File: pileLettersAt.Rd

package info (click to toggle)
r-bioc-genomicalignments 1.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,980 kB
  • ctags: 54
  • sloc: ansic: 1,493; makefile: 4; sh: 3
file content (137 lines) | stat: -rw-r--r-- 4,496 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
\name{pileLettersAt}

\alias{pileLettersAt}


\title{Pile the letters of a set of aligned reads on top of a set
       of individual genomic positions}

\description{
  \code{pileLettersAt} extracts the letters/nucleotides of a set of
  reads that align to a set of individual genomic positions of interest.
  The extracted letters are returned as "piles of letters" (one per
  genomic position of interest) stored in an \link[Biostrings]{XStringSet}
  (typically \link[Biostrings]{DNAStringSet}) object.
}

\usage{
pileLettersAt(x, seqnames, pos, cigar, at)
}

\arguments{
  \item{x}{
    An \link[Biostrings]{XStringSet} (typically
    \link[Biostrings]{DNAStringSet}) object containing N \emph{unaligned}
    read sequences (a.k.a. the query sequences) reported with respect to
    the + strand. 
  }
  \item{seqnames}{
    A factor-\link[IRanges]{Rle} \emph{parallel} to \code{x}.
    For each \code{i}, \code{seqnames[i]} must be the name of the reference
    sequence of the i-th alignment.
  }
  \item{pos}{
    An integer vector \emph{parallel} to \code{x}.
    For each \code{i}, \code{pos[i]} must be the 1-based position
    on the reference sequence of the first aligned letter in \code{x[[i]]}.
  }
  \item{cigar}{
    A character vector \emph{parallel} to \code{x}. Contains the extended
    CIGAR strings of the alignments.
  }
  \item{at}{
    A \link[GenomicRanges]{GRanges} object containing the individual genomic
    positions of interest. \code{seqlevels(at)} must be identical to
    \code{levels(seqnames)}.
  }
}

\details{
  \code{x}, \code{seqnames}, \code{pos}, \code{cigar} must be 4 \emph{parallel}
  vectors describing N aligned reads.
}

\value{
  An \link[Biostrings]{XStringSet} (typically \link[Biostrings]{DNAStringSet})
  object \emph{parallel} to \code{at} (i.e. with 1 string per individual
  genomic position).
}

\author{H. Pages}

\seealso{
  \itemize{
    \item \link[Biostrings]{DNAStringSet} objects in the \pkg{Biostrings}
          package.

    \item \link[GenomicRanges]{GRanges} objects in the \pkg{GenomicRanges}
          package.

    \item The \code{\link{stackStringsFromBam}} function
          for stacking the read sequences (or their quality strings)
          stored in a BAM file on a region of interest.

    \item \link{GAlignments} objects.

    \item \link{cigar-utils} for the CIGAR utility functions used internally
          by \code{pileLettersAt}.

    \item The SAMtools mpileup command available at
          \url{http://samtools.sourceforge.net/} as part of the
          SAMtools project.
  }
}

\examples{
## Input

##   - A BAM file:
bamfile <- BamFile(system.file("extdata", "ex1.bam", package="Rsamtools"))
seqinfo(bamfile)  # to see the seqlevels and seqlengths
stackStringsFromBam(bamfile, param="seq1:1-21")  # a quick look at
                                                 # the reads

##   - A GRanges object containing Individual Genomic Positions Of
##     Interest:
my_IGPOI <- GRanges(Rle(c("seq1", "seq2"), c(7, 2)),
                    IRanges(c(1:5, 21, 1575, 1513:1514), width=1))

## Some preliminary massage on 'my_IGPOI'

seqinfo(my_IGPOI) <- merge(seqinfo(my_IGPOI), seqinfo(bamfile))
seqlevels(my_IGPOI) <- seqlevelsInUse(my_IGPOI)

## Load the BAM file in a GAlignments object. We load only the reads
## aligned to the sequences in 'seqlevels(my_IGPOI)' and we filter out
## reads not passing quality controls (flag bit 0x200) and PCR or
## optical duplicates (flag bit 0x400). See ?ScanBamParam and the SAM
## Spec for more information. 

which <- as(seqinfo(my_IGPOI), "GRanges")
flag <- scanBamFlag(isNotPassingQualityControls=FALSE,
                    isDuplicate=FALSE)
what <- c("seq", "qual")
param <- ScanBamParam(flag=flag, what=c("seq", "qual"), which=which)
gal <- readGAlignmentsFromBam(bamfile, param=param)
seqlevels(gal) <- seqlevels(my_IGPOI) 

## Extract the read sequences (a.k.a. query sequences) and quality
## strings. Both are reported with respect to the + strand.

qseq <- mcols(gal)$seq
qual <- mcols(gal)$qual

nucl_piles <- pileLettersAt(qseq, seqnames(gal), start(gal), cigar(gal),
                            my_IGPOI)
qual_piles <- pileLettersAt(qual, seqnames(gal), start(gal), cigar(gal),
                            my_IGPOI)
mcols(my_IGPOI)$nucl_piles <- nucl_piles
mcols(my_IGPOI)$qual_piles <- qual_piles
my_IGPOI 

## Finally, to summarize A/C/G/T frequencies at each position:
alphabetFrequency(nucl_piles, baseOnly=TRUE)
}

\keyword{methods}
\keyword{manip}