File: summarizeOverlaps-methods.Rd

package info (click to toggle)
r-bioc-genomicalignments 1.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,980 kB
  • ctags: 54
  • sloc: ansic: 1,493; makefile: 4; sh: 3
file content (497 lines) | stat: -rw-r--r-- 21,767 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
\name{summarizeOverlaps-methods}

\alias{summarizeOverlaps-methods}

\alias{summarizeOverlaps}
\alias{summarizeOverlaps,GRanges,GAlignments-method}
\alias{summarizeOverlaps,GRangesList,GAlignments-method}
\alias{summarizeOverlaps,GRanges,GAlignmentsList-method}
\alias{summarizeOverlaps,GRangesList,GAlignmentsList-method}
\alias{summarizeOverlaps,GRanges,GAlignmentPairs-method}
\alias{summarizeOverlaps,GRangesList,GAlignmentPairs-method}

\alias{Union}
\alias{IntersectionStrict}
\alias{IntersectionNotEmpty}

\alias{summarizeOverlaps,GRanges,BamFile-method}
\alias{summarizeOverlaps,GRangesList,BamFile-method}
\alias{summarizeOverlaps,GRanges,character-method}
\alias{summarizeOverlaps,GRangesList,character-method}
\alias{summarizeOverlaps,GRanges,BamFileList-method}
\alias{summarizeOverlaps,GRangesList,BamFileList-method}
\alias{summarizeOverlaps,BamViews,missing-method}


\title{Perform overlap queries between reads and genomic features} 

\description{
  \code{summarizeOverlaps} extends \code{findOverlaps} by providing 
  options to resolve reads that overlap multiple features. 
}

\usage{
\S4method{summarizeOverlaps}{GRanges,GAlignments}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE) 

\S4method{summarizeOverlaps}{GRangesList,GAlignments}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE) 

\S4method{summarizeOverlaps}{GRanges,GAlignmentPairs}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE) 

\S4method{summarizeOverlaps}{GRangesList,GAlignmentPairs}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE) 

## mode funtions
Union(features, reads, ignore.strand=FALSE, inter.feature=TRUE)
IntersectionStrict(features, reads, ignore.strand=FALSE, inter.feature=TRUE)
IntersectionNotEmpty(features, reads, ignore.strand=FALSE, inter.feature=TRUE)

\S4method{summarizeOverlaps}{GRanges,BamFile}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE,
  singleEnd=TRUE, fragments=FALSE, param=ScanBamParam())

\S4method{summarizeOverlaps}{BamViews,missing}(
  features, reads, mode, ignore.strand=FALSE, ..., inter.feature=TRUE,
  singleEnd=TRUE, fragments=FALSE, param=ScanBamParam())
}

\arguments{
  \item{features}{
    A \link[GenomicRanges]{GRanges} or a \link[GenomicRanges]{GRangesList}
    object of genomic regions of interest. When a \link[GenomicRanges]{GRanges}
    is supplied, each row is considered a feature. When a
    \link[GenomicRanges]{GRangesList} is supplied, each higher list-level is
    considered a feature. This distinction is important when defining
    overlaps.

    Can also be a \link[Rsamtools]{BamViews} object. In that case
    \code{features} are extracted from the \code{bamRanges} of the
    \link[Rsamtools]{BamViews} object.

    Metadata from \code{bamPaths} and \code{bamSamples} are stored in the
    \code{colData} slot of the \link[GenomicRanges]{SummarizedExperiment}
    object. \code{bamExperiment} metadata are in the \code{exptData}
    slot. 
  } 
  \item{reads}{
    A \link[Rsamtools]{BamViews} or \link[Rsamtools]{BamFileList} object
    that represents the data to be counted by \code{summarizeOverlaps}.

    Can be missing when a \link[Rsamtools]{BamViews} object is the only
    argument supplied to \code{summarizeOverlaps}.
    \code{reads} are the files specified in \code{bamPaths} of the
    \link[Rsamtools]{BamViews} object.
  }
  \item{mode}{
    \code{mode} can be one of the pre-defined count methods such as
    "Union", "IntersectionStrict", or "IntersectionNotEmpty" or
    it can be a user supplied count function. For a custom count
    funtion, the input arguments must match those of the pre-defined 
    options and the function must return a vector of counts the same
    length as the annotation ('features' argument). See examples for
    details.

    The pre-defined options are designed after the counting modes 
    available in the HTSeq package by Simon Anders (see references).

    \itemize{
      \item "Union" : (Default) Reads that overlap any portion of exactly one
            feature are counted. Reads that overlap multiple features are
            discarded. This is the most conservative of the 3 modes.
      \item "IntersectionStrict" : A read must fall completely "within" the
            feature to be counted. If a read overlaps multiple features but
            falls "within" only one, the read is counted for that feature.
            If the read is "within" multiple features, the read is discarded.
      \item "IntersectionNotEmpty" : A read must fall in a unique disjoint
            region of a feature to be counted. When a read overlaps multiple
            features, the features are partitioned into disjoint intervals.
            Regions that are shared between the features are discarded leaving
            only the unique disjoint regions. If the read overlaps one of
            these remaining regions, it is assigned to the feature the
            unique disjoint region came from.
      \item user supplied function : A function can be supplied as the
            \code{mode} argument. It must (1) have arguments that correspond
            to \code{features}, \code{reads}, \code{ignore.strand} and
            \code{inter.feature} arguments (as in the defined mode functions)
            and (2) return a vector of counts the same length as
            \code{features}. 
    }
  }
  \item{ignore.strand}{
    A logical indicating if strand should be considered when matching.
  }
  \item{inter.feature}{
    (Default TRUE) A logical indicating if the counting \code{mode} should 
    be aware of overlapping features. When TRUE (default), reads mapping to 
    multiple features are dropped (i.e., not counted). When FALSE, these 
    reads are retained and a count is assigned to each feature they map to.

    There are 6 possible combinations of the \code{mode} and
    \code{inter.feature} arguments. When \code{inter.feature=FALSE} the
    behavior of modes \sQuote{Union} and \sQuote{IntersectionStrict} are 
    essentially \sQuote{countOverlaps} with \sQuote{type=any} and
    \code{type=within}, respectively. \sQuote{IntersectionNotEmpty} does
    not reduce to a simple countOverlaps because common (shared) regions 
    of the annotation are removed before counting.
  }
  \item{...}{
    Additional arguments passed to functions or methods called
    from within \code{summarizeOverlaps}. For BAM file methods
    arguments may include \code{singleEnd}, \code{fragments} or
    \code{param} which apply to reading records from a file
    (see below). Providing \code{count.mapped.reads=TRUE} include
    additional passes through the BAM file to collect statistics
    similar to those from \code{countBam}.
  }
  \item{singleEnd}{
    (Default TRUE) A logical value indicating if reads are single or 
    paired-end. In Bioconductor > 2.12 it is not necessary to sort
    paired-end BAM files by \code{qname}. When counting with 
    \code{summarizeOverlaps}, setting \code{singleEnd=FALSE} will trigger 
    paired-end reading and counting. It is fine to also set 
    \code{asMates=TRUE} in the \code{BamFile} but is not necessary when
    \code{singleEnd=FALSE}.
  }
  \item{fragments}{
    (Default FALSE) Applies to paired-end data only so \code{singleEnd}
    must be FALSE. 

    \code{fragments} is a logical value indicating if singletons, reads 
    with unmapped pairs and other fragments should be included in counting. 
    When \code{fragments=FALSE} the
    \code{\link{readGAlignmentPairs}} function
    from the \pkg{GenomicAlignments} package is used to 
    read in the data, when \code{fragments=TRUE} the
    \code{\link{readGAlignmentsList}}
    is used.

    \code{readGAlignmentPairs} keeps only the read pairs mated by the 
    algorithm while \code{readGAlignmentsList} keeps the pairs as well as 
    all singletons, reads with unmapped pairs and other fragments. When 
    \code{fragments=TRUE} counts will generally be higher because all 
    records are included in the counting, not just the primary alignment 
    pairs. See \code{?\link{readGAlignmentsListFromBam}} for the algorithm
    details.
  }
  \item{param}{An optional \code{\link[Rsamtools]{ScanBamParam}} instance to
    further influence scanning, counting, or filtering.

    See \code{?\link{BamFile}} for details of how records are returned
    when both \code{yieldSize} is specified in a \code{\link{BamFile}} and
    \code{which} is defined in a \code{\link{ScanBamParam}}.
  }
}

\details{
  \describe{
    \item{}{\code{summarizeOverlaps} offers counting modes to resolve reads 
      that overlap multiple features. The \code{mode} argument defines a
      set of rules to resolve the read to a single feature such that each read 
      is counted a maximum of once. New to GenomicRanges >= 1.13.9 is the
      \code{inter.feature} argument which allows reads to be counted for
      each feature they overlap. When \code{inter.feature=TRUE} the counting
      modes are aware of feature overlap and reads overlapping multiple 
      features are dropped and not counted. When \code{inter.feature=FALSE} 
      multiple feature overlap is ignored and reads are counted once for each 
      feature they map to. This essentially reduces modes \sQuote{Union} and
      \sQuote{IntersectionStrict} to \code{countOverlaps} with 
      \code{type="any"}, and \code{type="within"}, respectively.
      \sQuote{IntersectionNotEmpty} is not reduced to a derivative of
      \code{countOverlaps} because the shared regions are removed before 
      counting.

      The \code{BamViews}, \code{BamFile} and \code{BamFileList} methods
      summarize overlaps across one or several files. The latter uses
      \code{bplapply}; control parallel evaluation using the
      \code{\link{register}} interface in the \pkg{BiocParallel} package.

    }
    \item{features :}{
      A \sQuote{feature} can be any portion of a genomic region such as a gene, 
      transcript, exon etc. When the \code{features} argument is a 
      \link[GenomicRanges]{GRanges} the rows define the features. The result
      will be the same length as the \link[GenomicRanges]{GRanges}. When 
      \code{features} is a \link[GenomicRanges]{GRangesList} the highest
      list-level defines the features and the result will be the same length
      as the \link[GenomicRanges]{GRangesList}. 

      When \code{inter.feature=TRUE}, each count \code{mode} attempts to 
      assign a read that overlaps multiple features to a single feature. If 
      there are ranges that should be considered together (e.g., exons by 
      transcript or cds regions by gene) the \link[GenomicRanges]{GRangesList}
      would be appropriate. If there is no grouping in the data then a 
      \link[GenomicRanges]{GRanges} would be appropriate. 
    }
    \item{paired-end reads :}{
      Paired-end reads are counted as a single hit if one or both parts of
      the pair are overlapped. Paired-end records can be counted in a 
      \link{GAlignmentPairs} container or BAM file. 

      Counting pairs in BAM files:
      \itemize{
        \item{The \code{singleEnd} argument should be FALSE.}
        \item{When \code{reads} are supplied as a BamFile or BamFileList, 
              the \code{asMates} argument to the BamFile should be TRUE.}
        \item{When \code{fragments} is TRUE, a \code{GAlignmentPairs}
              object is used in counting (pairs only).}
        \item{When \code{fragments} is FALSE, a \code{GAlignmentsList}
              object is used in counting (pairs, singletons, unmapped
              mates, etc.)}
      }
    }
  }
}

\value{
  A \link[GenomicRanges]{SummarizedExperiment} object. The \code{assays} slot
  holds the counts, \code{rowData} holds the annotation specified in 
  \code{features}. 

  \code{colData} is a DataFrame with columns of \sQuote{object} (class of 
  \code{reads}) and \sQuote{records} (length of \code{reads}). When \code{reads}
  is a BamFile or BamFileList and \code{count.mapped.reads=TRUE},
  the \code{colData} holds the output of a call
  to \code{countBam} with columns of \sQuote{records} (total records in file), 
  \sQuote{nucleotides} and \sQuote{mapped}. The number in \sQuote{mapped} is
  the number of records returned when \code{isUnmappedQuery=FALSE} in the 
  \sQuote{ScanBamParam}.
}

\references{
  HTSeq :
  \url{http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html}

  htseq-count :
  \url{http://www-huber.embl.de/users/anders/HTSeq/doc/count.html}
}

\author{Valerie Obenchain <vobencha@fhcrc.org>}

\seealso{
  \itemize{
    \item The \pkg{DESeq}, \pkg{DEXSeq} and \pkg{edgeR} packages.

    \item The \link{GAlignments} and \link{GAlignmentPairs} classes.

    \item The \link[Rsamtools]{BamFileList} and \link[Rsamtools]{BamViews}
          classes in the \pkg{Rsamtools} package.

    \item The \link{readGAlignments} and \link{readGAlignmentPairs} functions.
  }
}

\examples{
reads <- GAlignments(
    names = c("a","b","c","d","e","f","g"),
    seqnames = Rle(c(rep(c("chr1", "chr2"), 3), "chr1")),
    pos = as.integer(c(1400, 2700, 3400, 7100, 4000, 3100, 5200)),
    cigar = c("500M", "100M", "300M", "500M", "300M", 
              "50M200N50M", "50M150N50M"),
    strand = strand(rep("+", 7)))

gr <- GRanges(
    seqnames = c(rep("chr1", 7), rep("chr2", 4)), strand = "+", 
    ranges = IRanges(c(1000, 3000, 3600, 4000, 4000, 5000, 5400, 
                       2000, 3000, 7000, 7500), 
                     width = c(500, 500, 300, 500, 900, 500, 500, 
                               900, 500, 600, 300),
                     names=c("A", "B", "C1", "C2", "D1", "D2", "E", "F",
                             "G", "H1", "H2"))) 
groups <- factor(c(1,2,3,3,4,4,5,6,7,8,8))
grl <- splitAsList(gr, groups)
names(grl) <- LETTERS[seq_along(grl)]

## ---------------------------------------------------------------------
## Counting modes. 
## ---------------------------------------------------------------------

## First count with a GRanges as the 'features'. 'Union' is the
## most conservative counting mode followed by 'IntersectionStrict' 
## then 'IntersectionNotEmpty'.
counts1 <- 
    data.frame(union=assays(summarizeOverlaps(gr, reads))$counts, 
               intStrict=assays(summarizeOverlaps(gr, reads, 
                                mode="IntersectionStrict"))$counts,
               intNotEmpty=assays(summarizeOverlaps(gr, reads,
                                  mode="IntersectionNotEmpty"))$counts)

colSums(counts1)

## Split the 'features' into a GRangesList and count again.
counts2 <- 
    data.frame(union=assays(summarizeOverlaps(grl, reads))$counts, 
               intStrict=assays(summarizeOverlaps(grl, reads, 
                                mode="IntersectionStrict"))$counts,
               intNotEmpty=assays(summarizeOverlaps(grl, reads,
                                  mode="IntersectionNotEmpty"))$counts)
colSums(counts2)

## The GRangesList ('grl' object) has 8 features whereas the GRanges 
## ('gr' object) has 11. The affect on counting can be seen by looking
## at feature 'H' with mode 'Union'. In the GRanges this feature is 
## represented by ranges 'H1' and 'H2',
gr[c("H1", "H2")]

## and by list element 'H' in the GRangesList, 
grl["H"]
 
## Read "d" hits both 'H1' and 'H2'. This is considered a multi-hit when
## using a GRanges (each range is a separate feature) so the read was 
## dropped and not counted.
counts1[c("H1", "H2"), ]

## When using a GRangesList, each list element is considered a feature.
## The read hits multiple ranges within list element 'H' but only one 
## list element. This is not considered a multi-hit so the read is counted.
counts2["H", ]

## ---------------------------------------------------------------------
## Counting multi-hit reads.
## ---------------------------------------------------------------------

## The goal of the counting modes is to provide a set of rules that
## resolve reads hitting multiple features so each read is counted
## a maximum of once. However, sometimes it may be desirable to count 
## a read for each feature it overlaps. This can be accomplished by 
## setting 'inter.feature' to FALSE.

## When 'inter.feature=FALSE', modes 'Union' and 'IntersectionStrict'
## essentially reduce to countOverlaps() with type="any" and 
## type="within", respectively.

## When 'inter.feature=TRUE' only features "A", "F" and "G" have counts.
se1 <- summarizeOverlaps(gr, reads, mode="Union", inter.feature=TRUE)
assays(se1)$counts

## When 'inter.feature=FALSE' all 11 features have a count. There are 
## 7 total reads so one or more reads were counted more than once.
se2 <- summarizeOverlaps(gr, reads, mode="Union", inter.feature=FALSE)
assays(se2)$counts

## ---------------------------------------------------------------------
## Counting BAM files.
## ---------------------------------------------------------------------

library(pasillaBamSubset)
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
exbygene <- exonsBy(TxDb.Dmelanogaster.UCSC.dm3.ensGene, "gene")

## (i) Single-end :

## Large files can be iterated over in chunks by setting a
## 'yieldSize' on the BamFile.
bf_s <- BamFile(untreated1_chr4(), yieldSize=50000)
se_s <- summarizeOverlaps(exbygene, bf_s, singleEnd=TRUE)
table(assays(se_s)$counts > 0)

## When a character (file name) is provided as 'reads' instead 
## of a BamFile object summarizeOverlaps() will create a BamFile
## and set a reasonable default 'yieldSize'.

## (ii) Paired-end :

## A paired-end file may contain singletons, reads with unmapped
## pairs or reads with more than two fragments. When 'fragments=FALSE'
## only reads paired by the algorithm are included in the counting. 
nofrag <- summarizeOverlaps(exbygene, untreated3_chr4(), 
                            singleEnd=FALSE, fragments=FALSE)
table(assays(nofrag)$counts > 0)

## When 'fragments=TRUE' all singletons, reads with unmapped pairs 
## and other fragments will be included in the counting.
bf <- BamFile(untreated3_chr4(), asMates=TRUE)
frag <- summarizeOverlaps(exbygene, bf, singleEnd=FALSE, fragments=TRUE)
table(assays(frag)$counts > 0)

## As expected, using 'fragments=TRUE' results in a larger number 
## of total counts because singletons, unmapped pairs etc. are 
## included in the counting.

## Total reads in the file:
countBam(untreated3_chr4())

## Reads counted with 'fragments=FALSE':
sum(assays(nofrag)$counts)

## Reads counted with 'fragments=TRUE':
sum(assays(frag)$counts)

## ---------------------------------------------------------------------
## Count tables for DESeq or edgeR.
## ---------------------------------------------------------------------

fls <- list.files(system.file("extdata", package="GenomicAlignments"),
                  recursive=TRUE, pattern="*bam$", full=TRUE)
names(fls) <- basename(fls)
bf <- BamFileList(fls, index=character(), yieldSize=1000)
genes <- GRanges(
    seqnames = c(rep("chr2L", 4), rep("chr2R", 5), rep("chr3L", 2)),
    ranges = IRanges(c(1000, 3000, 4000, 7000, 2000, 3000, 3600, 
                       4000, 7500, 5000, 5400), 
                     width=c(rep(500, 3), 600, 900, 500, 300, 900, 
                             300, 500, 500))) 
se <- summarizeOverlaps(genes, bf)

## When the reads are BAM files, the 'colData' contains summary 
## information from a call to countBam().
colData(se)

## Create count tables.
library(DESeq)
deseq <- newCountDataSet(assays(se)$counts, rownames(colData(se)))
library(edgeR)
edger <- DGEList(assays(se)$counts, group=rownames(colData(se)))

## ---------------------------------------------------------------------
## Filter records by map quality before counting. 
## (user-supplied count function) 
## ---------------------------------------------------------------------

## The 'mode' argument can take a custom count function whose
## arguments are the same as those in the current counting modes
## (i.e., Union, IntersectionNotEmpty, IntersectionStrict). 
## In this example records are filtered by map quality before counting.

mapq_filter <- function(features, reads,  ignore.strand, inter.feature) { 
    require(GenomicAlignments) # needed for parallel evaluation
    Union(features, reads[mcols(reads)$mapq >= 20], 
          ignore.strand=ignore.strand, 
          inter.feature=inter.feature) 
}

genes <- GRanges("seq1", IRanges(seq(1, 1500, by=200), width=100))
param <- ScanBamParam(what="mapq")
fl <- system.file("extdata", "ex1.bam", package="Rsamtools")
se <- summarizeOverlaps(genes, fl, mode=mapq_filter, param=param) 
assays(se)$counts

## The count function can be completely custom (i.e., not use the
## pre-defined count functions at all). Requirements are that
## the input arguments match the pre-defined modes and the output
## is a vector of counts the same length as 'features'. 
 
my_count <- function(features, reads,  ignore.strand, inter.feature) { 
    ## perform filtering, or subsetting etc. 
    require(GenomicAlignments) # needed for parallel evaluation
    countOverlaps(features, reads)
}

## ---------------------------------------------------------------------
## summarizeOverlaps() with BamViews
## ---------------------------------------------------------------------

## bamSamples and bamPaths metadata are included in the colData.
## bamExperiment metadata is put into the exptData slot.
fl <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
rngs <- GRanges(c("seq1", "seq2"), IRanges(1, c(1575, 1584)))
samp <- DataFrame(info="test", row.names="ex1")
view <- BamViews(fl, bamSamples=samp, bamRanges=rngs)
se <- summarizeOverlaps(view, mode=Union, ignore.strand=TRUE)
colData(se)
exptData(se)
}

\keyword{methods}
\keyword{utilities}