1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
|
\name{pileup}
\Rdversion{1.1}
% PileupParam
\alias{PileupParam-class}
\alias{PileupParam}
\alias{show,PileupParam-method}
\alias{max_depth}
\alias{min_base_quality}
\alias{min_mapq}
\alias{min_nucleotide_depth}
\alias{min_minor_allele_depth}
\alias{distinguish_strands}
\alias{distinguish_nucleotides}
\alias{ignore_query_Ns}
\alias{include_deletions}
\alias{include_insertions}
\alias{left_bins}
\alias{query_bins}
\alias{cycle_bins}
% pileup
\alias{pileup}
\alias{pileup,character-method}
\alias{pileup,BamFile-method}
% utility
\alias{phred2ASCIIOffset}
\title{
Use filters and output formats to calculate pile-up statistics for a
BAM file.
}
\description{
\code{pileup} uses \code{PileupParam} and \code{ScanBamParam} objects
to calculate pileup statistics for a BAM file. The result is a
\code{data.frame} with columns summarizing counts of reads overlapping
each genomic position, optionally differentiated on nucleotide,
strand, and position within read.
}
\usage{
pileup(file, index=file, ..., scanBamParam=ScanBamParam(),
pileupParam=PileupParam())
## PileupParam constructor
PileupParam(max_depth=250, min_base_quality=13, min_mapq=0,
min_nucleotide_depth=1, min_minor_allele_depth=0,
distinguish_strands=TRUE, distinguish_nucleotides=TRUE,
ignore_query_Ns=TRUE, include_deletions=TRUE, include_insertions=FALSE,
left_bins=NULL, query_bins=NULL, cycle_bins=NULL)
phred2ASCIIOffset(phred=integer(),
scheme= c("Illumina 1.8+", "Sanger", "Solexa", "Illumina 1.3+",
"Illumina 1.5+"))
}
\arguments{
\item{file}{
character(1) or \code{\link{BamFile}}; BAM file path.
}
\item{index}{
When \code{file} is character(1), an optional character(1) of BAM
index file path; see \code{\link{scanBam}}.
}
\item{\dots}{Additional arguments, perhaps used by methods.}
\item{scanBamParam}{An instance of \code{\link{ScanBamParam}}.}
\item{pileupParam}{An instance of \code{\link{PileupParam}}.}
%% args for PileupParam
\item{max_depth}{integer(1); maximum number of overlapping alignments
considered for each position in the pileup.}
\item{min_base_quality}{integer(1); minimum \sQuote{QUAL} value for
each nucleotide in an alignment. Use \code{phred2ASCIIOffset} to
help translate numeric or character values to these offsets.}
\item{min_mapq}{integer(1); minimum \sQuote{MAPQ} value for an
alignment to be included in pileup.}
\item{min_nucleotide_depth}{integer(1); minimum count of each
nucleotide (\emph{independent} of other nucleotides) at a given
position required for said nucleotide to appear in the result.}
\item{min_minor_allele_depth}{integer(1); minimum count of \emph{all}
nucleotides other than the major allele at a given position required
for a particular nucleotide to appear in the result.}
\item{distinguish_strands}{logical(1); \code{TRUE} if result should
differentiate between strands.}
\item{distinguish_nucleotides}{logical(1); \code{TRUE} if result
should differentiate between nucleotides.}
\item{ignore_query_Ns}{logical(1); \code{TRUE} if non-determinate
nucleotides in alignments should be excluded from the pileup.}
\item{include_deletions}{logical(1); \code{TRUE} to include deletions
in pileup.}
\item{include_insertions}{logical(1); \code{TRUE} to include
insertions in pileup.}
\item{left_bins}{numeric; all same sign; unique positions within a
read to delimit bins. Position within read is based on counting from
the \emph{5' end regardless of strand}. Minimum of two values are
required so at least one range can be formed. \code{NULL} (default)
indicates no binning. Use negative values to count from the opposite
end. Sorted order not required. Floating-point values are coerced to
\code{integer}.
If you want to delimit bins based on sequencing cycles to, e.g.,
discard later cycles, \code{\link{query_bins}} probably gives the
desired behavior.
}
\item{query_bins}{numeric; all same sign; unique positions within a
read to delimit bins. Position within a read is based on counting
from the \emph{5' end} when the read aligns to \emph{plus strand},
counting from the \emph{3' end} when read aligns to minus
strand. Minimum of two values are required so at least one range can
be formed. \code{NULL} (default) indicates no binning. Use negative
values to count from the opposite end. Sorted order not
required. Floating-point values are coerced to \code{integer}.}
\item{phred}{Either a numeric() (coerced to integer()) PHRED score
(e.g., in the range (0, 41) for the \sQuote{Illumina 1.8+} scheme)
or character() of printable ASCII characters. When \code{phred} is
character(), it can be of length 1 with 1 or more characters, or of
any length where all elements have exactly 1 character.}
\item{scheme}{Encoding scheme, used to translate numeric() PHRED
scores to their ASCII code. Ignored when \code{phred} is already
character().}
\item{cycle_bins}{DEPRECATED. See \code{\link{left_bins}} for
identical behavior.}
}
\details{
\emph{NB}: Use of \code{pileup} assumes familiarity with
\code{\link{ScanBamParam}}, and use of \code{left_bins} and
\code{query_bins} assumes familiarity with \code{\link{cut}}.
\code{pileup} visits each position in the BAM file, subject to
constraints implied by \code{which} and \code{flag} of
\code{scanBamParam}. For a given position, all reads overlapping the
position that are consistent with constraints dictated by \code{flag}
of \code{scanBamParam} and \code{pileupParam} are used for
counting. \code{pileup} also automatically excludes reads with flags
that indicate any of:
\itemize{
\item{unmapped alignment (\code{\link{isUnmappedQuery}})}
\item{secondary alignment (\code{\link{isSecondaryAlignment}})}
\item{not passing quality controls (\code{\link{isNotPassingQualityControls}})}
\item{PCR or optical duplicate (\code{\link{isDuplicate}})}
}
If no \code{which} argument is supplied to the \code{ScanBamParam},
behavior reflects that of \code{scanBam}: the entire file is visited
and counted.
Use a \code{\link{yieldSize}} value when creating a \code{BamFile}
instance to manage memory consumption when using pileup with large BAM
files. There are some differences in how \code{pileup} behavior is
affected when the \code{yieldSize} value is set on the BAM file. See
more details below.
Many of the parameters of the \code{pileupParam} interact. A simple
illustration is \code{ignore_query_Ns} and
\code{distinguish_nucleotides}, as mentioned in the
\code{ignore_query_Ns} section.
Parameters for \code{pileupParam} belong to two categories: parameters
that affect only the filtering criteria (so-called
\sQuote{behavior-only} policies), and parameters that affect
filtering behavior and the schema of the results
(\sQuote{behavior+structure} policies).
%% behavior-only
%% (other behavior-only arguments are self-evident or sam
%% spcification-evident)
%% behavior+structure
\code{distinguish_nucleotides} and \code{distinguish_strands} when set
to \code{TRUE} each add a column (\code{nucleotide} and \code{strand},
respectively) to the resulting \code{data.frame}. If both are TRUE,
each combination of \code{nucleotide} and \code{strand} at a given
position is counted separately. Setting only one to \code{TRUE}
behaves as expected; for example, if only \code{nucleotide} is set to
\code{TRUE}, each nucleotide at a given position is counted
separately, but the distinction of on which strand the nucleotide
appears is ignored.
\code{ignore_query_Ns} determines how ambiguous nucloetides are
treated. By default, ambiguous nucleotides (typically \sQuote{N} in
BAM files) in alignments are ignored. If \code{ignore_query_Ns} is
\code{FALSE}, ambiguous nucleotides are included in counts; further,
if \code{ignore_query_Ns} is \code{FALSE} and
\code{distinguish_nucleotides} is \code{TRUE} the \sQuote{N}
nucleotide value appears in the nucleotide column when a base at a
given position is ambiguous.
By default, deletions with respect to the reference genome to which
the reads were aligned are included in the counts in a pileup. If
\code{include_deletions} is \code{TRUE} and
\code{distinguish_nucleotides} is \code{TRUE}, the nucleotide column
uses a \sQuote{-} character to indicate a deletion at a position.
The \sQuote{=} nucleotide code in the \code{SEQ} field (to mean
\sQuote{identical to reference genome}) is supported by pileup, such
that a match with the reference genome is counted separately in the
results if \code{distinguish_nucleotides} is \code{TRUE}.
CIGAR support: \code{pileup} handles the extended CIGAR format by only
heeding operations that contribute to counts (\sQuote{M}, \sQuote{D},
\sQuote{I}). If you are confused about how the different CIGAR
operations interact, see the SAM versions of the BAM files used for
\code{pileup} unit testing for a fairly comprehensive human-readable
treatment.
\itemize{
\item{Deletions and clipping:
The extended CIGAR allows a number of operations conceptually
similar to a \sQuote{deletion} with respect to the reference
genome, but offer more specialized meanings than a simple
deletion. CIGAR \sQuote{N} operations (not to be confused with
\sQuote{N} used for non-determinate bases in the \code{SEQ} field)
indicate a large skipped region, \sQuote{S} a soft clip, and
\sQuote{H} a hard clip. \sQuote{N}, \sQuote{S}, and \sQuote{H}
CIGAR operations are never counted: only counts of true deletions
(\sQuote{D} in the CIGAR) can be included by setting
\code{include_deletions} to \code{TRUE}.
Soft clip operations contribute to the relative position of
nucleotides within a read, whereas hard clip and refskip
operations do not. For example, consider a sequence in a bam file
that starts at 11, with a CIGAR \code{2S1M} and \code{SEQ} field
\code{ttA}. The cycle position of the \code{A} nucleotide will be
\code{3}, but the reported position will be \code{11}. If using
\code{left_bins} or \code{query_bins} it might make sense to first
preprocess your files to remove soft clipping.
}
\item{Insertions and padding:
\code{pileup} can include counts of insertion operations by
setting \code{include_insertions} to \code{TRUE}. Details about
insertions are effectively truncated such that each insertion is
reduced to a single \sQuote{+} nucleotide. Information about
e.g. the nucleotide code and base quality within the insertion is
not included.
Because \sQuote{+} is used as a nucleotide code to represent
insertions in \code{pileup}, counts of the \sQuote{+} nucleotide
participate in voting for \code{min_nucleotide_depth} and
\code{min_minor_allele_depth} functionality.
The position of an insertion is the position that precedes it on
the reference sequence. \emph{Note:} this means if
\code{include_insertions} is \code{TRUE} a single read will
contribute two nucleotide codes (\code{+}, plus that of the
non-insertion base) at a given position if the non-insertion base
passes filter criteria.
\sQuote{P} CIGAR (padding) operations never affect counts.
}
}
The \dQuote{bin} arguments \code{query_bins} and \code{left_bins}
allow users to differentiate pileup counts based on arbitrary
non-overlapping regions within a read. \code{pileup} relies on
\code{\link{cut}} to derive bins, but limits input to numeric values
of the same sign (coerced to \code{integer}s), including
\code{+/-Inf}. If a \dQuote{bin} argument is set \code{pileup}
automatically excludes bases outside the implicit outer range. Here
are some important points regarding bin arguments:
\itemize{
\item{\code{query_bins} vs. \code{left_bins}:}{
BAM files store sequence data from the 5' end to the 3' end
(regardless of to which strand the read aligns). That means for
reads aligned to the minus strand, cycle position within a read is
effectively reversed. Take care to use the appropriate bin
argument for your use case.
The most common use of a bin argument is to bin later cycles
separately from earlier cycles; this is because accuracy typically
degrades in later cycles. For this application, \code{query_bins}
yields the correct behavior because bin positions are relative to
cycle order (i.e., sensitive to strand).
\code{left_bins} (in contrast with \code{query_bins}) determines
bin positions from the 5' end, regardless of strand.
}
\item{Positive or negative bin values can be used to delmit bins
based on the effective \dQuote{start} or \dQuote{end} of a
read. For \code{left_bin} the effective start is always the 5' end
(left-to-right as appear in the BAM file).
For \code{query_bin} the effective start is the first cycle of the
read as it came off the sequencer; that means the 5' end for reads
aligned to the plus strand and 3' for reads aligned to the minus
strand.
\itemize{
\item{\emph{From effective start of reads}: use positive values,
\code{0}, and \code{(+)Inf}. Because \code{\link{cut}}
produces ranges in the form (first,last], \sQuote{0} should be
used to create a bin that includes the first position. To
account for variable-length reads in BAM files, use
\code{(+)Inf} as the upper bound on a bin that extends to the
end of all reads.}
\item{\emph{From effective end of reads}: use negative values
and \code{-Inf}. \code{-1} denotes the last position in a
read. Because \code{\link{cut}} produces ranges in the form
(first,last], specify the lower bound of a bin by using one
less than the desired value, e.g., a bin that captures only
the second nucleotide from the last position:
\code{query_bins=c(-3, -2)}. To account for variable-length
reads in BAM files, use \code{-Inf} as the lower bound on a
bin that extends to the beginning of all reads.}
}
}
} %% end subsection bin argument gotchas
\code{pileup} obeys \code{\link{yieldSize}} on \code{BamFile} objects
with some differences from how \code{scanBam} responds to
\code{yieldSize}. Here are some points to keep in mind when using
\code{pileup} in conjunction with \code{yieldSize}:
\itemize{
\item{\code{BamFile}s with a \code{yieldSize} value set, once
opened and used with \code{pileup}, \emph{should not be used} with
other functions that accept a \code{BamFile} instance as
input. Create a new \code{BamFile} instance instead of trying to
reuse.}
\item{\code{pileup} only returns genomic positions for which all
input has been processed. \code{pileup} will hold in reserve
positions for which only partial data has been
processed. Positions held in reserve will be returned upon
subsequent calls to \code{pileup} when all the input for a given
position has been processed.}
\item{The correspondence between yieldSize and the number of rows
in the \code{data.frame} returned from \code{pileup} is not
1-to-1. \code{yieldSize} only limits the number of
\emph{alignments processed} from the BAM file each time the file
is read. Only a handful of alignments can lead to many distinct
records in the result.}
\item{Like \code{scanBam}, \code{pileup} uses an empty return
object (a zero-row \code{data.frame}) to indicate end-of-input.}
\item{Sometimes reading \code{yieldSize} records from the BAM file
does not result in any completed positions. In order to avoid
returning a zero-row \code{data.frame} \code{pileup} will
repeatedly process \code{yieldSize} additional records until at
least one position can be returned to the user.}
}
}
\value{
For \code{pileup} a \code{data.frame} with 1 row per unique
combination of differentiating column values that satisfied filter
criteria, with frequency (\code{count}) of unique combination. Columns
\code{seqnames}, \code{pos}, and \code{count} always appear; optional
\code{strand}, \code{nulceotide}, and \code{left_bin} /
\code{query_bin} columns appear as dictated by arguments to
\code{PileupParam}.
Column details:
\itemize{
\item{\code{seqnames}: factor. SAM \sQuote{RNAME} field of
alignment.}
\item{\code{pos}: integer(1). Genomic position of base. Derived by
offset from SAM \sQuote{POS} field of alignment.}
\item{\code{strand}: factor. \sQuote{strand} to which read aligns.}
\item{\code{nucleotide}: factor. \sQuote{nucleotide} of base,
extracted from SAM \sQuote{SEQ} field of alignment.}
\item{\code{left_bin} / \code{query_bin}: factor. Bin in which base
appears.}
\item{\code{count}: integer(1). Frequency of combination of
differentiating fields, as indicated by values of other columns.}
}
See \code{\link{scanBam}} for more detailed explanation of SAM fields.
If a \code{which} argument is specified for the \code{scanBamParam}, a
\code{which_label} column (\code{factor} in the form
\sQuote{rname:start-end}) is automatically included. The
\code{which_label} column is used to maintain grouping of results,
such that two queries of the same genomic region can be
differentiated.
Order of rows in \code{data.frame} is first by order of
\code{seqnames} column based on the BAM index file, then
non-decreasing order on columns \code{pos}, then \code{nucleotide},
then \code{strand}, then \code{left_bin} / \code{query_bin}.
\code{PileupParam} returns an instance of PileupParam class.
\code{phred2ASCIIOffset} returns a named integer vector of the same
length or number of characters in \code{phred}. The names are the
ASCII encoding, and the values are the offsets to be used with
\code{min_base_quality}.
}
\seealso{
\itemize{
\item{\link{Rsamtools}}
\item{\linkS4class{BamFile}}
\item{\link{ScanBamParam}}
\item{\link{scanBam}}
\item{\link{cut}}
}
For the relationship between PHRED scores and ASCII encoding, see
\url{https://en.wikipedia.org/wiki/FASTQ_format#Encoding}.
}
\author{Nathaniel Hayden \url{nhayden@fredhutch.org}}
%% This information should go somewhere--essential for an outside
%% developer to help build context quickly
%% \excruciatingdetails{
%% There are really three stages of filtering; (1) filtering based on
%% \code{scanBamParam} arguments (via _do_scan_bam, (2) filtering
%% based on arguments of \code{pileupParam} that apply on a per-read
%% and per-alignment basis and lastly, (3) a final filtering stage
%% applied on the intermediate count of a position after all
%% overlapping alignments have been
%% processed. \code{min_nucleotide_depth} and
%% \{min_minor_allele_depth} arguments cannot be applied until the
%% last stage.
%% }
\examples{
## The examples below apply equally to pileup queries for specific
## genomic ranges (specified by the 'which' parameter of 'ScanBamParam')
## and queries across entire files; the entire-file examples are
## included first to make them easy to find. The more detailed examples
## of pileup use queries with a 'which' argument.
library("RNAseqData.HNRNPC.bam.chr14")
fl <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
\donttest{
## no 'which' argument to ScanBamParam: process entire file at once
res <- pileup(fl)
head(res)
table(res$strand, res$nucleotide)
}
\donttest{
## no 'which' argument to ScanBamParam with 'yieldSize' set on BamFile
bf <- open(BamFile(fl, yieldSize=5e4))
repeat {
res <- pileup(bf)
message(nrow(res), " rows in result data.frame")
if(nrow(res) == 0L)
break
}
close(bf)
}
## pileup for the first half of chr14 with default Pileup parameters
## 'which' argument: process only specified genomic range(s)
sbp <- ScanBamParam(which=GRanges("chr14", IRanges(1, 53674770)))
res <- pileup(fl, scanBamParam=sbp)
head(res)
table(res$strand, res$nucleotide)
## specify pileup parameters: include ambiguious nucleotides
## (the 'N' level in the nucleotide column of the data.frame)
p_param <- PileupParam(ignore_query_Ns=FALSE)
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
head(res)
table(res$strand, res$nucleotide)
## Don't distinguish strand, require a minimum frequency of 7 for a
## nucleotide at a genomic position to be included in results.
p_param <- PileupParam(distinguish_strands=TRUE,
min_nucleotide_depth=7)
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
head(res)
table(res$nucleotide, res$strand)
## Any combination of the filtering criteria is possible: let's say we
## want a "coverage pileup" that only counts reads with mapping
## quality >= 13, and bases with quality >= 10 that appear at least 4
## times at each genomic position
p_param <- PileupParam(distinguish_nucleotides=FALSE,
distinguish_strands=FALSE,
min_mapq=13,
min_base_quality=10,
min_nucleotide_depth=4)
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
head(res)
res <- res[, c("pos", "count")] ## drop seqnames and which_label cols
plot(count ~ pos, res, pch=".")
## ASCII offsets to help specify min_base_quality, e.g., quality of at
## least 10 on the Illumina 1.3+ scale
phred2ASCIIOffset(10, "Illumina 1.3+")
## Well-supported polymorphic positions (minor allele present in at
## least 5 alignments) with high map quality
p_param <- PileupParam(min_minor_allele_depth=5,
min_mapq=40,
distinguish_strand=FALSE)
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
dim(res) ## reduced to our biologically interesting positions
head(xtabs(count ~ pos + nucleotide, res))
## query_bins
\donttest{
## basic use of positive bins: single pivot; count bases that appear in
## first 10 cycles of a read separately from the rest
p_param <- PileupParam(query_bins=c(0, 10, Inf))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
## basic use of positive bins: simple range; only include bases in
## cycle positions 6-10 within read
p_param <- PileupParam(query_bins=c(5, 10))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
## basic use of negative bins: single pivot; count bases that appear in
## last 3 cycle positions of a read separately from the rest.
p_param <- PileupParam(query_bins=c(-Inf, -4, -1))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
## basic use of negative bins: drop nucleotides in last two cycle
## positions of each read
p_param <- PileupParam(query_bins=c(-Inf, -3))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
}
## typical use: beginning, middle, and end segments; because of the
## nature of sequencing technology, it is common for bases in the
## beginning and end segments of each read to be less reliable. pileup
## makes it easy to count each segment separately.
## Assume determined ahead of time that for the data 1-7 makes sense for
## beginning, 8-12 middle, >=13 end (actual cycle positions should be
## tailored to data in actual BAM files).
p_param <- PileupParam(query_bins=c(0, 7, 12, Inf)) ## note non-symmetric
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
xt <- xtabs(count ~ nucleotide + query_bin, res)
print(xt)
t(t(xt) / colSums(xt)) ## cheap normalization for illustrative purposes
## 'implicit outer range': in contrast to c(0, 7, 12, Inf), suppose we
## still want to have beginning, middle, and end segements, but know
## that the first three cycles and any bases beyond the 16th cycle are
## irrelevant. Hence, the implicit outer range is (3,16]; all bases
## outside of that are dropped.
p_param <- PileupParam(query_bins=c(3, 7, 12, 16))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
xt <- xtabs(count ~ nucleotide + query_bin, res)
print(xt)
t(t(xt) / colSums(xt))
\donttest{
## single-width bins: count each cycle within a read separately.
p_param <- PileupParam(query_bins=seq(1,20))
res <- pileup(fl, scanBamParam=sbp, pileupParam=p_param)
xt <- xtabs(count ~ nucleotide + query_bin, res)
print(xt[ , c(1:3, 18:19)]) ## fit on one screen
print(t(t(xt) / colSums(xt))[ , c(1:3, 18:19)])
}
}
|