1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
|
\name{junctions-methods}
\alias{junctions-methods}
\alias{junctions}
\alias{junctions,GAlignments-method}
\alias{junctions,GAlignmentPairs-method}
\alias{junctions,GAlignmentsList-method}
\alias{NATURAL_INTRON_MOTIFS}
\alias{summarizeJunctions}
\alias{readTopHatJunctions}
\alias{readSTARJunctions}
% Old stuff:
\alias{introns}
\title{Extract junctions from genomic alignments}
\description{
Given an object \code{x} containing genomic alignments (e.g. a
\link{GAlignments}, \link{GAlignmentPairs}, or \link{GAlignmentsList}
object), \code{junctions(x)} extracts the junctions from it and
\code{summarizeJunctions(x)} extracts and summarizes them.
\code{readTopHatJunctions} and \code{readSTARJunctions} are utilities
for importing the junction file generated by the TopHat and STAR
aligners, respectively.
}
\usage{
## junctions() and summarizeJunctions()
## ------------------------------------
junctions(x, use.mcols=FALSE, ...)
\S4method{junctions}{GAlignments}(x, use.mcols=FALSE)
\S4method{junctions}{GAlignmentPairs}(x, use.mcols=FALSE)
\S4method{junctions}{GAlignmentsList}(x, use.mcols=FALSE, ignore.strand=FALSE)
## summarizeJunctions() and NATURAL_INTRON_MOTIFS
## ----------------------------------------------
summarizeJunctions(x, with.revmap=FALSE, genome=NULL)
NATURAL_INTRON_MOTIFS
## Utilities for importing the junction file generated by some aligners
## --------------------------------------------------------------------
readTopHatJunctions(file, file.is.raw.juncs=FALSE)
readSTARJunctions(file)
}
\arguments{
\item{x}{
A \link{GAlignments}, \link{GAlignmentPairs}, or \link{GAlignmentsList}
object.
}
\item{use.mcols}{
\code{TRUE} or \code{FALSE} (the default).
Whether the metadata columns on \code{x} (accessible with \code{mcols(x)})
should be propagated to the returned object or not.
}
\item{...}{
Additional arguments, for use in specific methods.
}
\item{ignore.strand}{
\code{TRUE} or \code{FALSE} (the default).
If set to \code{TRUE}, then the strand of \code{x} is set to \code{"*"}
prior to any computation.
}
\item{with.revmap}{
\code{TRUE} or \code{FALSE} (the default).
If set to \code{TRUE}, then a \code{revmap} metadata column is added to
the output of \code{summarizeJunctions}. This metadata column is an
\link[IRanges]{IntegerList} object representing the mapping from each
element in the ouput (i.e. each junction) to the corresponding elements in
the input \code{x}.
}
\item{genome}{
\code{NULL} (the default), or the reference genome that was used to
align the reads, specified in a way that is accepted by the
\code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
software package. In that case the corresponding BSgenome data package
needs to be already installed (see \code{?getBSgenome} for the details).
If \code{genome} is supplied, then the \code{intron_motif} and
\code{intron_strand} metadata columns are computed (based on the
dinucleotides found at the intron boundaries) and added to the output
of \code{summarizeJunctions}. See the Value section below for a
description of these metadata columns.
}
\item{file}{
The path (or a connection) to the junction file generated by the aligner.
This file should be the \emph{junctions.bed} or \emph{new_list.juncs}
file for \code{readTopHatJunctions}, and the \emph{SJ.out.tab} file for
\code{readSTARJunctions}.
}
\item{file.is.raw.juncs}{
\code{TRUE} or \code{FALSE} (the default).
If set to \code{TRUE}, then the input file is assumed to be a TopHat
\emph{.juncs} file instead of the \emph{junctions.bed} file generated
by TopHat. A TopHat \emph{.juncs} file can be obtained by passing the
\emph{junctions.bed} file thru TopHat's \emph{bed_to_juncs} script.
See the TopHat manual at \url{http://tophat.cbcb.umd.edu/manual.shtml}
for more information.
}
}
\details{
An N operation in the CIGAR of a genomic alignment is interpreted as a
junction. \code{junctions(x)} will return the genomic ranges of all
junctions found in \code{x}.
More precisely, if \code{x} is a \link{GAlignments} object,
\code{junctions(x)} is equivalent to:
\preformatted{
psetdiff(granges(x), grglist(x, order.as.in.query=TRUE))
}
On a \code{x} is a \link{GAlignmentPairs} object, it's equivalent to (but
faster than):
\preformatted{
mendoapply(c, junctions(first(x)), junctions(last(x)))
}
\code{NATURAL_INTRON_MOTIFS} is a predefined character vector containing
the 5 natural intron motifs described at
\url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC84117/}.
}
\value{
\code{junctions(x)} returns the genomic ranges of the junctions in a
\link[GenomicRanges]{GRangesList} object \emph{parallel} to \code{x}
(i.e. with 1 list element per element in \code{x}).
If \code{x} has names on it, they're propagated to the returned object.
If \code{use.mcols} is TRUE and \code{x} has metadata columns on it
(accessible with \code{mcols(x)}), they're propagated to the returned object.
\code{summarizeJunctions} returns the genomic ranges of the unique
junctions in \code{x} in an unstranded \link[GenomicRanges]{GRanges}
object with the following metadata columns:
\itemize{
\item \code{score}: The total number of alignments crossing each
junction, i.e., that have the junction encoded in their CIGAR.
\item \code{plus_score} and \code{minus_score}: The strand-specific
number of alignments crossing each junction.
\item \code{revmap}: [Only if \code{with.revmap} was set to \code{TRUE}.]
An \link[IRanges]{IntegerList} object representing the mapping from
each element in the ouput (i.e. each junction) to the corresponding
elements in input \code{x}.
\item \code{intron_motif} and \code{intron_strand}: [Only if \code{genome}
was supplied.] The intron motif and strand for each junction,
based on the dinucleotides found in the genome sequences at the
intron boundaries.
The \code{intron_motif} metadata column is a factor whose levels
are the 5 natural intron motifs stored in predefined character
vector \code{NATURAL_INTRON_MOTIFS}.
If the dinucleotides found at the intron boundaries don't match
any of these natural intron motifs, then \code{intron_motif} and
\code{intron_strand} are set to \code{NA} and \code{*},
respectively.
}
\code{readTopHatJunctions} and \code{readSTARJunctions} return the
junctions reported in the input file in a stranded
\link[GenomicRanges]{GRanges} object.
With the following metadata columns for \code{readTopHatJunctions} (when
reading in the \emph{junctions.bed} file):
\itemize{
\item \code{name}: An id assigned by TopHat to each junction. This id is
of the form JUNC00000017 and is unique within the
\emph{junctions.bed} file.
\item \code{score}: The total number of alignments crossing each
junction.
}
With the following metadata columns for \code{readSTARJunctions}:
\itemize{
\item \code{intron_motif} and \code{intron_strand}:
The intron motif and strand for each junction, based on the code
found in the input file (0: non-canonical, 1: GT/AG, 2: CT/AC,
3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT).
Note that of the 5 natural intron motifs stored in predefined
character vector \code{NATURAL_INTRON_MOTIFS}, only the first 3 are
assigned codes by the STAR software (2 codes per motif, one if the
intron is on the plus strand and one if it's on the minus strand).
Thus the \code{intron_motif} metadata column is a factor with only
3 levels. If code is 0, then \code{intron_motif} and
\code{intron_strand} are set to \code{NA} and \code{*}, respectively.
\item \code{um_reads}: The number of uniquely mapping reads crossing
the junction (a pair where the 2 alignments cross the same junction
is counted only once).
\item \code{mm_reads}: The number of multi-mapping reads crossing the
junction (a pair where the 2 alignments cross the same junction is
counted only once).
}
See STAR manual at \url{https://code.google.com/p/rna-star/} for more
information.
}
\author{H. Pages}
\references{
\url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC84117/} for the 5 natural
intron motifs stored in predefined character vector
\code{NATURAL_INTRON_MOTIFS}.
TopHat2: accurate alignment of transcriptomes in the presence of insertions,
deletions and gene fusions
\itemize{
\item TopHat2 paper: \url{http://genomebiology.com/2013/14/4/r36}
\item TopHat2 software and manual: \url{http://tophat.cbcb.umd.edu/}
}
STAR: ultrafast universal RNA-seq aligner
\itemize{
\item STAR paper: \url{http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635}
\item STAR software and manual: \url{https://code.google.com/p/rna-star/}
}
}
\seealso{
\itemize{
\item \link{GAlignments}, \link{GAlignmentPairs}, and
\link{GAlignmentsList} objects.
\item The \link[GenomicRanges]{GRanges} and
\link[GenomicRanges]{GRangesList} classes defined and documented
in the \pkg{GenomicRanges} package.
\item The \link[IRanges]{IntegerList} class defined and documented
in the \pkg{IRanges} package.
\item The \code{\link[BSgenome]{getBSgenome}} function in the
\pkg{BSgenome} package, for searching the installed BSgenome
data packages for the specified genome and returning it as a
\link[BSgenome]{BSgenome} object.
\item The \code{\link{readGAlignments}} and
\code{\link{readGAlignmentPairs}} functions for reading genomic
alignments from a file.
\item The \code{\link[IRanges]{extractList}} function in the \pkg{IRanges}
package, for extracting groups of elements from a vector-like object
and returning them into a \link[IRanges]{List} object.
}
}
\examples{
library(RNAseqData.HNRNPC.bam.chr14)
bamfile <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
## ---------------------------------------------------------------------
## A. junctions()
## ---------------------------------------------------------------------
gal <- readGAlignments(bamfile)
table(njunc(gal)) # some alignments have 3 junctions!
juncs <- junctions(gal)
juncs
stopifnot(identical(unname(elementLengths(juncs)), njunc(gal)))
galp <- readGAlignmentPairs(bamfile)
juncs <- junctions(galp)
juncs
stopifnot(identical(unname(elementLengths(juncs)), njunc(galp)))
## ---------------------------------------------------------------------
## B. summarizeJunctions()
## ---------------------------------------------------------------------
## By default, only the "score", "plus_score", and "minus_score"
## metadata columns are returned:
junc_summary <- summarizeJunctions(gal)
junc_summary
## The "score" metadata column reports the total number of alignments
## crossing each junction, i.e., that have the junction encoded in their
## CIGAR:
median(mcols(junc_summary)$score)
## The "plus_score" and "minus_score" metadata columns report the
## strand-specific number of alignments crossing each junction:
stopifnot(identical(mcols(junc_summary)$score,
mcols(junc_summary)$plus_score +
mcols(junc_summary)$minus_score))
## If 'with.revmap' is TRUE, the "revmap" metadata column is added to
## the output. This metadata column is an IntegerList object represen-
## ting the mapping from each element in the ouput (i.e. a junction) to
## the corresponding elements in the input 'x'. Here we're going to use
## this to compute a 'score2' for each junction. We obtain this score
## by summing the mapping qualities of the alignments crossing the
## junction:
gal <- readGAlignments(bamfile, param=ScanBamParam(what="mapq"))
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE)
junc_score2 <- sum(extractList(mcols(gal)$mapq,
mcols(junc_summary)$revmap))
mcols(junc_summary)$score2 <- junc_score2
## If a genome is specified thru the 'genome' argument (in which case
## the corresponding BSgenome data package needs to be installed), then
## summarizeJunctions() returns the intron motif and strand for each
## junction. Since the reads in RNAseqData.HNRNPC.bam.chr14 were aligned
## to the hg19 genome, the following requires that you have
## BSgenome.Hsapiens.UCSC.hg19 installed:
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE, genome="hg19")
mcols(junc_summary)$score2 <- junc_score2 # putting 'score2' back
## The "intron_motif" metadata column is a factor whose levels are the
## 5 natural intron motifs stored in predefined character vector
## 'NATURAL_INTRON_MOTIFS':
table(mcols(junc_summary)$intron_motif)
## ---------------------------------------------------------------------
## C. STRANDED RNA-seq PROTOCOL
## ---------------------------------------------------------------------
## Here is a simple test for checking whether the RNA-seq protocol was
## stranded or not:
strandedTest <- function(plus_score, minus_score)
(sum(plus_score ^ 2) + sum(minus_score ^ 2)) /
sum((plus_score + minus_score) ^ 2)
## The result of this test is guaranteed to be >= 0.5 and <= 1.
## If, for each junction, the strand of the crossing alignments looks
## random (i.e. "plus_score" and "minus_score" are close), then
## strandedTest() will return a value close to 0.5. If it doesn't look
## random (i.e. for each junction, one of "plus_score" and "minus_score"
## is much bigger than the other), then strandedTest() will return a
## value close to 1.
## If the reads are single-end, the test is meaningful when applied
## directly on 'junc_summary'. However, for the test to be meaningful
## on paired-end reads, it needs to be applied on the first and last
## alignments separately:
junc_summary1 <- summarizeJunctions(first(galp))
junc_summary2 <- summarizeJunctions(last(galp))
strandedTest(mcols(junc_summary1)$plus_score,
mcols(junc_summary1)$minus_score)
strandedTest(mcols(junc_summary2)$plus_score,
mcols(junc_summary2)$minus_score)
## Both values are close to 0.5 which suggests that the RNA-seq protocol
## used for this experiment was not stranded.
## ---------------------------------------------------------------------
## UTILITIES FOR IMPORTING THE JUNCTION FILE GENERATED BY SOME ALIGNERS
## ---------------------------------------------------------------------
## The TopHat aligner generates a junctions.bed file where it reports
## all the junctions satisfying some "quality" criteria (see the TopHat
## manual at http://tophat.cbcb.umd.edu/manual.shtml for more
## information). This file can be loaded with readTopHatJunctions():
runname <- names(RNAseqData.HNRNPC.bam.chr14_BAMFILES)[1]
junctions_file <- system.file("extdata", "tophat2_out", runname,
"junctions.bed",
package="RNAseqData.HNRNPC.bam.chr14")
th_junctions <- readTopHatJunctions(junctions_file)
## Comparing the "TopHat junctions" with the result of
## summarizeJunctions():
th_junctions14 <- th_junctions
seqlevels(th_junctions14, force=TRUE) <- "chr14"
mcols(th_junctions14)$intron_strand <- strand(th_junctions14)
strand(th_junctions14) <- "*"
## All the "TopHat junctions" are in 'junc_summary':
stopifnot(all(th_junctions14 \%in\% junc_summary))
## But not all the junctions in 'junc_summary' are reported by TopHat
## (that's because TopHat reports only junctions that satisfy some
## "quality" criteria):
is_in_th_junctions14 <- junc_summary \%in\% th_junctions14
table(is_in_th_junctions14) # 32 junctions are not in TopHat's
# junctions.bed file
junc_summary2 <- junc_summary[is_in_th_junctions14]
## 'junc_summary2' and 'th_junctions14' contain the same junctions in
## the same order:
stopifnot(all(junc_summary2 == th_junctions14))
## Let's merge their metadata columns. We use our own version of
## merge() for this, which is stricter (it checks that the common
## columns are the same in the 2 data frames to merge) and also
## simpler:
merge2 <- function(df1, df2)
{
common_colnames <- intersect(colnames(df1), colnames(df2))
lapply(common_colnames,
function(colname)
stopifnot(all(df1[ , colname] == df2[ , colname])))
extra_mcolnames <- setdiff(colnames(df2), colnames(df1))
cbind(df1, df2[ , extra_mcolnames, drop=FALSE])
}
mcols(th_junctions14) <- merge2(mcols(th_junctions14),
mcols(junc_summary2))
## Here is a peculiar junction reported by TopHat:
idx0 <- which(mcols(th_junctions14)$score2 == 0L)
th_junctions14[idx0]
gal[mcols(th_junctions14)$revmap[[idx0]]]
## The junction is crossed by 5 alignments (score is 5), all of which
## have a mapping quality of 0!
}
\keyword{methods}
\keyword{manip}
|