File: junctions-methods.Rd

package info (click to toggle)
r-bioc-genomicalignments 1.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,980 kB
  • ctags: 54
  • sloc: ansic: 1,493; makefile: 4; sh: 3
file content (422 lines) | stat: -rw-r--r-- 17,154 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
\name{junctions-methods}

\alias{junctions-methods}

\alias{junctions}
\alias{junctions,GAlignments-method}
\alias{junctions,GAlignmentPairs-method}
\alias{junctions,GAlignmentsList-method}

\alias{NATURAL_INTRON_MOTIFS}

\alias{summarizeJunctions}

\alias{readTopHatJunctions}
\alias{readSTARJunctions}

% Old stuff:
\alias{introns}


\title{Extract junctions from genomic alignments}

\description{
  Given an object \code{x} containing genomic alignments (e.g. a
  \link{GAlignments}, \link{GAlignmentPairs}, or \link{GAlignmentsList}
  object), \code{junctions(x)} extracts the junctions from it and
  \code{summarizeJunctions(x)} extracts and summarizes them.

  \code{readTopHatJunctions} and \code{readSTARJunctions} are utilities
  for importing the junction file generated by the TopHat and STAR
  aligners, respectively.
}

\usage{
## junctions() and summarizeJunctions()
## ------------------------------------

junctions(x, use.mcols=FALSE, ...)

\S4method{junctions}{GAlignments}(x, use.mcols=FALSE)

\S4method{junctions}{GAlignmentPairs}(x, use.mcols=FALSE)

\S4method{junctions}{GAlignmentsList}(x, use.mcols=FALSE, ignore.strand=FALSE)

## summarizeJunctions() and NATURAL_INTRON_MOTIFS
## ----------------------------------------------

summarizeJunctions(x, with.revmap=FALSE, genome=NULL)

NATURAL_INTRON_MOTIFS

## Utilities for importing the junction file generated by some aligners
## --------------------------------------------------------------------

readTopHatJunctions(file, file.is.raw.juncs=FALSE)

readSTARJunctions(file)
}

\arguments{
  \item{x}{
    A \link{GAlignments}, \link{GAlignmentPairs}, or \link{GAlignmentsList}
    object.
  }
  \item{use.mcols}{
    \code{TRUE} or \code{FALSE} (the default).
    Whether the metadata columns on \code{x} (accessible with \code{mcols(x)})
    should be propagated to the returned object or not.
  }
  \item{...}{
    Additional arguments, for use in specific methods.
  }
  \item{ignore.strand}{
    \code{TRUE} or \code{FALSE} (the default).
    If set to \code{TRUE}, then the strand of \code{x} is set to \code{"*"}
    prior to any computation.
  }
  \item{with.revmap}{
    \code{TRUE} or \code{FALSE} (the default).
    If set to \code{TRUE}, then a \code{revmap} metadata column is added to
    the output of \code{summarizeJunctions}. This metadata column is an
    \link[IRanges]{IntegerList} object representing the mapping from each
    element in the ouput (i.e. each junction) to the corresponding elements in
    the input \code{x}.
  }
  \item{genome}{
    \code{NULL} (the default), or the reference genome that was used to
    align the reads, specified in a way that is accepted by the
    \code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
    software package. In that case the corresponding BSgenome data package
    needs to be already installed (see \code{?getBSgenome} for the details).

    If \code{genome} is supplied, then the \code{intron_motif} and
    \code{intron_strand} metadata columns are computed (based on the
    dinucleotides found at the intron boundaries) and added to the output
    of \code{summarizeJunctions}. See the Value section below for a
    description of these metadata columns.
  }
  \item{file}{
    The path (or a connection) to the junction file generated by the aligner.
    This file should be the \emph{junctions.bed} or \emph{new_list.juncs}
    file for \code{readTopHatJunctions}, and the \emph{SJ.out.tab} file for
    \code{readSTARJunctions}.
  }
  \item{file.is.raw.juncs}{
    \code{TRUE} or \code{FALSE} (the default).
    If set to \code{TRUE}, then the input file is assumed to be a TopHat
    \emph{.juncs} file instead of the \emph{junctions.bed} file generated
    by TopHat. A TopHat \emph{.juncs} file can be obtained by passing the
    \emph{junctions.bed} file thru TopHat's \emph{bed_to_juncs} script.
    See the TopHat manual at \url{http://tophat.cbcb.umd.edu/manual.shtml}
    for more information.
  }
}

\details{
  An N operation in the CIGAR of a genomic alignment is interpreted as a
  junction. \code{junctions(x)} will return the genomic ranges of all
  junctions found in \code{x}.

  More precisely, if \code{x} is a \link{GAlignments} object,
  \code{junctions(x)} is equivalent to:
\preformatted{
  psetdiff(granges(x), grglist(x, order.as.in.query=TRUE))
}
  On a \code{x} is a \link{GAlignmentPairs} object, it's equivalent to (but
  faster than):
\preformatted{
  mendoapply(c, junctions(first(x)), junctions(last(x)))
}

  \code{NATURAL_INTRON_MOTIFS} is a predefined character vector containing
  the 5 natural intron motifs described at
  \url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC84117/}.
}

\value{
  \code{junctions(x)} returns the genomic ranges of the junctions in a
  \link[GenomicRanges]{GRangesList} object \emph{parallel} to \code{x}
  (i.e. with 1 list element per element in \code{x}).
  If \code{x} has names on it, they're propagated to the returned object.
  If \code{use.mcols} is TRUE and \code{x} has metadata columns on it
  (accessible with \code{mcols(x)}), they're propagated to the returned object.

  \code{summarizeJunctions} returns the genomic ranges of the unique
  junctions in \code{x} in an unstranded \link[GenomicRanges]{GRanges}
  object with the following metadata columns:
  \itemize{
    \item \code{score}: The total number of alignments crossing each
          junction, i.e., that have the junction encoded in their CIGAR.

    \item \code{plus_score} and \code{minus_score}: The strand-specific
          number of alignments crossing each junction.

    \item \code{revmap}: [Only if \code{with.revmap} was set to \code{TRUE}.]
          An \link[IRanges]{IntegerList} object representing the mapping from
          each element in the ouput (i.e. each junction) to the corresponding
          elements in input \code{x}.

    \item \code{intron_motif} and \code{intron_strand}: [Only if \code{genome}
          was supplied.] The intron motif and strand for each junction,
          based on the dinucleotides found in the genome sequences at the
          intron boundaries.
          The \code{intron_motif} metadata column is a factor whose levels
          are the 5 natural intron motifs stored in predefined character
          vector \code{NATURAL_INTRON_MOTIFS}.
          If the dinucleotides found at the intron boundaries don't match
          any of these natural intron motifs, then \code{intron_motif} and
          \code{intron_strand} are set to \code{NA} and \code{*},
          respectively.
  }

  \code{readTopHatJunctions} and \code{readSTARJunctions} return the
  junctions reported in the input file in a stranded
  \link[GenomicRanges]{GRanges} object.
  With the following metadata columns for \code{readTopHatJunctions} (when
  reading in the \emph{junctions.bed} file):
  \itemize{
    \item \code{name}: An id assigned by TopHat to each junction. This id is
          of the form JUNC00000017 and is unique within the
          \emph{junctions.bed} file.
          
    \item \code{score}: The total number of alignments crossing each
          junction.
  }
  With the following metadata columns for \code{readSTARJunctions}:
  \itemize{
    \item \code{intron_motif} and \code{intron_strand}:
          The intron motif and strand for each junction, based on the code
          found in the input file (0: non-canonical, 1: GT/AG, 2: CT/AC,
          3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT).
          Note that of the 5 natural intron motifs stored in predefined
          character vector \code{NATURAL_INTRON_MOTIFS}, only the first 3 are
          assigned codes by the STAR software (2 codes per motif, one if the
          intron is on the plus strand and one if it's on the minus strand).
          Thus the \code{intron_motif} metadata column is a factor with only
          3 levels. If code is 0, then \code{intron_motif} and
          \code{intron_strand} are set to \code{NA} and \code{*}, respectively.

    \item \code{um_reads}: The number of uniquely mapping reads crossing
          the junction (a pair where the 2 alignments cross the same junction
          is counted only once).

    \item \code{mm_reads}: The number of multi-mapping reads crossing the
          junction (a pair where the 2 alignments cross the same junction is
          counted only once).
  }
  See STAR manual at \url{https://code.google.com/p/rna-star/} for more
  information.
}

\author{H. Pages}

\references{
  \url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC84117/} for the 5 natural
  intron motifs stored in predefined character vector
  \code{NATURAL_INTRON_MOTIFS}.

  TopHat2: accurate alignment of transcriptomes in the presence of insertions,
  deletions and gene fusions
  \itemize{
    \item TopHat2 paper: \url{http://genomebiology.com/2013/14/4/r36}
    \item TopHat2 software and manual: \url{http://tophat.cbcb.umd.edu/}
  }

  STAR: ultrafast universal RNA-seq aligner
  \itemize{
    \item STAR paper: \url{http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635}
    \item STAR software and manual: \url{https://code.google.com/p/rna-star/}
  }
}

\seealso{
  \itemize{
    \item \link{GAlignments}, \link{GAlignmentPairs}, and
          \link{GAlignmentsList} objects.

    \item The \link[GenomicRanges]{GRanges} and
          \link[GenomicRanges]{GRangesList} classes defined and documented
          in the \pkg{GenomicRanges} package.

    \item The \link[IRanges]{IntegerList} class defined and documented
          in the \pkg{IRanges} package.

    \item The \code{\link[BSgenome]{getBSgenome}} function in the
          \pkg{BSgenome} package, for searching the installed BSgenome
          data packages for the specified genome and returning it as a
          \link[BSgenome]{BSgenome} object.

    \item The \code{\link{readGAlignments}} and
          \code{\link{readGAlignmentPairs}} functions for reading genomic
          alignments from a file.

    \item The \code{\link[IRanges]{extractList}} function in the \pkg{IRanges}
          package, for extracting groups of elements from a vector-like object
          and returning them into a \link[IRanges]{List} object.
  }
}

\examples{
library(RNAseqData.HNRNPC.bam.chr14)
bamfile <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]

## ---------------------------------------------------------------------
## A. junctions()
## ---------------------------------------------------------------------

gal <- readGAlignments(bamfile)
table(njunc(gal))  # some alignments have 3 junctions!
juncs <- junctions(gal)
juncs

stopifnot(identical(unname(elementLengths(juncs)), njunc(gal)))

galp <- readGAlignmentPairs(bamfile)
juncs <- junctions(galp)
juncs

stopifnot(identical(unname(elementLengths(juncs)), njunc(galp)))

## ---------------------------------------------------------------------
## B. summarizeJunctions()
## ---------------------------------------------------------------------

## By default, only the "score", "plus_score", and "minus_score"
## metadata columns are returned:
junc_summary <- summarizeJunctions(gal)
junc_summary

## The "score" metadata column reports the total number of alignments
## crossing each junction, i.e., that have the junction encoded in their
## CIGAR:
median(mcols(junc_summary)$score)

## The "plus_score" and "minus_score" metadata columns report the
## strand-specific number of alignments crossing each junction:
stopifnot(identical(mcols(junc_summary)$score,
                    mcols(junc_summary)$plus_score +
                    mcols(junc_summary)$minus_score))

## If 'with.revmap' is TRUE, the "revmap" metadata column is added to
## the output. This metadata column is an IntegerList object represen-
## ting the mapping from each element in the ouput (i.e. a junction) to
## the corresponding elements in the input 'x'. Here we're going to use
## this to compute a 'score2' for each junction. We obtain this score
## by summing the mapping qualities of the alignments crossing the
## junction:
gal <- readGAlignments(bamfile, param=ScanBamParam(what="mapq"))
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE)
junc_score2 <- sum(extractList(mcols(gal)$mapq,
                               mcols(junc_summary)$revmap))
mcols(junc_summary)$score2 <- junc_score2

## If a genome is specified thru the 'genome' argument (in which case
## the corresponding BSgenome data package needs to be installed), then
## summarizeJunctions() returns the intron motif and strand for each
## junction. Since the reads in RNAseqData.HNRNPC.bam.chr14 were aligned
## to the hg19 genome, the following requires that you have
## BSgenome.Hsapiens.UCSC.hg19 installed:
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE, genome="hg19")
mcols(junc_summary)$score2 <- junc_score2  # putting 'score2' back

## The "intron_motif" metadata column is a factor whose levels are the
## 5 natural intron motifs stored in predefined character vector
## 'NATURAL_INTRON_MOTIFS':
table(mcols(junc_summary)$intron_motif)

## ---------------------------------------------------------------------
## C. STRANDED RNA-seq PROTOCOL
## ---------------------------------------------------------------------

## Here is a simple test for checking whether the RNA-seq protocol was
## stranded or not:
strandedTest <- function(plus_score, minus_score)
    (sum(plus_score ^ 2) + sum(minus_score ^ 2)) /
        sum((plus_score + minus_score) ^ 2)

## The result of this test is guaranteed to be >= 0.5 and <= 1.
## If, for each junction, the strand of the crossing alignments looks
## random (i.e. "plus_score" and "minus_score" are close), then
## strandedTest() will return a value close to 0.5. If it doesn't look
## random (i.e. for each junction, one of "plus_score" and "minus_score"
## is much bigger than the other), then strandedTest() will return a
## value close to 1.

## If the reads are single-end, the test is meaningful when applied
## directly on 'junc_summary'. However, for the test to be meaningful
## on paired-end reads, it needs to be applied on the first and last
## alignments separately:
junc_summary1 <- summarizeJunctions(first(galp))
junc_summary2 <- summarizeJunctions(last(galp))
strandedTest(mcols(junc_summary1)$plus_score,
             mcols(junc_summary1)$minus_score)
strandedTest(mcols(junc_summary2)$plus_score,
             mcols(junc_summary2)$minus_score)
## Both values are close to 0.5 which suggests that the RNA-seq protocol
## used for this experiment was not stranded.

## ---------------------------------------------------------------------
## UTILITIES FOR IMPORTING THE JUNCTION FILE GENERATED BY SOME ALIGNERS
## ---------------------------------------------------------------------

## The TopHat aligner generates a junctions.bed file where it reports
## all the junctions satisfying some "quality" criteria (see the TopHat
## manual at http://tophat.cbcb.umd.edu/manual.shtml for more
## information). This file can be loaded with readTopHatJunctions():
runname <- names(RNAseqData.HNRNPC.bam.chr14_BAMFILES)[1]
junctions_file <- system.file("extdata", "tophat2_out", runname,
                              "junctions.bed",
                              package="RNAseqData.HNRNPC.bam.chr14")
th_junctions <- readTopHatJunctions(junctions_file)

## Comparing the "TopHat junctions" with the result of
## summarizeJunctions():
th_junctions14 <- th_junctions
seqlevels(th_junctions14, force=TRUE) <- "chr14"
mcols(th_junctions14)$intron_strand <- strand(th_junctions14)
strand(th_junctions14) <- "*"

## All the "TopHat junctions" are in 'junc_summary':
stopifnot(all(th_junctions14 \%in\% junc_summary))

## But not all the junctions in 'junc_summary' are reported by TopHat
## (that's because TopHat reports only junctions that satisfy some
## "quality" criteria):
is_in_th_junctions14 <- junc_summary \%in\% th_junctions14
table(is_in_th_junctions14)  # 32 junctions are not in TopHat's
                             # junctions.bed file
junc_summary2 <- junc_summary[is_in_th_junctions14]

## 'junc_summary2' and 'th_junctions14' contain the same junctions in
## the same order:
stopifnot(all(junc_summary2 == th_junctions14))

## Let's merge their metadata columns. We use our own version of
## merge() for this, which is stricter (it checks that the common
## columns are the same in the 2 data frames to merge) and also
## simpler:
merge2 <- function(df1, df2)
{
    common_colnames <- intersect(colnames(df1), colnames(df2))
    lapply(common_colnames,
           function(colname)
             stopifnot(all(df1[ , colname] == df2[ , colname])))
    extra_mcolnames <- setdiff(colnames(df2), colnames(df1))
    cbind(df1, df2[ , extra_mcolnames, drop=FALSE])
}

mcols(th_junctions14) <- merge2(mcols(th_junctions14),
                                mcols(junc_summary2))

## Here is a peculiar junction reported by TopHat:
idx0 <- which(mcols(th_junctions14)$score2 == 0L)
th_junctions14[idx0]
gal[mcols(th_junctions14)$revmap[[idx0]]]
## The junction is crossed by 5 alignments (score is 5), all of which
## have a mapping quality of 0!
}

\keyword{methods}
\keyword{manip}