File: locateVariants-methods.Rd

package info (click to toggle)
r-bioc-variantannotation 1.52.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,372 kB
  • sloc: ansic: 1,357; makefile: 2
file content (391 lines) | stat: -rw-r--r-- 17,235 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
\name{locateVariants}
\alias{locateVariants}

\alias{locateVariants,IntegerRanges,TxDb,VariantType-method}
\alias{locateVariants,IntegerRanges,GRangesList,VariantType-method}
\alias{locateVariants,GRanges,TxDb,VariantType-method}
\alias{locateVariants,GRanges,GRangesList,VariantType-method}
\alias{locateVariants,VCF,TxDb,VariantType-method}
\alias{locateVariants,VCF,GRangesList,VariantType-method}

\alias{locateVariants,GRanges,TxDb,CodingVariants-method}
\alias{locateVariants,GRanges,GRangesList,CodingVariants-method}
\alias{locateVariants,GRanges,TxDb,IntronVariants-method}
\alias{locateVariants,GRanges,GRangesList,IntronVariants-method}
\alias{locateVariants,GRanges,TxDb,FiveUTRVariants-method}
\alias{locateVariants,GRanges,GRangesList,FiveUTRVariants-method}
\alias{locateVariants,GRanges,TxDb,ThreeUTRVariants-method}
\alias{locateVariants,GRanges,GRangesList,ThreeUTRVariants-method}
\alias{locateVariants,GRanges,TxDb,IntergenicVariants-method}
\alias{locateVariants,GRanges,GRangesList,IntergenicVariants-method}
\alias{locateVariants,GRanges,TxDb,SpliceSiteVariants-method}
\alias{locateVariants,GRanges,GRangesList,SpliceSiteVariants-method}
\alias{locateVariants,GRanges,TxDb,PromoterVariants-method}
\alias{locateVariants,GRanges,GRangesList,PromoterVariants-method}
\alias{locateVariants,GRanges,TxDb,AllVariants-method}
\alias{locateVariants,GRanges,GRangesList,AllVariants-method}

\title{Locate variants}

\description{Variant location with respect to gene function}

\usage{
locateVariants(query, subject, region, ...)
\S4method{locateVariants}{VCF,TxDb,VariantType}(query, subject, region, ...,
    cache=new.env(parent=emptyenv()), ignore.strand=FALSE, asHits=FALSE)
\S4method{locateVariants}{GRanges,TxDb,VariantType}(query, subject, region, ...,
    cache=new.env(parent=emptyenv()), ignore.strand=FALSE, asHits=FALSE)
}

\arguments{
  \item{query}{A \link[IRanges]{IntegerRanges}, \link[GenomicRanges]{GRanges}
    or \linkS4class{VCF} object containing the variants. Metadata columns are
    allowed but ignored.

    NOTE: Zero-width ranges are treated as width-1 ranges; start values
    are decremented to equal the end value.
  }
  \item{subject}{A \link[GenomicFeatures]{TxDb} or \code{GRangesList}
    object that serves as the annotation. GFF files can be converted to
    \link[GenomicFeatures]{TxDb} objects with
    \code{makeTxDbFromGFF()} in the \code{txdbmaker} package.
  }
  \item{region}{An instance of one of the 8 VariantType classes:
    \code{CodingVariants}, \code{IntronVariants}, \code{FiveUTRVariants},
    \code{ThreeUTRVariants}, \code{IntergenicVariants},
    \code{SpliceSiteVariants}, \code{PromoterVariants}, \code{AllVariants}.
    All objects can be instantiated with no arguments, e.g., CodingVariants()
    will create an object of \code{CodingVariants}.

    \code{AllVariants}, \code{PromoterVariants} and \code{IntergenicVariants}
    have \code{upstream} and \code{downstream} arguments. For
    \code{PromoterVariants} and \code{IntergenicVariants} these are single
    integer values >= 0. For \code{AllVariants} these are integer vectors
    of length 2 named \sQuote{promoter} and \sQuote{intergenic}. See
    ?\code{upstream} for more details.

    When using \code{AllVariants}, a range in \code{query} may fall in
    multiple regions (e.g., 'intergenic' and 'promoter'). In this case
    the result will have a row for each match. All data in the
    row will be equivalent except the LOCATION column.
  }
  \item{\dots}{Additional arguments passed to methods
  }
  \item{cache}{An \code{environment} into which required components
    of \code{subject} are loaded. Provide, and re-use, a cache to
    speed repeated queries to the same \code{subject} across
    different \code{query} instances.
  }
  \item{ignore.strand}{A \code{logical} indicating if strand should be
    ignored when performing overlaps.
  }
  \item{asHits}{A \code{logical} indicating if the results should be
    returned as a \link[S4Vectors]{Hits} object. Not applicable when
    \code{region} is AllVariants or IntergenicVariants.
  }
}

\details{
  \describe{
    \item{Range representation:}{
      The ranges in \code{query} should reflect the position(s) of the
      reference allele. For snps the range will be of width 1. For range
      insertions or deletions the reference allele could be a sequence
      such as GGTG in which case the width of the range should be 4.
    }
    \item{Location:}{
      Possible locations are \sQuote{coding}, \sQuote{intron},
      \sQuote{threeUTR}, \sQuote{fiveUTR}, \sQuote{intergenic},
      \sQuote{spliceSite}, or \sQuote{promoter}.

      Overlap operations for \sQuote{coding}, \sQuote{intron},
      \sQuote{threeUTR}, and \sQuote{fiveUTR} require variants to fall
      completely within the defined region to be classified as such.

      To be classified as a \sQuote{spliceSite} the variant must overlap
      with any portion of the first 2 or last 2 nucleotides in an intron.

      \sQuote{intergenic} variants are ranges that do not fall within a
      defined gene region. \sQuote{transcripts by gene} are extracted from
      the annotation and overlapped with the variant positions. Variants with
      no overlaps are classified as \code{intergenic}. When available, gene
      IDs for the flanking genes are provided as \code{PRECEDEID} and
      \code{FOLLOWID}. \code{upstream} and \code{downstream} arguments define
      the acceptable distance from the query for the flanking genes.
      \code{PRECEDEID} and \code{FOLLOWID} results are lists and contain all
      genes that fall within the defined distance. See the examples for how
      to compute the distance from ranges to PRECEDEID and FOLLOWID.

      \sQuote{promoter} variants fall within a specified range upstream and
      downstream of the transcription start site. Ranges values can be set
      with the \code{upstream} and \code{downstream} arguments when creating
      the \code{PromoterVariants()} or \code{AllVariants()} classes.
    }
    \item{Subject as GRangesList:}{
      The \code{subject} can be a \code{TxDb} or \code{GRangesList}
      object. When using a \code{GRangesList} the type of data required
      is driven by the \code{VariantType} class. Below is a description of
      the appropriate \code{GRangesList} for each \code{VariantType}.
      \describe{
        \item{CodingVariants:}{coding (CDS) by transcript}
        \item{IntronVariants:}{introns by transcript}
        \item{FiveUTRVariants:}{five prime UTR by transcript}
        \item{ThreeUTRVariants:}{three prime UTR by transcript}
        \item{IntergenicVariants:}{transcripts by gene}
        \item{SpliceSiteVariants:}{introns by transcript}
        \item{PromoterVariants:}{list of transcripts}
        \item{AllVariants:}{no GRangeList method available}
      }
    }
    \item{Using the cache:}{
      When processing multiple VCF files performance is enhanced by specifying
      an environment as the \code{cache} argument. This cache is used to store
      and reuse extracted components of the subject (TxDb) required by the
      function. The first call to the function (i.e., processing the first
      VCF file in a list of many) populates the cache; repeated calls
      to \code{locateVariants} will access these objects from the cache vs
      re-extracting the same information.
    }
  }
}
\value{
  A \code{GRanges} object with a row for each variant-transcript match.
  Strand of the output is from the \code{subject} hit
  except in the case of IntergenicVariants. For intergenic, multiple precede
  and follow gene ids are returned for each variant. When
  \code{ignore.strand=TRUE} the return strand is \code{*} because
  genes on both strands are considered and it is possible to have a mixture.
  When \code{ignore.strand=FALSE} the strand will match the \code{query}
  because only genes on the same strand are considered.

  Metadata columns are \code{LOCATION}, \code{QUERYID},
  \code{TXID}, \code{GENEID}, \code{PRECEDEID}, \code{FOLLOWID} and
  \code{CDSID}. Results are ordered by \code{QUERYID}, \code{TXID} and
  \code{GENEID}. Columns are described in detail below.

 \describe{
    \item{\code{LOCATION}}{
      Possible locations are \sQuote{coding}, \sQuote{intron},
      \sQuote{threeUTR}, \sQuote{fiveUTR}, \sQuote{intergenic},
      \sQuote{spliceSite} and \sQuote{promoter}.

      To be classified as \sQuote{coding}, \sQuote{intron}, \sQuote{threeUTR}
      or \sQuote{fiveUTR} the variant must fall completely within the region.

      \sQuote{intergenic} variants do not fall within a transcript. The
      \sQuote{GENEID} for these positions are \code{NA}. Lists of flanking
      genes that fall within the distance defined by \code{upstream} and
      \code{downstream} are given as \sQuote{PRECEDEID} and \sQuote{FOLLOWID}.
      By default, the gene ID is returned in the \sQuote{PRECEDEID} and
      \sQuote{FOLLOWID} columns. To return the transcript ids instead set
      \code{idType = "tx"} in the \code{IntergenicVariants()}
      constructor.

      A \sQuote{spliceSite} variant overlaps any portion of the first 2 or last
      2 nucleotides of an intron.
    }
    \item{\code{LOCSTART, LOCEND}}{
      Genomic position in LOCATION-centric coordinates.
      If LOCATION is `intron`, these are intron-centric coordinates,
      if LOCATION is `coding` then cds-centric. All coordinates are
      relative to the start of the transcript. SpliceSiteVariants,
      IntergenicVariants and PromoterVariants have no formal
      extraction `by transcript` so for these variants LOCSTART and
      LOCEND are NA. Coordinates are computed with \code{mapToTranscripts};
      see ?\code{mapToTranscripts} in the GenomicFeatures package for details.
    }
    \item{\code{QUERYID}}{
      The \code{QUERYID} column provides a map back to the row in the
      original \code{query}. If the \code{query} was a \code{VCF} object this
      index corresponds to the row in the \code{GRanges} object returned by
      the \code{rowRanges} accessor.
    }
    \item{\code{TXID}}{
      The transcript id taken from the \code{TxDb} object.
    }
    \item{\code{CDSID}}{
      The coding sequence id(s) taken from the \code{TxDb} object.
    }
    \item{\code{GENEID}}{
      The gene id taken from the \code{TxDb} object.
    }
    \item{\code{PRECEDEID}}{
      IDs for all genes the query precedes within the defined
      \code{upstream} and \code{downstream} distance. Only applicable
      for \sQuote{intergenic} variants. By default this column contains gene ids;
      to return transcript ids set \code{idType = "tx"} in
      the \code{IntergenicVariants} constructor.
    }
    \item{\code{FOLLOWID}}{
      IDs for all genes the query follows within the defined
      \code{upstream} and \code{downstream} distance. Only applicable
      for \sQuote{intergenic} variants. By default this column contains gene ids;
      to return transcript ids set \code{idType = "tx"} in
      the \code{IntergenicVariants} constructor.
    }
    All ID values will be \sQuote{NA} for variants with a location of
    \code{transcript_region} or \code{NA}.
  }
}

\author{Valerie Obenchain}

\seealso{
  \itemize{
    \item The \link{readVcf} function.
    \item The \link{predictCoding} function.
    \item The promoters function on the
          \link[GenomicRanges]{intra-range-methods} man page in the
          \pkg{GenomicRanges} package.
  }
}

\examples{
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

  ## ---------------------------------------------------------------------
  ## Variants in all gene regions
  ## ---------------------------------------------------------------------
  ## Read variants from a VCF file.
  fl <- system.file("extdata", "gl_chr1.vcf",
                    package="VariantAnnotation")
  vcf <- readVcf(fl, "hg19")

  ## Often the seqlevels in the VCF file do not match those in the TxDb.
  head(seqlevels(vcf))
  head(seqlevels(txdb))
  intersect(seqlevels(vcf), seqlevels(txdb))

  ## Rename seqlevels with renameSeqlevesl().
  vcf <- renameSeqlevels(vcf, paste0("chr", seqlevels(vcf)))

  ## Confirm.
  intersect(seqlevels(vcf), seqlevels(txdb))

  ## Overlaps for all possible variant locations.
  loc_all <- locateVariants(vcf, txdb, AllVariants())
  table(loc_all$LOCATION)

  ## ---------------------------------------------------------------------
  ## Variants in intergenic regions
  ## ---------------------------------------------------------------------
  ## Intergenic variants do not overlap a gene range in the
  ## annotation and therefore 'GENEID' is always NA. Flanking genes
  ## that fall within the 'upstream' and 'downstream' distances are
  ## reported as PRECEDEID and FOLLOWID.
  region <- IntergenicVariants(upstream=70000, downstream=70000)
  loc_int <- locateVariants(vcf, txdb, region)
  mcols(loc_int)[c("LOCATION", "PRECEDEID", "FOLLOWID")]

  ## Distance to the flanking genes can be computed for variants that
  ## have PRECEDEID(s) or FOLLOWID(s). Each variant can have multiple
  ## flanking id's so we first expand PRECEDEID and the corresponding
  ## variant ranges.
  p_ids <- unlist(loc_int$PRECEDEID, use.names=FALSE)
  exp_ranges <- rep(loc_int,  elementNROWS(loc_int$PRECEDEID))

  ## Compute distances with the distance method defined in GenomicFeatures.
  ## Help page can be found at ?`distance,GenomicRanges,TxDb-method`.
  ## The method returns NA for ids that cannot be collapsed into a single
  ## range (e.g., genes with ranges on multiple chromosomes).
  distance(exp_ranges, txdb, id=p_ids, type="gene")

  ## To search for distance by transcript id set idType='tx' in the
  ## IntergenicVariants() constructor, e.g.,
  ## locateVariants(vcf, txdb, region=IntergenicVariants(idType="tx"))

  ## Unlist ids and expand ranges as before to get p_ids and exp_ranges.
  ## Then call distance() with type = "tx":
  ## distance(exp_ranges, txdb, id=p_ids, type="tx")


  ## ---------------------------------------------------------------------
  ## GRangesList as subject
  ## ---------------------------------------------------------------------
  ## When 'subject' is a GRangesList the GENEID is unavailable and
  ## will always be reported as NA. This is because the GRangesList
  ## objects are extractions of region-by-transcript, not region-by-gene.
  \dontrun{
  cdsbytx <- cdsBy(txdb)
  locateVariants(vcf, cdsbytx, CodingVariants())

  intbytx <- intronsByTranscript(txdb)
  locateVariants(vcf, intbytx, IntronVariants())
  }

  ## ---------------------------------------------------------------------
  ## Using the cache
  ## ---------------------------------------------------------------------
  ## When processing multiple VCF files, the 'cache' can be used
  ## to store the extracted components of the TxDb
  ## (i.e., cds by tx, introns by tx etc.). This avoids having to
  ## re-extract these GRangesLists during each loop.
  \dontrun{
  myenv <- new.env()
  files <- list(vcf1, vcf2, vcf3)
  lapply(files,
      function(fl) {
          vcf <- readVcf(fl, "hg19")
          ## modify seqlevels to match TxDb
          seqlevels(vcf_mod) <- paste0("chr", seqlevels(vcf))
          locateVariants(vcf_mod, txdb, AllVariants(), cache=myenv)
      })
  }

  ## ---------------------------------------------------------------------
  ## Parallel implmentation
  ## ---------------------------------------------------------------------
  \dontrun{
  library(BiocParallel)

  ## A connection to a TxDb object is established when
  ## the package is loaded. Because each process reading from an
  ## sqlite db must have a unique connection the TxDb
  ## object cannot be passed as an argument when running in
  ## parallel. Instead the package must be loaded on each worker.

  ## The overhead of the multiple loading may defeat the
  ## purpose of running the job in parallel. An alternative is
  ## to instead pass the appropriate GRangesList as an argument.
  ## The details section on this man page under the heading
  ## 'Subject as GRangesList' explains what GRangesList is
  ## appropriate for each variant type.

  ## A. Passing a GRangesList:

  fun <- function(x, subject, ...)
      locateVariants(x, subject, IntronVariants())

  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  grl <- intronsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene)
  mclapply(c(vcf, vcf), fun, subject=grl)


  ## B. Passing a TxDb:

  ## Forking:
  ## In the case of forking, the TxDb cannot be loaded
  ## in the current workspace.
  ## To detach the NAMESPACE:
  ##     unloadNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")

  fun <- function(x) {
      library(TxDb.Hsapiens.UCSC.hg19.knownGene)
      locateVariants(x, TxDb.Hsapiens.UCSC.hg19.knownGene,
                     IntronVariants())
  }
  mclapply(c(vcf, vcf), fun)

  ## Clusters:
  cl <- makeCluster(2, type = "SOCK")
  fun <- function(query, subject, region) {
      library(VariantAnnotation)
      library(TxDb.Hsapiens.UCSC.hg19.knownGene)
      locateVariants(query, TxDb.Hsapiens.UCSC.hg19.knownGene, region)
  }
  parLapply(cl, c(vcf, vcf), fun, region=IntronVariants())
  stopCluster(cl)
  }
}
\keyword{methods}