File: readVcf-methods.Rd

package info (click to toggle)
r-bioc-variantannotation 1.10.5-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,172 kB
  • ctags: 109
  • sloc: ansic: 1,088; sh: 4; makefile: 2
file content (336 lines) | stat: -rw-r--r-- 13,086 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
\name{readVcf}
\alias{readVcf}
\alias{readVcf,character,ANY,ScanVcfParam-method}
\alias{readVcf,character,ANY,missing-method}
\alias{readVcf,character,missing,missing-method}
\alias{readVcf,TabixFile,ANY,ScanVcfParam-method}
\alias{readVcf,TabixFile,ANY,RangedData-method}
\alias{readVcf,TabixFile,ANY,RangesList-method}
\alias{readVcf,TabixFile,ANY,GRanges-method}
\alias{readVcf,TabixFile,ANY,GRangesList-method}
\alias{readVcf,TabixFile,ANY,missing-method}

%% lightweight read* functions
\alias{readInfo}
\alias{readGeno}
\alias{readGT}


\title{Read VCF files}

\description{Read Variant Call Format (VCF) files}

\usage{

  \S4method{readVcf}{TabixFile,ANY,ScanVcfParam}(file, genome, param, ...)
  \S4method{readVcf}{TabixFile,ANY,RangedData}(file, genome, param, ...)
  \S4method{readVcf}{TabixFile,ANY,RangesList}(file, genome, param, ...)
  \S4method{readVcf}{TabixFile,ANY,GRanges}(file, genome, param, ...)
  \S4method{readVcf}{TabixFile,ANY,GRangesList}(file, genome, param, ...)
  \S4method{readVcf}{TabixFile,ANY,missing}(file, genome, param, ...)
  \S4method{readVcf}{character,ANY,ScanVcfParam}(file, genome, param, ...)
  \S4method{readVcf}{character,ANY,missing}(file, genome, param, ...)
  \S4method{readVcf}{character,missing,missing}(file, genome, param, ...)

## Lightweight functions to read a single variable
readInfo(file, x, param=ScanVcfParam(), ..., row.names=TRUE)
readGeno(file, x, param=ScanVcfParam(), ..., row.names=TRUE)
readGT(file, nucleotides=FALSE, param=ScanVcfParam(), ..., row.names=TRUE)
}

\arguments{
  \item{file}{A \code{\link{TabixFile}} instance or character() name of the VCF 
    file to be processed. When ranges are specified in \code{param},
    \code{file} must be a \code{\link{TabixFile}}. 

    Use of the \code{\link{TabixFile}} methods are encouraged as they are
    more efficient than the character() methods. See ?\code{TabixFile} and
    ?\code{indexTabix} for help creating a \code{\link{TabixFile}}.
  }
  \item{genome}{A \code{character} or \code{Seqinfo} object.

    \code{character}: Genome identifier supplied as single string or named 
    character vector (names correspond to seqnames in the VCF file). This
    identifier will relpace the genome information in the VCF \code{Seqinfo} 
    (i.e., \code{seqinfo(vcf)}).

    \code{Seqinfo}: When a \code{Seqinfo} is provided it is merged with
    the existing \code{Seqinfo}. It can have greater than or equal to the
    number of \code{seqlevels} in the current \code{Seqinfo} and the
    \code{seqlevels} must match.
  }
  \item{param}{A instance of \code{\linkS4class{ScanVcfParam}}, \code{GRanges},
    \code{GRangesList}, \code{RangedData} or \code{RangesList}. VCF files can be 
    subset on genomic coordinates (ranges) or elements in the VCF fields. Both 
    genomic coordinates and VCF elements can be specified in a 
    \code{\linkS4class{ScanVcfParam}}. See ?\code{ScanVcfParam} for details.
  }
  \item{x}{\code{character} name of single \code{info} or \code{geno} field to 
    import. Applicable to \code{readInfo} and \code{readGeno} only.
  }
  \item{row.names}{A \code{logical} specifying if rownames should be returned. 
    Applicable to \code{readInfo}, \code{readGeno} and \code{readGT}.
  }
  \item{nucleotides}{A \code{logical} indicating if genotypes should be returned
    as nucleotides instead of the numeric representation.
    Applicable to \code{readGT} only.
  }
  \item{\dots}{Additional arguments, passed to methods.
  }
}

\details{
  \describe{
    \item{Data Import: }{
      \describe{
        \item{VCF object: }{
          \code{readVcf} imports records from bzip compressed or uncompressed 
          VCF files. Data are parsed into a \code{\linkS4class{VCF}} object 
          using the file header information if available. To import a subset 
          of ranges the VCF must have a Tabix index file. An index file can be 
          created with \code{bzip} and \code{indexTabix} functions.

          The \code{readInfo}, \code{readGeno} and \code{readGT} functions
          are lightweight versions of \code{readVcf} and import a single
          variable. The return object is a vector, matrix or CompressedList
          instead of a VCF class. 
          }
      } 
      \code{readVcf} calls \code{\link{scanVcf}}, the details of which can be 
      found with \code{?scanVcf}.
    }
    \item{Data type: }{
      CHROM, POS, ID and REF fields are used to create the \code{GRanges}
      stored in the \code{rowData} slot of the \code{VCF} object. Access
      with \code{rowData} accessor.

      REF, ALT, QUAL and FILTER are parsed into the \code{DataFrame} in the 
      \code{fixed} slot. Because ALT can have more than one value per variant 
      it is represented as a \code{DNAStringSetList}. REF is a \code{DNAStringSet},
      QUAL is \code{numeric} and FILTER is a \code{character}. Accessors include
      \code{fixed}, \code{ref}, \code{alt}, \code{qual}, and \code{filt}.

      Data from the INFO field can be accessed with the \code{info} accessor.
      Genotype data (i.e., data immediately following the FORMAT field in the 
      VCF) can be accessed with the \code{geno} accessor. INFO and genotype data 
      types are determined according to the \sQuote{Number} and \sQuote{Type} 
      information in the file header as follows:

      If \sQuote{Number} is 1, \sQuote{info} data are parsed into a 
      \code{vector}. \sQuote{geno} data are parsed into a \code{matrix} where 
      the columns are the samples.

      If \sQuote{Number} is an integer >1, \sQuote{info} data are parsed into a
      \code{DataFrame} with the indicated number of columns. \sQuote{geno} are
      parsed into an \code{array} with the same dimentions as \sQuote{Number}. 
      Columns of the \sQuote{geno} matrices are the samples.

      If \sQuote{Number} is \sQuote{.}, \sQuote{A} or \sQuote{G}, a \code{matrix} 
      is used for both \sQuote{info} and \sQuote{geno} data.

      When the VCF header does not contain data type information, the data are 
      returned as a single unparsed column named \sQuote{INFO} or \sQuote{GENO}.
    }
    \item{Missing data: }{
      Missing data in VCF files are represented by a dot ("."). \code{readVcf} 
      retains the dot as a character string for data type character and converts 
      it to \code{NA} for data types numeric or double. 

      Because the data are stored in rectangluar data structures there is a
      value for each \code{info} and \code{geno} field element in the \code{VCF} 
      class. If the element was missing or was not collected for a particular 
      variant the value will be \code{NA}.
    }
  }
}

\value{
  \code{readVcf} returns a \code{\linkS4class{VCF}} object. See ?\code{VCF} for 
  complete details of the class structure. \code{readGT}, \code{readInfo} and
  \code{readGeno} return a \code{matrix}.

  \describe{
    \item{rowData: }{
      The CHROM, POS, ID and REF fields are used to create a \code{GRanges}
      object. The ranges are created using POS as the start value and width of 
      the reference allele (REF). The IDs become the rownames. If they are 
      missing (i.e., \sQuote{.}) a string of CHROM:POS_REF/ALT is used instead. 
      The \code{genome} argument is stored in the seqinfo of the \code{GRanges} 
      and can be accessed with \code{genome(<VCF>)}.

      One elementMetadata column, \code{paramRangeID}, is included with the
      \code{rowData}. This ID is meaningful when multiple ranges are specified 
      in the \code{ScanVcfParam}. This ID distinguishes which records match
      each range.
    }
    \item{fixed: }{
      REF, ALT, QUAL and FILTER fields of the VCF are parsed into a 
      \code{DataFrame}.
    }
    \item{info: }{
      Data from the INFO field of the VCF is parsed into a \code{DataFrame}.
    }
    \item{geno: }{
      If present, the genotype data are parsed into a list of \code{matrices} 
      or \code{arrays}. Each list element represents a field in the FORMAT 
      column of the VCF file. Rows are the variants, columns are the samples. 
    }
    \item{colData: }{
      This slot contains a \code{DataFrame} describing the samples. If present, 
      the sample names following FORMAT in the VCF file become the row names.
    }
    \item{exptData: }{ 
      Header information present in the file is put into a \code{SimpleList}
      in \code{exptData}.
    }
  }
  See references for complete details of the VCF file format. 
}

\references{
  \url{http://vcftools.sourceforge.net/specs.html} outlines the VCF
  specification.

  \url{http://samtools.sourceforge.net/mpileup.shtml} contains
  information on the portion of the specification implemented by
  \code{bcftools}.

  \url{http://samtools.sourceforge.net/} provides information on
  \code{samtools}.
}

\author{
  Valerie Obenchain <vobencha@fhcrc.org>
}

\seealso{
  \code{\link{indexTabix}},
  \code{\link{TabixFile}},
  \code{\link{scanTabix}},
  \code{\link{scanBcf}},
  \code{\link{expand,CollapsedVCF-method}}
}

\examples{
  fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation") 
  vcf <- readVcf(fl, "hg19")
  ## vcf <- readVcf(fl, c("20"="hg19"))  ## 'genome' as named vector

  ## ---------------------------------------------------------------------
  ## Header and genome information 
  ## ---------------------------------------------------------------------
  vcf

  ## all header information
  hdr <- header(vcf)

  ## header information for 'info' and 'fixed' fields
  info(hdr)
  fixed(hdr)

  ## ---------------------------------------------------------------------
  ## Accessors
  ## ---------------------------------------------------------------------
  ## fixed fields together
  head(fixed(vcf), 5)

  ## fixed fields separately 
  filt(vcf)
  ref(vcf) 

  ## info data 
  info(hdr)
  info(vcf)
  info(vcf)$DP

  ## geno data 
  geno(hdr)
  geno(vcf)
  head(geno(vcf)$GT)

  ## genome
  unique(genome(rowData(vcf)))

  ## ---------------------------------------------------------------------
  ## Data subsets with lightweight read* functions 
  ## ---------------------------------------------------------------------

  ## Import a single 'info' or 'geno' variable
  DP <- readInfo(fl, "DP")
  HQ <- readGeno(fl, "HQ")

  ## Import GT as numeric representation 
  GT <- readGT(fl)
  ## Import GT as nucleotides 
  GT <- readGT(fl, nucleotides=TRUE)

  ## ---------------------------------------------------------------------
  ## Data subsets with ScanVcfParam
  ## ---------------------------------------------------------------------

  ## Subset on genome coordinates:
  ## 'file' must have a Tabix index
  rngs <- GRanges("20", IRanges(c(14370, 1110000), c(17330, 1234600)))
  names(rngs) <- c("geneA", "geneB")
  param <- ScanVcfParam(which=rngs) 
  compressVcf <- bgzip(fl, tempfile())
  idx <- indexTabix(compressVcf, "vcf")
  tab <- TabixFile(compressVcf, idx)
  vcf <- readVcf(tab, "hg19", param)

  ## When data are subset by range ('which' argument in ScanVcfParam),
  ## the 'paramRangeID' column provides a map back to the original 
  ## range in 'param'.
  rowData(vcf)[,"paramRangeID"]
  vcfWhich(param)

  ## Subset on samples:
  ## Consult the header for the sample names.
  samples(hdr) 
  ## Specify one or more names in 'samples' in a ScanVcfParam.
  param <- ScanVcfParam(samples="NA00002")
  vcf <- readVcf(tab, "hg19", param)
  geno(vcf)$GT

  ## Subset on 'fixed', 'info' or 'geno' fields:
  param <- ScanVcfParam(fixed="ALT", geno=c("GT", "HQ"), info=c("NS", "AF"))
  vcf_tab <- readVcf(tab, "hg19", param)
  info(vcf_tab)
  geno(vcf_tab)

  ## No ranges are specified in the 'param' so tabix file is not
  ## required. Instead, the uncompressed VCF can be used as 'file'.
  vcf_fname <- readVcf(fl, "hg19", param)

  ## The header will always contain information for all variables
  ## in the original file reguardless of how the data were subset.
  ## For example, all 'geno' fields are listed in the header 
  geno(header(vcf_fname))

  ## but only 'GT' and 'HQ' are present in the VCF object.
  geno(vcf_fname)

  ## Subset on both genome coordinates and 'info', 'geno' fields: 
  param <- ScanVcfParam(geno="HQ", info="AF", which=rngs)
  vcf <- readVcf(tab, "hg19", param)

  ## When any of 'fixed', 'info' or 'geno' are omitted (i.e., no
  ## elements specified) all records are retrieved. Use NA to indicate
  ## that no records should be retrieved. This param specifies
  ## all 'fixed fields, the "GT" 'geno' field and none of 'info'.
  ScanVcfParam(geno="GT", info=NA)

  ## ---------------------------------------------------------------------
  ## Iterate through VCF with 'yieldSize' 
  ## ---------------------------------------------------------------------
  fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
  param <- ScanVcfParam(fixed="ALT", geno=c("GT", "GL"), info=c("LDAF"))
  tab <- TabixFile(fl, yieldSize=4000)
  open(tab)
  while (nrow(vcf <- readVcf(tab, "hg19", param=param)))
      cat("vcf dim:", dim(vcf), "\n")
  close(tab)

}

\keyword{manip}