1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
|
\name{readVcf}
\alias{readVcf}
\alias{readVcf,character,ANY,ScanVcfParam-method}
\alias{readVcf,character,ANY,missing-method}
\alias{readVcf,character,missing,missing-method}
\alias{readVcf,TabixFile,ANY,ScanVcfParam-method}
\alias{readVcf,TabixFile,ANY,RangedData-method}
\alias{readVcf,TabixFile,ANY,RangesList-method}
\alias{readVcf,TabixFile,ANY,GRanges-method}
\alias{readVcf,TabixFile,ANY,GRangesList-method}
\alias{readVcf,TabixFile,ANY,missing-method}
%% lightweight read* functions
\alias{readInfo}
\alias{readGeno}
\alias{readGT}
\title{Read VCF files}
\description{Read Variant Call Format (VCF) files}
\usage{
\S4method{readVcf}{TabixFile,ANY,ScanVcfParam}(file, genome, param, ...)
\S4method{readVcf}{TabixFile,ANY,RangedData}(file, genome, param, ...)
\S4method{readVcf}{TabixFile,ANY,RangesList}(file, genome, param, ...)
\S4method{readVcf}{TabixFile,ANY,GRanges}(file, genome, param, ...)
\S4method{readVcf}{TabixFile,ANY,GRangesList}(file, genome, param, ...)
\S4method{readVcf}{TabixFile,ANY,missing}(file, genome, param, ...)
\S4method{readVcf}{character,ANY,ScanVcfParam}(file, genome, param, ...)
\S4method{readVcf}{character,ANY,missing}(file, genome, param, ...)
\S4method{readVcf}{character,missing,missing}(file, genome, param, ...)
## Lightweight functions to read a single variable
readInfo(file, x, param=ScanVcfParam(), ..., row.names=TRUE)
readGeno(file, x, param=ScanVcfParam(), ..., row.names=TRUE)
readGT(file, nucleotides=FALSE, param=ScanVcfParam(), ..., row.names=TRUE)
}
\arguments{
\item{file}{A \code{\link{TabixFile}} instance or character() name of the VCF
file to be processed. When ranges are specified in \code{param},
\code{file} must be a \code{\link{TabixFile}}.
Use of the \code{\link{TabixFile}} methods are encouraged as they are
more efficient than the character() methods. See ?\code{TabixFile} and
?\code{indexTabix} for help creating a \code{\link{TabixFile}}.
}
\item{genome}{A \code{character} or \code{Seqinfo} object.
\code{character}: Genome identifier supplied as single string or named
character vector (names correspond to seqnames in the VCF file). This
identifier will relpace the genome information in the VCF \code{Seqinfo}
(i.e., \code{seqinfo(vcf)}).
\code{Seqinfo}: When a \code{Seqinfo} is provided it is merged with
the existing \code{Seqinfo}. It can have greater than or equal to the
number of \code{seqlevels} in the current \code{Seqinfo} and the
\code{seqlevels} must match.
}
\item{param}{A instance of \code{\linkS4class{ScanVcfParam}}, \code{GRanges},
\code{GRangesList}, \code{RangedData} or \code{RangesList}. VCF files can be
subset on genomic coordinates (ranges) or elements in the VCF fields. Both
genomic coordinates and VCF elements can be specified in a
\code{\linkS4class{ScanVcfParam}}. See ?\code{ScanVcfParam} for details.
}
\item{x}{\code{character} name of single \code{info} or \code{geno} field to
import. Applicable to \code{readInfo} and \code{readGeno} only.
}
\item{row.names}{A \code{logical} specifying if rownames should be returned.
Applicable to \code{readInfo}, \code{readGeno} and \code{readGT}.
}
\item{nucleotides}{A \code{logical} indicating if genotypes should be returned
as nucleotides instead of the numeric representation.
Applicable to \code{readGT} only.
}
\item{\dots}{Additional arguments, passed to methods.
}
}
\details{
\describe{
\item{Data Import: }{
\describe{
\item{VCF object: }{
\code{readVcf} imports records from bzip compressed or uncompressed
VCF files. Data are parsed into a \code{\linkS4class{VCF}} object
using the file header information if available. To import a subset
of ranges the VCF must have a Tabix index file. An index file can be
created with \code{bzip} and \code{indexTabix} functions.
The \code{readInfo}, \code{readGeno} and \code{readGT} functions
are lightweight versions of \code{readVcf} and import a single
variable. The return object is a vector, matrix or CompressedList
instead of a VCF class.
}
}
\code{readVcf} calls \code{\link{scanVcf}}, the details of which can be
found with \code{?scanVcf}.
}
\item{Data type: }{
CHROM, POS, ID and REF fields are used to create the \code{GRanges}
stored in the \code{rowData} slot of the \code{VCF} object. Access
with \code{rowData} accessor.
REF, ALT, QUAL and FILTER are parsed into the \code{DataFrame} in the
\code{fixed} slot. Because ALT can have more than one value per variant
it is represented as a \code{DNAStringSetList}. REF is a \code{DNAStringSet},
QUAL is \code{numeric} and FILTER is a \code{character}. Accessors include
\code{fixed}, \code{ref}, \code{alt}, \code{qual}, and \code{filt}.
Data from the INFO field can be accessed with the \code{info} accessor.
Genotype data (i.e., data immediately following the FORMAT field in the
VCF) can be accessed with the \code{geno} accessor. INFO and genotype data
types are determined according to the \sQuote{Number} and \sQuote{Type}
information in the file header as follows:
If \sQuote{Number} is 1, \sQuote{info} data are parsed into a
\code{vector}. \sQuote{geno} data are parsed into a \code{matrix} where
the columns are the samples.
If \sQuote{Number} is an integer >1, \sQuote{info} data are parsed into a
\code{DataFrame} with the indicated number of columns. \sQuote{geno} are
parsed into an \code{array} with the same dimentions as \sQuote{Number}.
Columns of the \sQuote{geno} matrices are the samples.
If \sQuote{Number} is \sQuote{.}, \sQuote{A} or \sQuote{G}, a \code{matrix}
is used for both \sQuote{info} and \sQuote{geno} data.
When the VCF header does not contain data type information, the data are
returned as a single unparsed column named \sQuote{INFO} or \sQuote{GENO}.
}
\item{Missing data: }{
Missing data in VCF files are represented by a dot ("."). \code{readVcf}
retains the dot as a character string for data type character and converts
it to \code{NA} for data types numeric or double.
Because the data are stored in rectangluar data structures there is a
value for each \code{info} and \code{geno} field element in the \code{VCF}
class. If the element was missing or was not collected for a particular
variant the value will be \code{NA}.
}
}
}
\value{
\code{readVcf} returns a \code{\linkS4class{VCF}} object. See ?\code{VCF} for
complete details of the class structure. \code{readGT}, \code{readInfo} and
\code{readGeno} return a \code{matrix}.
\describe{
\item{rowData: }{
The CHROM, POS, ID and REF fields are used to create a \code{GRanges}
object. The ranges are created using POS as the start value and width of
the reference allele (REF). The IDs become the rownames. If they are
missing (i.e., \sQuote{.}) a string of CHROM:POS_REF/ALT is used instead.
The \code{genome} argument is stored in the seqinfo of the \code{GRanges}
and can be accessed with \code{genome(<VCF>)}.
One elementMetadata column, \code{paramRangeID}, is included with the
\code{rowData}. This ID is meaningful when multiple ranges are specified
in the \code{ScanVcfParam}. This ID distinguishes which records match
each range.
}
\item{fixed: }{
REF, ALT, QUAL and FILTER fields of the VCF are parsed into a
\code{DataFrame}.
}
\item{info: }{
Data from the INFO field of the VCF is parsed into a \code{DataFrame}.
}
\item{geno: }{
If present, the genotype data are parsed into a list of \code{matrices}
or \code{arrays}. Each list element represents a field in the FORMAT
column of the VCF file. Rows are the variants, columns are the samples.
}
\item{colData: }{
This slot contains a \code{DataFrame} describing the samples. If present,
the sample names following FORMAT in the VCF file become the row names.
}
\item{exptData: }{
Header information present in the file is put into a \code{SimpleList}
in \code{exptData}.
}
}
See references for complete details of the VCF file format.
}
\references{
\url{http://vcftools.sourceforge.net/specs.html} outlines the VCF
specification.
\url{http://samtools.sourceforge.net/mpileup.shtml} contains
information on the portion of the specification implemented by
\code{bcftools}.
\url{http://samtools.sourceforge.net/} provides information on
\code{samtools}.
}
\author{
Valerie Obenchain <vobencha@fhcrc.org>
}
\seealso{
\code{\link{indexTabix}},
\code{\link{TabixFile}},
\code{\link{scanTabix}},
\code{\link{scanBcf}},
\code{\link{expand,CollapsedVCF-method}}
}
\examples{
fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
## vcf <- readVcf(fl, c("20"="hg19")) ## 'genome' as named vector
## ---------------------------------------------------------------------
## Header and genome information
## ---------------------------------------------------------------------
vcf
## all header information
hdr <- header(vcf)
## header information for 'info' and 'fixed' fields
info(hdr)
fixed(hdr)
## ---------------------------------------------------------------------
## Accessors
## ---------------------------------------------------------------------
## fixed fields together
head(fixed(vcf), 5)
## fixed fields separately
filt(vcf)
ref(vcf)
## info data
info(hdr)
info(vcf)
info(vcf)$DP
## geno data
geno(hdr)
geno(vcf)
head(geno(vcf)$GT)
## genome
unique(genome(rowData(vcf)))
## ---------------------------------------------------------------------
## Data subsets with lightweight read* functions
## ---------------------------------------------------------------------
## Import a single 'info' or 'geno' variable
DP <- readInfo(fl, "DP")
HQ <- readGeno(fl, "HQ")
## Import GT as numeric representation
GT <- readGT(fl)
## Import GT as nucleotides
GT <- readGT(fl, nucleotides=TRUE)
## ---------------------------------------------------------------------
## Data subsets with ScanVcfParam
## ---------------------------------------------------------------------
## Subset on genome coordinates:
## 'file' must have a Tabix index
rngs <- GRanges("20", IRanges(c(14370, 1110000), c(17330, 1234600)))
names(rngs) <- c("geneA", "geneB")
param <- ScanVcfParam(which=rngs)
compressVcf <- bgzip(fl, tempfile())
idx <- indexTabix(compressVcf, "vcf")
tab <- TabixFile(compressVcf, idx)
vcf <- readVcf(tab, "hg19", param)
## When data are subset by range ('which' argument in ScanVcfParam),
## the 'paramRangeID' column provides a map back to the original
## range in 'param'.
rowData(vcf)[,"paramRangeID"]
vcfWhich(param)
## Subset on samples:
## Consult the header for the sample names.
samples(hdr)
## Specify one or more names in 'samples' in a ScanVcfParam.
param <- ScanVcfParam(samples="NA00002")
vcf <- readVcf(tab, "hg19", param)
geno(vcf)$GT
## Subset on 'fixed', 'info' or 'geno' fields:
param <- ScanVcfParam(fixed="ALT", geno=c("GT", "HQ"), info=c("NS", "AF"))
vcf_tab <- readVcf(tab, "hg19", param)
info(vcf_tab)
geno(vcf_tab)
## No ranges are specified in the 'param' so tabix file is not
## required. Instead, the uncompressed VCF can be used as 'file'.
vcf_fname <- readVcf(fl, "hg19", param)
## The header will always contain information for all variables
## in the original file reguardless of how the data were subset.
## For example, all 'geno' fields are listed in the header
geno(header(vcf_fname))
## but only 'GT' and 'HQ' are present in the VCF object.
geno(vcf_fname)
## Subset on both genome coordinates and 'info', 'geno' fields:
param <- ScanVcfParam(geno="HQ", info="AF", which=rngs)
vcf <- readVcf(tab, "hg19", param)
## When any of 'fixed', 'info' or 'geno' are omitted (i.e., no
## elements specified) all records are retrieved. Use NA to indicate
## that no records should be retrieved. This param specifies
## all 'fixed fields, the "GT" 'geno' field and none of 'info'.
ScanVcfParam(geno="GT", info=NA)
## ---------------------------------------------------------------------
## Iterate through VCF with 'yieldSize'
## ---------------------------------------------------------------------
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
param <- ScanVcfParam(fixed="ALT", geno=c("GT", "GL"), info=c("LDAF"))
tab <- TabixFile(fl, yieldSize=4000)
open(tab)
while (nrow(vcf <- readVcf(tab, "hg19", param=param)))
cat("vcf dim:", dim(vcf), "\n")
close(tab)
}
\keyword{manip}
|