1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
\name{filterVcf}
\alias{filterVcf}
\alias{filterVcf,character-method}
\alias{filterVcf,TabixFile-method}
\title{Filter VCF files}
\description{Filter Variant Call Format (VCF) files from one file to another}
\usage{
\S4method{filterVcf}{character}(file, genome, destination, ..., verbose = TRUE,
index = FALSE, prefilters = FilterRules(), filters = FilterRules(),
param = ScanVcfParam())
\S4method{filterVcf}{TabixFile}(file, genome, destination, ..., verbose = TRUE,
index = FALSE, prefilters = FilterRules(), filters = FilterRules(),
param = ScanVcfParam())
}
\arguments{
\item{file}{A \code{character(1)} file path or \code{\link{TabixFile}}
specifying the VCF file to be filtered.}
\item{genome}{A \code{character(1)} identifier}
\item{destination}{A \code{character(1)} path to the location where
the filtered VCF file will be written.}
\item{...}{Additional arguments, possibly used by future methods.}
\item{verbose}{A \code{logical(1)} indicating whether progress
messages should be printed.}
\item{index}{A \code{logical(1)} indicating whether the filtered file
should be compressed and indexed (using \code{\link{bgzip}} and
\code{indexTabix}).}
\item{prefilters}{A \code{\link{FilterRules}} instance contains rules for
filtering un-parsed lines of the VCF file.}
\item{filters}{A \code{\link{FilterRules}} instance contains rules for
filtering fully parsed VCF objects.}
\item{param}{A \code{\link{ScanVcfParam}} instance restricting input
of particular \code{info} or \code{geno} fields, or genomic
locations. Applicable when applying a \code{filter} only.
Prefiltering involves a grep of unparsed lines in the file;
indexing is not used.}
}
\details{
This function transfers content of one VCF file to another, removing
records that fail to satisfy \code{prefilters} and
\code{filters}. Filtering is done in a memory efficient manner,
iterating over the input VCF file in chunks of default size 100,000
(when invoked with \code{character(1)} for \code{file}) or as
specified by the \code{yieldSize} argument of \code{TabixFile} (when
invoked with \code{TabixFile}).
There are up to two passes. In the first pass, unparsed lines are
passed to \code{prefilters} for filtering, e.g., searching for a fixed
character string. In the second pass lines successfully passing
\code{prefilters} are parsed into \code{VCF} instances and made
available for further filtering. One or both of \code{prefilter} and
\code{filter} can be present.
}
\value{The destination file path as a \code{character(1)}.}
\author{
Martin Morgan \url{mailto:mtmorgan@fhcrc.org} and Paul Shannon
\url{mailto:pshannon@fhcrc.org}.
}
\seealso{
\code{\link{readVcf}}, \code{\link{writeVcf}}.
}
\examples{
fl <- system.file(package="VariantAnnotation", "extdata",
"chr22.vcf.gz")
destination <- tempfile()
pre <- FilterRules(list(isLowCoverageExomeSnp = function(x) {
grepl("LOWCOV,EXOME", x, fixed=TRUE)
}))
filt <- FilterRules(list(isSNP = function(x) info(x)$VT == "SNP"))
filtered <- filterVcf(fl, "hg19", destination, prefilters=pre, filters=filt)
vcf <- readVcf(filtered, "hg19")
}
\keyword{manip}
|