File: scanVcf-methods.Rd

package info (click to toggle)
r-bioc-variantannotation 1.20.2-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,492 kB
  • ctags: 114
  • sloc: ansic: 1,370; sh: 4; makefile: 2
file content (154 lines) | stat: -rw-r--r-- 5,172 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
\name{scanVcf}
\Rdversion{1.1}

\alias{scanVcfHeader}
\alias{scanVcfHeader,character-method}
\alias{scanVcf}
\alias{scanVcf,character,ScanVcfParam-method}
\alias{scanVcf,character,missing-method}
\alias{scanVcf,connection,missing-method}

\alias{scanVcfHeader,TabixFile-method}
\alias{scanVcf,TabixFile,GRanges-method}
\alias{scanVcf,TabixFile,RangesList-method}
\alias{scanVcf,TabixFile,ScanVcfParam-method}
\alias{scanVcf,TabixFile,missing-method}

\title{
  Import VCF files
}

\description{
  Import Variant Call Format (VCF) files in text or binary format
}

\usage{
scanVcfHeader(file, ...)
\S4method{scanVcfHeader}{character}(file, ...)

scanVcf(file, ..., param)
\S4method{scanVcf}{character,ScanVcfParam}(file, ..., param)
\S4method{scanVcf}{character,missing}(file, ..., param)
\S4method{scanVcf}{connection,missing}(file, ..., param)

\S4method{scanVcfHeader}{TabixFile}(file, ...)
\S4method{scanVcf}{TabixFile,missing}(file, ..., param)
\S4method{scanVcf}{TabixFile,ScanVcfParam}(file, ..., param)
\S4method{scanVcf}{TabixFile,GRanges}(file, ..., param)
\S4method{scanVcf}{TabixFile,RangesList}(file, ..., param)
}

\arguments{
  \item{file}{For \code{scanVcf} and \code{scanVcfHeader}, the character() 
    file name, \code{\link{TabixFile}}, or class \code{connection} 
    (\code{file()} or \code{bgzip()}) of the \sQuote{VCF} file to be 
    processed.
  }
  \item{param}{A instance of \code{\linkS4class{ScanVcfParam}} influencing 
    which records are parsed and the \sQuote{INFO} and \sQuote{GENO} information
    returned.
  }
  \item{...}{Additional arguments for methods
  }
}

\details{
  The argument \code{param} allows portions of the file to be input, but 
  requires that the file be bgzip'd and indexed as a 
  \code{\linkS4class{TabixFile}}.

  \code{scanVcf} with \code{param="missing"} and \code{file="character"}
  or \code{file="connection"} scan the entire file. With
  \code{file="connection"}, an argument \code{n} indicates the number of
  lines of the VCF file to input; a connection open at the beginning of
  the call is open and incremented by \code{n} lines at the end of the
  call, providing a convenient way to stream through large VCF files.

  The INFO field of the scanned VCF file is returned as a single
  \sQuote{packed} vector, as in the VCF file. The GENO field is a list of 
  matrices, each matrix corresponds to a field as defined in the FORMAT 
  field of the VCF header. Each matrix has as many rows as
  scanned in the VCF file, and as many columns as there are samples. As
  with the INFO field, the elements of the matrix are
  \sQuote{packed}. The reason that INFO and GENO are returned packed is
  to facilitate manipulation, e.g., selecting particular rows or
  samples in a consistent manner across elements.
}

\value{
  \code{scanVcfHeader} returns a \code{VCFHeader} object with
  header information parsed into five categories, \code{samples}, 
  \code{meta}, \code{fixed}, \code{info} and \code{geno}. Each
  can be accessed with a `getter' of the same name
  (e.g., info(<VCFHeader>)). If the file header has multiple rows 
  with the same name (e.g., 'source') the row names of the DataFrame
  are made unique in the usual way, 'source', 'source.1' etc. 

  \code{scanVcf} returns a list, with one element per range. Each list
  has 7 elements, obtained from the columns of the VCF specification:
    \describe{
      \item{rowRanges}{
        \code{GRanges} instance derived from \code{CHROM}, \code{POS}, 
        \code{ID}, and the width of \code{REF}
      }
      \item{REF}{
        reference allele
      }
      \item{ALT}{
        alternate allele
      }
      \item{QUAL}{
        phred-scaled quality score for the assertion made in ALT
      }
      \item{FILTER}{
        indicator of whether or not the position passed all filters applied
      }
      \item{INFO}{
        additional information
      }
      \item{GENO}{
        genotype information immediately following the FORMAT field in the VCF
      }
    }
  The \code{GENO} element is itself a list, with elements corresponding
  to those defined in the VCF file header. For \code{scanVcf}, elements
  of GENO are returned as a matrix of records x samples; if the
  description of the element in the file header indicated multiplicity
  other than 1 (e.g., variable number for \dQuote{A}, \dQuote{G}, or
  \dQuote{.}), then each entry in the matrix is a character string with
  sub-entries comma-delimited.
}

\references{
  \url{http://vcftools.sourceforge.net/specs.html} outlines the VCF
  specification.

  \url{http://samtools.sourceforge.net/mpileup.shtml} contains
  information on the portion of the specification implemented by
  \code{bcftools}.

  \url{http://samtools.sourceforge.net/} provides information on
  \code{samtools}.
}

\seealso{
  \code{\link{readVcf}} 
  \code{\link{BcfFile}} 
  \code{\link{TabixFile}}
}

\author{
  Martin Morgan and Valerie Obenchain>
}

\examples{
  fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
  scanVcfHeader(fl)
  vcf <- scanVcf(fl)
  ## value: list-of-lists
  str(vcf)
  names(vcf[[1]][["GENO"]])
  vcf[[1]][["GENO"]][["GT"]]
}

\keyword{ manip }