File: predictCoding-methods.Rd

package info (click to toggle)
r-bioc-variantannotation 1.10.5-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,172 kB
  • ctags: 109
  • sloc: ansic: 1,088; sh: 4; makefile: 2
file content (204 lines) | stat: -rw-r--r-- 8,646 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
\name{predictCoding}
\alias{predictCoding}
\alias{predictCoding,Ranges,TranscriptDb,ANY,DNAStringSet-method}
\alias{predictCoding,GRanges,TranscriptDb,ANY,DNAStringSet-method}
\alias{predictCoding,CollapsedVCF,TranscriptDb,ANY,missing-method}
\alias{predictCoding,ExpandedVCF,TranscriptDb,ANY,missing-method}

\title{Predict amino acid coding changes for variants}

\description{
  Predict amino acid coding changes for variants a coding regions
}

\usage{
\S4method{predictCoding}{CollapsedVCF,TranscriptDb,ANY,missing}(query, subject, seqSource, varAllele, ..., ignore.strand=FALSE)
\S4method{predictCoding}{ExpandedVCF,TranscriptDb,ANY,missing}(query, subject, seqSource, varAllele, ..., ignore.strand=FALSE)
\S4method{predictCoding}{Ranges,TranscriptDb,ANY,DNAStringSet}(query, subject, seqSource, varAllele, ..., ignore.strand=FALSE)
\S4method{predictCoding}{GRanges,TranscriptDb,ANY,DNAStringSet}(query, subject, seqSource, varAllele, ..., ignore.strand=FALSE)
}

\arguments{
  \item{query}{A \linkS4class{VCF}, \link{Ranges} or \link{GRanges} 
    instance containing the variants to be annotated. If a \link[IRanges]{Ranges} 
    is provided it will be coerced to a \link{GRanges}. If a \linkS4class{VCF}
    is provided the \code{GRanges} from the \code{rowData} slot will be used. All 
    elementMetadata columns are ignored.

    When the \code{query} is not a \code{VCF} object a \code{varAllele} must be 
    provided. The \code{varAllele} must be a \code{DNAStringSet} the same length
    as the \code{query}. If there are multiple alternate alleles per variant
    the \code{query} must be expanded to one row per each alternate allele.
    See examples.
  }
  \item{subject}{A \link[GenomicFeatures]{TranscriptDb} object that serves 
    as the annotation. GFF files can be converted to 
    \link[GenomicFeatures]{TranscriptDb} objects with 
    \code{makeTranscriptDbFromGFF()} in the \code{GenomicFeatures} package.
  } 
  \item{seqSource}{A \code{\link[BSgenome]{BSgenome}} instance or an \link{FaFile}
    to be used for sequence extraction.
  }
  \item{varAllele}{A \link[Biostrings]{DNAStringSet} containing the variant
    (alternate) alleles. The length of \code{varAllele} must equal the length
    of \code{query}. Missing values are represented by a zero width empty 
    element. Ranges with missing \code{varAllele} values are ignored; those with 
    an \sQuote{N} character are not translated.

    When the \code{query} is a \code{VCF} object the \code{varAllele} argument 
    will be missing. This value is taken internally from the \code{VCF} with
    \code{alt(<VCF>)}.
  }
  \item{\dots}{Additional arguments passed to methods. Arguments
    \code{genetic.code} and \code{if.fuzzy.codon} are supported for the
    \code{translate} function.
  }
  \item{ignore.strand}{A \code{logical} indicating if strand should be ignored
    when performing overlaps.
  }
}

\details{
  This function returns the amino acid coding for variants that fall 
  completely `within' a coding region. The reference sequences are 
  taken from a fasta file or \link[BSgenome]{BSgenome}. The width of 
  the reference is determined from the start postion and width of the 
  range in the \code{query}. For guidance on how to represent an insertion, 
  deletion or substitution see the 1000 Genomes VCF format (references). 

  Variant alleles are taken from the \code{varAllele} when supplied.
  When the \code{query} is a \code{VCF} object the \code{varAllele} will
  be missing. This value is taken internally from the \code{VCF} with
  \code{alt(<VCF>)}. The variant allele is substituted 
  into the reference sequences and transcribed. Transcription only 
  occurs if the substitution, insertion or deletion results in a new sequence 
  length divisible by 3.

  When the \code{query} is an unstranded (*) \code{GRanges} \code{predictCoding} 
  will attempt to find overlaps on both the positive and negative strands of the
  \code{subject}. When the subject hit is on the negative strand the return 
  \code{varAllele} is reverse complemented. The strand of the returned 
  \code{GRanges} represents the strand of the subject hit.
} 

\value{
  A \link[GenomicRanges]{GRanges} with a row for each variant-transcript 
  match. The result includes only variants that fell within coding regions.
  The strand of the output \code{GRanges} represents the strand of the 
  \code{subject} hit.

  At a minimum, the \code{elementMetadata} columns include,
  \describe{
    \item{\code{varAllele}}{
      Variant allele. This value is reverse complemented for an unstranded 
      \code{query} that overlaps a \code{subject} on the negative strand.
    }
    \item{\code{QUERYID}}{
      Map back to the row in the original query
    }
    \item{\code{TXID}}{
      Internal transcript id from the annotation
    }
    \item{\code{CDSID}}{
      Internal coding region id from the annotation
    }
    \item{\code{GENEID}}{
      Internal gene id from the annotation
    }
    \item{\code{CDSLOC}}{
      Variant location in coding region-based coordinates. This position is 
      relative to the start of the coding (cds) region defined in the 
      \code{subject} annotation.
    }
    \item{\code{PROTEINLOC}}{
      Variant codon triplet location in coding region-based coordinates.
      This position is relative to the start of the coding (cds) region
      defined in the \code{subject} annotation. 
    }
    \item{\code{CONSEQUENCE}}{
      Possible values are `synonymous', `nonsynonymous', `frameshift', 
      `nonsense' and `not translated'. Variant sequences are translated only 
      when the codon sequence is a multiple of 3. The value will be `frameshift' 
      when a sequence is of incompatible length and it will be `not translated' 
      when the \code{varAllele} is missing or there is an \sQuote{N} in the 
      sequence. `nonsense' is used for premature stop codons.
    }
    \item{\code{REFCODON}}{
      The reference codon sequence. This range is typically greater
      than the width of the range in the \code{GRanges} because it includes 
      all codons involved in the sequence modification. If the reference 
      sequence is of width 2 but the alternate allele is of width 4 then at 
      least two codons must be included in the \code{REFSEQ}.
    }
    \item{\code{VARCODON}}{
      This sequence is the result of inserting, deleting or replacing the 
      position(s) in the reference sequence alternate allele. If the result 
      of this modifiction is not a multiple of 3 no translation is performed 
      and the \code{VARAA} value will be missing.
    }
    \item{\code{REEFAA}}{
      The reference amino acid column contains the translated \code{REFSEQ}.
    }
    \item{\code{VARAA}}{
      The variant amino acid column contains the translated \code{VARSEQ}. When
      translation is not possible this value is missing.
    }
  }
}

\references{
  \url{http://vcftools.sourceforge.net/specs.html}
}

\author{Michael Lawrence and Valerie Obenchain <vobencha@fhcrc.org>}

\seealso{
  \link{readVcf},
  \link{locateVariants},
  \link{refLocsToLocalLocs}
  \link{getTranscriptSeqs}
}

\examples{
  library(BSgenome.Hsapiens.UCSC.hg19)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 

  ## ----------------------------
  ## VCF object as query 
  ## ----------------------------
  ## Read variants from a VCF file 
  fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
  vcf <- readVcf(fl, "hg19")

  ## Rename seqlevels in the VCF object to match those in the TxDb.
  vcf <- renameSeqlevels(vcf, "chr22")
  ## Confirm common seqlevels
  intersect(seqlevels(vcf), seqlevels(txdb))
 
  ## When 'query' is a VCF object the varAllele argument is missing.
  coding1 <- predictCoding(vcf, txdb, Hsapiens)
  head(coding1, 3)

  ## ----------------------------
  ## GRanges object as query 
  ## ----------------------------
  ## Alternatively, a GRanges can be the 'query' to predictCoding().
  ## The seqlevels were previously adjusted in the VCF object so the GRanges
  ## extracted from rowData() has the correct seqlevels.
  rd <- rowData(vcf)
 
  ## The GRanges must be expanded to have one row per alternate allele. 
  ## Variants 1, 2 and 10 have two alternate alleles.
  altallele <- alt(vcf)
  eltlen <- elementLengths(altallele)
  rd_exp <- rep(rd, eltlen)
 
  ## Call predictCoding() with the expanded GRanges as the 'query'
  ## and the unlisted alternate allele as the 'varAllele'.
  coding2 <- predictCoding(rd_exp, txdb, Hsapiens, unlist(altallele)) 

  identical(coding1, coding2)
}

\keyword{methods}