File: summarizeVariants-methods.Rd

package info (click to toggle)
r-bioc-variantannotation 1.28.10-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,608 kB
  • sloc: ansic: 1,370; sh: 4; makefile: 2
file content (162 lines) | stat: -rw-r--r-- 6,348 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
\name{summarizeVariants}

\alias{summarizeVariants}
\alias{summarizeVariants,TxDb,VCF,CodingVariants-method}
\alias{summarizeVariants,TxDb,VCF,FiveUTRVariants-method}
\alias{summarizeVariants,TxDb,VCF,ThreeUTRVariants-method}
\alias{summarizeVariants,TxDb,VCF,SpliceSiteVariants-method}
\alias{summarizeVariants,TxDb,VCF,IntronVariants-method}
\alias{summarizeVariants,TxDb,VCF,PromoterVariants-method}
\alias{summarizeVariants,GRangesList,VCF,VariantType-method}
\alias{summarizeVariants,GRangesList,VCF,function-method}

\title{Summarize variants by sample}

\description{
  Variants in a VCF file are overlapped with an annotation 
  region and summarized by sample. Genotype information in
  the VCF is used to determine which samples express 
  each variant.
}

\usage{
\S4method{summarizeVariants}{TxDb,VCF,CodingVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{TxDb,VCF,FiveUTRVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{TxDb,VCF,ThreeUTRVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{TxDb,VCF,SpliceSiteVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{TxDb,VCF,IntronVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{TxDb,VCF,PromoterVariants}(query, subject, mode, ...)
\S4method{summarizeVariants}{GRangesList,VCF,VariantType}(query, subject, mode, ...)
\S4method{summarizeVariants}{GRangesList,VCF,function}(query, subject, mode, ...)
}

\arguments{
  \item{query}{A \link[GenomicFeatures]{TxDb} or \code{GRangesList}
    object that serves as the annotation. GFF files can be converted to
    \link[GenomicFeatures]{TxDb} objects with 
    \code{makeTxDbFromGFF()} in the \code{GenomicFeatures} package.
  }
  \item{subject}{A \linkS4class{VCF} object containing the variants.
  }
  \item{mode}{\code{mode} can be a \code{VariantType} class or the
    name of a function. 

    When \code{mode} is a \code{VariantType} class, counting is done with
    \code{locateVariants} and counts are summarized transcript-by-sample. 
    Supported \code{VariantType} classes include 
    \code{CodingVariants}, \code{IntronVariants}, \code{FiveUTRVariants}, 
    \code{ThreeUTRVariants}, \code{SpliceSiteVariants} or \code{PromoterVariants}. 
    \code{AllVariants()} and \code{IntergenicVariants} are not supported. See 
    ?\code{locateVariants} for more detail on the variant classes.

    \code{mode} can also be the name of any counting function that outputs
    a \code{Hits} object. Variants will be summarized by the length of the
    \code{GRangesList} annotation (i.e., 'length-of-GRangesList'-by-sample).
  }
  \item{\dots}{Additional arguments passed to methods such as
    \describe{
      \item{ignore.strand}{A \code{logical} indicating if strand should be
        igored when performing overlaps.
      }
    }
  }
}

\details{
  \code{summarizeVariants} uses the genotype information in a VCF
  file to determine which samples are positive for each variant.
  Variants are overlapped with the annotation and the counts
  are summarized annotation-by-sample. If the annotation is a 
  \code{GRangesList} of transcripts, the count matrix will 
  be transcripts-by-sample. If the \code{GRangesList} is genes,
  the count matrix will be gene-by-sample.

  \itemize{
    \item{Counting with locateVariants() :}{

      Variant counts are always summarized transcript-by-sample.
      When \code{query} is a \code{GRangesList}, it must be compatible 
      with the \code{VariantType}-class given as the \code{mode} argument. 
      The list below specifies the appropriate \code{GRangesList} for each
      \code{mode}.
      \describe{
        \item{CodingVariants :}{coding (CDS) by transcript}
        \item{IntronVariants :}{introns by transcript}
        \item{FiveUTRVariants :}{five prime UTR by transcript}
        \item{ThreeUTRVariants :}{three prime UTR by transcript}
        \item{SpliceSiteVariants :}{introns by transcript}
        \item{PromoterVariants :}{list of transcripts}
      }

      When \code{query} is a \code{TxDb}, the appropriate 
      region-by-transcript \code{GRangesList} listed above is extracted 
      internally and used as the annotation. 

    }
 
    \item{Counting with a user-supplied function :}{

      \code{subject} must be a \code{GRangesList} and \code{mode} must
      be the name of a function. The count function must take 'query'
      and 'subject' arguments and return a \code{Hits} object. Counts are 
      summarized by the outer list elements of the \code{GRangesList}.
    }
  }
}

\value{
  A \code{RangedSummarizedExperiment} object with count summaries in the 
  \code{assays} slot. The \code{rowRanges} contains the annotation
  used for counting. Information in \code{colData} and \code{metadata} 
  are taken from the VCF file.
}

\author{Valerie Obenchain}

\seealso{
  \code{\link{readVcf}},
  \code{\link{predictCoding}},
  \code{\link{locateVariants}}
}

\examples{
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 

  ## Read variants from VCF.
  fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
  vcf <- readVcf(fl, "hg19")
  ## Rename seqlevels to match TxDb; confirm the match.
  seqlevels(vcf) <- paste0("chr", seqlevels(vcf)) 
  intersect(seqlevels(vcf), seqlevels(txdb))

  ## ----------------------------------------
  ## Counting with locateVariants()
  ## ----------------------------------------
  ## TxDb as the 'query'
  coding1 <- summarizeVariants(txdb, vcf, CodingVariants())
  colSums(assays(coding1)$counts)

  ## GRangesList as the 'query'
  cdsbytx <- cdsBy(txdb, "tx")
  coding2 <- summarizeVariants(cdsbytx, vcf, CodingVariants()) 

  stopifnot(identical(assays(coding1)$counts, assays(coding2)$counts))

  ## Promoter region variants summarized by transcript
  tx <- transcripts(txdb)
  txlst <- splitAsList(tx, seq_len(length(tx)))
  promoter <- summarizeVariants(txlst, vcf, 
                                PromoterVariants(upstream=100, downstream=10))
  colSums(assays(promoter)$counts)

  ## ----------------------------------------
  ## Counting with findOverlaps() 
  ## ----------------------------------------

  ## Summarize all variants by transcript
  allvariants <- summarizeVariants(txlst, vcf, findOverlaps)
  colSums(assays(allvariants)$counts)
}

\keyword{methods}