1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
|
### R code from vignette source 'filterVcf.Rnw'
###################################################
### code chunk number 1: setup
###################################################
options(width=80, prompt = " ", continue = " ")
###################################################
### code chunk number 2: prefilters
###################################################
isGermlinePrefilter <- function(x) {
grepl("Germline", x, fixed=TRUE)
}
notInDbsnpPrefilter <- function(x) {
!(grepl("dbsnp", x, fixed=TRUE))
}
###################################################
### code chunk number 3: filters
###################################################
isSnp <- function(x) {
refSnp <- nchar(ref(x)) == 1L
a <- alt(x)
altSnp <- elementLengths(a) == 1L
ai <- unlist(a[altSnp]) # all length 1, so unlisting is 1:1 map
altSnp[altSnp] <- nchar(ai) == 1L & (ai %in% c("A", "C", "G", "T"))
refSnp & altSnp
}
allelicDepth <- function(x) {
## ratio of AD of the "alternate allele" for the tumor sample
## OR "reference allele" for normal samples to total reads for
## the sample should be greater than some threshold (say 0.1,
## that is: at least 10% of the sample should have the allele
## of interest)
ad <- geno(x)$AD
tumorPct <- ad[,1,2,drop=FALSE] / rowSums(ad[,1,,drop=FALSE])
normPct <- ad[,2,1, drop=FALSE] / rowSums(ad[,2,,drop=FALSE])
test <- (tumorPct > 0.1) | (normPct > 0.1)
!is.na(test) & test
}
###################################################
### code chunk number 4: createFilterRules
###################################################
library(VariantAnnotation)
prefilters <- FilterRules(list(germline=isGermlinePrefilter,
dbsnp=notInDbsnpPrefilter))
filters <- FilterRules(list(isSnp=isSnp, AD=allelicDepth))
###################################################
### code chunk number 5: createFilteredFile
###################################################
file.gz <- system.file("extdata", "chr7-sub.vcf.gz",
package="VariantAnnotation")
file.gz.tbi <- system.file("extdata", "chr7-sub.vcf.gz.tbi",
package="VariantAnnotation")
destination.file <- tempfile()
tabix.file <- TabixFile(file.gz, yieldSize=10000)
filterVcf(tabix.file, "hg19", destination.file,
prefilters=prefilters, filters=filters, verbose=TRUE)
###################################################
### code chunk number 6: mcf7regulatoryRegions (eval = FALSE)
###################################################
## library(AnnotationHub)
## hub <- AnnotationHub()
## # paste operation is only for display purposes on a narrow page
## ctcf.regs <- paste("goldenpath.hg19.encodeDCC.wgEncodeUwTfbs",
## "wgEncodeUwTfbsMcf7CtcfStdPkRep1.narrowPeak_0.0.1.RData",
## sep=".")
## mcf7.gr <- hub[[ctcf.regs]]
###################################################
### code chunk number 7: findOverlaps (eval = FALSE)
###################################################
## vcf <- readVcf(destination.file, "hg19")
## seqlevels(vcf) <- paste("chr", seqlevels(vcf), sep="")
## ov.mcf7 <- findOverlaps(vcf, mcf7.gr)
###################################################
### code chunk number 8: locateVariant (eval = FALSE)
###################################################
## library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
## print(locateVariants(vcf[6,], txdb, AllVariants()))
|