1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mut_strand.R
\name{mut_strand}
\alias{mut_strand}
\title{Find strand of mutations}
\usage{
mut_strand(vcf, ranges, mode = "transcription")
}
\arguments{
\item{vcf}{GRanges containing the VCF object}
\item{ranges}{GRanges object with the genomic ranges of:
1. (transcription mode) the gene bodies with strand (+/-) information, or
2. (replication mode) the replication strand with 'strand_info' metadata}
\item{mode}{"transcription" or "replication", default = "transcription"}
}
\value{
Character vector with transcriptional strand information with
length of vcf: "-" for positions outside gene bodies, "U" for
untranscribed/sense/coding strand, "T" for
transcribed/anti-sense/non-coding strand.
}
\description{
Find strand of mutations
}
\details{
For transcription mode:
Definitions of gene bodies with strand (+/-) information should be defined
in a GRanges object.
For the base substitutions that are within gene bodies, it is determined whether
the "C" or "T" base is on the same strand as the gene definition. (Since
by convention we regard base substitutions as C>X or T>X.)
Base substitutions on the same strand as the gene definitions are considered
"untranscribed", and on the opposite strand of gene bodies as "transcribed",
since the gene definitions report the coding or sense strand, which is
untranscribed.
No strand information "-" is returned for base substitutions outside gene
bodies, or base substitutions that overlap with more than one gene body on
the same strand.
For replication mode:
Replication directions of genomic ranges should be defined in GRanges object.
The GRanges object should have a "strand_info" metadata column,
which contains only two different annotations, e.g. "left" and "right", or
"leading" and "lagging". The genomic ranges cannot overlap, to allow only one
annotation per location.
For each base substitution it is determined on which strand it is located.
No strand information "-" is returned for base substitutions in unannotated
genomic regions.
With the package we provide an example dataset, see example code.
}
\examples{
## For this example we need our variants from the VCF samples, and
## a known genes dataset. See the 'read_vcfs_as_granges()' example
## for how to load the VCF samples.
vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
package = "MutationalPatterns"
))
## For transcription strand:
## You can obtain the known genes from the UCSC hg19 dataset using
## Bioconductor:
# source("https://bioconductor.org/biocLite.R")
# biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
genes_hg19 <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
mut_strand(vcfs[[1]], genes_hg19, mode = "transcription")
## For replication strand:
## Read example bed file with replication direction annotation
## Read replistrand data
repli_file <- system.file("extdata/ReplicationDirectionRegions.bed",
package = "MutationalPatterns"
)
repli_strand <- read.table(repli_file, header = TRUE)
repli_strand_granges <- GRanges(
seqnames = repli_strand$Chr,
ranges = IRanges(
start = repli_strand$Start + 1,
end = repli_strand$Stop
),
strand_info = repli_strand$Class
)
## UCSC seqlevelsstyle
seqlevelsStyle(repli_strand_granges) <- "UCSC"
mut_strand(vcfs[[1]], repli_strand_granges, mode = "transcription")
}
\seealso{
\code{\link{read_vcfs_as_granges}},
}
|