1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
#' Find strand of mutations
#'
#' @details
#' For transcription mode:
#' Definitions of gene bodies with strand (+/-) information should be defined
#' in a GRanges object.
#'
#' For the base substitutions that are within gene bodies, it is determined whether
#' the "C" or "T" base is on the same strand as the gene definition. (Since
#' by convention we regard base substitutions as C>X or T>X.)
#'
#' Base substitutions on the same strand as the gene definitions are considered
#' "untranscribed", and on the opposite strand of gene bodies as "transcribed",
#' since the gene definitions report the coding or sense strand, which is
#' untranscribed.
#'
#' No strand information "-" is returned for base substitutions outside gene
#' bodies, or base substitutions that overlap with more than one gene body on
#' the same strand.
#'
#' For replication mode:
#' Replication directions of genomic ranges should be defined in GRanges object.
#' The GRanges object should have a "strand_info" metadata column,
#' which contains only two different annotations, e.g. "left" and "right", or
#' "leading" and "lagging". The genomic ranges cannot overlap, to allow only one
#' annotation per location.
#'
#' For each base substitution it is determined on which strand it is located.
#' No strand information "-" is returned for base substitutions in un-annotated
#' genomic regions.
#'
#' With the package we provide an example dataset, see example code.
#'
#'
#' @param vcf GRanges containing the VCF object
#' @param ranges GRanges object with the genomic ranges of:
#' 1. (transcription mode) the gene bodies with strand (+/-) information, or
#' 2. (replication mode) the replication strand with 'strand_info' metadata
#' @param mode "transcription" or "replication", default = "transcription"
#'
#' @return Character vector with transcriptional strand information with
#' length of vcf: "-" for positions outside gene bodies, "U" for
#' untranscribed/sense/coding strand, "T" for
#' transcribed/anti-sense/non-coding strand.
#'
#' @examples
#' ## For this example we need our variants from the VCF samples, and
#' ## a known genes dataset. See the 'read_vcfs_as_granges()' example
#' ## for how to load the VCF samples.
#' vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#' package = "MutationalPatterns"
#' ))
#'
#' ## For transcription strand:
#' ## You can obtain the known genes from the UCSC hg19 dataset using
#' ## Bioconductor:
#' # source("https://bioconductor.org/biocLite.R")
#' # biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
#' library("TxDb.Hsapiens.UCSC.hg19.knownGene")
#' genes_hg19 <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'
#' mut_strand(vcfs[[1]], genes_hg19, mode = "transcription")
#'
#' ## For replication strand:
#' ## Read example bed file with replication direction annotation
#' ## Read replistrand data
#' repli_file <- system.file("extdata/ReplicationDirectionRegions.bed",
#' package = "MutationalPatterns"
#' )
#' repli_strand <- read.table(repli_file, header = TRUE)
#' repli_strand_granges <- GRanges(
#' seqnames = repli_strand$Chr,
#' ranges = IRanges(
#' start = repli_strand$Start + 1,
#' end = repli_strand$Stop
#' ),
#' strand_info = repli_strand$Class
#' )
#' ## UCSC seqlevelsstyle
#' seqlevelsStyle(repli_strand_granges) <- "UCSC"
#'
#' mut_strand(vcfs[[1]], repli_strand_granges, mode = "transcription")
#' @seealso
#' \code{\link{read_vcfs_as_granges}},
#'
#' @export
mut_strand <- function(vcf, ranges, mode = "transcription") {
# Transcription mode
if (mode == "transcription") {
# Reduce gene object to merge gene definitions that overlap on the same strand
genes <- GenomicRanges::reduce(ranges)
# Check consistency of chromosome names.
if (!(all(GenomeInfoDb::seqlevels(vcf) %in% GenomeInfoDb::seqlevels(genes)))) {
stop(paste(
"Chromosome names (seqlevels) of vcf and genes Granges",
"object do not match. Use the seqlevelsStyle() function",
"to rename chromosome names."
), call. = FALSE)
}
# Determine overlap between vcf positions and genes
overlap <- findOverlaps(vcf, genes)
overlap <- as.data.frame(as.matrix(overlap))
colnames(overlap) <- c("vcf_id", "gene_body_id")
# Remove mutations that overlap with multiple genes and therefore cannot
# be determined whether they are on transcribed or untranscribed strand
# duplicated mutations.
dup_pos <- overlap$vcf_id[duplicated(overlap$vcf_id)]
# Index of duplicated mutations
dup_idx <- which(overlap$vcf_id %in% dup_pos)
# Remove all duplicated (non-unique mapping) mutations.
if (length(dup_idx) > 0) {
overlap <- overlap[-dup_idx, ]
}
# Subset of mutations in genes
vcf_overlap <- vcf[overlap$vcf_id]
# Find reference allele of mutations (and strand of reference genome is
# reported in vcf file).
ref <- .get_ref(vcf_overlap)
# Find the strand of C or T (since we regard base substitutions as
# C>X or T>X) which mutations have ref allele C or T.
i <- which(ref == "C" | ref == "T")
# Store mutation strand info in vector.
strand_muts <- rep(0, nrow(overlap))
strand_muts[i] <- "+"
strand_muts[-i] <- "-"
# Find strand of gene bodies of overlaps.
strand_genebodies <- as.character(strand(genes)[overlap$gene_body_id])
# Find if mut and gene_bodies are on the same strand.
same_strand <- (strand_muts == strand_genebodies)
# Subset vcf object for both untranscribed and transcribed
# gene definition represents the untranscribed/sense/coding strand
# if mutation is on same strand as gene, than its untranscribed.
U_index <- which(same_strand == TRUE)
# If mutation is on different strand than gene, then its transcribed.
T_index <- which(same_strand == FALSE)
strand <- rep(0, nrow(overlap))
strand[U_index] <- "untranscribed"
strand[T_index] <- "transcribed"
# Make vector with all positions in input vcf for positions that do
# not overlap with gene bodies, report "-".
strand2 <- rep("-", length(vcf))
strand2[overlap$vcf_id] <- strand
# make factor
strand2 <- factor(strand2, levels = c("untranscribed", "transcribed", "-"))
}
# Replication mode
if (mode == "replication") {
if (is.null(ranges$strand_info)) {
stop("GRanges object with genomic regions does not contain 'strand_info' factor as metadata.")
}
# Manually set the levels of the factor.
levels(ranges$strand_info) <- unique(ranges$strand_info)
# Check that only two different annotations
if (length(levels(ranges$strand_info)) != 2) {
stop("GRanges object metadata: 'strand_info' factor should contain exactly two different
levels, such as 'left' and 'right'.")
}
overlap <- findOverlaps(vcf, ranges)
overlap <- as.data.frame(as.matrix(overlap))
colnames(overlap) <- c("vcf_id", "region_id")
dup_pos <- overlap$vcf_id[duplicated(overlap$vcf_id)]
dup_idx <- which(overlap$vcf_id %in% dup_pos)
if (length(dup_idx) > 0) {
overlap <- overlap[-dup_idx, ]
warning("Some variants overlap with multiple genomic regions in the GRanges object.\n
These variants are assigned '-', as the strand cannot be determined.\n
To avoid this, make sure no genomic regions are overlapping in your GRanges object.")
}
# Combine the strand info from the mutation and the ranges.
strand_repli <- ranges[overlap$region_id]$strand_info
match_f <- BiocGenerics::match(.get_ref(vcf)[overlap$vcf_id], c("C", "T"), nomatch = 0L) > 0L
strand_mut <- ifelse(match_f, "+", "-")
strand_levels <- levels(ranges$strand_info)
strand_f <- (strand_mut == "+" & strand_repli == strand_levels[1]) | (strand_mut == "-" & strand_repli == strand_levels[2])
strand <- ifelse(strand_f, strand_levels[1], strand_levels[2])
# Fill in the strand info where possible
strand2 <- rep("-", length(vcf))
strand2[overlap$vcf_id] <- strand
levels <- c(levels(ranges$strand_info), "-")
strand2 <- factor(strand2, levels = levels)
}
return(strand2)
}
|