File: mut_strand.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.0.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 5,908 kB
  • sloc: sh: 8; makefile: 2
file content (203 lines) | stat: -rw-r--r-- 8,145 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#' Find strand of mutations
#'
#' @details
#' For transcription mode:
#' Definitions of gene bodies with strand (+/-) information should be defined
#' in a GRanges object.
#'
#' For the base substitutions that are within gene bodies, it is determined whether
#' the "C" or "T" base is on the same strand as the gene definition. (Since
#' by convention we regard base substitutions as C>X or T>X.)
#'
#' Base substitutions on the same strand as the gene definitions are considered
#' "untranscribed", and on the opposite strand of gene bodies as "transcribed",
#' since the gene definitions report the coding or sense strand, which is
#' untranscribed.
#'
#' No strand information "-" is returned for base substitutions outside gene
#' bodies, or base substitutions that overlap with more than one gene body on
#' the same strand.
#'
#' For replication mode:
#' Replication directions of genomic ranges should be defined in GRanges object.
#' The GRanges object should have a "strand_info" metadata column,
#' which contains only two different annotations, e.g. "left" and "right", or
#' "leading" and "lagging". The genomic ranges cannot overlap, to allow only one
#' annotation per location.
#'
#' For each base substitution it is determined on which strand it is located.
#' No strand information "-" is returned for base substitutions in un-annotated
#' genomic regions.
#'
#' With the package we provide an example dataset, see example code.
#'
#'
#' @param vcf GRanges containing the VCF object
#' @param ranges GRanges object with the genomic ranges of:
#' 1. (transcription mode) the gene bodies with strand (+/-) information, or
#' 2. (replication mode) the replication strand with 'strand_info' metadata
#' @param mode "transcription" or "replication", default = "transcription"
#'
#' @return Character vector with transcriptional strand information with
#' length of vcf: "-" for positions outside gene bodies, "U" for
#' untranscribed/sense/coding strand, "T" for
#' transcribed/anti-sense/non-coding strand.
#'
#' @examples
#' ## For this example we need our variants from the VCF samples, and
#' ## a known genes dataset.  See the 'read_vcfs_as_granges()' example
#' ## for how to load the VCF samples.
#' vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' ## For transcription strand:
#' ## You can obtain the known genes from the UCSC hg19 dataset using
#' ## Bioconductor:
#' # source("https://bioconductor.org/biocLite.R")
#' # biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
#' library("TxDb.Hsapiens.UCSC.hg19.knownGene")
#' genes_hg19 <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'
#' mut_strand(vcfs[[1]], genes_hg19, mode = "transcription")
#'
#' ## For replication strand:
#' ## Read example bed file with replication direction annotation
#' ## Read replistrand data
#' repli_file <- system.file("extdata/ReplicationDirectionRegions.bed",
#'   package = "MutationalPatterns"
#' )
#' repli_strand <- read.table(repli_file, header = TRUE)
#' repli_strand_granges <- GRanges(
#'   seqnames = repli_strand$Chr,
#'   ranges = IRanges(
#'     start = repli_strand$Start + 1,
#'     end = repli_strand$Stop
#'   ),
#'   strand_info = repli_strand$Class
#' )
#' ## UCSC seqlevelsstyle
#' seqlevelsStyle(repli_strand_granges) <- "UCSC"
#'
#' mut_strand(vcfs[[1]], repli_strand_granges, mode = "transcription")
#' @seealso
#' \code{\link{read_vcfs_as_granges}},
#'
#' @export

mut_strand <- function(vcf, ranges, mode = "transcription") {
  # Transcription mode
  if (mode == "transcription") {
    # Reduce gene object to merge gene definitions that overlap on the same strand
    genes <- GenomicRanges::reduce(ranges)

    # Check consistency of chromosome names.
    if (!(all(GenomeInfoDb::seqlevels(vcf) %in% GenomeInfoDb::seqlevels(genes)))) {
      stop(paste(
        "Chromosome names (seqlevels) of vcf and genes Granges",
        "object do not match. Use the seqlevelsStyle() function",
        "to rename chromosome names."
      ), call. = FALSE)
    }

    # Determine overlap between vcf positions and genes
    overlap <- findOverlaps(vcf, genes)
    overlap <- as.data.frame(as.matrix(overlap))
    colnames(overlap) <- c("vcf_id", "gene_body_id")

    # Remove mutations that overlap with multiple genes and therefore cannot
    # be determined whether they are on transcribed or untranscribed strand
    # duplicated mutations.
    dup_pos <- overlap$vcf_id[duplicated(overlap$vcf_id)]

    # Index of duplicated mutations
    dup_idx <- which(overlap$vcf_id %in% dup_pos)

    # Remove all duplicated (non-unique mapping) mutations.
    if (length(dup_idx) > 0) {
      overlap <- overlap[-dup_idx, ]
    }

    # Subset of mutations in genes
    vcf_overlap <- vcf[overlap$vcf_id]

    # Find reference allele of mutations (and strand of reference genome is
    # reported in vcf file).
    ref <- .get_ref(vcf_overlap)

    # Find the strand of C or T (since we regard base substitutions as
    # C>X or T>X) which mutations have ref allele C or T.
    i <- which(ref == "C" | ref == "T")

    # Store mutation strand info in vector.
    strand_muts <- rep(0, nrow(overlap))
    strand_muts[i] <- "+"
    strand_muts[-i] <- "-"

    # Find strand of gene bodies of overlaps.
    strand_genebodies <- as.character(strand(genes)[overlap$gene_body_id])

    # Find if mut and gene_bodies are on the same strand.
    same_strand <- (strand_muts == strand_genebodies)

    # Subset vcf object for both untranscribed and transcribed
    # gene definition represents the untranscribed/sense/coding strand
    # if mutation is on same strand as gene, than its untranscribed.
    U_index <- which(same_strand == TRUE)

    # If mutation is on different strand than gene, then its transcribed.
    T_index <- which(same_strand == FALSE)
    strand <- rep(0, nrow(overlap))
    strand[U_index] <- "untranscribed"
    strand[T_index] <- "transcribed"

    # Make vector with all positions in input vcf for positions that do
    # not overlap with gene bodies, report "-".
    strand2 <- rep("-", length(vcf))
    strand2[overlap$vcf_id] <- strand
    # make factor
    strand2 <- factor(strand2, levels = c("untranscribed", "transcribed", "-"))
  }

  # Replication mode
  if (mode == "replication") {
    if (is.null(ranges$strand_info)) {
      stop("GRanges object with genomic regions does not contain 'strand_info' factor as metadata.")
    }

    # Manually set the levels of the factor.
    levels(ranges$strand_info) <- unique(ranges$strand_info)

    # Check that only two different annotations
    if (length(levels(ranges$strand_info)) != 2) {
      stop("GRanges object metadata: 'strand_info' factor should contain exactly two different 
           levels, such as 'left' and 'right'.")
    }
    overlap <- findOverlaps(vcf, ranges)
    overlap <- as.data.frame(as.matrix(overlap))
    colnames(overlap) <- c("vcf_id", "region_id")
    dup_pos <- overlap$vcf_id[duplicated(overlap$vcf_id)]
    dup_idx <- which(overlap$vcf_id %in% dup_pos)
    if (length(dup_idx) > 0) {
      overlap <- overlap[-dup_idx, ]
      warning("Some variants overlap with multiple genomic regions in the GRanges object.\n
              These variants are assigned '-', as the strand cannot be determined.\n
              To avoid this, make sure no genomic regions are overlapping in your GRanges object.")
    }

    # Combine the strand info from the mutation and the ranges.
    strand_repli <- ranges[overlap$region_id]$strand_info
    match_f <- BiocGenerics::match(.get_ref(vcf)[overlap$vcf_id], c("C", "T"), nomatch = 0L) > 0L
    strand_mut <- ifelse(match_f, "+", "-")
    strand_levels <- levels(ranges$strand_info)
    strand_f <- (strand_mut == "+" & strand_repli == strand_levels[1]) | (strand_mut == "-" & strand_repli == strand_levels[2])
    strand <- ifelse(strand_f, strand_levels[1], strand_levels[2])

    # Fill in the strand info where possible
    strand2 <- rep("-", length(vcf))
    strand2[overlap$vcf_id] <- strand
    levels <- c(levels(ranges$strand_info), "-")
    strand2 <- factor(strand2, levels = levels)
  }
  return(strand2)
}