File: plot_lesion_segregation.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,360 kB
  • sloc: sh: 8; makefile: 2
file content (273 lines) | stat: -rw-r--r-- 10,080 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#' Plot the strands of variants to show lesion segregation
#'
#' The strands of variants in a GRanges object is plotted.
#' This way the presence of any lesion segregation is visualized.
#' The function can plot either a single or multiple samples.
#' Per chromosome, the ratio of the mutations on the chromosomal strands is
#' visualised by a line. The position of this line is calculated as the mean of
#' the "+" and "-" strand, where "+" equals 1 and "-" equals 0. In other words:
#' this line lies between the two strands if the mutations are equally
#' distributed between them, and approaches a strand if the majority of
#' mutations on a chromosome lie on that strand.
#' 
#' @param vcf GRanges or RGrangesList object.
#' @param per_chrom Boolean. Determines whether to create a separate plot per chromosome.
#' @param sample_name Name of the sample. Is used as the title of the plot.
#' Not very useful if you have more than one sample.
#' @param min_muts_mean Integer. The minimum of mutations, required for the mean strand 
#' of a chromosome to be calculated.
#' @param chromosomes Character vector. Determines chromosomes to be used and their order.
#' @param subsample Double between 0 and 1. Subsamples the amount of mutations to create
#' a plot with less dots. Such a plot is easier to modify in a vector program like illustrator.
#' (default: NA)
#'
#' @return ggplot2 object
#' @export
#' @seealso
#' \code{\link{calculate_lesion_segregation}}
#' @family Lesion_segregation
#' @examples
#'
#' ## See the 'read_vcfs_as_granges()' example for how we obtained the
#' ## following data:
#' grl <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#'   package = "MutationalPatterns"
#' ))
#' 
#' ## Plot lesion segregation
#' plot_lesion_segregation(grl[1:3])
#' 
#' ## Select a single GRanges object to plot.
#' gr <- grl[[1]]
#'
#' ## Plot lesion segregation for a single sample. 
#' ## Also add a title to the plot.
#' plot_lesion_segregation(gr, sample_name = "Colon1")
#'
#' ## Plot lesion segregation per chromosome.
#' ## We here store the results in a list.
#' figure_l = plot_lesion_segregation(gr, per_chrom = TRUE, sample_name = "Colon1")
#' 
#' ## Plot specific chromosomes in a user specified order
#' plot_lesion_segregation(grl[1:3], chromosomes = c(2,3))
#' 
#' ## Subsample the mutations, so less points are plotted.
#' plot_lesion_segregation(grl[1:3], subsample = 0.2)
#' 
plot_lesion_segregation <- function(vcf, 
                                    per_chrom = FALSE, 
                                    sample_name = NA, 
                                    min_muts_mean = 10,
                                    chromosomes = NA,
                                    subsample = NA) {

  # Argument name is vcf, because the function previously only worked on a single GRanges.
  # Because the function now also works on a GRangesList, the name vcf_list is used internally.
  vcf_list <- vcf
  
  # These variables use non standard evaluation.
  # To avoid R CMD check complaints we initialize them to NULL.
  max_pos <- start_mb <- notused <- n <- y <- NULL

  # Convert list to grl if necessary
  if (inherits(vcf_list, "list")) {
    vcf_list <- GenomicRanges::GRangesList(vcf_list)
  }
  # Genome is set to NULL to ensure seqlevels can be changed.
  GenomeInfoDb::genome(vcf_list) <- NA
  GenomeInfoDb::seqlevelsStyle(vcf_list) <- "NCBI" # This takes less space when plotting
  
  
  # Subsample the amount of mutations if necessary
  if (!is.na(subsample)){
    if (inherits(vcf_list, "CompressedGRangesList")){
      vcf_list <- purrr::map(as.list(vcf_list), .subsample_granges, subsample) %>% 
        GenomicRanges::GRangesList()
    } else if (inherits(vcf_list, "GRanges")) {
      vcf_list <- .subsample_granges(vcf_list, subsample)
    } else {
      .not_gr_or_grl(vcf_list)
    }
  }

  
  # get strandedness
  if (inherits(vcf_list, "CompressedGRangesList")){
    tb <- purrr::map(as.list(vcf_list), function(vcf) {
      gr <- .get_strandedness_gr(vcf)
      tb <- .get_strandedness_tb(gr)
      return(tb)
    }) %>% 
      dplyr::bind_rows(.id = "sample")
    sample_names <- names(vcf_list)
    nr_samples <- length(vcf_list)
    max_nr_mutations <- max(S4Vectors::elementNROWS(vcf_list))
  } else if (inherits(vcf_list, "GRanges")) {
    gr <- .get_strandedness_gr(vcf_list)
    tb <- .get_strandedness_tb(gr)
    tb$sample <- "MySample"
    sample_names <- "MySample"
    nr_samples <- 1
    max_nr_mutations <- length(vcf_list)
  } else {
    .not_gr_or_grl(vcf_list)
  }
  
  # Turn the sample column into a factor.
  # This ensures the correct order of the samples in the plot.
  tb <- dplyr::mutate(tb, sample = factor(sample, levels = unique(sample)))
  
  
  ## set chrom order
  if (!.is_na(chromosomes)) {
    chromosomes <- as.character(chromosomes)
    withCallingHandlers(
      {
        GenomeInfoDb::seqlevelsStyle(chromosomes) = "NCBI"
      },
      warning = function(w) {
        if (grepl("found more than one best sequence renaming map compatible with seqname style", 
                  conditionMessage(w))) {
          invokeRestart("muffleWarning")
        }
      }
    )
    
    tb <- tb %>%
      dplyr::filter(seqnames %in% chromosomes) %>%
      dplyr::mutate(seqnames = factor(seqnames, levels = chromosomes))
  } else{
    chromosomes <- GenomeInfoDb::seqlevelsInUse(vcf_list)
    if (!length(chromosomes)){
      chromosomes <- GenomeInfoDb::seqlevels(vcf_list)
    }
    tb$seqnames <- factor(tb$seqnames, levels = chromosomes)
  }
  
  # Create limit to ensure that the entire chromosomes are plotted, 
  # even when mutations don't span the entire chromosome.
  chroms <- levels(tb$seqnames)
  nr_chroms <- length(chroms)
  chrom_lengths <- GenomeInfoDb::seqlengths(vcf_list)[chroms]
  
  chr_starts <- tibble::tibble("start_mb" = rep(1, nr_chroms),
                              "seqnames" = chroms)
  chr_ends <- tibble::tibble("start_mb" = chrom_lengths/1000000,
                             'seqnames' = chroms)
  chr_limits <- rbind(chr_starts, chr_ends)
  
  # Per sample limits
  tb_limits <- tibble::tibble("sample" = factor(rep(sample_names, each = 2 * nr_chroms), levels = levels(tb$sample)),
                          "start_mb" = rep(chr_limits$start_mb, nr_samples),
                          "seqnames" = factor(rep(chr_limits$seqnames, nr_samples), levels = chroms),
                          "y" = 0.5)

  
  # calculate mean strand per chomosome per sample
  tb_mean <- tb %>% 
    dplyr::group_by(seqnames, sample) %>% 
    dplyr::summarize(y = mean(y), n = dplyr::n(), .groups = "drop_last") %>%
    dplyr::ungroup() %>% 
    dplyr::right_join(tb_limits[,c("sample", "seqnames", "start_mb")], 
                                by = c("seqnames", "sample")) %>% 
    dplyr::filter(n >= min_muts_mean) %>% 
    dplyr::mutate(sample = factor(sample, levels = levels(tb$sample)))

  # Set point_sizes
  point_size <- 200 / max_nr_mutations
  if (per_chrom == TRUE) {
    point_size <- point_size * 5
  }
  if (point_size > 2) {
    point_size <- 2
  } else if (point_size < 0.02) {
    point_size <- 0.02
  }

  # Create plots
  if (per_chrom == FALSE) {
    fig <- .plot_lesion_segregation_gg(tb, tb_mean, tb_limits, point_size, sample_name)
    return(fig)
  } else {
    tb_l <- split(tb, tb$seqnames)
    tb_mean_l <- split(tb_mean, tb_mean$seqnames)
    tb_limits_l <- split(tb_limits, tb_limits$seqnames)
    fig_l <- mapply(.plot_lesion_segregation_gg,
      tb_l, tb_mean_l, tb_limits_l,
      MoreArgs = list("point_size" = point_size, "sample_name" = sample_name),
      SIMPLIFY = FALSE
    )
    return(fig_l)
  }
}


#' Subsample granges to a smaller number of mutations
#'
#' @param vcf GRanges object
#' @param subsample Double between 0 and 1. Subsamples the amount of mutations to create
#' a plot with less dots. Such a plot is easier to modify in a vector program like illustrator.
#' 
#' @return GRanges object
#' @noRd
#'
.subsample_granges = function(vcf, subsample){
  nr_muts_kept <- ceiling(subsample * length(vcf))
  muts_kept_i <- sample.int(length(vcf), nr_muts_kept, replace = FALSE)
  vcf <- vcf[muts_kept_i,]
  return(vcf)
}

#' Plot the strands of variants to show lesion segregation
#'
#' This is a helper function for 'plot_lesion_segregation'.
#' It performs the actual plotting.
#'
#' @param tb A tibble with strand information of variants
#' @param tb_mean A tibble with the mean strand per chromosome
#' per sample.
#' @param tb_limits tibble describing the chromosome limits.
#' This ensures entire chromosomes are plotted,
#' instead of just the part with variants
#' @param point_size Scalar describing the point size of the plot
#' @param sample_name Name of the sample
#'
#' @return ggplot2 object
#'
#' @import ggplot2
#' @noRd
#'
.plot_lesion_segregation_gg <- function(tb, tb_mean, tb_limits, point_size, sample_name) {

  # These variables use non standard evaluation.
  # To avoid R CMD check complaints we initialize them to NULL.
  y <- start_mb <- NULL

  if (.is_na(sample_name)) {
    my_labs <- labs(y = "Strand", x = "Coordinate (mb)")
  } else {
    my_labs <- labs(y = "Strand", x = "Coordinate (mb)", title = sample_name)
  }


  # Plot strandedness
  fig <- ggplot(mapping = aes(y = y, x = start_mb, color = y)) +
    geom_line(data = tb_mean, size = 1.5) +
    geom_jitter(data = tb, width = 0.05, height = 0.1, size = point_size) +
    facet_grid(sample ~ seqnames, scales = "free_x", space = "free_x") +
    geom_blank(data = tb_limits) +
    scale_y_continuous(breaks = c(0, 1), labels = c("-", "+"), limits = c(-1, 2)) +
    my_labs +
    theme_bw() +
    theme(text = element_text(size = 16),
          axis.text.x = element_blank(),
          axis.ticks = element_blank(),
          axis.text.y = element_text(size = 16),
          panel.grid = element_blank(),
          panel.grid.major.y = element_line(),
          panel.border = element_blank(),
          legend.position = 'none') +
    scale_color_gradientn(colors = c('#446DF6', 'grey90', '#FFBC63'),limits = c(0,1))
  
  return(fig)
}