File: bin_mutation_density.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,360 kB
  • sloc: sh: 8; makefile: 2
file content (140 lines) | stat: -rw-r--r-- 4,543 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#' Bin the genome based on mutation density
#'
#' This function splits the genome based on the mutation density.
#' The density is calculated per chromosome. The density is split
#' into bins. The difference in density between subsequent bins is the same
#' for all bins. In other words, the difference in density between bins 1 and
#' 2 is the same as between bins 2 and 3.
#' The function returns a GRangesList. Each GRanges in the list contains the
#' regions associated with that bin. This can be used with the
#' 'split_muts_region()' function.
#'
#'
#' @param vcf_list GRangesList or GRanges object.
#' @param ref_genome BSgenome reference genome object
#' @param nrbins The number of bins in which to separate the genome
#' @param man_dens_cutoffs Manual density cutoffs to use.
#'
#' @family genomic_regions
#' @return GRangesList
#' @export
#' @importFrom magrittr %>%
#'
#' @examples
#'
#' ### See the 'read_vcfs_as_granges()' example for how we obtained the
#' ## following data:
#' grl <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' ## Load the corresponding reference genome.
#' ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
#' library(ref_genome, character.only = TRUE)
#'
#' ## Determine region density
#' dens_grl <- bin_mutation_density(grl, ref_genome, nrbins = 3)
#' names(dens_grl) <- c("Low", "Medium", "High")
#'
#'
#' ## You can also use manual cutoffs. This feature is meant for more
#' ## advanced users. It can be usefull if you want to find highly mutated regions, with
#' ## a consistent cutoff between analyses.
#' dens_grl_man <- bin_mutation_density(grl, ref_genome, man_dens_cutoffs = c(0, 2e-08, 1))
bin_mutation_density <- function(vcf_list, 
                                 ref_genome, 
                                 nrbins = 3,
                                 man_dens_cutoffs = NA) {

  # These variables use non standard evaluation.
  # To avoid R CMD check complaints we initialize them to NULL.
  . <- NULL

  # Convert list to grl if necessary
  if (inherits(vcf_list, "list")) {
    vcf_list <- GenomicRanges::GRangesList(vcf_list)
  }

  # Merge if grl.
  if (inherits(vcf_list, "CompressedGRangesList")) {
    gr <- unlist(vcf_list)
  } else if (inherits(vcf_list, "GRanges")) {
    gr <- vcf_list
  } else {
    .not_gr_or_grl(vcf_list)
  }

  # Get ref genome
  ref_genome <- BSgenome::getBSgenome(ref_genome)

  # Sort the input
  gr <- BiocGenerics::sort(gr)

  # Determine density per chromosome
  chroms <- GenomeInfoDb::seqlevelsInUse(gr)
  dens_gr <- purrr::map(chroms, 
                        .get_mutation_density_chrom, 
                        ref_genome, 
                        gr, 
                        chroms) %>%
    do.call(c, .)

  # Set density break type
  if (!.is_na(man_dens_cutoffs)) {
    breaks <- man_dens_cutoffs
  } else {
    breaks <- nrbins
  }

  # Split density in classes
  dens_gr$dens <- cut(dens_gr$dens, breaks = breaks, labels = FALSE)
  dens_grl <- split(dens_gr, dens_gr$dens)
  dens_grl <- GenomicRanges::reduce(dens_grl)

  return(dens_grl)
}

#' Get mutation density for a single chromosome
#'
#' This function determines the mutation density for a single chromosome.
#' It returns a GRanges object, containing the densities.
#'
#' @param chrom String describing a chromosome
#' @param ref_genome BSgenome reference genome object
#' @param gr GRanges object
#' @param chroms String vector of chromosomes
#'
#' @return GRanges
#' @noRd
#'
.get_mutation_density_chrom <- function(chrom, ref_genome, gr, chroms) {

  # Select muts in chrom
  gr <- gr[GenomeInfoDb::seqnames(gr) == chrom]

  # Determine position
  half_width <- (BiocGenerics::end(gr) - BiocGenerics::start(gr)) / 2
  pos <- BiocGenerics::start(gr) + half_width

  # Calculate density. Only calculate within chromosome size
  chr_size <- GenomeInfoDb::seqlengths(ref_genome)[chrom]
  dens <- stats::density(pos, bw = "SJ", from = 1, to = chr_size)

  # Determine location of density bins.
  # The positions of the density estimates x are taken as the middle of the bins
  half_dist <- (dens$x[2] - dens$x[1]) / 2
  start <- ceiling(dens$x - half_dist)
  end <- floor(dens$x + half_dist)

  # Clip again to chromosome size
  start[1] <- 1
  end[length(end)] <- chr_size

  # Transform to GRanges
  dens_gr <- GenomicRanges::GRanges(seqnames = chrom, 
                                    ranges = IRanges::IRanges(start, end))
  dens_gr$dens <- dens$y
  GenomeInfoDb::seqlevels(dens_gr) <- chroms

  return(dens_gr)
}