1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
#' Bin the genome based on mutation density
#'
#' This function splits the genome based on the mutation density.
#' The density is calculated per chromosome. The density is split
#' into bins. The difference in density between subsequent bins is the same
#' for all bins. In other words, the difference in density between bins 1 and
#' 2 is the same as between bins 2 and 3.
#' The function returns a GRangesList. Each GRanges in the list contains the
#' regions associated with that bin. This can be used with the
#' 'split_muts_region()' function.
#'
#'
#' @param vcf_list GRangesList or GRanges object.
#' @param ref_genome BSgenome reference genome object
#' @param nrbins The number of bins in which to separate the genome
#' @param man_dens_cutoffs Manual density cutoffs to use.
#'
#' @family genomic_regions
#' @return GRangesList
#' @export
#' @importFrom magrittr %>%
#'
#' @examples
#'
#' ### See the 'read_vcfs_as_granges()' example for how we obtained the
#' ## following data:
#' grl <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#' package = "MutationalPatterns"
#' ))
#'
#' ## Load the corresponding reference genome.
#' ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
#' library(ref_genome, character.only = TRUE)
#'
#' ## Determine region density
#' dens_grl <- bin_mutation_density(grl, ref_genome, nrbins = 3)
#' names(dens_grl) <- c("Low", "Medium", "High")
#'
#'
#' ## You can also use manual cutoffs. This feature is meant for more
#' ## advanced users. It can be usefull if you want to find highly mutated regions, with
#' ## a consistent cutoff between analyses.
#' dens_grl_man <- bin_mutation_density(grl, ref_genome, man_dens_cutoffs = c(0, 2e-08, 1))
bin_mutation_density <- function(vcf_list,
ref_genome,
nrbins = 3,
man_dens_cutoffs = NA) {
# These variables use non standard evaluation.
# To avoid R CMD check complaints we initialize them to NULL.
. <- NULL
# Convert list to grl if necessary
if (inherits(vcf_list, "list")) {
vcf_list <- GenomicRanges::GRangesList(vcf_list)
}
# Merge if grl.
if (inherits(vcf_list, "CompressedGRangesList")) {
gr <- unlist(vcf_list)
} else if (inherits(vcf_list, "GRanges")) {
gr <- vcf_list
} else {
.not_gr_or_grl(vcf_list)
}
# Get ref genome
ref_genome <- BSgenome::getBSgenome(ref_genome)
# Sort the input
gr <- BiocGenerics::sort(gr)
# Determine density per chromosome
chroms <- GenomeInfoDb::seqlevelsInUse(gr)
dens_gr <- purrr::map(chroms,
.get_mutation_density_chrom,
ref_genome,
gr,
chroms) %>%
do.call(c, .)
# Set density break type
if (!.is_na(man_dens_cutoffs)) {
breaks <- man_dens_cutoffs
} else {
breaks <- nrbins
}
# Split density in classes
dens_gr$dens <- cut(dens_gr$dens, breaks = breaks, labels = FALSE)
dens_grl <- split(dens_gr, dens_gr$dens)
dens_grl <- GenomicRanges::reduce(dens_grl)
return(dens_grl)
}
#' Get mutation density for a single chromosome
#'
#' This function determines the mutation density for a single chromosome.
#' It returns a GRanges object, containing the densities.
#'
#' @param chrom String describing a chromosome
#' @param ref_genome BSgenome reference genome object
#' @param gr GRanges object
#' @param chroms String vector of chromosomes
#'
#' @return GRanges
#' @noRd
#'
.get_mutation_density_chrom <- function(chrom, ref_genome, gr, chroms) {
# Select muts in chrom
gr <- gr[GenomeInfoDb::seqnames(gr) == chrom]
# Determine position
half_width <- (BiocGenerics::end(gr) - BiocGenerics::start(gr)) / 2
pos <- BiocGenerics::start(gr) + half_width
# Calculate density. Only calculate within chromosome size
chr_size <- GenomeInfoDb::seqlengths(ref_genome)[chrom]
dens <- stats::density(pos, bw = "SJ", from = 1, to = chr_size)
# Determine location of density bins.
# The positions of the density estimates x are taken as the middle of the bins
half_dist <- (dens$x[2] - dens$x[1]) / 2
start <- ceiling(dens$x - half_dist)
end <- floor(dens$x + half_dist)
# Clip again to chromosome size
start[1] <- 1
end[length(end)] <- chr_size
# Transform to GRanges
dens_gr <- GenomicRanges::GRanges(seqnames = chrom,
ranges = IRanges::IRanges(start, end))
dens_gr$dens <- dens$y
GenomeInfoDb::seqlevels(dens_gr) <- chroms
return(dens_gr)
}
|