1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
#' Fit mutational signatures to a mutation matrix with bootstrapping
#'
#' @description
#' Bootstrapping the signature refitting shows how stable the refit is, when small changes are made to the
#' mutation matrix. You can be more confident in the refitting results, when the differences in signature
#' contributions are small between bootstrap iterations.
#'
#' @details
#' The mutation matrix is resampled 'n_boots' times.
#' Resampling is done per column (sample) with replacement.
#' The row weights are used as probabilities.
#' On each resampled matrix the 'fit_to_signatures()' or 'fit_to_signatures_strict()' function
#' is applied.
#' In the end a matrix is returned with the contributions for each bootstrap iteration.
#' Each row is a single bootstrap iteration from a single sample.
#' The method you choose determines how strict the signature refitting is.
#' The 'regular' and "regular_10+ methods often suffer from a lot of overfitting,
#' however this is less of an issue when you refit on an limited number of signatures.
#' The 'strict' method suffers less from overfitting, but can suffer from more
#' signature misattribution. The best method will depend on your data and
#' research question.
#'
#' @param mut_matrix mutation count matrix (dimensions: x mutation types
#' X n samples)
#' @param signatures Signature matrix (dimensions: x mutation types
#' X n signatures)
#' @param n_boots Number of bootstrap iterations.
#' @param max_delta The maximum difference in original vs reconstructed cosine similarity between two iterations.
#' Only used with method strict.
#' @param method The refitting method to be used.
#' Possible values:
#' * 'strict' Uses fit_to_signatures_strict with the default (backwards selection) method;
#' * 'regular' Uses fit_to_signatures;
#' * 'regular_10+' Uses fit_to_signatures, but removes signatures with less than 10 variants.;
#' * 'strict_best_subset' Uses fit_to_signatures_strict with the 'best_subset' method;
#' * 'strict_backwards' Uses fit_to_signatures_strict with the backwards selection method.
#' This is the same as the 'strict' method;
#' @param verbose Boolean. If TRUE, the function will show how far along it is.
#'
#' @return A matrix showing the signature contributions across all the bootstrap iterations.
#' @export
#'
#' @seealso \code{\link{mut_matrix}},
#' \code{\link{fit_to_signatures_strict}},
#' \code{\link{fit_to_signatures_bootstrapped}}
#'
#' @importFrom magrittr %>%
#' @examples
#' ## See the 'mut_matrix()' example for how we obtained the mutation matrix:
#' mut_mat <- readRDS(system.file("states/mut_mat_data.rds",
#' package = "MutationalPatterns"
#' ))
#'
#' ## Get pre-defined signatures
#' signatures <- get_known_signatures()
#'
#' ## Fit to signatures with bootstrapping
#' ## Here we use a very low "n_boots" to reduce the runtime.
#' ## For real uses, a much higher value is required.
#' contri_boots <- fit_to_signatures_bootstrapped(mut_mat,
#' signatures,
#' n_boots = 2,
#' max_delta = 0.004
#' )
#'
#' ## Use the regular refit method
#' contri_boots <- fit_to_signatures_bootstrapped(mut_mat,
#' signatures,
#' n_boots = 2,
#' method = "regular"
#' )
fit_to_signatures_bootstrapped <- function(mut_matrix,
signatures,
n_boots = 1000,
max_delta = 0.004,
method = c("strict", "regular", "regular_10+", "strict_best_subset", "strict_backwards"),
verbose = TRUE) {
# These variables use non standard evaluation.
# To avoid R CMD check complaints we initialize them to NULL.
sigs <- . <- NULL
# Match argument
method <- match.arg(method)
# Check enough mutations are present
min_nr_muts <- colSums(mut_matrix) %>%
min()
if (min_nr_muts <= 10) {
warning("At least one of your samples has less than 10 mutations.
This will negatively impact the signature refitting.
Please consider removing or pooling this sample.")
}
sig_names_tb <- tibble::tibble("sigs" = colnames(signatures))
contri_list <- vector("list", n_boots)
for (i in seq_len(n_boots)) {
# Resample mut_mat
mut_mat_resampled <- .resample_mut_mat(mut_matrix)
# Perform refit method
if (method == "strict" | method == "strict_backwards") {
refit_out <- fit_to_signatures_strict(mut_mat_resampled, signatures, max_delta = max_delta)
contri <- refit_out$fit_res$contribution
}
else if (method == "regular") {
fit_res <- fit_to_signatures(mut_mat_resampled, signatures)
contri <- fit_res$contribution
}
else if (method == "regular_10+") {
fit_res <- fit_to_signatures(mut_mat_resampled, signatures)
index <- rowSums(fit_res$contribution >= 10) != 0 # Check whether a signature has at least 10 mutations in a single sample
contri <- fit_res$contribution[index, ]
}
else if (method == "strict_best_subset"){
refit_out <- fit_to_signatures_strict(mut_mat_resampled, signatures, max_delta = max_delta, method = "best_subset")
contri <- refit_out$fit_res$contribution
}
# Reformat contribution
colnames(contri) <- paste(colnames(contri), i, sep = "_")
contri_tb <- contri %>%
as.data.frame() %>%
tibble::rownames_to_column("sigs") %>%
dplyr::left_join(sig_names_tb, ., by = "sigs") %>%
dplyr::select(-sigs)
# Add contribution to list
contri_list[[i]] <- contri_tb
if (i %% 10 == 0 & verbose == TRUE) {
message(paste0("Performed ", i, " of ", n_boots, " iterations"))
}
}
# Combine contribution from all samples
contri_boots <- do.call(cbind, contri_list) %>%
t()
# Clean up format
colnames(contri_boots) <- sig_names_tb$sigs
contri_boots[is.na(contri_boots)] <- 0
contri_boots <- contri_boots[, colSums(contri_boots) != 0, drop = FALSE]
return(contri_boots)
}
#' Resample a mutation matrix
#'
#' The mutation matrix is resampled per column (sample).
#' Resampling is done with replacement using the row weights as propabilities.
#'
#' @param mut_matrix mutation count matrix (dimensions: x mutation types
#' X n samples)
#'
#' @return A resamples mutation matrix
#'
#' @noRd
#'
.resample_mut_mat <- function(mut_matrix) {
mut_mat_resampled <- apply(mut_matrix, 2, function(x) {
total_muts <- sum(x)
sample_weights <- x / total_muts
feature_rows <- sample(seq_along(x), total_muts, replace = TRUE, prob = sample_weights)
row_counts <- table(feature_rows)
index <- as.numeric(names(row_counts))
x[index] <- as.vector(row_counts)
x[-index] <- 0
return(x)
})
return(mut_mat_resampled)
}
|