1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
library(tidyverse)
library(VariantAnnotation)
ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
library(ref_genome, character.only = TRUE)
# Function to add fake MNVs to example data. This is usefull for the unit tests, examples and vignette.
add_fake_mnvs <- function(vcf, ref_genome, nr_mnvs = NA) {
if (is.na(nr_mnvs)) {
nr_mnvs <- sample(seq(5, 10), 1)
}
mnv_lengths <- sample(seq(3, 10),
nr_mnvs,
replace = TRUE,
prob = c(0.35, 0.25, 0.15, 0.1, 0.05, 0.05, 0.025, 0.025)
)
mnvs <- purrr::map(mnv_lengths, create_1_mnv, vcf) %>%
do.call(rbind, .) %>%
sort()
vcf <- rbind(vcf, mnvs)
return(vcf)
}
# helper function of add_fake_mnvs
create_1_mnv <- function(mnv_length, vcf) {
# Select random variant
vcf <- vcf[isSNV(vcf)]
variant <- vcf[sample.int(length(vcf), 1)]
# Combine variant `mnv_length` times
mnv_variant <- purrr::map(seq(1, mnv_length), function(x) {
return(variant)
}) %>%
do.call(rbind, .)
# Replace the positions in the vcf
first_start <- start(mnv_variant)[1] + 100
starts <- seq(first_start, first_start + (mnv_length - 1))
end(mnv_variant) <- starts
start(mnv_variant) <- starts
# Update the ref bases
gr <- granges(mnv_variant)
old_seqnames <- as.vector(seqnames(gr))
seqlevelsStyle(gr) <- "UCSC"
seqlevels(gr) <- str_c("chr", c(1:22, "X", "Y"))
refs <- Biostrings::getSeq(BSgenome::getBSgenome(ref_genome), gr)
ref(mnv_variant) <- refs
possible_alts <- purrr::map(as.vector(refs), ~ setdiff(c("A", "C", "T", "G"), .x))
alts <- purrr::map(possible_alts, ~ sample(.x, 1)) %>%
DNAStringSetList(use.names = FALSE)
alt(mnv_variant) <- alts
# Update the names of the new variants
names(mnv_variant) <- str_c(
old_seqnames,
":",
starts,
"_",
as.vector(ref(mnv_variant)),
"/",
as.vector(unlist(alt(mnv_variant)))
)
return(mnv_variant)
}
# Function to remove data from a vcf, that is not needed for the package.
decrease_vcf_size <- function(vcf_fname) {
# Read vcf
vcf <- readVcf(vcf_fname)
# Remove unnecessary info
info(vcf) <- info(vcf)[, 0, drop = FALSE]
info(header(vcf)) <- info(header(vcf))[0, , drop = FALSE]
# Remove unnecessary genotype data
gt_to_keep <- c("GT", "AD", "DP", "GQ", "PL")
geno(vcf) <- geno(vcf)[gt_to_keep]
geno(header(vcf)) <- geno(header(vcf))[gt_to_keep, , drop = FALSE]
# Remove filter statements from header
fixed(header(vcf))[["FILTER"]] <- fixed(header(vcf))[["FILTER"]][0, , drop = FALSE]
# Remove all but one sample
tmp_name <- "tmp.vcf"
tmp_name2 <- "tmp2.vcf"
writeVcf(vcf, tmp_name)
system(str_c("grep -v '##ALT' ", tmp_name, " > ", tmp_name2))
system(str_c("cut -f1-10 ", tmp_name2, " > ", vcf_fname))
file.remove(tmp_name)
file.remove(tmp_name2)
invisible(0)
}
# Determine vcf and donor names
snv_vcf_fnames <- list.files("~/surfdrive/Shared/Boxtel_General/Data/Mutation_data/SNVs/hg19/Healthy_bone_marrow/",
pattern = "MQ60.vcf",
full.names = TRUE
)
indel_vcf_fnames <- list.files("~/surfdrive/Shared/Boxtel_General/Data/Mutation_data/INDELs/hg19/Healthy_bone_marrow/",
pattern = "CALLABLE.vcf",
full.names = TRUE
)
vcf_fnames <- c(snv_vcf_fnames, indel_vcf_fnames)
sample_names <- vcf_fnames %>%
basename() %>%
str_remove("_.*")
donor_names <- sample_names %>%
str_remove("HSC.*") %>%
str_remove("MPP.*")
used_donors <- c("AC", "ACC55", "BCH")
vcf_f <- donor_names %in% used_donors
vcf_fnames <- vcf_fnames[vcf_f]
donor_names <- donor_names[vcf_f]
# Read vcfs and combine vcfs from the same donor
vcf_l <- purrr::map(vcf_fnames, readVcf, genome = "hg19")
vcf_l <- vcf_l %>%
split(donor_names) %>% # Split per donor into list of lists
purrr::map(., function(x) do.call(rbind, x)) %>% # Combine vcfs from a single donor
purrr::map(sort) %>%
purrr::map(unique)
# Add fake mnvs.
set.seed(42)
vcf_l <- purrr::map(vcf_l, add_fake_mnvs, ref_genome)
# Write vcfs
out_vcf_fnames <- str_c("inst/extdata/blood-", names(vcf_l), ".vcf")
purrr::map2(vcf_l, out_vcf_fnames, writeVcf)
# Decrease vcf size
purrr::map(out_vcf_fnames, decrease_vcf_size)
# Create granges object from the vcfs
vcf_fnames <- list.files("inst/extdata/", pattern = "blood.*.vcf", full.names = TRUE)
vcf_l <- purrr::map(vcf_fnames, readVcf, "hg19")
sample_names <- basename(vcf_fnames) %>%
str_remove("blood-") %>%
str_remove(".vcf")
names(vcf_l) <- sample_names
grl <- purrr::map(vcf_l, granges) %>%
GRangesList()
seqlevelsStyle(grl) <- "UCSC"
seqlevels(grl, pruning.mode = "fine") <- str_c("chr", c(1:22, "X"))
saveRDS(grl, "inst/states/blood_grl.rds")
# Create empty vcf
vcf <- vcf_l[[1]]
vcf <- vcf[0]
writeVcf(vcf, "inst/extdata/empty.vcf")
# Reduce size of old example vcfs
samples = c("colon1", "colon2", "colon3", "intestine1", "intestine2", "intestine3", "liver1", "liver2", "liver3")
vcf_fnames = paste0("inst/extdata/", samples, "-sample.vcf")
purrr::map(vcf_fnames, decrease_vcf_size)
|