File: genomic_distribution.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,360 kB
  • sloc: sh: 8; makefile: 2
file content (140 lines) | stat: -rw-r--r-- 5,468 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#' Find overlaps between mutations and a genomic region.
#'
#' Function finds the number of mutations that reside in genomic region and
#' takes surveyed area of genome into account.
#'
#' @param vcf_list GRangesList or GRanges object.
#' @param surveyed_list A GRangesList or a list with GRanges of regions of
#' the genome that have been surveyed (e.g. determined using GATK CallableLoci).
#' @param region_list A GRangesList or a list with GRanges objects containing
#' locations of genomic regions.
#'
#' @return A data.frame containing the number observed and number of expected
#' mutations in each genomic region.
#'
#' @examples
#' ## See the 'read_vcfs_as_granges()' example for how we obtained the
#' ## following data:
#' vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' ## Use biomaRt to obtain data.
#' ## We can query the BioMart database, but this may take a long time,
#' ## so we take some shortcuts by loading the results from our
#' ## examples. The corresponding code for downloading the data can be
#' ## found above the command we run.
#'
#' # mart="ensemble"
#' # library(biomaRt)
#'
#' # regulatory <- useEnsembl(biomart="regulation",
#' #                          dataset="hsapiens_regulatory_feature",
#' #                          GRCh = 37)
#' regulatory <- readRDS(system.file("states/regulatory_data.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' ## Download the regulatory CTCF binding sites and convert them to
#' ## a GRanges object.
#' # CTCF <- getBM(attributes = c('chromosome_name',
#' #                             'chromosome_start',
#' #                             'chromosome_end',
#' #                             'feature_type_name'),
#' #              filters = "regulatory_feature_type_name",
#' #              values = "CTCF Binding Site",
#' #              mart = regulatory)
#' #
#' # CTCF_g <- reduce(GRanges(CTCF$chromosome_name,
#' #                 IRanges(CTCF$chromosome_start,
#' #                 CTCF$chromosome_end)))
#'
#' CTCF_g <- readRDS(system.file("states/CTCF_g_data.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' ## Download the promoter regions and conver them to a GRanges object.
#' # promoter = getBM(attributes = c('chromosome_name', 'chromosome_start',
#' #                                 'chromosome_end', 'feature_type_name'),
#' #                  filters = "regulatory_feature_type_name",
#' #                  values = "Promoter",
#' #                  mart = regulatory)
#' # promoter_g = reduce(GRanges(promoter$chromosome_name,
#' #                     IRanges(promoter$chromosome_start,
#' #                             promoter$chromosome_end)))
#'
#' promoter_g <- readRDS(system.file("states/promoter_g_data.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' # flanking = getBM(attributes = c('chromosome_name',
#' #                                 'chromosome_start',
#' #                                 'chromosome_end',
#' #                                 'feature_type_name'),
#' #                  filters = "regulatory_feature_type_name",
#' #                  values = "Promoter Flanking Region",
#' #                  mart = regulatory)
#' # flanking_g = reduce(GRanges(
#' #                        flanking$chromosome_name,
#' #                        IRanges(flanking$chromosome_start,
#' #                        flanking$chromosome_end)))
#'
#' flanking_g <- readRDS(system.file("states/promoter_flanking_g_data.rds",
#'   package = "MutationalPatterns"
#' ))
#'
#' regions <- GRangesList(promoter_g, flanking_g, CTCF_g)
#'
#' names(regions) <- c("Promoter", "Promoter flanking", "CTCF")
#'
#' # Use a naming standard consistently.
#' seqlevelsStyle(regions) <- "UCSC"
#'
#' ## Get the filename with surveyed/callable regions
#' surveyed_file <- system.file("extdata/callableloci-sample.bed",
#'   package = "MutationalPatterns"
#' )
#'
#' ## Import the file using rtracklayer and use the UCSC naming standard
#' library(rtracklayer)
#' surveyed <- import(surveyed_file)
#' seqlevelsStyle(surveyed) <- "UCSC"
#'
#' ## For this example we use the same surveyed file for each sample.
#' surveyed_list <- rep(list(surveyed), 9)
#'
#' ## Calculate the number of observed and expected number of mutations in
#' ## each genomic regions for each sample.
#' distr <- genomic_distribution(vcfs, surveyed_list, regions)
#' @seealso
#' \code{\link{read_vcfs_as_granges}}
#'
#' @export

genomic_distribution <- function(vcf_list, surveyed_list, region_list) {

  # Check arguments
  if (length(vcf_list) != length(surveyed_list)) {
    stop("vcf_list and surveyed_list must have the same length", call. = FALSE)
  }

  if (is.null(names(region_list))) {
    stop(paste("Please set the names of region_list using:",
      "    names(region_list) <- c(\"regionA\", \"regionB\", ...)",
      sep = "\n"
    ), call. = FALSE)
  }

  # Perform intersect with region over each combi of vcf and region.
  # Map over the region list. Within this loop map over the vcfs.
  df <- purrr::map(as.list(region_list), function(region) {
    purrr::map2(as.list(vcf_list), as.list(surveyed_list), .intersect_with_region, region) %>%
      dplyr::bind_rows(.id = "sample")
  }) %>% dplyr::bind_rows(.id = "region")

  # Region as factor
  # make sure level order is the same as in region_list input (important
  # for plotting later)
  df$region <- factor(df$region, levels = names(region_list))
  return(df)
}