File: intersect_with_region.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,360 kB
  • sloc: sh: 8; makefile: 2
file content (55 lines) | stat: -rw-r--r-- 1,819 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#' Find overlap between mutations and a genomic region
#'
#' Find the number of mutations that reside in genomic region and take
#' surveyed area of genome into account.
#'
#' @param vcf CollapsedVCF object with mutations
#' @param surveyed GRanges object with regions of the genome that were surveyed
#' @param region GRanges object with genomic region(s)
#' @noRd
#' @return A data.frame containing the overlapping mutations for a
#' genomic region.

.intersect_with_region <- function(vcf, surveyed, region) {
  # Number of mutations in vcf file
  n_muts <- length(vcf)

  # Number of base pairs that were surveyed
  surveyed_length <- sum(as.numeric(BiocGenerics::width(surveyed)))

  # Check if chromosome names are the same in the objects
  if (GenomeInfoDb::seqlevelsStyle(vcf) != GenomeInfoDb::seqlevelsStyle(surveyed)) {
    stop(paste(
      "The chromosome names (seqlevels) of the VCF and the",
      "surveyed GRanges object do not match."
    ))
  }

  if (GenomeInfoDb::seqlevelsStyle(region) != GenomeInfoDb::seqlevelsStyle(surveyed)) {
    stop(paste(
      "The chromosome names (seqlevels) of the surveyed and",
      "the region GRanges object do not match."
    ))
  }

  # Intersect genomic region and surveyed region
  surveyed_region <- GenomicRanges::intersect(surveyed, region, ignore.strand = TRUE)
  surveyed_region_length <- sum(width(surveyed_region))

  # Find which mutations lie in surveyed genomic region
  overlap <- GenomicRanges::findOverlaps(vcf, surveyed_region)
  muts_in_region <- as.data.frame(as.matrix(overlap))$queryHits

  observed <- length(muts_in_region)
  prob <- n_muts / surveyed_length
  expected <- prob * surveyed_region_length

  res <- data.frame(
    n_muts,
    surveyed_length,
    prob, surveyed_region_length,
    expected,
    observed
  )
  return(res)
}