1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
|
#' @include transcriptToX.R
#' @title Map genomic coordinates to transcript coordinates
#'
#' @description
#'
#' `genomeToTranscript` maps genomic coordinates to positions within the
#' transcript (if at the provided genomic position a transcript is encoded).
#' The function does only support mapping of genomic coordinates that are
#' completely within the genomic region at which an exon is encoded. If the
#' genomic region crosses the exon boundary an empty `IRanges` is returned.
#' See examples for details.
#'
#' @details
#'
#' The function first retrieves all exons overlapping the provided genomic
#' coordinates and identifies then exons that are fully containing the
#' coordinates in `x`. The transcript-relative coordinates are calculated based
#' on the relative position of the provided genomic coordinates in this exon.
#'
#' @note
#'
#' The function throws a warning and returns an empty `IRanges` object if the
#' genomic coordinates can not be mapped to a transcript.
#'
#' @param x `GRanges` object with the genomic coordinates that should be
#' mapped.
#'
#' @param db `EnsDb` object.
#'
#' @return
#'
#' An `IRangesList` with length equal to `length(x)`. Each element providing
#' the mapping(s) to position within any encoded transcripts at the respective
#' genomic location as an `IRanges` object. An `IRanges` with negative start
#' coordinates is returned, if the provided genomic coordinates are not
#' completely within the genomic coordinates of an exon.
#'
#' The ID of the exon and its rank (index of the exon in the transcript) are
#' provided in the result's `IRanges` metadata columns as well as the genomic
#' position of `x`.
#'
#' @md
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#'
#' ## Subsetting the EnsDb object to chromosome X only to speed up execution
#' ## time of examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Define a genomic region and calculate within-transcript coordinates
#' gnm <- GRanges("X:107716399-107716401")
#'
#' res <- genomeToTranscript(gnm, edbx)
#' ## Result is an IRanges object with the start and end coordinates within
#' ## each transcript that has an exon at the genomic range.
#' res
#'
#' ## An IRanges with negative coordinates is returned if at the provided
#' ## position no exon is present. Below we use the same coordinates but
#' ## specify that the coordinates are on the forward (+) strand
#' gnm <- GRanges("X:107716399-107716401:+")
#' genomeToTranscript(gnm, edbx)
#'
#' ## Next we provide multiple genomic positions.
#' gnm <- GRanges("X", IRanges(start = c(644635, 107716399, 107716399),
#' end = c(644639, 107716401, 107716401)), strand = c("*", "*", "+"))
#'
#' ## The result of the mapping is an IRangesList each element providing the
#' ## within-transcript coordinates for each input region
#' genomeToTranscript(gnm, edbx)
genomeToTranscript <- function(x, db) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
res <- .genome_to_tx(x, db)
if (is(res, "IRanges"))
res <- IRangesList(res)
not_mapped <- sum(any(start(res) < 0))
if (not_mapped > 0)
warning(not_mapped, " genomic region(s) could not be mapped ",
"to a transcript; hint: see ?seqlevelsStyle if you used ",
"UCSC chromosome names", call. = FALSE)
res
}
#' @title Map genomic coordinates to protein coordinates
#'
#' @description
#'
#' Map positions along the genome to positions within the protein sequence if
#' a protein is encoded at the location. The provided coordinates have to be
#' completely within the genomic position of an exon of a protein coding
#' transcript (see [genomeToTranscript()] for details). Also, the provided
#' positions have to be within the genomic region encoding the CDS of a
#' transcript (excluding its stop codon; soo [transcriptToProtein()] for
#' details).
#'
#' For genomic positions for which the mapping failed an `IRanges` with
#' negative coordinates (i.e. a start position of -1) is returned.
#'
#' @details
#'
#' `genomeToProtein` combines calls to [genomeToTranscript()] and
#' [transcriptToProtein()].
#'
#' @param x `GRanges` with the genomic coordinates that should be mapped to
#' within-protein coordinates.
#'
#' @param db `EnsDb` object.
#'
#' @return
#'
#' An `IRangesList` with each element representing the mapping of one of the
#' `GRanges` in `x` (i.e. the length of the `IRangesList` is `length(x)`).
#' Each element in `IRanges` provides the coordinates within the protein
#' sequence, names being the (Ensembl) IDs of the protein. The ID of the
#' transcript encoding the protein, the ID of the exon within which the
#' genomic coordinates are located and its rank in the transcript are provided
#' in metadata columns `"tx_id"`, `"exon_id"` and `"exon_rank"`. Metadata
#' columns `"cds_ok"` indicates whether the length of the CDS matches the
#' length of the encoded protein. Coordinates for which `cds_ok = FALSE` should
#' be taken with caution, as they might not be correct. Metadata columns
#' `"seq_start"`, `"seq_end"`, `"seq_name"` and `"seq_strand"` provide the
#' provided genomic coordinates.
#'
#' For genomic coordinates that can not be mapped to within-protein sequences
#' an `IRanges` with a start coordinate of -1 is returned.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## In the example below we define 4 genomic regions:
#' ## 630898: corresponds to the first nt of the CDS of ENST00000381578
#' ## 644636: last nt of the CDS of ENST00000381578
#' ## 644633: last nt before the stop codon in ENST00000381578
#' ## 634829: position within an intron.
#' gnm <- GRanges("X", IRanges(start = c(630898, 644636, 644633, 634829),
#' width = c(5, 1, 1, 3)))
#' res <- genomeToProtein(gnm, edbx)
#'
#' ## The result is an IRangesList with the same length as gnm
#' length(res)
#' length(gnm)
#'
#' ## The first element represents the mapping for the first GRanges:
#' ## the coordinate is mapped to the first amino acid of the protein(s).
#' ## The genomic coordinates can be mapped to several transcripts (and hence
#' ## proteins).
#' res[[1]]
#'
#' ## The stop codon is not translated, thus the mapping for the second
#' ## GRanges fails
#' res[[2]]
#'
#' ## The 3rd GRanges is mapped to the last amino acid.
#' res[[3]]
#'
#' ## Mapping of intronic positions fail
#' res[[4]]
genomeToProtein <- function(x, db) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
txs <- genomeToTranscript(x, db)
int_ids <- rep(1:length(txs), lengths(txs))
txs <- unlist(txs, use.names = FALSE)
if (is.null(names(txs)))
names(txs) <- ""
prts <- transcriptToProtein(txs, db)
mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")], mcols(txs))
prts <- split(prts, int_ids)
names(prts) <- NULL
## Prune the result by removing non-mappable regions from each element
## for which we do have correct mappings.
prts <- IRangesList(lapply(prts, function(x) {
strts <- start(x)
if (any(strts < 0) & any(strts > 0))
x <- x[strts > 0]
x
}))
prts
}
#' This function takes a `GRanges` and a `GRangesList` as input and maps
#' coordinates. We loop through each input range and map that to within
#' transcript positions based on the `exns` argument.
#'
#' @param genome `GRanges` with the genomic positions to be mapped.
#'
#' @param exns `GRangesList` with genomic positions of transcripts' exons, i.e.
#' a `GRangesList` as the one returned by `exonsBy`.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.genome_to_tx_ranges <- function(genome, exns) {
IRangesList(unlist(
lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) {
if (length(exns)) {
ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE,
ignore.strand = FALSE)
ints <- ints[width(ints) == width(rgn)]
ints <- ints[lengths(ints) > 0]
ints_map <- mapply(
ints, exns[names(ints)], names(ints), FUN = function(int, tx,
tx_id,
rgn) {
exon_idx <- int$exon_rank
if (exon_idx > 1)
count_up <- sum(width(tx)[1:(exon_idx - 1)])
else count_up <- 0
if (as.character(strand(int)[1]) == "+")
irng <- IRanges(start = count_up + start(rgn) -
start(tx[exon_idx]) + 1,
width = width(rgn))
else
irng <- IRanges(start = count_up + end(tx[exon_idx]) -
end(rgn) + 1,
width = width(rgn))
names(irng) <- tx_id
dfrm <- DataFrame(
tx_id = tx_id,
exon_id = int$exon_id,
exon_rank = exon_idx,
seq_start = start(rgn),
seq_end = end(rgn),
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn))
)
mcols(irng) <- dfrm
irng
}, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE)
} else ints_map <- NULL
if (length(ints_map))
unlist(IRangesList(ints_map))
else {
metad <- DataFrame(tx_id = NA_character_,
exon_id = NA_character_,
exon_rank = NA_integer_,
seq_start = start(rgn), seq_end = end(rgn),
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn)))
empty_rng <- IRanges(start = -1, width = 1)
mcols(empty_rng) <- metad
empty_rng
}
}, exns = exns)
, use.names = FALSE))
}
#' @description
#'
#' Takes a `GRanges` object, identifies all exons overlapping that region,
#' checks if the region is completely included in an exons and returns
#' the position within the transcript for that exon.
#'
#' @param db `EnsDb`.
#'
#' @param genome `GRanges`
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
#'
#' @examples
#'
#' ## Second example: nt 2 of exon 4 of ENST00000554971 (nt 637 of tx)
#' genome <- GRanges("X", IRanges(start = 601735, end = 601735))
#' db <- edbx
#'
#' ## + strand: Last two nt of CDS for ENST00000381578
#' genome <- GRanges("X", IRanges(start = 605370, end = 605374))
#' ## Length of result is 2.
#' ## Position within ENST00000381578: 259 + 709 + 209 + 58 + 89 + 245 = 1569
#'
#' ## - strand: TSC22D3:
#' ## 1) ENST00000486554 nt 1-3:
#' genome <- GRanges("X", IRanges(end = 106959631, start = 106959629))
#' ## Overlaps 2 tx: ENST00000372390, ENST00000486554
#'
#' ## 1) ENST00000486554 nts 5-8 in exon 2
#' ## exon 1: 503nt: region is 508-511
#' genome <- GRanges("X", IRanges(end = 106957975, width = 4))
#' .genome_to_tx(genome, edbx)
#'
#' ## 3) ENST00000372397, last 3nt
#' ## exon 3: 446 + 52 + 1529 total length: 2025-2027
#' genome <- GRanges("X", IRanges(start = 106956451, width = 3))
#' .genome_to_tx(genome, edbx)
#'
#'
#' ## Example with two genes, on two strands!
.genome_to_tx <- function(genome, db) {
## Get exonsBy for all input ranges.
exns <- exonsBy(db, by = "tx",
filter = AnnotationFilterList(
SeqNameFilter(as.character(unique(seqnames(genome)))),
GeneStartFilter(max(end(genome)), condition = "<="),
GeneEndFilter(min(start(genome)), condition = ">=")
))
res <- .genome_to_tx_ranges(genome, exns)
if (!is.null(names(genome)))
names(res) <- names(genome)
if (length(genome) == 1)
res[[1]]
else res
}
|