File: genomeToX.R

package info (click to toggle)
r-bioc-ensembldb 2.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,764 kB
  • sloc: perl: 331; sh: 15; makefile: 5
file content (329 lines) | stat: -rw-r--r-- 12,928 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#' @include transcriptToX.R

#' @title Map genomic coordinates to transcript coordinates
#'
#' @description
#'
#' `genomeToTranscript` maps genomic coordinates to positions within the
#' transcript (if at the provided genomic position a transcript is encoded).
#' The function does only support mapping of genomic coordinates that are
#' completely within the genomic region at which an exon is encoded. If the
#' genomic region crosses the exon boundary an empty `IRanges` is returned.
#' See examples for details.
#'
#' @details
#'
#' The function first retrieves all exons overlapping the provided genomic
#' coordinates and identifies then exons that are fully containing the
#' coordinates in `x`. The transcript-relative coordinates are calculated based
#' on the relative position of the provided genomic coordinates in this exon.
#'
#' @note
#'
#' The function throws a warning and returns an empty `IRanges` object if the
#' genomic coordinates can not be mapped to a transcript.
#'
#' @param x `GRanges` object with the genomic coordinates that should be
#'     mapped.
#'
#' @param db `EnsDb` object.
#'
#' @return
#'
#' An `IRangesList` with length equal to `length(x)`. Each element providing
#' the mapping(s) to position within any encoded transcripts at the respective
#' genomic location as an `IRanges` object. An `IRanges` with negative start
#' coordinates is returned, if the provided genomic coordinates are not
#' completely within the genomic coordinates of an exon.
#'
#' The ID of the exon and its rank (index of the exon in the transcript) are
#' provided in the result's `IRanges` metadata columns as well as the genomic
#' position of `x`.
#'
#' @md
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#'
#' ## Subsetting the EnsDb object to chromosome X only to speed up execution
#' ## time of examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Define a genomic region and calculate within-transcript coordinates
#' gnm <- GRanges("X:107716399-107716401")
#'
#' res <- genomeToTranscript(gnm, edbx)
#' ## Result is an IRanges object with the start and end coordinates within
#' ## each transcript that has an exon at the genomic range.
#' res
#'
#' ## An IRanges with negative coordinates is returned if at the provided
#' ## position no exon is present. Below we use the same coordinates but
#' ## specify that the coordinates are on the forward (+) strand
#' gnm <- GRanges("X:107716399-107716401:+")
#' genomeToTranscript(gnm, edbx)
#'
#' ## Next we provide multiple genomic positions.
#' gnm <- GRanges("X", IRanges(start = c(644635, 107716399, 107716399),
#'     end = c(644639, 107716401, 107716401)), strand = c("*", "*", "+"))
#'
#' ## The result of the mapping is an IRangesList each element providing the
#' ## within-transcript coordinates for each input region
#' genomeToTranscript(gnm, edbx)
genomeToTranscript <- function(x, db) {
    if (missing(x) || !is(x, "GRanges"))
        stop("Argument 'x' is required and has to be a 'GRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    res <- .genome_to_tx(x, db)
    if (is(res, "IRanges"))
        res <- IRangesList(res)
    not_mapped <- sum(any(start(res) < 0))
    if (not_mapped > 0)
        warning(not_mapped, " genomic region(s) could not be mapped ",
                "to a transcript; hint: see ?seqlevelsStyle if you used ",
                "UCSC chromosome names", call. = FALSE)
    res
}

#' @title Map genomic coordinates to protein coordinates
#'
#' @description
#'
#' Map positions along the genome to positions within the protein sequence if
#' a protein is encoded at the location. The provided coordinates have to be
#' completely within the genomic position of an exon of a protein coding
#' transcript (see [genomeToTranscript()] for details). Also, the provided
#' positions have to be within the genomic region encoding the CDS of a
#' transcript (excluding its stop codon; soo [transcriptToProtein()] for
#' details).
#'
#' For genomic positions for which the mapping failed an `IRanges` with
#' negative coordinates (i.e. a start position of -1) is returned.
#'
#' @details
#'
#' `genomeToProtein` combines calls to [genomeToTranscript()] and
#' [transcriptToProtein()].
#'
#' @param x `GRanges` with the genomic coordinates that should be mapped to
#'     within-protein coordinates.
#'
#' @param db `EnsDb` object.
#'
#' @return
#'
#' An `IRangesList` with each element representing the mapping of one of the
#' `GRanges` in `x` (i.e. the length of the `IRangesList` is `length(x)`).
#' Each element in `IRanges` provides the coordinates within the protein
#' sequence, names being the (Ensembl) IDs of the protein. The ID of the
#' transcript encoding the protein, the ID of the exon within which the
#' genomic coordinates are located and its rank in the transcript are provided
#' in metadata columns `"tx_id"`, `"exon_id"` and `"exon_rank"`. Metadata
#' columns `"cds_ok"` indicates whether the length of the CDS matches the
#' length of the encoded protein. Coordinates for which `cds_ok = FALSE` should
#' be taken with caution, as they might not be correct. Metadata columns
#' `"seq_start"`, `"seq_end"`, `"seq_name"` and `"seq_strand"` provide the
#' provided genomic coordinates.
#'
#' For genomic coordinates that can not be mapped to within-protein sequences
#' an `IRanges` with a start coordinate of -1 is returned.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## In the example below we define 4 genomic regions:
#' ## 630898: corresponds to the first nt of the CDS of ENST00000381578
#' ## 644636: last nt of the CDS of ENST00000381578
#' ## 644633: last nt before the stop codon in ENST00000381578
#' ## 634829: position within an intron.
#' gnm <- GRanges("X", IRanges(start = c(630898, 644636, 644633, 634829),
#'     width = c(5, 1, 1, 3)))
#' res <- genomeToProtein(gnm, edbx)
#'
#' ## The result is an IRangesList with the same length as gnm
#' length(res)
#' length(gnm)
#'
#' ## The first element represents the mapping for the first GRanges:
#' ## the coordinate is mapped to the first amino acid of the protein(s).
#' ## The genomic coordinates can be mapped to several transcripts (and hence
#' ## proteins).
#' res[[1]]
#'
#' ## The stop codon is not translated, thus the mapping for the second
#' ## GRanges fails
#' res[[2]]
#'
#' ## The 3rd GRanges is mapped to the last amino acid.
#' res[[3]]
#'
#' ## Mapping of intronic positions fail
#' res[[4]]
genomeToProtein <- function(x, db) {
    if (missing(x) || !is(x, "GRanges"))
        stop("Argument 'x' is required and has to be a 'GRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    txs <- genomeToTranscript(x, db)
    int_ids <- rep(1:length(txs), lengths(txs))
    txs <- unlist(txs, use.names = FALSE)
    if (is.null(names(txs)))
        names(txs) <- ""
    prts <- transcriptToProtein(txs, db)
    mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")], mcols(txs))
    prts <- split(prts, int_ids)
    names(prts) <- NULL
    ## Prune the result by removing non-mappable regions from each element
    ## for which we do have correct mappings.
    prts <- IRangesList(lapply(prts, function(x) {
        strts <- start(x)
        if (any(strts < 0) & any(strts > 0))
            x <- x[strts > 0]
        x
    }))
    prts
}

#' This function takes a `GRanges` and a `GRangesList` as input and maps
#' coordinates. We loop through each input range and map that to within
#' transcript positions based on the `exns` argument.
#'
#' @param genome `GRanges` with the genomic positions to be mapped.
#'
#' @param exns `GRangesList` with genomic positions of transcripts' exons, i.e.
#'     a `GRangesList` as the one returned by `exonsBy`.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.genome_to_tx_ranges <- function(genome, exns) {
    IRangesList(unlist(
        lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) {
            if (length(exns)) {
                ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE,
                                   ignore.strand = FALSE)
                ints <- ints[width(ints) == width(rgn)]
                ints <- ints[lengths(ints) > 0]
                ints_map <- mapply(
                    ints, exns[names(ints)], names(ints), FUN = function(int, tx,
                                                                         tx_id,
                                                                         rgn) {
                        exon_idx <- int$exon_rank
                        if (exon_idx > 1)
                            count_up <- sum(width(tx)[1:(exon_idx - 1)])
                        else count_up <- 0
                        if (as.character(strand(int)[1]) == "+")
                            irng <- IRanges(start = count_up + start(rgn) -
                                                start(tx[exon_idx]) + 1,
                                            width = width(rgn))
                        else
                            irng <- IRanges(start = count_up + end(tx[exon_idx]) -
                                                end(rgn) + 1,
                                            width = width(rgn))
                        names(irng) <- tx_id
                        dfrm <- DataFrame(
                            tx_id = tx_id,
                            exon_id = int$exon_id,
                            exon_rank = exon_idx,
                            seq_start = start(rgn),
                            seq_end = end(rgn),
                            seq_name = as.character(seqnames(rgn)),
                            seq_strand = as.character(strand(rgn))
                        )
                        mcols(irng) <- dfrm
                        irng
                    }, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE)
            } else ints_map <- NULL
            if (length(ints_map))
                unlist(IRangesList(ints_map))
            else {
                metad <- DataFrame(tx_id = NA_character_,
                                   exon_id = NA_character_,
                                   exon_rank = NA_integer_,
                                   seq_start = start(rgn), seq_end = end(rgn),
                                   seq_name = as.character(seqnames(rgn)),
                                   seq_strand = as.character(strand(rgn)))
                empty_rng <- IRanges(start = -1, width = 1)
                mcols(empty_rng) <- metad
                empty_rng
            }
        }, exns = exns)
      , use.names = FALSE))
}

#' @description
#'
#' Takes a `GRanges` object, identifies all exons overlapping that region,
#' checks if the region is completely included in an exons and returns
#' the position within the transcript for that exon.
#'
#' @param db `EnsDb`.
#'
#' @param genome `GRanges`
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
#'
#' @examples
#'
#' ## Second example: nt 2 of exon 4 of ENST00000554971 (nt 637 of tx)
#' genome <- GRanges("X", IRanges(start = 601735, end = 601735))
#' db <- edbx
#'
#' ## + strand: Last two nt of CDS for ENST00000381578
#' genome <- GRanges("X", IRanges(start = 605370, end = 605374))
#' ## Length of result is 2.
#' ## Position within ENST00000381578: 259 + 709 + 209 + 58 + 89 + 245 = 1569
#'
#' ## - strand: TSC22D3:
#' ## 1) ENST00000486554 nt 1-3:
#' genome <- GRanges("X", IRanges(end = 106959631, start = 106959629))
#' ## Overlaps 2 tx: ENST00000372390, ENST00000486554
#'
#' ## 1) ENST00000486554 nts 5-8 in exon 2
#' ## exon 1: 503nt: region is 508-511
#' genome <- GRanges("X", IRanges(end = 106957975, width = 4))
#' .genome_to_tx(genome, edbx)
#'
#' ## 3) ENST00000372397, last 3nt
#' ## exon 3: 446 + 52 + 1529 total length: 2025-2027
#' genome <- GRanges("X", IRanges(start = 106956451, width = 3))
#' .genome_to_tx(genome, edbx)
#'
#'
#' ## Example with two genes, on two strands!
.genome_to_tx <- function(genome, db) {
    ## Get exonsBy for all input ranges.
    exns <- exonsBy(db, by = "tx",
                    filter = AnnotationFilterList(
                        SeqNameFilter(as.character(unique(seqnames(genome)))),
                        GeneStartFilter(max(end(genome)), condition = "<="),
                        GeneEndFilter(min(start(genome)), condition = ">=")
                        ))
    res <- .genome_to_tx_ranges(genome, exns)
    if (!is.null(names(genome)))
        names(res) <- names(genome)
    if (length(genome) == 1)
        res[[1]]
    else res
}