File: transcriptToX.R

package info (click to toggle)
r-bioc-ensembldb 2.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,764 kB
  • sloc: perl: 331; sh: 15; makefile: 5
file content (597 lines) | stat: -rw-r--r-- 25,897 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
#' @include proteinToX.R

#' @title Map transcript-relative coordinates to amino acid residues of the
#'     encoded protein
#'
#' @description
#'
#' `transcriptToProtein` maps within-transcript coordinates to the corresponding
#' coordinates within the encoded protein sequence. The provided coordinates
#' have to be within the coding region of the transcript (excluding the stop
#' codon) but are supposed to be relative to the first nucleotide of the
#' transcript (which includes the 5' UTR). Positions relative to the CDS of a
#' transcript (e.g. /PKP2 c.1643delg/) have to be first converted to
#' transcript-relative coordinates. This can be done with the
#' [cdsToTranscript()] function.
#'
#' @details
#'
#' Transcript-relative coordinates are mapped to the amino acid residues they
#' encode. As an example, positions within the transcript that correspond to
#' nucleotides 1 to 3 in the CDS are mapped to the first position in the
#' protein sequence (see examples for more details).
#'
#' @param x `IRanges` with the coordinates within the transcript. Coordinates
#'     are counted from the start of the transcript (including the 5' UTR). The
#'     Ensembl IDs of the corresponding transcripts have to be provided either
#'     as `names` of the `IRanges`, or in one of its metadata columns.
#'
#' @param db `EnsDb` object.
#'
#' @param id `character(1)` specifying where the transcript identifier can be
#'     found. Has to be either `"name"` or one of `colnames(mcols(prng))`.
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' protein sequence, names being the (Ensembl) IDs of the protein. The
#' original transcript ID and the transcript-relative coordinates are provided
#' as metadata columns. Metadata columns `"cds_ok"` indicates whether the
#' length of the transcript's CDS matches the length of the encoded protein.
#' `IRanges` with a start coordinate of `-1` is returned for transcript
#' coordinates that can not be mapped to protein-relative coordinates
#' (either no transcript was found for the provided ID, the transcript
#' does not encode a protein or the provided coordinates are not within
#' the coding region of the transcript).
#'
#' @family coordinate mapping functions
#'
#' @seealso [cdsToTranscript()] and [transcriptToCds()] for conversion between
#' CDS- and transcript-relative coordinates.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Define an IRanges with the positions of the first 2 nucleotides of the
#' ## coding region for the transcript ENST00000381578
#' txpos <- IRanges(start = 692, width = 2, names = "ENST00000381578")
#'
#' ## Map these to the corresponding residues in the protein sequence
#' ## The protein-relative coordinates are returned as an `IRanges` object,
#' ## with the original, transcript-relative coordinates provided in metadata
#' ## columns tx_start and tx_end
#' transcriptToProtein(txpos, edbx)
#'
#' ## We can also map multiple ranges. Note that for any of the 3 nucleotides
#' ## encoding the same amino acid the position of this residue in the
#' ## protein sequence is returned. To illustrate this we map below each of the
#' ## first 4 nucleotides of the CDS to the corresponding position within the
#' ## protein.
#' txpos <- IRanges(start = c(692, 693, 694, 695),
#'     width = rep(1, 4), names = rep("ENST00000381578", 4))
#' transcriptToProtein(txpos, edbx)
#'
#' ## If the mapping fails, an IRanges with negative start position is returned.
#' ## Mapping can fail (as below) because the ID is not known.
#' transcriptToProtein(IRanges(1, 1, names = "unknown"), edbx)
#'
#' ## Or because the provided coordinates are not within the CDS
#' transcriptToProtein(IRanges(1, 1, names = "ENST00000381578"), edbx)
transcriptToProtein <- function(x, db, id = "name") {
    if (missing(x) || !is(x, "IRanges"))
        stop("Argument 'x' is required and has to be an 'IRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    .txs_to_proteins(x, db = db, id = id)
}

#' This version performs the mapping for each IRange separately! Results are
#' the same as in .txs_to_protins. While the code is easier to read and to
#' debug, it is also considerably slower.
#'
#' @noRd
.tx_to_protein <- function(x, db, id = "name") {
    if (length(x) > 1)
        return(unlist(IRangesList(
            lapply(split(x, 1:length(x)), FUN = .tx_to_protein,
                   db = db, id = id)), use.names = FALSE))
    id <- match.arg(id, c("name", colnames(mcols(x))))
    if (id == "name")
        ids <- names(x)
    else ids <- mcols(x)[, id]
    if (any(is.null(ids)))
        stop("All IDs are NULL")
    empty_ranges <- IRanges(start = -1, width = 1)
    mcols(empty_ranges) <- DataFrame(tx_id = ids,
                                     tx_start = start(x),
                                     tx_end = end(x),
                                     cds_ok = NA)
    tx_lens <- .transcriptLengths(db,
                                  filter = TxIdFilter(ids),
                                  with.cds_len = TRUE,
                                  with.utr5_len = TRUE,
                                  with.utr3_len = TRUE)
    ## Does the tx exist?
    if (nrow(tx_lens) == 0) {
        warning("Transcript '", ids, "' can not be found", call. = FALSE)
        return(empty_ranges)
    }
    ## Is it a protein coding tx?
    if (is.na(tx_lens$cds_len)) {
        warning("Transcript '", ids, "' does not encode a protein",
                call. = FALSE)
        return(empty_ranges)
    }
    if (is.na(tx_lens$utr5_len))
        tx_lens$utr5_len <- 0
    if (is.na(tx_lens$utr3_len))
        tx_lens$utr3_len <- 0
    ## Are the coordinates within the CDS? Exclude the stop codon!
    if (start(x) <= tx_lens$utr5_len |
        end(x) > (tx_lens$utr5_len + tx_lens$cds_len - 3)) {
        warning("The provided coordinates for '", ids,
                "' are not within the coding region", call. = FALSE)
        return(empty_ranges)
    }
    ## Define the coordinates
    end(empty_ranges) <- ceiling((end(x) - tx_lens$utr5_len) / 3)
    start(empty_ranges) <- ceiling((start(x) - tx_lens$utr5_len) / 3)
    ## Now check if the CDS length matches the protein sequence length.
    prt <- proteins(db, filter = TxIdFilter(ids), columns = "protein_sequence")
    names(empty_ranges) <- prt$protein_id
    cds_ok <- nchar(prt$protein_sequence) * 3 == (tx_lens$cds_len - 3)
    if (!cds_ok)
        warning("The CDS of '", ids, "' does not match the length of the ",
                "encoded protein. Returned protein coordinates might not be",
                " correct", call. = FALSE)
    mcols(empty_ranges)$cds_ok <- cds_ok
    empty_ranges
}

#' @details
#'
#' Transcript-relative coordinates are mapped to the amino acid residues they
#' encode. As an example, positions within the transcript that corrspond to
#' nucleotides 1 to 3 in the CDS are mapped to the first position in the
#' protein sequence (see examples for more details).
#' (e.g. a transcript relative position corresponding to the first nucleotide
#' of the transcript's CDS is mapped to the first position in the protein
#' sequence, same as if the transcri(no matter which transcript relative
#' position corresponding to the first,
#' second or third nucleotide of the CDS are provided, all (any) of
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.txs_to_proteins <- function(x, db, id = "name") {
    ## Fetch all in one.
    id <- match.arg(id, c("name", colnames(mcols(x))))
    if (id == "name")
        ids <- names(x)
    else ids <- mcols(x)[, id]
    if (any(is.null(ids)))
        stop("One or more of the provided IDs are NULL", call. = FALSE)
    names(x) <- ids
    ## Define internal IDs - we'll need them to return the result in the
    ## correct order.
    internal_ids <- paste0(names(x), ":", start(x), ":", end(x))
    tx_lens <- .transcriptLengths(db,
                                  filter = TxIdFilter(unique(ids)),
                                  with.cds_len = TRUE,
                                  with.utr5_len = TRUE,
                                  with.utr3_len = TRUE)
    ## Process transcripts not found
    not_found <- !(ids %in% tx_lens$tx_id)
    fail_res <- IRanges()
    if (any(not_found)) {
        ## issue #69 does not apply here, even if redundant ids are used,
        ## all of them are considered.
        warning("Transcript(s) ", paste0("'", ids[not_found], "'",
                                         collapse = ", "),
                " could not be found", call. = FALSE)
        fail_res <- IRanges(start = rep(-1, sum(not_found)),
                            width = rep(1, sum(not_found)))
        ## Add metadata columnds.
        mcols(fail_res) <- DataFrame(tx_id = ids[not_found],
                                     tx_start = start(x[not_found]),
                                     tx_end = end(x[not_found]),
                                     cds_ok = NA)
        if (all(not_found))
            return(fail_res[match(internal_ids,
                                  paste0(mcols(fail_res)$tx_id, ":",
                                         mcols(fail_res)$tx_start, ":",
                                         mcols(fail_res)$tx_end))])
    }
    ## Remove non-coding transcripts
    not_coding <- is.na(tx_lens$cds_len)
    if (any(not_coding)) {
        not_coding_id <- tx_lens$tx_id[not_coding]
        warning("Transcript(s) ", paste0("'", not_coding_id, "'",
                                         collapse = ", "),
                " do/does not encode a protein", call. = FALSE)
        ## Subset x to the non-coding ones (issue #69)
        x_sub <- x[ids %in% not_coding_id]
        not_coding_res <- IRanges(start = rep(-1, length(x_sub)),
                                  width = rep(1, length(x_sub)))
        ## Add metadata columnds.
        mcols(not_coding_res) <- DataFrame(tx_id = names(x_sub),
                                           tx_start = start(x_sub),
                                           tx_end = end(x_sub),
                                           cds_ok = NA)
        fail_res <- c(fail_res, not_coding_res)
        if (all(not_coding))
            return(fail_res[match(internal_ids,
                                  paste0(mcols(fail_res)$tx_id, ":",
                                         mcols(fail_res)$tx_start, ":",
                                         mcols(fail_res)$tx_end))])
        tx_lens <- tx_lens[!not_coding, , drop = FALSE]
    }
    ## Ensure we have no missing 3' or 5' UTRs
    tx_lens$utr3_len[is.na(tx_lens$utr3_len)] <- 0
    tx_lens$utr5_len[is.na(tx_lens$utr5_len)] <- 0
    x <- x[ids %in% tx_lens$tx_id]
    id_idx <- match(names(x), tx_lens$tx_id)
    ## Are the coordinates within the CDS? Exclude the stop codon!
    outside_cds <- start(x) <= tx_lens$utr5_len[id_idx] |
        end(x) > (tx_lens$utr5_len[id_idx] + tx_lens$cds_len[id_idx] - 3)
    if (any(outside_cds)) {
        ## Also save against issue #69.
        outside_res <- IRanges(start = rep(-1, sum(outside_cds)),
                               width = rep(1, sum(outside_cds)))
        ## Add metadata columnds.
        mcols(outside_res) <- DataFrame(tx_id = names(x)[outside_cds],
                                           tx_start = start(x[outside_cds]),
                                           tx_end = end(x[outside_cds]),
                                           cds_ok = NA)
        fail_res <- c(fail_res, outside_res)
        warning("Provided coordinates for ",
                paste0("'", names(x)[outside_cds], "'", collapse = ", "),
                " are not within the coding region", call. = FALSE)
        if (all(outside_cds))
            return(fail_res[match(internal_ids,
                                  paste0(mcols(fail_res)$tx_id, ":",
                                         mcols(fail_res)$tx_start, ":",
                                         mcols(fail_res)$tx_end))])
        x <- x[!outside_cds]
    }
    ## Now check if the CDS length matches the protein sequence length.
    prt <- proteins(db, filter = TxIdFilter(names(x)),
                    columns = "protein_sequence")
    id_idx_prt <- match(names(x), prt$tx_id)
    id_idx <- match(names(x), tx_lens$tx_id)
    res <- IRanges(start = ceiling((start(x) - tx_lens$utr5_len[id_idx]) / 3),
                   end = ceiling((end(x) - tx_lens$utr5_len[id_idx]) / 3),
                   names = prt$protein_id[id_idx_prt])
    cds_ok <- nchar(prt$protein_sequence[id_idx_prt]) * 3 ==
        (tx_lens$cds_len[id_idx] - 3)
    if (any(!cds_ok))
        warning("The CDS of ", paste0("'", unique(names(x)[!cds_ok]), "'",
                                      collapse = ", "),
                " does not match the length of the encoded protein. Returned",
                " protein coordinates for this/these transcript(s) might not",
                " be correct", call. = FALSE)
    mcols(res) <- DataFrame(tx_id = names(x), tx_start = start(x),
                            tx_end = end(x), cds_ok = cds_ok)
    res <- c(fail_res, res)
    res[match(internal_ids, paste0(mcols(res)$tx_id, ":",
                                   mcols(res)$tx_start, ":",
                                   mcols(res)$tx_end))]
}

#' @title Map transcript-relative coordinates to genomic coordinates
#'
#' @description
#'
#' `transcriptToGenome` maps transcript-relative coordinates to genomic
#' coordinates. Provided coordinates are expected to be relative to the first
#' nucleotide of the **transcript**, not the **CDS**. CDS-relative coordinates
#' have to be converted to transcript-relative positions first with the
#' [cdsToTranscript()] function.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
#' `GRangesList` with the same length (and order) than the input `IRanges`
#' `x`. Each `GRanges` in the `GRangesList` provides the genomic coordinates
#' corresponding to the provided within-transcript coordinates. The
#' original transcript ID and the transcript-relative coordinates are provided
#' as metadata columns as well as the ID of the individual exon(s). An empty
#' `GRanges` is returned for transcripts that can not be found in the database.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @family coordinate mapping functions
#'
#' @seealso [cdsToTranscript()] and [transcriptToCds()] for the mapping between
#' CDS- and transcript-relative coordinates.
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Below we map positions 1 to 5 within the transcript ENST00000381578 to
#' ## the genome. The ID of the transcript has to be provided either as names
#' ## or in one of the IRanges' metadata columns
#' txpos <- IRanges(start = 1, end = 5, names = "ENST00000381578")
#'
#' transcriptToGenome(txpos, edbx)
#' ## The object returns a GRangesList with the genomic coordinates, in this
#' ## example the coordinates are within the same exon and map to a single
#' ## genomic region.
#'
#' ## Next we map nucleotides 501 to 505 of ENST00000486554 to the genome
#' txpos <- IRanges(start = 501, end = 505, names = "ENST00000486554")
#'
#' transcriptToGenome(txpos, edbx)
#' ## The positions within the transcript are located within two of the
#' ## transcripts exons and thus a `GRanges` of length 2 is returned.
#'
#' ## Next we map multiple regions, two within the same transcript and one
#' ## in a transcript that does not exist.
#' txpos <- IRanges(start = c(501, 1, 5), end = c(505, 10, 6),
#'     names = c("ENST00000486554", "ENST00000486554", "some"))
#'
#' res <- transcriptToGenome(txpos, edbx)
#'
#' ## The length of the result GRangesList has the same length than the
#' ## input IRanges
#' length(res)
#'
#' ## The result for the last region is an empty GRanges, because the
#' ## transcript could not be found in the database
#' res[[3]]
#'
#' res
transcriptToGenome <- function(x, db, id = "name") {
    if (missing(x) || !is(x, "IRanges"))
        stop("Argument 'x' is required and has to be an 'IRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    res <- .tx_to_genome(x, db, id = id)
    not_found <- sum(lengths(res) == 0)
    if (not_found > 0)
        warning(not_found, " transcript(s) could either not be found in the ",
                "database or the specified range is outside the transcript's ",
                "sequence")
    res
}

.tx_to_genome <- function(x, db, id = "name") {
    id <- match.arg(id, c("name", colnames(mcols(x))))
    if (id == "name")
        ids <- names(x)
    else ids <- mcols(x)[, id]
    if (any(is.null(ids)))
        stop("One or more of the provided IDs are NULL", call. = FALSE)
    names(x) <- ids
    exns <- exonsBy(db, filter = TxIdFilter(unique(ids)))
    ## Loop over x
    res <- lapply(split(x, f = 1:length(x)), function(z) {
        if (any(names(exns) == names(z))) {
            gen_map <- .to_genome(exns[[names(z)]], z)
            if (length(gen_map)) {
                mcols(gen_map) <- DataFrame(mcols(gen_map), tx_start = start(z),
                                            tx_end = end(z))
                gen_map[order(gen_map$exon_rank)]
            } else GRanges()
        } else GRanges()
    })
    names(res) <- ids
    as(res, "GRangesList")
}

#' @title Map transcript-relative coordinates to positions within the CDS
#'
#' @description
#'
#' Converts transcript-relative coordinates to positions within the CDS (if the
#' transcript encodes a protein).
#'
#' @param x `IRanges` with the coordinates within the transcript. Coordinates
#'     are expected to be relative to the transcription start (the first
#'     nucleotide of the transcript). The Ensembl IDs of the corresponding
#'     transcripts have to be provided either as `names` of the `IRanges`, or
#'     in one of its metadata columns.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' transcripts CDS. The transcript-relative coordinates are provided
#' as metadata columns.
#' `IRanges` with a start coordinate of `-1` is returned for transcripts
#' that are not known in the database, non-coding transcripts or if the
#' provided start and/or end coordinates are not within the coding region.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Defining transcript-relative coordinates for 4 transcripts of the gene
#' ## BCL2
#' txcoords <- IRanges(start = c(1463, 3, 143, 147), width = 1,
#'     names = c("ENST00000398117", "ENST00000333681",
#'     "ENST00000590515", "ENST00000589955"))
#'
#' ## Map the coordinates.
#' transcriptToCds(txcoords, EnsDb.Hsapiens.v86)
#'
#' ## ENST00000590515 does not encode a protein and thus -1 is returned
#' ## The coordinates within ENST00000333681 are outside the CDS and thus also
#' ## -1 is reported.
transcriptToCds <- function(x, db, id = "name") {
    if (missing(x) || !is(x, "IRanges"))
        stop("Argument 'x' is required and has to be an 'IRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    id <- match.arg(id, c("name", colnames(mcols(x))))
    if (id == "name")
	ids <- names(x)
    else ids <- mcols(x)[, id]
    tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE,
                                  with.cds_len = TRUE,
                                  filter = ~ tx_id %in% ids)
    mcols(x)$tx_start <- start(x)
    mcols(x)$tx_end <- end(x)
    start(x) <- -1
    end(x) <- -1
    if (nrow(tx_lens)) {
        ## IDs not found
        nf <- !(ids %in% tx_lens$tx_id)
        if (any(nf))
            warning("Could not find ", sum(nf), " of ", length(ids),
                    " transcript(s): ", .ids_message(ids[nf]))
        new_start <- mcols(x)$tx_start - tx_lens[ids, "utr5_len"]
        new_end <- mcols(x)$tx_end - tx_lens[ids, "utr5_len"]
        ## non-coding
        isna <- !nf & is.na(new_start)
        if (any(isna))
            warning(sum(isna), " of ", length(ids), " transcript(s) are non-",
                    "coding: ", .ids_message(ids[isna]))
        ## coordinate not in CDS?
        notcds <- !nf & !isna & (new_start <= 0 | new_end <= 0)
        if (any(notcds))
            warning("Coordinates for ", sum(notcds), " of ", length(ids),
                    " transcript(s) are not in the coding region: ",
                    .ids_message(ids[notcds]))
        endout <- !nf & !isna & !notcds & new_end > tx_lens[ids, "cds_len"]
        if (any(endout))
            warning("End coordinate for ", sum(endout), " of ", length(ids),
                    " transcript(s) are outside of the coding region: ",
                    .ids_message(ids[endout]))
        ## Now update coordinates
        end(x[!nf &!isna & !notcds & !endout]) <-
            new_end[!nf &!isna & !notcds & !endout]
        start(x[!nf & !isna & !notcds & !endout]) <-
            new_start[!nf & !isna & !notcds & !endout]
    } else
        warning("Could not find any of the provided transcript IDs")
    x
}

#' @title Map positions within the CDS to coordinates relative to the start of
#'     the transcript
#'
#' @description
#'
#' Converts CDS-relative coordinates to positions within the transcript, i.e.
#' relative to the start of the transcript and hence including its 5' UTR.
#'
#' @param x `IRanges` with the coordinates within the CDS. Coordinates
#'     are expected to be relative to the transcription start (the first
#'     nucleotide of the transcript). The Ensembl IDs of the corresponding
#'     transcripts have to be provided either as `names` of the `IRanges`, or
#'     in one of its metadata columns.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' transcripts CDS. The transcript-relative coordinates are provided
#' as metadata columns.
#' `IRanges` with a start coordinate of `-1` is returned for transcripts
#' that are not known in the database, non-coding transcripts or if the
#' provided start and/or end coordinates are not within the coding region.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Defining transcript-relative coordinates for 4 transcripts of the gene
#' ## BCL2
#' txcoords <- IRanges(start = c(4, 3, 143, 147), width = 1,
#'     names = c("ENST00000398117", "ENST00000333681",
#'     "ENST00000590515", "ENST00000589955"))
#'
#' cdsToTranscript(txcoords, EnsDb.Hsapiens.v86)
#'
#' ## Next we map the coordinate for variants within the gene PKP2 to the
#' ## genome. The variants is PKP2 c.1643DelG and the provided
#' ## position is thus relative to the CDS. We have to convert the
#' ## position first to transcript-relative coordinates.
#' pkp2 <- IRanges(start = 1643, width = 1, name = "ENST00000070846")
#'
#' ## Map the coordinates by first converting the CDS- to transcript-relative
#' ## coordinates
#' transcriptToGenome(cdsToTranscript(pkp2, EnsDb.Hsapiens.v86),
#'     EnsDb.Hsapiens.v86)
cdsToTranscript <- function(x, db, id = "name") {
    if (missing(x) || !is(x, "IRanges"))
        stop("Argument 'x' is required and has to be an 'IRanges' object")
    if (missing(db) || !is(db, "EnsDb"))
        stop("Argument 'db' is required and has to be an 'EnsDb' object")
    id <- match.arg(id, c("name", colnames(mcols(x))))
    if (id == "name")
	ids <- names(x)
    else ids <- mcols(x)[, id]
    tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE,
                                  with.cds_len = TRUE,
                                  filter = ~ tx_id %in% ids)
    mcols(x)$cds_start <- start(x)
    mcols(x)$cds_end <- end(x)
    start(x) <- -1
    end(x) <- -1
    if (nrow(tx_lens)) {
        ## IDs not found
        nf <- !(ids %in% tx_lens$tx_id)
        if (any(nf))
            warning("Could not find ", sum(nf), " of ", length(ids),
                    " transcript(s): ", .ids_message(ids[nf]))
        new_start <- mcols(x)$cds_start + tx_lens[ids, "utr5_len"]
        new_end <- mcols(x)$cds_end + tx_lens[ids, "utr5_len"]
        isna <- !nf & is.na(new_start)
        if (any(isna))
            warning(sum(isna), " of ", length(ids), " transcript(s) are non-",
                    "coding: ", .ids_message(ids[isna]))
        ## coordinate not in CDS?
        notcds <- !nf & !isna &
            (mcols(x)$cds_start > tx_lens[ids, "cds_len"] |
                     mcols(x)$cds_end >= tx_lens[ids, "cds_len"])
        if (any(notcds))
            warning("Coordinates for ", sum(notcds), " of ", length(ids),
                    " transcript(s) are not in the coding region: ",
                    .ids_message(ids[notcds]))
        ## Now update coordinates
        end(x[!nf & !isna & !notcds]) <- new_end[!nf & !isna & !notcds]
        start(x[!nf & !isna & !notcds]) <- new_start[!nf & !isna & !notcds]
    } else
        warning("Could not find any of the provided transcript IDs")
    x
}

.ids_message <- function(x) {
    mess <- paste(x[1:min(3, length(x))], collapse = ", ")
    if (length(x) > 3) {
        mess <- paste0(mess, " ... (", length(x) - 3, " more)")
    }
    mess
}