File: proteinToGenome.Rd

package info (click to toggle)
r-bioc-genomicfeatures 1.50.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,056 kB
  • sloc: makefile: 6; sh: 2
file content (179 lines) | stat: -rw-r--r-- 6,651 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
\name{proteinToGenome}

\alias{proteinToGenome}
\alias{proteinToGenome,GRangesList-method}
\alias{proteinToGenome,ANY-method}

\title{Map protein-relative coordinates to genomic coordinates}

\description{
  \code{proteinToGenome} is a generic function for mapping
  ranges of protein-relative positions to the genome.

  NOTE: This man page is for the \code{proteinToGenome} S4 generic
  function and methods defined in the \pkg{GenomicFeatures} package,
  which are (loosely) modeled on the \code{\link[ensembldb]{proteinToGenome}}
  function from the \pkg{ensembldb} package.
  See \code{?ensembldb::\link[ensembldb]{proteinToGenome}} for the latter.
}

\usage{
## S4 generic function:
proteinToGenome(x, db, ...)  # dispatch is on 2nd argument 'db'

\S4method{proteinToGenome}{ANY}(x, db)

\S4method{proteinToGenome}{GRangesList}(x, db)
}

\arguments{
  \item{x}{
    A named \link[IRanges]{IRanges} object (or derivative) containing ranges
    of \emph{protein-relative positions} (protein-relative positions are
    positions relative to a protein sequence).

    The names on \code{x} must be transcript names present in \code{db}.
    More precisely, for the default \code{proteinToGenome()} method,
    \code{names(x)} must be a subset of:
    \preformatted{  mcols(transcripts(db, columns="tx_name"))$tx_name
    }
    And for the method for \link[GenomicRanges]{GRangesList} objects,
    \code{names(x)} must be a subset of:
    \preformatted{  names(db)
    }
  }
  \item{db}{
    For the default \code{proteinToGenome()} method: A \link{TxDb}
    object or any object that supports \code{\link{transcripts}()}
    and \code{\link{cdsBy}()} (e.g. an \link[ensembldb]{EnsDb} object
    from the \pkg{ensembldb} package).

    For the method for \link[GenomicRanges]{GRangesList} objects:
    A named \link[GenomicRanges]{GRangesList} object (or derivative)
    where each list element is a \link[GenomicRanges]{GRanges} object
    representing a CDS (the ranges in the \link[GenomicRanges]{GRanges}
    object must represent the CDS parts ordered by ascending exon rank).
  }
  \item{...}{
    Further arguments to be passed to specific methods.
  }
}

\details{
  The \code{proteinToGenome()} method for \link[GenomicRanges]{GRangesList}
  objects is the workhorse behind the default method. Note that the latter
  is a thin wrapper around the former, which simply does the following:
  \enumerate{
    \item Use \code{\link{cdsBy}()} to extract the CDS from \code{db}.
          The CDS are returned in a \link[GenomicRanges]{GRangesList}
          object with transcript names on it.
    \item Call \code{proteinToGenome()} on \code{x} and the
          \link[GenomicRanges]{GRangesList} object returned by
          \code{\link{cdsBy}()}.
  }
}

\value{
  A named \link[GenomicRanges]{GRangesList} object \emph{parallel} to
  \code{x} (the transcript names on \code{x} are propagated).
  The i-th list element in the returned object is the result of mapping
  the range of protein-relative positions \code{x[i]} to the genome.

  Note that a given range in \code{x} can only be mapped to the genome
  if the name on it is the name of a \emph{coding} transcript. If it's
  not (i.e. if it's the name of a \emph{non-coding} transcript), then
  an empty \link[GenomicRanges]{GRanges} object is placed in the returned
  object to indicate the impossible mapping, and a warning is issued.

  Otherwise, if a given range in \code{x} can be mapped to the
  genome, then the result of the mapping is represented by a
  non-empty \link[GenomicRanges]{GRanges} object.
  Note that this object represents the original CDS associated to
  \code{x}, trimmed on its 5' end or 3' end, or on both.
  Furthermore, this object will have the same metadata columns as the
  \link[GenomicRanges]{GRanges} object representing the original CDS,
  plus the 2 following ones:
  \itemize{
    \item \code{protein_start}: The protein-relative start of the mapping.
    \item \code{protein_end}: The protein-relative end of the mapping.
  }
}

\note{
  Unlike \code{ensembldb::\link[ensembldb]{proteinToGenome}()} which
  can work either with Ensembl protein IDs or Ensembl transcript IDs
  on \code{x}, the default \code{proteinToGenome()} method described
  above only accepts \emph{transcript names} on \code{x}.

  This means that, if the user is in possession of protein IDs, they
  must first replace them with the corresponding transcript IDs (referred
  to as \emph{transcript names} in the context of \link{TxDb} objects).
  How to do this exactly depends on the origin of those IDs (UCSC,
  Ensembl, GTF/GFF3 file, FlyBase, etc...)
}

\author{H. Pagès, using \code{ensembldb::proteinToGenome()} for
        inspiration and design.}

\seealso{
  \itemize{
    \item The \code{\link[ensembldb]{proteinToGenome}} function in the
          \pkg{ensembldb} package, which the \code{proteinToGenome()}
          generic and methods documented in this man page are (loosely)
          modeled on.

    \item \link{TxDb} objects.

    \item \link[ensembldb]{EnsDb} objects (\link{TxDb}-like objects) in
          the \pkg{ensembldb} package.

    \item \code{\link{transcripts}} for extracting transcripts from a
          \link{TxDb}-like object.

    \item \code{\link{cdsBy}} for extracting CDS from a \link{TxDb}-like
          object.

    \item \link[IRanges]{IRanges} objects in the \pkg{IRanges} package.

    \item \link[GenomicRanges]{GRanges} and \link[GenomicRanges]{GRangesList}
          objects in the \pkg{GenomicRanges} package.
  }
}

\examples{
## ---------------------------------------------------------------------
## USING TOY CDS
## ---------------------------------------------------------------------
CDS1 <- GRanges(c("chrX:11-60:+", "chrX:101-125:+"))
CDS2 <- GRanges(c("chrY:201-230:-", "chrY:101-125:-", "chrY:11-60:-"))
cds_by_tx <- GRangesList(TX1=CDS1, TX2=CDS2)
cds_by_tx

x1 <- IRanges(start=8, end=20, names="TX1")
proteinToGenome(x1, cds_by_tx)

x2 <- IRanges(start=c(1, 18), end=c(25, 20), names=c("TX1", "TX1"))
x2
proteinToGenome(x2, cds_by_tx)

x3 <- IRanges(start=8, end=15, names="TX2")
proteinToGenome(x3, cds_by_tx)

x4 <- c(x3, x2)
x4
proteinToGenome(x4, cds_by_tx)

## ---------------------------------------------------------------------
## USING A TxDb OBJECT
## ---------------------------------------------------------------------
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene

## The first transcript (FBtr0309810) is non-coding:
x <- IRanges(c(FBtr0309810="11-55", FBtr0306539="90-300"))
res <- proteinToGenome(x, txdb)
res
}

\keyword{methods}
\keyword{utilities}