File: makeTxDbFromBiomart.Rd

package info (click to toggle)
r-bioc-txdbmaker 1.2.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,168 kB
  • sloc: makefile: 2
file content (322 lines) | stat: -rw-r--r-- 12,601 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
\name{makeTxDbFromBiomart}

\alias{makeTxDbFromBiomart}
\alias{getChromInfoFromBiomart}

\title{
  Make a TxDb object from annotations available on a
  BioMart database
}
\description{
  The \code{makeTxDbFromBiomart} function allows the user to make a
  \link[GenomicFeatures]{TxDb} object from transcript annotations
  available on a BioMart database.

  Note that \code{makeTxDbFromBiomart} is being phased out
  in favor of \code{\link{makeTxDbFromEnsembl}}.
}
\usage{
makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
                    dataset="hsapiens_gene_ensembl",
                    transcript_ids=NULL,
                    circ_seqs=NULL,
                    filter=NULL,
                    id_prefix="ensembl_",
                    host="https://www.ensembl.org",
                    port,
                    taxonomyId=NA,
                    miRBaseBuild=NA)

getChromInfoFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
                        dataset="hsapiens_gene_ensembl",
                        id_prefix="ensembl_",
                        host="https://www.ensembl.org",
                        port)
}
\arguments{
  \item{biomart}{which BioMart database to use.
    Get the list of all available BioMart databases with the
    \code{\link[biomaRt]{listMarts}} function from the biomaRt
    package. See the details section below for a list of BioMart
    databases with compatible transcript annotations.}
  \item{dataset}{which dataset from BioMart. For example:
    \code{"hsapiens_gene_ensembl"}, \code{"mmusculus_gene_ensembl"},
    \code{"dmelanogaster_gene_ensembl"}, \code{"celegans_gene_ensembl"},
    etc in the ensembl database.
    See the examples section below for how to discover which datasets
    are available in a given BioMart database.}
  \item{transcript_ids}{optionally, only retrieve transcript
    annotation data for the specified set of transcript ids.
    If this is used, then the meta information displayed for the
    resulting \link[GenomicFeatures]{TxDb} object will say 'Full dataset: no'.
    Otherwise it will say 'Full dataset: yes'.}
  \item{circ_seqs}{a character vector to list out which chromosomes
    should be marked as circular.}
  \item{filter}{Additional filters to use in the BioMart query. Must be
    a named list. An example is \code{filter=list(source="entrez")}}
  \item{id_prefix}{Specifies the prefix used in BioMart attributes. For
    example, some BioMarts may have an attribute specified as
    \code{"ensembl_transcript_id"} whereas others have the same attribute
    specified as \code{"transcript_id"}. Defaults to \code{"ensembl_"}.}
  \item{host}{The host URL of the BioMart. Defaults to www.ensembl.org.}
  \item{port}{The port to use in the HTTP communication with the host. This
    argument has been deprecated. It is handled by \code{useEnsembl}
    depending on the host input.}
  \item{taxonomyId}{By default this value is NA and the dataset
    selected will be used to look up the correct value for this.  But
    you can use this argument to override that and supply your own
    taxId here (which will be independently checked to make sure its a
    real taxonomy id).  Normally you should never need to use this.}
  \item{miRBaseBuild}{specify the string for the appropriate build
    Information from mirbase.db to use for microRNAs.  This can be
    learned by calling \code{supportedMiRBaseBuildValues}.  By default,
    this value will be set to \code{NA}, which will inactivate the
    \code{microRNAs} accessor.}
}

\details{
  \code{makeTxDbFromBiomart} is a convenience function that feeds
  data from a BioMart database to the lower level
  \code{\link{makeTxDb}} function.
  See \code{?\link{makeTxDbFromUCSC}} for a similar function
  that feeds data from the UCSC source.

  Here is a list of datasets known to be compatible with
  \code{makeTxDbFromBiomart} (list updated on September 18, 2017):
  \enumerate{
    \item All the datasets in the main Ensembl database.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsembl(biomart="ENSEMBL_MART_ENSEMBL",
                             host="https://www.ensembl.org")
    biomaRt::listDatasets(mart)}

    \item All the datasets in the Ensembl Fungi database.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsemblGenomes(biomart="fungi_mart")
    biomaRt::listDatasets(mart)}

    \item All the datasets in the Ensembl Metazoa database.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsemblGenomes(biomart="metazoa_mart")
    biomaRt::listDatasets(mart)}

    \item All the datasets in the Ensembl Plants database.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsemblGenomes(biomart="plants_mart")
    biomaRt::listDatasets(mart)}

    \item All the datasets in the Ensembl Protists database.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsemblGenomes(biomart="protists_mart")
    biomaRt::listDatasets(mart)}

    \item All the datasets in the Gramene Mart.
          Get the list with:
\preformatted{    mart <- biomaRt::useEnsembl(biomart="ENSEMBL_MART_PLANT",
                             host="https://ensembl.gramene.org")
    biomaRt::listDatasets(mart)}
  }

  Note that BioMart is not currently available for Ensembl Bacteria.

  Also please note that not all these datasets have CDS information.
}

\value{
  A \link[GenomicFeatures]{TxDb} object for \code{makeTxDbFromBiomart}.

  A data frame with 1 row per chromosome (or scaffold) and with columns
  \code{chrom} and \code{length} for \code{getChromInfoFromBiomart}.
}

\author{
  M. Carlson and H. Pagès
}

\seealso{
  \itemize{
    \item \code{\link{makeTxDbFromUCSC}} and \code{\link{makeTxDbFromEnsembl}}
          for making a \link[GenomicFeatures]{TxDb} object from other online
          resources.

    \item \code{\link{makeTxDbFromGRanges}} and \code{\link{makeTxDbFromGFF}}
          for making a \link[GenomicFeatures]{TxDb} object from a
          \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file.

    \item The \code{\link[biomaRt]{listMarts}},
          \code{\link[biomaRt]{useEnsembl}},
          \code{\link[biomaRt]{listDatasets}}, and
          \code{\link[biomaRt]{listFilters}} functions in the
          \pkg{biomaRt} package.

    \item The \code{\link{supportedMiRBaseBuildValues}} function for
          listing all the possible values for the \code{miRBaseBuild}
          argument.

    \item \link[GenomicFeatures]{TxDb} objects implemented in the
          \pkg{GenomicFeatures} package.

    \item \code{\link{makeTxDb}} for the low-level function used
          by the \code{makeTxDbFrom*} functions to make the
          \link[GenomicFeatures]{TxDb} object returned to the user.
  }
}

\examples{
## ---------------------------------------------------------------------
## A. BASIC USAGE
## ---------------------------------------------------------------------

## We can use listDatasets() from the biomaRt package to list the
## datasets available in the "ENSEMBL_MART_ENSEMBL" BioMart database:
library(biomaRt)
listMarts(host="https://www.ensembl.org")
mart <- useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", host="https://www.ensembl.org")
datasets <- listDatasets(mart)
head(datasets)
subset(datasets, grepl("elegans", dataset, ignore.case=TRUE))

## Retrieve the full transcript dataset for Worm:
txdb1 <- makeTxDbFromBiomart(dataset="celegans_gene_ensembl")
txdb1

## Retrieve an incomplete transcript dataset for Human:
transcript_ids <- c(
    "ENST00000013894",
    "ENST00000268655",
    "ENST00000313243",
    "ENST00000435657",
    "ENST00000384428",
    "ENST00000478783"
)

if (interactive()) {
  txdb2 <- makeTxDbFromBiomart(dataset="hsapiens_gene_ensembl",
                               transcript_ids=transcript_ids)
  txdb2  # note that these annotations match the GRCh38 genome assembly
}

## ---------------------------------------------------------------------
## B. ACCESSING THE EnsemblGenomes MARTS
## ---------------------------------------------------------------------

library(biomaRt)

## Note that BioMart is not currently available for Ensembl Bacteria.

## ---------------------
## --- Ensembl Fungi ---

mart <- useEnsemblGenomes(biomart="fungi_mart")
datasets <- listDatasets(mart)
datasets$dataset
yeast_txdb <- makeTxDbFromBiomart(biomart="fungi_mart",
                                  dataset="scerevisiae_eg_gene",
                                  host="https://fungi.ensembl.org")
yeast_txdb

## -----------------------
## --- Ensembl Metazoa ---

## The metazoa mart is slow and at the same time it doesn't seem to
## support requests that take more than 1 min at the moment. So a call to
## biomaRt::getBM() will fail with a "Timeout was reached" error if the
## requested data takes more than 1 min to download. This unfortunately
## happens with the example below so we don't try to run it for now.

\donttest{
  mart <- useEnsemblGenomes(biomart="metazoa_mart")
  datasets <- listDatasets(mart)
  datasets$dataset
  worm_txdb <- makeTxDbFromBiomart(biomart="metazoa_mart",
                                   dataset="celegans_eg_gene",
                                   host="https://metazoa.ensembl.org")
  worm_txdb

  ## Note that even if the dataset for Worm on Ensembl Metazoa contains
  ## the same transcript as on the main Ensembl database, the transcript
  ## type might be annotated with slightly different terms (e.g. antisense
  ## vs antisense_RNA):
  filter <- list(tx_name="Y71G12B.44")
  transcripts(worm_txdb, filter=filter, columns=c("tx_name", "tx_type"))
  transcripts(txdb1, filter=filter, columns=c("tx_name", "tx_type"))
}
## ----------------------
## --- Ensembl Plants ---

## Like the metazoa mart (see above), the plants mart is also slow and
## doesn't seem to support requests that take more than 1 min either.
## So we don't try to run the example below for now.

\donttest{
mart <- useEnsemblGenomes(biomart="plants_mart")
datasets <- listDatasets(mart)
datasets[ , 1:2]
athaliana_txdb <- makeTxDbFromBiomart(biomart="plants_mart",
                                      dataset="athaliana_eg_gene",
                                      host="https://plants.ensembl.org")
athaliana_txdb
}
## ------------------------
## --- Ensembl Protists ---

mart <- useEnsemblGenomes(biomart="protists_mart")
datasets <- listDatasets(mart)
datasets$dataset
tgondii_txdb <- makeTxDbFromBiomart(biomart="protists_mart",
                                    dataset="tgondii_eg_gene",
                                    host="https://protists.ensembl.org")
tgondii_txdb

## ---------------------------------------------------------------------
## C. USING AN Ensembl MIRROR
## ---------------------------------------------------------------------

## You can use the 'host' argument to access the "ENSEMBL_MART_ENSEMBL"
## BioMart database at a mirror (e.g. at uswest.ensembl.org). A gotcha
## when doing this is that the name of the database on the mirror might
## be different! We can check this with listMarts() from the biomaRt
## package:
if (interactive()) {

  listMarts(host="https://useast.ensembl.org")

  txdb3 <- makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
                               dataset="hsapiens_gene_ensembl",
                               transcript_ids=transcript_ids,
                               host="https://useast.ensembl.org")
  txdb3
}
## Therefore in addition to setting 'host' to "uswest.ensembl.org", we
## might also need to specify the 'biomart' argument.


## ---------------------------------------------------------------------
## D. USING FILTERS
## ---------------------------------------------------------------------

## We can use listFilters() from the biomaRt package to get valid filter
## names:
mart <- useEnsembl(biomart="ENSEMBL_MART_ENSEMBL",
                dataset="hsapiens_gene_ensembl",
                host="https://www.ensembl.org")
head(listFilters(mart))

## Retrieve transcript dataset for Ensembl gene ENSG00000011198:
my_filter <- list(ensembl_gene_id="ENSG00000011198")

if (interactive()) {
  txdb4 <- makeTxDbFromBiomart(dataset="hsapiens_gene_ensembl",
                               filter=my_filter)
  txdb4
  transcripts(txdb4, columns=c("tx_id", "tx_name", "gene_id"))
  transcriptLengths(txdb4)
}

## ---------------------------------------------------------------------
## E. RETRIEVING CHROMOSOME INFORMATION ONLY
## ---------------------------------------------------------------------

chrominfo <- getChromInfoFromBiomart(dataset="celegans_gene_ensembl")
chrominfo
}