1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
|
\name{makeTxDbFromBiomart}
\alias{makeTxDbFromBiomart}
\alias{getChromInfoFromBiomart}
\title{
Make a TxDb object from annotations available on a
BioMart database
}
\description{
The \code{makeTxDbFromBiomart} function allows the user to make a
\link[GenomicFeatures]{TxDb} object from transcript annotations
available on a BioMart database.
Note that \code{makeTxDbFromBiomart} is being phased out
in favor of \code{\link{makeTxDbFromEnsembl}}.
}
\usage{
makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
dataset="hsapiens_gene_ensembl",
transcript_ids=NULL,
circ_seqs=NULL,
filter=NULL,
id_prefix="ensembl_",
host="https://www.ensembl.org",
port,
taxonomyId=NA,
miRBaseBuild=NA)
getChromInfoFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
dataset="hsapiens_gene_ensembl",
id_prefix="ensembl_",
host="https://www.ensembl.org",
port)
}
\arguments{
\item{biomart}{which BioMart database to use.
Get the list of all available BioMart databases with the
\code{\link[biomaRt]{listMarts}} function from the biomaRt
package. See the details section below for a list of BioMart
databases with compatible transcript annotations.}
\item{dataset}{which dataset from BioMart. For example:
\code{"hsapiens_gene_ensembl"}, \code{"mmusculus_gene_ensembl"},
\code{"dmelanogaster_gene_ensembl"}, \code{"celegans_gene_ensembl"},
etc in the ensembl database.
See the examples section below for how to discover which datasets
are available in a given BioMart database.}
\item{transcript_ids}{optionally, only retrieve transcript
annotation data for the specified set of transcript ids.
If this is used, then the meta information displayed for the
resulting \link[GenomicFeatures]{TxDb} object will say 'Full dataset: no'.
Otherwise it will say 'Full dataset: yes'.}
\item{circ_seqs}{a character vector to list out which chromosomes
should be marked as circular.}
\item{filter}{Additional filters to use in the BioMart query. Must be
a named list. An example is \code{filter=list(source="entrez")}}
\item{id_prefix}{Specifies the prefix used in BioMart attributes. For
example, some BioMarts may have an attribute specified as
\code{"ensembl_transcript_id"} whereas others have the same attribute
specified as \code{"transcript_id"}. Defaults to \code{"ensembl_"}.}
\item{host}{The host URL of the BioMart. Defaults to www.ensembl.org.}
\item{port}{The port to use in the HTTP communication with the host. This
argument has been deprecated. It is handled by \code{useEnsembl}
depending on the host input.}
\item{taxonomyId}{By default this value is NA and the dataset
selected will be used to look up the correct value for this. But
you can use this argument to override that and supply your own
taxId here (which will be independently checked to make sure its a
real taxonomy id). Normally you should never need to use this.}
\item{miRBaseBuild}{specify the string for the appropriate build
Information from mirbase.db to use for microRNAs. This can be
learned by calling \code{supportedMiRBaseBuildValues}. By default,
this value will be set to \code{NA}, which will inactivate the
\code{microRNAs} accessor.}
}
\details{
\code{makeTxDbFromBiomart} is a convenience function that feeds
data from a BioMart database to the lower level
\code{\link{makeTxDb}} function.
See \code{?\link{makeTxDbFromUCSC}} for a similar function
that feeds data from the UCSC source.
Here is a list of datasets known to be compatible with
\code{makeTxDbFromBiomart} (list updated on September 18, 2017):
\enumerate{
\item All the datasets in the main Ensembl database.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsembl(biomart="ENSEMBL_MART_ENSEMBL",
host="https://www.ensembl.org")
biomaRt::listDatasets(mart)}
\item All the datasets in the Ensembl Fungi database.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsemblGenomes(biomart="fungi_mart")
biomaRt::listDatasets(mart)}
\item All the datasets in the Ensembl Metazoa database.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsemblGenomes(biomart="metazoa_mart")
biomaRt::listDatasets(mart)}
\item All the datasets in the Ensembl Plants database.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsemblGenomes(biomart="plants_mart")
biomaRt::listDatasets(mart)}
\item All the datasets in the Ensembl Protists database.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsemblGenomes(biomart="protists_mart")
biomaRt::listDatasets(mart)}
\item All the datasets in the Gramene Mart.
Get the list with:
\preformatted{ mart <- biomaRt::useEnsembl(biomart="ENSEMBL_MART_PLANT",
host="https://ensembl.gramene.org")
biomaRt::listDatasets(mart)}
}
Note that BioMart is not currently available for Ensembl Bacteria.
Also please note that not all these datasets have CDS information.
}
\value{
A \link[GenomicFeatures]{TxDb} object for \code{makeTxDbFromBiomart}.
A data frame with 1 row per chromosome (or scaffold) and with columns
\code{chrom} and \code{length} for \code{getChromInfoFromBiomart}.
}
\author{
M. Carlson and H. Pagès
}
\seealso{
\itemize{
\item \code{\link{makeTxDbFromUCSC}} and \code{\link{makeTxDbFromEnsembl}}
for making a \link[GenomicFeatures]{TxDb} object from other online
resources.
\item \code{\link{makeTxDbFromGRanges}} and \code{\link{makeTxDbFromGFF}}
for making a \link[GenomicFeatures]{TxDb} object from a
\link[GenomicRanges]{GRanges} object, or from a GFF or GTF file.
\item The \code{\link[biomaRt]{listMarts}},
\code{\link[biomaRt]{useEnsembl}},
\code{\link[biomaRt]{listDatasets}}, and
\code{\link[biomaRt]{listFilters}} functions in the
\pkg{biomaRt} package.
\item The \code{\link{supportedMiRBaseBuildValues}} function for
listing all the possible values for the \code{miRBaseBuild}
argument.
\item \link[GenomicFeatures]{TxDb} objects implemented in the
\pkg{GenomicFeatures} package.
\item \code{\link{makeTxDb}} for the low-level function used
by the \code{makeTxDbFrom*} functions to make the
\link[GenomicFeatures]{TxDb} object returned to the user.
}
}
\examples{
## ---------------------------------------------------------------------
## A. BASIC USAGE
## ---------------------------------------------------------------------
## We can use listDatasets() from the biomaRt package to list the
## datasets available in the "ENSEMBL_MART_ENSEMBL" BioMart database:
library(biomaRt)
listMarts(host="https://www.ensembl.org")
mart <- useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", host="https://www.ensembl.org")
datasets <- listDatasets(mart)
head(datasets)
subset(datasets, grepl("elegans", dataset, ignore.case=TRUE))
## Retrieve the full transcript dataset for Worm:
txdb1 <- makeTxDbFromBiomart(dataset="celegans_gene_ensembl")
txdb1
## Retrieve an incomplete transcript dataset for Human:
transcript_ids <- c(
"ENST00000013894",
"ENST00000268655",
"ENST00000313243",
"ENST00000435657",
"ENST00000384428",
"ENST00000478783"
)
if (interactive()) {
txdb2 <- makeTxDbFromBiomart(dataset="hsapiens_gene_ensembl",
transcript_ids=transcript_ids)
txdb2 # note that these annotations match the GRCh38 genome assembly
}
## ---------------------------------------------------------------------
## B. ACCESSING THE EnsemblGenomes MARTS
## ---------------------------------------------------------------------
library(biomaRt)
## Note that BioMart is not currently available for Ensembl Bacteria.
## ---------------------
## --- Ensembl Fungi ---
mart <- useEnsemblGenomes(biomart="fungi_mart")
datasets <- listDatasets(mart)
datasets$dataset
yeast_txdb <- makeTxDbFromBiomart(biomart="fungi_mart",
dataset="scerevisiae_eg_gene",
host="https://fungi.ensembl.org")
yeast_txdb
## -----------------------
## --- Ensembl Metazoa ---
## The metazoa mart is slow and at the same time it doesn't seem to
## support requests that take more than 1 min at the moment. So a call to
## biomaRt::getBM() will fail with a "Timeout was reached" error if the
## requested data takes more than 1 min to download. This unfortunately
## happens with the example below so we don't try to run it for now.
\donttest{
mart <- useEnsemblGenomes(biomart="metazoa_mart")
datasets <- listDatasets(mart)
datasets$dataset
worm_txdb <- makeTxDbFromBiomart(biomart="metazoa_mart",
dataset="celegans_eg_gene",
host="https://metazoa.ensembl.org")
worm_txdb
## Note that even if the dataset for Worm on Ensembl Metazoa contains
## the same transcript as on the main Ensembl database, the transcript
## type might be annotated with slightly different terms (e.g. antisense
## vs antisense_RNA):
filter <- list(tx_name="Y71G12B.44")
transcripts(worm_txdb, filter=filter, columns=c("tx_name", "tx_type"))
transcripts(txdb1, filter=filter, columns=c("tx_name", "tx_type"))
}
## ----------------------
## --- Ensembl Plants ---
## Like the metazoa mart (see above), the plants mart is also slow and
## doesn't seem to support requests that take more than 1 min either.
## So we don't try to run the example below for now.
\donttest{
mart <- useEnsemblGenomes(biomart="plants_mart")
datasets <- listDatasets(mart)
datasets[ , 1:2]
athaliana_txdb <- makeTxDbFromBiomart(biomart="plants_mart",
dataset="athaliana_eg_gene",
host="https://plants.ensembl.org")
athaliana_txdb
}
## ------------------------
## --- Ensembl Protists ---
mart <- useEnsemblGenomes(biomart="protists_mart")
datasets <- listDatasets(mart)
datasets$dataset
tgondii_txdb <- makeTxDbFromBiomart(biomart="protists_mart",
dataset="tgondii_eg_gene",
host="https://protists.ensembl.org")
tgondii_txdb
## ---------------------------------------------------------------------
## C. USING AN Ensembl MIRROR
## ---------------------------------------------------------------------
## You can use the 'host' argument to access the "ENSEMBL_MART_ENSEMBL"
## BioMart database at a mirror (e.g. at uswest.ensembl.org). A gotcha
## when doing this is that the name of the database on the mirror might
## be different! We can check this with listMarts() from the biomaRt
## package:
if (interactive()) {
listMarts(host="https://useast.ensembl.org")
txdb3 <- makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
dataset="hsapiens_gene_ensembl",
transcript_ids=transcript_ids,
host="https://useast.ensembl.org")
txdb3
}
## Therefore in addition to setting 'host' to "uswest.ensembl.org", we
## might also need to specify the 'biomart' argument.
## ---------------------------------------------------------------------
## D. USING FILTERS
## ---------------------------------------------------------------------
## We can use listFilters() from the biomaRt package to get valid filter
## names:
mart <- useEnsembl(biomart="ENSEMBL_MART_ENSEMBL",
dataset="hsapiens_gene_ensembl",
host="https://www.ensembl.org")
head(listFilters(mart))
## Retrieve transcript dataset for Ensembl gene ENSG00000011198:
my_filter <- list(ensembl_gene_id="ENSG00000011198")
if (interactive()) {
txdb4 <- makeTxDbFromBiomart(dataset="hsapiens_gene_ensembl",
filter=my_filter)
txdb4
transcripts(txdb4, columns=c("tx_id", "tx_name", "gene_id"))
transcriptLengths(txdb4)
}
## ---------------------------------------------------------------------
## E. RETRIEVING CHROMOSOME INFORMATION ONLY
## ---------------------------------------------------------------------
chrominfo <- getChromInfoFromBiomart(dataset="celegans_gene_ensembl")
chrominfo
}
|