1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
|
\name{makeEnsembldbPackage}
\alias{ensDbFromAH}
\alias{ensDbFromGRanges}
\alias{ensDbFromGtf}
\alias{ensDbFromGff}
\alias{makeEnsembldbPackage}
\alias{fetchTablesFromEnsembl}
\alias{makeEnsemblSQLiteFromTables}
\title{
Generating a Ensembl annotation package from Ensembl
}
\description{
The functions described on this page allow to build \code{EnsDb}
annotation objects/databases from Ensembl annotations. The most
complete set of annotations, which include also the NCBI Entrezgene
identifiers for each gene, can be retrieved by the functions using
the Ensembl Perl API (i.e. functions \code{fetchTablesFromEnsembl},
\code{makeEnsemblSQLiteFromTables}). Alternatively the functions
\code{ensDbFromAH}, \code{ensDbFromGRanges}, \code{ensDbFromGff} and
\code{ensDbFromGtf} can be used to build \code{EnsDb} objects using
GFF or GTF files from Ensembl, which can be either manually downloaded
from the Ensembl ftp server, or directly form within R using
\code{AnnotationHub}.
The generated SQLite database can be packaged into an R package using
the \code{makeEnsembldbPackage}.
}
\usage{
ensDbFromAH(ah, outfile, path, organism, genomeVersion, version)
ensDbFromGRanges(x, outfile, path, organism, genomeVersion,
version, ...)
ensDbFromGff(gff, outfile, path, organism, genomeVersion,
version, ...)
ensDbFromGtf(gtf, outfile, path, organism, genomeVersion,
version, ...)
fetchTablesFromEnsembl(version, ensemblapi, user="anonymous",
host="ensembldb.ensembl.org", pass="",
port=5306, species="human")
makeEnsemblSQLiteFromTables(path=".", dbname)
makeEnsembldbPackage(ensdb, version, maintainer, author,
destDir=".", license="Artistic-2.0")
}
\arguments{
(in alphabetical order)
\item{ah}{
For \code{ensDbFromAH}: an \code{AnnotationHub} object representing
a single resource (i.e. GTF file from Ensembl) from
\code{AnnotationHub}.
}
\item{author}{
The author of the package.
}
\item{dbname}{
The name for the database (optional). By default a name based on the
species and Ensembl version will be automatically generated (and
returned by the function).
}
\item{destDir}{
Where the package should be saved to.
}
\item{ensdb}{
The file name of the SQLite database generated by \code{makeEnsemblSQLiteFromTables}.
}
\item{ ensemblapi }{
The path to the Ensembl perl API installed locally on the
system. The Ensembl perl API version has to fit the version.
}
\item{genomeVersion}{
For \code{ensDbFromAH}, \code{ensDbFromGtf} and \code{ensDbFromGff}:
the version of the genome (e.g. \code{"GRCh37"}). If not provided
the function will try to guess it from the file name (assuming file
name convention of Ensembl GTF files).
}
\item{gff}{
The GFF file to import.
}
\item{gtf}{
The GTF file name.
}
\item{host}{
The hostname to access the Ensembl database.
}
\item{license}{
The license of the package.
}
\item{maintainer}{
The maintainer of the package.
}
\item{organism}{
For \code{ensDbFromAH}, \code{ensDbFromGff} and \code{ensDbFromGtf}:
the organism name (e.g. \code{"Homo_sapiens"}). If not provided the
function will try to guess it from the file name (assuming file name
convention of Ensembl GTF files).
}
\item{outfile}{
The desired file name of the SQLite file. If not provided the name
of the GTF file will be used.
}
\item{pass}{
The password for the Ensembl database.
}
\item{path}{
The directory in which the tables retrieved by
\code{fetchTablesFromEnsembl} or the SQLite database file generated
by \code{ensDbFromGtf} are stored.
}
\item{port}{
The port to be used to connect to the Ensembl database.
}
\item{species}{
The species for which the annotations should be retrieved.
}
\item{user}{
The username for the Ensembl database.
}
\item{version}{
For \code{fetchTablesFromEnsembl}, \code{ensDbFromGRanges} and \code{ensDbFromGtf}: the
Ensembl version for which the annotation should be retrieved
(e.g. 75). The \code{ensDbFromGtf} function will try to guess the
Ensembl version from the GTF file name if not provided.
For \code{makeEnsemblDbPackage}: the version for the package.
}
\item{x}{
For \code{ensDbFromGRanges}: the \code{GRanges} object.
}
\item{...}{
Currently not used.
}
}
\section{Functions}{
\describe{
\item{ensDbFromAH}{
Create an \code{EnsDb} (SQLite) database from a GTF file provided
by \code{AnnotationHub}. The function returns the file name of the
generated database file. For usage see the examples below.
}
\item{ensDbFromGff}{
Create an \code{EnsDb} (SQLite) database from a GFF file from
Ensembl. The function returns the file name of the
generated database file. For usage see the examples below.
}
\item{ensDbFromGtf}{
Create an \code{EnsDb} (SQLite) database from a GTF file from
Ensembl. The function returns the file name of the generated
database file. For usage see the examplesbelow.
}
\item{ensDbFromGRanges}{
Create an \code{EnsDb} (SQLite) database from a GRanges object
(e.g. from \code{AnnotationHub}). The function returns the file
name of the generated database file. For usage see the examples
below.
}
\item{fetchTablesFromEnsembl}{
Uses the Ensembl Perl API to fetch all required data from an
Ensembl database server and stores them locally to text files
(that can be used as input for the
\code{makeEnsembldbSQLiteFromTables} function).
}
\item{makeEnsemblSQLiteFromTables}{
Creates the SQLite \code{EnsDb} database from the tables generated
by the \code{fetchTablesFromEnsembl}.
}
\item{makeEnsembldbPackage}{
Creates an R package containing the \code{EnsDb} database from a
\code{EnsDb} SQLite database created by any of the above
functions \code{ensDbFromAH}, \code{ensDbFromGff},
\code{ensDbFromGtf} or \code{makeEnsemblSQLiteFromTables}.
}
}
}
\details{
The \code{fetchTablesFromEnsembl} function internally calls the perl
script \code{get_gene_transcript_exon_tables.pl} to retrieve all
required information from the Ensembl database using the Ensembl perl
API.
As an alternative way, a EnsDb database file can be generated by the
\code{ensDbFromGtf} or \code{ensDbFromGff} from a GTF or GFF file
downloaded from the Ensembl ftp server or using the \code{ensDbFromAH}
to build a database directly from corresponding resources from the
AnnotationHub. The returned database file name can then
be used as an input to the \code{makeEnsembldbPackage} or it can be
directly loaded and used by the \code{EnsDb} constructor.
}
\note{
A local installation of the Ensembl perl API is required for the
\code{fetchTablesFromEnsembl}. See
\url{http://www.ensembl.org/info/docs/api/api_installation.html} for
installation inscructions.
A database generated from a GTF/GFF files lacks some features as they are
not available in the GTF files from Ensembl. These are: NCBI
Entrezgene IDs.
}
\value{
\code{makeEnsemblSQLiteFromTables}, \code{ensDbFromAH},
\code{ensDbFromGRanges} and \code{ensDbFromGtf}: the name of the
SQLite file.
}
\seealso{
\code{\link{EnsDb}}, \code{\link{genes}}
}
\author{
Johannes Rainer
}
\examples{
\dontrun{
## get all human gene/transcript/exon annotations from Ensembl (75)
## the resulting tables will be stored by default to the current working
## directory; if the correct Ensembl api (version 75) is defined in the
## PERL5LIB environment variable, the ensemblapi parameter can also be omitted.
fetchTablesFromEnsembl(75,
ensemblapi="/home/bioinfo/ensembl/75/API/ensembl/modules",
species="human")
## These tables can then be processed to generate a SQLite database
## containing the annotations
DBFile <- makeEnsemblSQLiteFromTables()
## and finally we can generate the package
makeEnsembldbPackage(ensdb=DBFile, version="0.0.1",
maintainer="Johannes Rainer <johannes.rainer@eurac.edu>",
author="J Rainer")
## Build an annotation database form a GFF file from Ensembl.
## ftp://ftp.ensembl.org/pub/release-83/gff3/rattus_norvegicus
gff <- "Rattus_norvegicus.Rnor_6.0.83.gff3.gz"
DB <- ensDbFromGff(gff=gff)
edb <- EnsDb(DB)
edb
## Build an annotation file from a GTF file.
## the GTF file can be downloaded from
## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
## generate the SQLite database file
DB <- ensDbFromGtf(gtf=paste0(ensemblhost, gtffile))
## load the DB file directly
EDB <- EnsDb(DB)
## Alternatively, we could fetch a GTF file directly from AnnotationHub
## and build the database from that:
library(AnnotationHub)
ah <- AnnotationHub()
## Query for all GTF files from Ensembl for Ensembl version 81
query(ah, c("Ensembl", "release-81", "GTF"))
## We could get the one from e.g. Bos taurus:
DB <- ensDbFromAH(ah["AH47941"])
edb <- EnsDb(DB)
edb
}
## Generate a sqlite database for genes encoded on chromosome Y
chrY <- system.file("chrY", package="ensembldb")
DBFile <- makeEnsemblSQLiteFromTables(path=chrY ,dbname=tempfile())
## load this database:
edb <- EnsDb(DBFile)
edb
## Generate a sqlite database from a GRanges object specifying
## genes encoded on chromosome Y
load(system.file("YGRanges.RData", package="ensembldb"))
Y
DB <- ensDbFromGRanges(Y, path=tempdir(), version=75,
organism="Homo_sapiens")
edb <- EnsDb(DB)
}
\keyword{ data }
|