File: makeEnsemblDbPackage.Rd

package info (click to toggle)
r-bioc-ensembldb 2.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,764 kB
  • sloc: perl: 331; sh: 15; makefile: 5
file content (315 lines) | stat: -rw-r--r-- 9,946 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
\name{makeEnsembldbPackage}
\alias{ensDbFromAH}
\alias{ensDbFromGRanges}
\alias{ensDbFromGtf}
\alias{ensDbFromGff}
\alias{makeEnsembldbPackage}
\alias{fetchTablesFromEnsembl}
\alias{makeEnsemblSQLiteFromTables}
\title{
  Generating a Ensembl annotation package from Ensembl
}
\description{
  The functions described on this page allow to build \code{EnsDb}
  annotation objects/databases from Ensembl annotations. The most
  complete set of annotations, which include also the NCBI Entrezgene
  identifiers for each gene, can be retrieved by the functions using
  the Ensembl Perl API (i.e. functions \code{fetchTablesFromEnsembl},
  \code{makeEnsemblSQLiteFromTables}). Alternatively the functions
  \code{ensDbFromAH}, \code{ensDbFromGRanges}, \code{ensDbFromGff} and
  \code{ensDbFromGtf} can be used to build \code{EnsDb} objects using
  GFF or GTF files from Ensembl, which can be either manually downloaded
  from the Ensembl ftp server, or directly form within R using
  \code{AnnotationHub}.
  The generated SQLite database can be packaged into an R package using
  the \code{makeEnsembldbPackage}.
}
\usage{

ensDbFromAH(ah, outfile, path, organism, genomeVersion, version)

ensDbFromGRanges(x, outfile, path, organism, genomeVersion,
                 version, ...)

ensDbFromGff(gff, outfile, path, organism, genomeVersion,
             version, ...)

ensDbFromGtf(gtf, outfile, path, organism, genomeVersion,
             version, ...)

fetchTablesFromEnsembl(version, ensemblapi, user="anonymous",
                       host="ensembldb.ensembl.org", pass="",
                       port=5306, species="human")

makeEnsemblSQLiteFromTables(path=".", dbname)

makeEnsembldbPackage(ensdb, version, maintainer, author,
                     destDir=".", license="Artistic-2.0")

}
\arguments{
  (in alphabetical order)

  \item{ah}{
    For \code{ensDbFromAH}: an \code{AnnotationHub} object representing
    a single resource (i.e. GTF file from Ensembl) from
    \code{AnnotationHub}.
  }

  \item{author}{
    The author of the package.
  }

  \item{dbname}{
    The name for the database (optional). By default a name based on the
    species and Ensembl version will be automatically generated (and
    returned by the function).
  }

  \item{destDir}{
    Where the package should be saved to.
  }

  \item{ensdb}{
    The file name of the SQLite database generated by \code{makeEnsemblSQLiteFromTables}.
  }

  \item{ ensemblapi }{
    The path to the Ensembl perl API installed locally on the
  system. The Ensembl perl API version has to fit the version.
  }

  \item{genomeVersion}{
    For \code{ensDbFromAH}, \code{ensDbFromGtf} and \code{ensDbFromGff}:
    the version of the genome (e.g. \code{"GRCh37"}). If not provided
    the function will try to guess it from the file name (assuming file
    name convention of Ensembl GTF files).
  }

  \item{gff}{
    The GFF file to import.
  }

  \item{gtf}{
    The GTF file name.
  }

  \item{host}{
    The hostname to access the Ensembl database.
  }

  \item{license}{
    The license of the package.
  }

  \item{maintainer}{
    The maintainer of the package.
  }

  \item{organism}{
    For \code{ensDbFromAH}, \code{ensDbFromGff} and \code{ensDbFromGtf}:
    the organism name (e.g. \code{"Homo_sapiens"}). If not provided the
    function will try to guess it from the file name (assuming file name
    convention of Ensembl GTF files).
  }

  \item{outfile}{
    The desired file name of the SQLite file. If not provided the name
    of the GTF file will be used.
  }

  \item{pass}{
    The password for the Ensembl database.
  }

  \item{path}{
    The directory in which the tables retrieved by
    \code{fetchTablesFromEnsembl} or the SQLite database file generated
    by \code{ensDbFromGtf} are stored.
  }

  \item{port}{
    The port to be used to connect to the Ensembl database.
  }

  \item{species}{
    The species for which the annotations should be retrieved.
  }

  \item{user}{
    The username for the Ensembl database.
  }

  \item{version}{
    For \code{fetchTablesFromEnsembl}, \code{ensDbFromGRanges} and \code{ensDbFromGtf}: the
    Ensembl version for which the annotation should be retrieved
    (e.g. 75). The \code{ensDbFromGtf} function will try to guess the
    Ensembl version from the GTF file name if not provided.

    For \code{makeEnsemblDbPackage}: the version for the package.
  }

  \item{x}{
    For \code{ensDbFromGRanges}: the \code{GRanges} object.
  }

  \item{...}{
    Currently not used.
  }

}
\section{Functions}{
  \describe{
    \item{ensDbFromAH}{
      Create an \code{EnsDb} (SQLite) database from a GTF file provided
      by \code{AnnotationHub}. The function returns the file name of the
      generated database file. For usage see the examples below.
    }

    \item{ensDbFromGff}{
      Create an \code{EnsDb} (SQLite) database from a GFF file from
      Ensembl. The function returns the file name of the
      generated database file. For usage see the examples below.
    }

    \item{ensDbFromGtf}{
      Create an \code{EnsDb} (SQLite) database from a GTF file from
      Ensembl. The function returns the file name of the generated
      database file. For usage see the examplesbelow.
    }

    \item{ensDbFromGRanges}{
      Create an \code{EnsDb} (SQLite) database from a GRanges object
      (e.g. from \code{AnnotationHub}). The function returns the file
      name of the generated database file. For usage see the examples
      below.
    }

    \item{fetchTablesFromEnsembl}{
      Uses the Ensembl Perl API to fetch all required data from an
      Ensembl database server and stores them locally to text files
      (that can be used as input for the
      \code{makeEnsembldbSQLiteFromTables} function).
    }

    \item{makeEnsemblSQLiteFromTables}{
      Creates the SQLite \code{EnsDb} database from the tables generated
      by the \code{fetchTablesFromEnsembl}.
    }

    \item{makeEnsembldbPackage}{
      Creates an R package containing the \code{EnsDb} database from a
      \code{EnsDb} SQLite database created by any of the above
      functions \code{ensDbFromAH}, \code{ensDbFromGff},
      \code{ensDbFromGtf} or \code{makeEnsemblSQLiteFromTables}.
    }
  }
}

\details{
  The \code{fetchTablesFromEnsembl} function internally calls the perl
  script \code{get_gene_transcript_exon_tables.pl} to retrieve all
  required information from the Ensembl database using the Ensembl perl
  API.

  As an alternative way, a EnsDb database file can be generated by the
  \code{ensDbFromGtf} or \code{ensDbFromGff} from a GTF or GFF file
  downloaded from the Ensembl ftp server or using the \code{ensDbFromAH}
  to build a database directly from corresponding resources from the
  AnnotationHub. The returned database file name can then
  be used as an input to the \code{makeEnsembldbPackage} or it can be
  directly loaded and used by the \code{EnsDb} constructor.
}
\note{
  A local installation of the Ensembl perl API is required for the
  \code{fetchTablesFromEnsembl}. See
  \url{http://www.ensembl.org/info/docs/api/api_installation.html} for
  installation inscructions.

  A database generated from a GTF/GFF files lacks some features as they are
  not available in the GTF files from Ensembl. These are: NCBI
  Entrezgene IDs.
}
\value{
  \code{makeEnsemblSQLiteFromTables}, \code{ensDbFromAH},
  \code{ensDbFromGRanges} and \code{ensDbFromGtf}: the name of the
  SQLite file.
}
\seealso{
  \code{\link{EnsDb}}, \code{\link{genes}}
}
\author{
Johannes Rainer
}
\examples{

\dontrun{

    ## get all human gene/transcript/exon annotations from Ensembl (75)
    ## the resulting tables will be stored by default to the current working
    ## directory; if the correct Ensembl api (version 75) is defined in the
    ## PERL5LIB environment variable, the ensemblapi parameter can also be omitted.
    fetchTablesFromEnsembl(75,
                           ensemblapi="/home/bioinfo/ensembl/75/API/ensembl/modules",
                           species="human")

    ## These tables can then be processed to generate a SQLite database
    ## containing the annotations
    DBFile <- makeEnsemblSQLiteFromTables()

    ## and finally we can generate the package
    makeEnsembldbPackage(ensdb=DBFile, version="0.0.1",
                         maintainer="Johannes Rainer <johannes.rainer@eurac.edu>",
                         author="J Rainer")

    ## Build an annotation database form a GFF file from Ensembl.
    ## ftp://ftp.ensembl.org/pub/release-83/gff3/rattus_norvegicus
    gff <- "Rattus_norvegicus.Rnor_6.0.83.gff3.gz"
    DB <- ensDbFromGff(gff=gff)
    edb <- EnsDb(DB)
    edb

    ## Build an annotation file from a GTF file.
    ## the GTF file can be downloaded from
    ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
    gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
    ## generate the SQLite database file
    DB <- ensDbFromGtf(gtf=paste0(ensemblhost, gtffile))

    ## load the DB file directly
    EDB <- EnsDb(DB)

    ## Alternatively, we could fetch a GTF file directly from AnnotationHub
    ## and build the database from that:
    library(AnnotationHub)
    ah <- AnnotationHub()
    ## Query for all GTF files from Ensembl for Ensembl version 81
    query(ah, c("Ensembl", "release-81", "GTF"))
    ## We could get the one from e.g. Bos taurus:
    DB <- ensDbFromAH(ah["AH47941"])
    edb <- EnsDb(DB)
    edb
}

## Generate a sqlite database for genes encoded on chromosome Y
chrY <- system.file("chrY", package="ensembldb")
DBFile <- makeEnsemblSQLiteFromTables(path=chrY ,dbname=tempfile())
## load this database:
edb <- EnsDb(DBFile)

edb

## Generate a sqlite database from a GRanges object specifying
## genes encoded on chromosome Y
load(system.file("YGRanges.RData", package="ensembldb"))

Y

DB <- ensDbFromGRanges(Y, path=tempdir(), version=75,
                       organism="Homo_sapiens")
edb <- EnsDb(DB)


}
\keyword{ data }