File: dbhelpers.R

package info (click to toggle)
r-bioc-ensembldb 2.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,764 kB
  • sloc: perl: 331; sh: 15; makefile: 5
file content (814 lines) | stat: -rw-r--r-- 33,629 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
############################################################
## EnsDb
## Constructor function.
#' @title Connect to an EnsDb object
#'
#' @description The \code{EnsDb} constructor function connects to the database
#'     specified with argument \code{x} and returns a corresponding
#'     \code{\linkS4class{EnsDb}} object.
#'
#' @details
#'
#' By providing the connection to a MariaDB/MySQL database, it is possible
#' to use MariaDB/MySQL as the database backend and queries will be performed on
#' that database. Note however that this requires the package \code{RMariaDB}
#' to be installed. In addition, the user needs to have access to a MySQL
#' server providing already an EnsDb database, or must have write
#' privileges on a MySQL server, in which case the \code{\link{useMySQL}}
#' method can be used to insert the annotations from an EnsDB package into
#' a MySQL database.
#' 
#' @param x Either a character specifying the \emph{SQLite} database file, or
#'     a \code{DBIConnection} to e.g. a MariaDB/MySQL database.
#' 
#' @return A \code{\linkS4class{EnsDb}} object.
#' 
#' @author Johannes Rainer
#' 
#' @examples
#' ## "Standard" way to create an EnsDb object:
#' library(EnsDb.Hsapiens.v86)
#' EnsDb.Hsapiens.v86
#'
#' ## Alternatively, provide the full file name of a SQLite database file
#' dbfile <- system.file("extdata/EnsDb.Hsapiens.v86.sqlite", package = "EnsDb.Hsapiens.v86")
#' edb <- EnsDb(dbfile)
#' edb
#'
#' ## Third way: connect to a MySQL database
#' \dontrun{
#' library(RMariaDB)
#' dbcon <- dbConnect(MySQL(), user = my_user, pass = my_pass,
#'     host = my_host, dbname = "ensdb_hsapiens_v86")
#' edb <- EnsDb(dbcon)
#' }
EnsDb <- function(x){
    options(useFancyQuotes=FALSE)
    if(missing(x)){
        stop("No sqlite file provided!")
    }
    if (is.character(x)) {
        lite <- dbDriver("SQLite")
        con <- dbConnect(lite, dbname = x, flags=SQLITE_RO)
    }
    else if (is(x, "DBIConnection")) {
        con <- x
    } else {
        stop("'x' should be either a character specifying the SQLite file to",
             " be loaded, or a DBIConnection object providing the connection",
             " to the database.")
    }
    ## Check if the database is valid.
    OK <- dbHasRequiredTables(con)
    if (is.character(OK))
        stop(OK)
    OK <- dbHasValidTables(con)
    if (is.character(OK))
        stop(OK)
    tables <- dbListTables(con)
    ## read the columns for these tables.
    Tables <- vector(length=length(tables), "list")
    for (i in 1:length(Tables)) {
        Tables[[i]] <- colnames(dbGetQuery(con,
                                           paste0("select * from ",
                                                  tables[ i ], " limit 1")))
    }
    names(Tables) <- tables
    EDB <- new("EnsDb", ensdb = con, tables = Tables)
    EDB <- setProperty(EDB, dbSeqlevelsStyle="Ensembl")
    ## Add the db schema version to the properties.
    EDB <- setProperty(EDB, DBSCHEMAVERSION =
                                .getMetaDataValue(con, "DBSCHEMAVERSION"))
    ## Setting the default for the returnFilterColumns
    returnFilterColumns(EDB) <- TRUE
    ## Defining the default for the ordering
    orderResultsInR(EDB) <- FALSE
    ## Check it again...
    OK <- validateEnsDb(EDB)
    if (is.character(OK))
        stop(OK)
    EDB
}

## loadEnsDb <- function(x) {
##     ## con <- ensDb( x )
##     ## EDB <- new( "EnsDb", ensdb=con )
##     return(EnsDb(x))
## }


## x is the connection to the database, name is the name of the entry to fetch
.getMetaDataValue <- function(x, name){
    return(dbGetQuery(x, paste0("select value from metadata where name='",
                                name, "'"))[ 1, 1])
}

############################################################
## prefixColumns
##
## Determines which tables (along with the table attributes) are required for
## the join.
## Updated version of prefixColumns:
## o Uses the order of the tables returned by listTables and adds the first
##   table in which the column was found. That's different to the previous
##   default of trying to join as few tables as possible but avoids problems
##   with table joins between e.g. tx and protein in which not all tx_id are
##   present in the protein table.
prefixColumns <- function(x, columns, clean = TRUE, with.tables){
    if (missing(columns))
        stop("columns is empty! No columns provided!")
    ## first get to the tables that contain these columns
    Tab <- listTables(x)   ## returns the tables by degree!
    if (!missing(with.tables)) {
        with.tables <- with.tables[ with.tables %in% names(Tab) ]
        if (length(with.tables) > 0) {
            Tab <- Tab[ with.tables ]
        } else {
            warning("The submitted table names are not valid in the database",
                    " and were thus dropped.")
        }
        if (length(Tab) == 0)
            stop("None of the tables submitted with with.tables is present",
                 " in the database!")
    }
    if (clean)
        columns <- cleanColumns(x, columns)
    if (length(columns) == 0) {
        return(NULL)
    }
    getCols <- columns
    result <- lapply(Tab, function(z) {
        if (length(getCols) > 0) {
            gotIt <- z[z %in% getCols]
            if (length(gotIt) > 0) {
                getCols <<- getCols[!(getCols %in% gotIt)]
                return(gotIt)
            } else {
                return(character())
            }
        }
    })
    ## If getCols length > 0 it contains columns not present in the db.
    result <- result[lengths(result) > 0]
    if (length(result) == 0)
        stop("None of the columns could be found in the database!")
    result <- mapply(result, names(result), FUN = function(z, y) {
        paste0(y, ".", z)
    }, SIMPLIFY = FALSE)
    return(result)
}

############################################################
## call the prefixColumns function and return just the column
## names, but in the same order than the provided columns.
prefixColumnsKeepOrder <- function(x, columns, clean = TRUE, with.tables) {
    res <- unlist(prefixColumns(x, columns, clean, with.tables),
                  use.names = FALSE)
    res_order <- sapply(columns, function(z) {
        idx <- grep(res, pattern = paste0("\\.", z, "$"))
        if (length(idx) == 0)
            return(NULL)
        return(res[idx[1]])
    })
    return(res_order[!is.null(res_order)])
}

############################################################
## ** NEW JOIN ENGINE **
##
## 1: table 1
## 2: table 2
## 3: on
## 4: suggested join
.JOINS2 <- rbind(
    c("gene", "tx", "on (gene.gene_id=tx.gene_id)", "join"),
    c("gene", "chromosome", "on (gene.seq_name=chromosome.seq_name)", "join"),
    c("tx", "tx2exon", "on (tx.tx_id=tx2exon.tx_id)", "join"),
    c("tx2exon", "exon", "on (tx2exon.exon_id=exon.exon_id)", "join"),
    c("tx", "protein", "on (tx.tx_id=protein.tx_id)", "left outer join"),
    c("gene", "entrezgene", "on (gene.gene_id=entrezgene.gene_id)",
      "left outer join"),
    c("protein", "protein_domain",
      "on (protein.protein_id=protein_domain.protein_id)", "left outer join"),
    c("protein", "uniprot", "on (protein.protein_id=uniprot.protein_id)",
      "left outer join"),
    c("uniprot", "protein_domain",
      "on (uniprot.protein_id=protein_domain.protein_id)", "left outer join")
)
## Takes the names of two tables, determines how to join them and returns the
## join query row, if found.
joinTwoTables <- function(a, b, mysql = FALSE) {
    jns <- .JOINS2
    gotIt <- which((jns[, 1] %in% a & jns[, 2] %in% b) |
                   (jns[, 2] %in% a & jns[, 1] %in% b))
    if (length(gotIt))
        jns[gotIt[1], ]
    else
        stop("Table(s) ", paste(a, collapse = ", "), " can not be joined with ",
             paste(b, collapse = ", "), "!")
}
## x: EnsDb.
## tab: tables to join.
## join: which type of join should be used?
## startWith: optional table name from which the join should start. That's
## specifically important for a left outer join call.
joinQueryOnTables2 <- function(x, tab, join = "suggested", startWith = NULL) {
    ## join can be join, left join, left outer join or suggested in which case
    ## the join defined in the .JOINS2 table will be used.
    join <- match.arg(join, c("join", "left join", "left outer join",
                              "suggested"))
    ## Order the tables.
    ## Start with startWith, or with the first one.
    if (missing(tab))
        stop("Argument 'tab' missing! Need some tables to make a join!")
    if (!is.null(startWith)) {
        if (!any(tab == startWith))
            stop("If provided, 'startWith' has to be the name of one of the",
                 " tables that should be joined!")
    }
    ## Add eventually needed tables to link the ones provided. The tables will
    ## be ordered by degree.
    tab <- addRequiredTables(x, tab)
    if (!is.null(startWith)) {
        alreadyUsed <- startWith
        tab <- tab[tab != startWith]
    } else {
        alreadyUsed <- tab[1]
        tab <- tab[-1]
    }
    Query <- alreadyUsed
    mysql <- is(dbconn(x), "MariaDBConnection")
    ## Iteratively build the query.
    while (length(tab) > 0) {
        res <- joinTwoTables(a = alreadyUsed, b = tab, mysql = mysql)
        newTab <- res[1:2][!(res[1:2] %in% alreadyUsed)]
        ## Could also use the suggested join which is in element 4.
        Query <- paste(Query, ifelse(join == "suggested", res[4], join),
                       newTab, res[3])
        alreadyUsed <- c(alreadyUsed, newTab)
        tab <- tab[tab != newTab]
    }
    return(Query)
}
## this function has to first get all tables that contain the columns,
## and then select, for columns present in more than one
## x... EnsDb
## columns... the columns
## NOTE: if "startWith" is not NULL, we're adding it to the tables!!!!
joinQueryOnColumns2 <- function(x, columns, join = "suggested",
                                startWith = NULL) {
    columns.bytable <- prefixColumns(x, columns)
    ## based on that we can build the query based on the tables we've got.
    ## Note that the function internally
    ## adds tables that might be needed for the join.
    Query <- joinQueryOnTables2(x, tab = c(names(columns.bytable), startWith),
                                join = join,
                                startWith = startWith)
    return(Query)
}


###
## Add additional tables in case the submitted tables are not directly connected
## and can thus not be joined. That's however not so complicated, since the database
## layout is pretty simple.
addRequiredTables <- function(x, tab){
    ## dash it, as long as I can't find a way to get connected objects in a
    ## graph I'll do it manually...
    ## if we have exon and any other table, we need definitely tx2exon
    if(any(tab=="exon") & length(tab) > 1){
        tab <- unique(c(tab, "tx2exon"))
    }
    ## if we have chromosome and any other table, we'll need gene
    if(any(tab=="chromosome") & length(tab) > 1){
        tab <- unique(c(tab, "gene"))
    }
    ## if we have exon and we have gene, we'll need also tx
    if((any(tab=="exon") | (any(tab=="tx2exon"))) & any(tab=="gene")){
        tab <- unique(c(tab, "tx"))
    }
    if (hasProteinData(x)) {
        ## Resolve the proteins: need tx to map between proteome and genome
        if (any(tab %in% c("uniprot", "protein_domain", "protein")) &
            any(tab %in% c("exon", "tx2exon", "gene",
                           "chromosome", "entrezgene")))
            tab <- unique(c(tab, "tx"))
        ## Need protein.
        if (any(tab %in% c("uniprot", "protein_domain")) &
            any(tab %in% c("exon", "tx2exon", "tx", "gene", "chromosome",
                           "entrezgene")))
            tab <- unique(c(tab, "protein"))
    }
    ## entrezgene is only linked via gene
    if (any(tab == "entrezgene") & length(tab) > 1)
        tab <- unique(c(tab, "gene"))
    return(tablesByDegree(x, tab))
}


############################################################
## .buildQuery
##
## The "backbone" function that builds the SQL query based on the specified
## columns, the provided filters etc.
## x an EnsDb object
## startWith: optional table from which the join should start.
.buildQuery <- function(x, columns, filter = AnnotationFilterList(),
                        order.by = "", order.type = "asc", group.by,
                        skip.order.check=FALSE, return.all.columns = TRUE,
                        join = "suggested", startWith = NULL) {
    resultcolumns <- columns    ## just to remember what we really want to give back
    ## 1) get all column names from the filters also removing the prefix.
    if (!is(filter, "AnnotationFilterList"))
        stop("parameter 'filter' has to be an 'AnnotationFilterList'!")
    if (length(filter) > 0) {
        ## check filter!
        ## add the columns needed for the filter
        filtercolumns <- unlist(lapply(filter, ensDbColumn, x))
        ## remove the prefix (column name for these)
        filtercolumns <- sapply(filtercolumns, removePrefix, USE.NAMES = FALSE)
        columns <- unique(c(columns, filtercolumns))
    }
    ## 2) get all column names for the order.by:
    if (any(order.by != "")) {
        ## if we have skip.order.check set we use the order.by as is.
        if (!skip.order.check) {
            order.by <- checkOrderBy(orderBy = order.by, supported = columns)
        }
    } else {
        order.by <- ""
    }
    ## Note: order by is now a vector!!!
    ## columns are now all columns that we want to fetch or that we need to
    ## filter or to sort.
    ## 3) check which tables we need for all of these columns:
    need.tables <- names(prefixColumns(x, columns))
    ##
    ## Now we can begin to build the query parts!
    ## a) the query part that joins all required tables.
    joinquery <- joinQueryOnColumns2(x, columns=columns, join = join,
                                     startWith = startWith)
    ## b) the filter part of the query
    if (length(filter) > 0) {
        ## USE THE ensDbQuery method here!!!
        filterquery <- paste0(" where ", ensDbQuery(filter, x,
                                                    with.tables = need.tables))
    } else {
        filterquery <- ""
    }
    ## c) the order part of the query
    if (any(order.by != "")) {
        if (!skip.order.check) {
            order.by <- paste(prefixColumnsKeepOrder(x = x, columns = order.by,
                                                     with.tables = need.tables),
                              collapse=",")
        }
        orderquery <- paste(" order by", order.by, order.type)
    } else {
        orderquery <- ""
    }
    ## And finally build the final query
    if (return.all.columns) {
        resultcolumns <- columns
    }
    paste0("select distinct ",
           paste(prefixColumnsKeepOrder(x, resultcolumns,
                                        with.tables = need.tables),
                 collapse=","),
           " from ",
           joinquery,
           filterquery,
           orderquery
           )
}


## remove the prefix again...
removePrefix <- function(x, split=".", fixed=TRUE){
    return(sapply(x, function(z){
        tmp <- unlist(strsplit(z, split=split, fixed=fixed))
        return(tmp[ length(tmp) ])
    }))
}


## just to add another layer; basically just calls buildQuery and executes the
## query
## join: what type of join should be performed.
## startWith: the name of the table from which the query should be started.
.getWhat <- function(x, columns, filter = AnnotationFilterList(), order.by = "",
                     order.type = "asc", group.by = NULL,
                     skip.order.check = FALSE, join = "suggested",
                     startWith = NULL) {
    ## That's nasty stuff; for now we support the column tx_name, which we however
    ## don't have in the database. Thus, we are querying everything except that
    ## column and filling it later with the values from tx_id.
    fetchColumns <- columns
    if(any(columns == "tx_name"))
        fetchColumns <- unique(c("tx_id",
                                 fetchColumns[fetchColumns != "tx_name"]))
    if (!is(filter, "AnnotationFilterList"))
        stop("parameter 'filter' has to be an 'AnnotationFilterList'!")
    ## Add also the global filter if present.
    global_filter <- .activeFilter(x)
    if (is(global_filter, "AnnotationFilter") |
        is(global_filter, "AnnotationFilterList"))
        filter <- AnnotationFilterList(global_filter, filter)
    ## If any filter is a SymbolFilter, add "symbol" to the return columns.
    if (length(filter) > 0) {
        if (any(.anyIs(filter, "SymbolFilter")))
            columns <- unique(c(columns, "symbol"))  ## append a filter column.
    }
    ## Catch also a "symbol" in columns
    if(any(columns == "symbol"))
        fetchColumns <- unique(c(fetchColumns[fetchColumns != "symbol"],
                                 "gene_name"))
    ## Shall we do the ordering in R or in SQL?
    if (orderResultsInR(x) & !skip.order.check) {
        ## Build the query
        Q <- .buildQuery(x = x, columns = fetchColumns, filter = filter,
                         order.by = "", order.type = order.type,
                         group.by = group.by,
                         skip.order.check = skip.order.check, join = join,
                         startWith = startWith)
        ## Get the data
        ## cat("Query: ", Q, "\n")
        Res <- dbGetQuery(dbconn(x), Q)
        ## Note: we can only order by the columns that we did get back from the
        ## database; that might be different for the SQL sorting!
        Res <- orderDataFrameBy(Res, by = checkOrderBy(order.by, fetchColumns),
                                decreasing = order.type != "asc")
    } else {
        ## Build the query
        Q <- .buildQuery(x = x, columns = fetchColumns, filter = filter,
                         order.by = order.by, order.type = order.type,
                         group.by = group.by,
                         skip.order.check = skip.order.check, join = join,
                         startWith = startWith)
        ## Get the data
        ## cat("Query: ", Q, "\n")
        Res <- dbGetQuery(dbconn(x), Q)
    }
    ## cat("Query:\n", Q, "\n")
    if(any(columns == "tx_cds_seq_start")) {
        if (!is.integer(Res[, "tx_cds_seq_start"])) {
            suppressWarnings(
                ## column contains "NULL" if not defined and coordinates are
                ## characters as.numeric transforms "NULL" into NA, and ensures
                ## coords are numeric.
                Res[ , "tx_cds_seq_start"] <- as.integer(Res[ , "tx_cds_seq_start"])
            )
        }
    }
    if(any(columns=="tx_cds_seq_end")){
        if (!is.integer(Res[, "tx_cds_seq_end"])) {
            suppressWarnings(
                ## column contains "NULL" if not defined and coordinates are
                ## characters as.numeric transforms "NULL" into NA, and ensures
                ## coords are numeric.
                Res[ , "tx_cds_seq_end" ] <- as.integer(Res[ , "tx_cds_seq_end" ])
            )
        }
    }
    ## Fix for MySQL returning 'numeric' instead of 'integer'.
    ## THIS SHOULD BE REMOVED ONCE THE PROBLEM IS FIXED IN RMySQL!!!
    int_cols <- c("exon_seq_start", "exon_seq_end", "exon_idx", "tx_seq_start",
                  "tx_seq_end", "tx_cds_seq_start", "tx_cds_seq_end",
                  "gene_seq_start", "gene_seq_end", "seq_strand")
    for (the_col in int_cols) {
        if (any(colnames(Res) == the_col))
            if (!is.integer(Res[, the_col]))
                Res[, the_col] <- as.integer(Res[, the_col])
    }
    ## Resolving the "symlinks" again.
    if(any(columns == "tx_name")) {
        Res <- data.frame(Res, tx_name = Res$tx_id, stringsAsFactors = FALSE)
    }
    if(any(columns == "symbol")) {
        Res <- data.frame(Res, symbol = Res$gene_name, stringsAsFactors = FALSE)
    }
    ## Ensure that the ordering is as requested.
    Res <- Res[, columns, drop=FALSE]
    Res
}

############################################################
## Check database validity.
#' @description Return tables with attributes based on the provided schema.
#'
#' @noRd
.ensdb_tables <- function(version = "1.0") {
    .ENSDB_TABLES <- list(`1.0` = list(
                              gene = c("gene_id", "gene_name", "entrezid",
                                       "gene_biotype", "gene_seq_start",
                                       "gene_seq_end", "seq_name", "seq_strand",
                                       "seq_coord_system"),
                              tx = c("tx_id", "tx_biotype", "tx_seq_start",
                                     "tx_seq_end", "tx_cds_seq_start",
                                     "tx_cds_seq_end", "gene_id"),
                              tx2exon = c("tx_id", "exon_id", "exon_idx"),
                              exon = c("exon_id", "exon_seq_start",
                                       "exon_seq_end"),
                              chromosome = c("seq_name", "seq_length",
                                             "is_circular"),
                              metadata = c("name", "value")),
                          `2.0` = list(
                              gene = c("gene_id", "gene_name",
                                       "gene_biotype", "gene_seq_start",
                                       "gene_seq_end", "seq_name", "seq_strand",
                                       "seq_coord_system"),
                              tx = c("tx_id", "tx_biotype", "tx_seq_start",
                                     "tx_seq_end", "tx_cds_seq_start",
                                     "tx_cds_seq_end", "gene_id"),
                              tx2exon = c("tx_id", "exon_id", "exon_idx"),
                              exon = c("exon_id", "exon_seq_start",
                                       "exon_seq_end"),
                              chromosome = c("seq_name", "seq_length",
                                             "is_circular"),
                              entrezgene = c("gene_id", "entrezid"),
                              metadata = c("name", "value"))
                          )
    .ENSDB_TABLES[[version]]
}
.ensdb_protein_tables <- function(version = "1.0") {
    .ENSDB_PROTEIN_TABLES <- list(`1.0` = list(
                                      protein = c("tx_id", "protein_id",
                                                  "protein_sequence"),
                                      uniprot = c("protein_id", "uniprot_id",
                                                  "uniprot_db",
                                                  "uniprot_mapping_type"),
                                      protein_domain = c("protein_id",
                                                         "protein_domain_id",
                                                         "protein_domain_source",
                                                         "interpro_accession",
                                                         "prot_dom_start",
                                                         "prot_dom_end")),
                                  `2.0` = list(
                                      protein = c("tx_id", "protein_id",
                                                  "protein_sequence"),
                                      uniprot = c("protein_id", "uniprot_id",
                                                  "uniprot_db",
                                                  "uniprot_mapping_type"),
                                      protein_domain = c("protein_id",
                                                         "protein_domain_id",
                                                         "protein_domain_source",
                                                         "interpro_accession",
                                                         "prot_dom_start",
                                                         "prot_dom_end"))
                                  )
    .ENSDB_PROTEIN_TABLES[[version]]
}
    
#' @description Extract the database schema version if available in the metadata
#'     database column.
#'
#' @param x Can be either a connection object or an \code{EnsDb} object.
#' 
#' @noRd
dbSchemaVersion <- function(x) {
    if (is(x, "EnsDb")) {
        return(getProperty(x, "DBSCHEMAVERSION"))
    } else {
        tabs <- dbListTables(x)
        if (any(tabs == "metadata")) {
            res <- dbGetQuery(x, "select * from metadata")
            if (any(res$name == "DBSCHEMAVERSION") &
                any(colnames(res) == "value"))
                return(res[res$name == "DBSCHEMAVERSION", "value"])
        }
    }
    return("1.0")
}

dbHasRequiredTables <- function(con, returnError = TRUE,
                                tables = .ensdb_tables(dbSchemaVersion(con))) {
    tabs <- dbListTables(con)
    if (length(tabs) == 0) {
        if (returnError)
            return("Database does not have any tables!")
        return(FALSE)
    }
    not_there <- names(tables)[!(names(tables) %in% tabs)]
    if (length(not_there) > 0) {
        if (returnError)
            return(paste0("Required tables ", paste(not_there, collapse = ", "),
                          " are not present in the database!"))
        return(FALSE)
    }
    return(TRUE)
}
dbHasValidTables <- function(con, returnError = TRUE,
                             tables = .ensdb_tables(dbSchemaVersion(con))) {
    for (tab in names(tables)) {
        cols <- tables[[tab]]
        from_db <- colnames(dbGetQuery(con, paste0("select * from ", tab,
                                                   " limit 1")))
        not_there <- cols[!(cols %in% from_db)]
        if (length(not_there) > 0) {
            if (returnError)
                return(paste0("Table ", tab, " is missing required columns ",
                              paste(not_there, collapse = ", "), "!"))
            return(FALSE)
        }
    }
    return(TRUE)
}

############################################################
## feedEnsDb2MySQL
##
##
feedEnsDb2MySQL <- function(x, mysql, verbose = TRUE) {
    if (!inherits(mysql, "MariaDBConnection"))
        stop("'mysql' is supposed to be a connection to a MySQL database.")
    ## Fetch the tables and feed them to MySQL.
    sqlite_con <- dbconn(x)
    tabs <- dbListTables(sqlite_con)
    for (the_table in tabs) {
        if (verbose)
            message("Fetch table ", the_table, " ... ", appendLF = FALSE)
        tmp <- dbGetQuery(sqlite_con, paste0("select * from ", the_table, ";"))
        if (verbose)
            message("OK\nStoring the table in MySQL ... ", appendLF = FALSE)
        ## Fix tx_cds_seq_start being a character in old databases
        if (any(colnames(tmp) == "tx_cds_seq_start")) {
            suppressWarnings(
                tmp[, "tx_cds_seq_start"] <- as.integer(tmp[, "tx_cds_seq_start"])
            )
            suppressWarnings(
                tmp[, "tx_cds_seq_end"] <- as.integer(tmp[, "tx_cds_seq_end"])
            )
        }
        dbWriteTable(mysql, tmp, name = the_table, row.names = FALSE,
                     field.types = vapply(tmp, .mysql_datatype, character(1)))
        if (verbose)
            message("OK")
    }
    ## Create the indices.
    if (verbose)
        message("Creating indices...", appendLF = FALSE)
    .createEnsDbIndices(mysql, proteins = hasProteinData(x))
    if (verbose)
        message("OK")
    return(TRUE)
}

#' Instead of using the default dbDataType function we define here a more
#' fine-grained data type definition.
#'
#' @noRd
.mysql_datatype <- function(x) {
    res <- "TEXT"
    if (any(is.na(x)) & any(!is.na(x)))
        x <- x[!is.na(x)]
    if (is.character(x)) {
        nc <- max(nchar(x))
        if (nc < 65535)
            res <- paste0("VARCHAR(", nc, ")")
        else res <- "MEDIUMTEXT"
    }
    if (is.logical(x))
        res <- "TINYINT"
    if (is.numeric(x)) {
        if (is.integer(x)) {
            res <- "INT"
            if (any(x < -2147483648 | x > 2147483647))
                res <- "BIGINT"
            if (all(x > -8388608 & x < 8388607))
                res <- "MEDIUMINT"
            if (all(x > -32768 & x < 32767))
                res <- "SMALLINT"
            if (all(x > -128 & x < 127))
                res <- "TINYINT"
        } else res <- "DOUBLE"
    }
    res
}

#' Small helper function to create all the indices.
#'
#' @param con database connection.
#'
#' @param mysql `logical(1)` indicating whether indices specific for
#'     MariaDB/MySQL databases should be created.
#'
#' @param proteins `logical(1)` whether indices for protein tables should be
#'     created too.
#'
#' @noRd
.createEnsDbIndices <- function(con, proteins = FALSE) {
    indexCols <- c(chromosome = "seq_name", gene = "gene_id", gene = "gene_name",
                   gene = "seq_name", tx = "tx_id", tx = "gene_id",
                   exon = "exon_id", tx2exon = "tx_id", tx2exon = "exon_id")
    idxType <- c(chromosome = "unique", gene = "unique", gene = "", gene = "",
                 tx = "unique", tx = "", exon = "unique", tx2exon = "",
                 tx2exon = "")
    if (as.numeric(dbSchemaVersion(con)) > 1) {
        indexCols <- c(indexCols,
                       entrezgene = "gene_id", entrezgene = "entrezid")
        idxType <- c(idxType, entrezgene = "", entrezgene = "")
    }
    if (proteins) {
        indexCols <- c(indexCols, protein = "tx_id", protein = "protein_id",
                       uniprot = "protein_id", uniprot = "uniprot_id",
                       protein_domain = "protein_domain_id",
                       protein_domain = "protein_id")
        idxType <- c(idxType, protein = "", protein = "", uniprot = "",
                     uniprot = "", protein_domain = "", protein_domain = "")
    }
    for (i in 1:length(indexCols)) {
        tabname <- names(indexCols)[i]
        colname <- indexCols[i]
        idx_tp <- idxType[i]
        tmp <- dbExecute(con, paste0("create ", idx_tp, " index ",
                                     tabname, "_", colname, "_idx on ",
                                     tabname, " (", colname, ")"))
    }
    ## Add the one on the numeric index:
    aff_rows <- dbExecute(con, paste0("create index tx2exon_exon_idx_idx on ",
                                      "tx2exon (exon_idx);"))
}

############################################################
## listEnsDbs
## list databases
#' @title List EnsDb databases in a MariaDB/MySQL server
#'
#' @description
#'
#' The \code{listEnsDbs} function lists EnsDb databases in a
#' MariaDB/MySQL server.
#'
#' @details
#'
#' The use of this function requires the \code{RMariaDB} package
#' to be installed. In addition user credentials to access a MySQL server
#' (with already installed EnsDb databases), or with write access are required.
#' For the latter EnsDb databases can be added with the \code{\link{useMySQL}}
#' method. EnsDb databases in a MariaDB/MySQL server follow the same naming
#' conventions than EnsDb packages, with the exception that the name is all
#' lower case and that each \code{"."} is replaced by \code{"_"}.
#' 
#' @param dbcon A \code{DBIConnection} object providing access to a
#'     MariaDB/MySQL database. Either \code{dbcon} or all of the other
#'     arguments have to be specified.
#' 
#' @param host Character specifying the host on which the MySQL server is
#'     running.
#' 
#' @param port The port of the MariaDB/MySQL server (usually \code{3306}).
#' 
#' @param user The username for the MariaDB/MySQL server.
#' 
#' @param pass The password for the MariaDB/MySQL server.
#' 
#' @return A \code{data.frame} listing the database names, organism name
#'     and Ensembl version of the EnsDb databases found on the server.
#' 
#' @author Johannes Rainer
#' 
#' @seealso \code{\link{useMySQL}}
#' 
#' @examples
#' \dontrun{
#' library(RMariaDB)
#' dbcon <- dbConnect(MariaDB(), host = "localhost", user = my_user, pass = my_pass)
#' listEnsDbs(dbcon)
#' }
listEnsDbs <- function(dbcon, host, port, user, pass) {
    if(requireNamespace("RMariaDB", quietly = TRUE)) {
        if (missing(dbcon)) {
            if (missing(host) | missing(user) | missing(port) | missing(host))
                stop("Arguments 'host', 'port', 'user' and 'pass' are required",
                     " if 'dbcon' is not specified.")
            dbcon <- dbConnect(RMariaDB::MariaDB(), host = host, port = port,
                               user = user, pass = pass)
        }
        dbs <- dbGetQuery(dbcon, "show databases;")
        edbs <- dbs[grep(dbs$Database, pattern = "^ensdb_"), "Database"]
        edbTable <- data.frame(matrix(ncol = 3, nrow = length(edbs)))
        colnames (edbTable) <- c("dbname", "organism", "ensembl_version")
        for (i in seq_along(edbs)) {
            edbTable[i, "dbname"] <- edbs[i]
            tmp <- unlist(strsplit(edbs[i], split = "_"))
            edbTable[i, "organism"] <- tmp[2]
            edbTable[i, "ensembl_version"] <- as.numeric(gsub(pattern = "v",
                                                              replacement = "",
                                                              tmp[3]))
        }
        return(edbTable)
    } else {
        stop("Required package 'RMariaDB' is not installed.")
    }
}

#' Simple helper that "translates" R logical operators to SQL.
#' @noRd
.logOp2SQL <- function(x) {
    if (x == "|")
        return("or")
    if (x == "&")
        return("and")
    return(NULL)
}