File: sql-utils.R

package info (click to toggle)
r-bioc-annotationhub 3.14.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 592 kB
  • sloc: makefile: 2
file content (304 lines) | stat: -rw-r--r-- 10,552 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
## This function filters the local annotationhub.sqlite metadata db and
## defines the subset exposed by AnnotationHub().
.uid0 <- function(path, date, localHub)
{
    conn <- .db_open(path)
    on.exit(.db_close(conn))

    ## General filter:
    ## All AnnotationHub resources (except OrgDbs, see below) are
    ## available from the time they are added -> infinity unless
    ## they are removed from the web or by author request. The
    ## snapshot date can be changed by the user. We want to return records
    ## with no rdatadateremoved and with rdatadateadded <= snapshot.
    ## All OrgDbs are omitted in the first filter and selectively
    ## exposed in the second filter.
    ##   NOTE: biocversions filter distinguishes between release and devel;
    ##   this is not caught by rdatadate added filter because the timestamp
    ##   is updated with each modification and currently someone using
    ##   an old version of Bioconductor will still get the current db
    ##   which will have a timestamp > the date when the old version of
    ##   Bioconductor was valid.
    ##   NOTE: The 'date' variable is the snapshotDate().

    # Ran into an issue comparing BiocVersion once we hit 3.10
    # 3.10 got truncated to 3.1 and missed values
    bioc_value <- .db_query(conn,
                            "SELECT DISTINCT biocversion FROM biocversions")[[1]]
    indx <- package_version(bioc_value) <= BiocManager::version()
    if (sum(indx) != 0) bioc_value <- bioc_value[indx]
    bioc_value <- paste(paste0('"', bioc_value, '"'), collapse=",")

    query1 <- sprintf(
        'SELECT resources.id
         FROM resources, rdatapaths, biocversions
         WHERE resources.rdatadateadded <= "%s"
         AND biocversions.biocversion IN (%s)
         AND resources.rdatadateremoved IS NULL
         AND rdatapaths.rdataclass != "OrgDb"
         AND biocversions.resource_id == resources.id
         AND rdatapaths.resource_id == resources.id',
         date, bioc_value)
    biocIds1 <- .db_query(conn, query1)[[1]]

    ## Add a query to get resources that have been removed
    ## But were present during a given release
    ## There is a chance that if the data was removed
    ## completely from external location that these
    ## ids won't work
    query3 <- sprintf(
        'SELECT resources.id
         FROM resources, rdatapaths, biocversions
         WHERE resources.rdatadateadded <= "%s"
         AND biocversions.biocversion IN (%s)
         AND resources.rdatadateremoved > "%s"
         AND rdatapaths.rdataclass != "OrgDb"
         AND biocversions.resource_id == resources.id
         AND rdatapaths.resource_id == resources.id',
         date, bioc_value, date)
    biocIds3 <- .db_query(conn, query3)[[1]]
    ## OrgDb sqlite files:
    ##
    ## OrgDbs are the single resource designed to expire at the end of a
    ## release cycle. The sqlite files are built before a release, added to the
    ## devel branch then propagate to the new release branch. For the
    ## duration of a release cycle both release and devel share the same
    ## OrgDb packages. Before the next release, new files are built, added
    ## to devel, propagated to release and so on.
    ##
    ## When new sqlite files are added to the hub they are stamped
    ## with the devel version which immediately becomes the new release version.
    ## For this reason, the devel code loads OrgDbs with the release version
    ## e.g.,
    ##   ifelse(isDevel, biocversion - 0.1, biocversion)
    ##
    ## NOTE: Because OrgDbs are valid for a full devel cycle they are
    ##       not filtered by snapshotDate(); the OrgDbs are valid for all

    query2 <- sprintf(
        'SELECT resources.id
            FROM resources, biocversions, rdatapaths
            WHERE biocversions.biocversion == "%s"
            AND rdatapaths.rdataclass == "OrgDb"
            AND resources.rdatadateremoved IS NULL
            AND biocversions.resource_id == resources.id
            AND rdatapaths.resource_id == resources.id',
        BiocManager::version())
    biocIds2 <- .db_query(conn, query2)[[1]]


    ## make unique and sort
    allIds = sort(unique(c(biocIds1, biocIds2, biocIds3)))
    ## match id to ah_id
    query <- paste0('SELECT ah_id FROM resources ',
                    'WHERE id IN (', paste0(allIds, collapse=","), ')',
                    'ORDER BY id')
    names(allIds) <- .db_query(conn, query)[[1]]
    allIds
}


.resource_table <- function(x)
{
    query <- sprintf(
        'SELECT %s FROM resources
         WHERE resources.id IN (%s)',
        .DB_RESOURCE_FIELDS, .id_as_single_string(x))
    tbl <- .query_as_data.frame(x, query)
    tbl[["tags"]] <- I(.collapse_as_list(x, .tags))
    tbl[["rdataclass"]] <- .collapse_as_string(x, .rdataclass)
    tbl[["rdatapath"]] <- .collapse_as_string(x, .rdatapath)
    tbl[["sourceurl"]] <- .collapse_as_string(x, .sourceurl)
    tbl[["sourcetype"]] <- .collapse_as_string(x, .sourcetype)
    tbl
}

## Used in mcols()
.DB_RESOURCE_FIELDS <- paste(sep=".", collapse=", ", "resources",
    c("ah_id", "title", "dataprovider", "species", "taxonomyid", "genome",
      "description", "coordinate_1_based", "maintainer",
      "rdatadateadded", "preparerclass"))

.id_as_single_string <- function(x)
    paste(sprintf("'%s'", .db_uid(x)), collapse=", ")

.query_as_data.frame <- function(x, query)
{
    tbl <- .db_query(dbfile(x), query)
    ridx <- match(names(x), tbl$ah_id)
    cidx <- match("ah_id", names(tbl))
    rownames(tbl) <- tbl$ah_id
    tbl[ridx, -cidx, drop=FALSE]
}

## Helper to collapse many to one fields (like above) into one space
.collapse_as_string <- function(x, FUN)
{
    uid <- .db_uid(x)
    tbl <- FUN(x)
    lst <- vapply(split(tbl[[1]], tbl[["id"]]), paste0,
                  character(1), collapse=", ")
    lst <- lst[match(uid, names(lst))]
    setNames(lst, names(uid))           # allows for x with no tags
}

.collapse_as_list <- function(x, FUN)
{
    uid <- .db_uid(x)
    tbl <- FUN(x)
    lst <- split(tbl[[1]], tbl$id)
    lst <- lst[match(uid, names(lst))]
    setNames(lst, names(uid))           # allows for x with no tags
}

## helper to retrieve tags
.tags <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT tag, resource_id AS id FROM tags
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

## helper for extracting rdataclass
.rdataclass <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT rdataclass, resource_id AS id FROM rdatapaths
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

## helper for extracting rdatapath
.rdatapath <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT rdatapath, resource_id AS id FROM rdatapaths
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

## helper for extracting sourceUrls
.sourceurl <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT sourceurl, resource_id AS id FROM input_sources
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

##  helper for extracting sourcetype
.sourcetype <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT sourcetype, resource_id AS id FROM input_sources
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

.sourcesize <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT sourcesize, resource_id AS id FROM input_sources
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

.sourcelastmodifieddate <- function(x) {
    query <- sprintf(
        'SELECT DISTINCT sourcelastmodifieddate, resource_id AS id
         FROM input_sources
         WHERE resource_id IN (%s)',
        .id_as_single_string(x))
    .db_query(dbfile(x), query)
}

.dataclass <- function(x)
{
    query <- sprintf(
        'SELECT DISTINCT r.ah_id AS ah_id, rdp.dispatchclass
         FROM rdatapaths AS rdp, resources AS r WHERE
         r.id = rdp.resource_id
         AND rdp.resource_id IN (%s)',
        .id_as_single_string(x))
    .query_as_data.frame(x, query)[[1]]
}

.title_data.frame <-
    function(x)
{
    query <- sprintf(
        "SELECT ah_id, title FROM resources
         WHERE resources.id IN (%s)",
        .id_as_single_string(x))
    .query_as_data.frame(x, query)
}


.resource_columns <- function()
    strsplit(gsub("resources.", "", .DB_RESOURCE_FIELDS), ", ")[[1]]

.resource_column <- function(x, name)
{
    valid <- .resource_columns()
    if (!name %in% valid) {
        msg <- sprintf("%s is not a resource data column", sQuote(name))
        stop(msg)
    }
    query <- sprintf(
        'SELECT ah_id, %s FROM resources WHERE id IN (%s)',
        name, .id_as_single_string(x))
    .query_as_data.frame(x, query)[[1]]
}

.count_resources <-
    function(x, column, limit=10)
{
    query <- sprintf(
        "SELECT %s FROM resources
         WHERE resources.id IN (%s)
         GROUP BY %s ORDER BY COUNT(%s) DESC LIMIT %d",
        column, .id_as_single_string(x), column, column, limit)
    .db_query(dbfile(x), query)[[column]]
}

.count_join_resources <-
    function(x, table, column, limit=10)
{
    query <- sprintf(
        "SELECT %s FROM resources, %s
         WHERE resources.id IN (%s) AND %s.resource_id == resources.id
         GROUP BY %s ORDER BY COUNT(%s) DESC LIMIT %d",
        column, table,
        .id_as_single_string(x), table,
        column, column, limit)
    .db_query(dbfile(x), query)[[column]]
}


.datapathIds <- function(x)
{
    query <- sprintf(
        'SELECT DISTINCT resources.ah_id, rdatapaths.id
         FROM resources, rdatapaths
         WHERE resources.id IN (%s)
         AND resources.id == rdatapaths.resource_id',
        .id_as_single_string(x))
    result <- .db_query(dbfile(x), query)
    setNames(result[[2]], result[[1]])
}

.IdsInfo <- function(x)
{
    query <-
        'SELECT DISTINCT resources.ah_id, rdatapaths.id, resources.title, rdatapaths.rdataclass, statuses.status, biocversions.biocversion, resources.rdatadateadded, resources.rdatadateremoved
         FROM resources, rdatapaths, statuses, biocversions
         WHERE resources.id == rdatapaths.resource_id
         AND resources.status_id == statuses.id
         AND biocversions.resource_id == resources.id'
    mat <- .db_query(dbfile(x), query)
    nms <- names(mat)
    nms[which(nms == "id")] = "fetch_id"
    names(mat) <- nms
    mat
}