1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
|
## This function filters the local annotationhub.sqlite metadata db and
## defines the subset exposed by AnnotationHub().
.uid0 <- function(path, date, localHub)
{
conn <- .db_open(path)
on.exit(.db_close(conn))
## General filter:
## All AnnotationHub resources (except OrgDbs, see below) are
## available from the time they are added -> infinity unless
## they are removed from the web or by author request. The
## snapshot date can be changed by the user. We want to return records
## with no rdatadateremoved and with rdatadateadded <= snapshot.
## All OrgDbs are omitted in the first filter and selectively
## exposed in the second filter.
## NOTE: biocversions filter distinguishes between release and devel;
## this is not caught by rdatadate added filter because the timestamp
## is updated with each modification and currently someone using
## an old version of Bioconductor will still get the current db
## which will have a timestamp > the date when the old version of
## Bioconductor was valid.
## NOTE: The 'date' variable is the snapshotDate().
# Ran into an issue comparing BiocVersion once we hit 3.10
# 3.10 got truncated to 3.1 and missed values
bioc_value <- .db_query(conn,
"SELECT DISTINCT biocversion FROM biocversions")[[1]]
indx <- package_version(bioc_value) <= BiocManager::version()
if (sum(indx) != 0) bioc_value <- bioc_value[indx]
bioc_value <- paste(paste0('"', bioc_value, '"'), collapse=",")
query1 <- sprintf(
'SELECT resources.id
FROM resources, rdatapaths, biocversions
WHERE resources.rdatadateadded <= "%s"
AND biocversions.biocversion IN (%s)
AND resources.rdatadateremoved IS NULL
AND rdatapaths.rdataclass != "OrgDb"
AND biocversions.resource_id == resources.id
AND rdatapaths.resource_id == resources.id',
date, bioc_value)
biocIds1 <- .db_query(conn, query1)[[1]]
## Add a query to get resources that have been removed
## But were present during a given release
## There is a chance that if the data was removed
## completely from external location that these
## ids won't work
query3 <- sprintf(
'SELECT resources.id
FROM resources, rdatapaths, biocversions
WHERE resources.rdatadateadded <= "%s"
AND biocversions.biocversion IN (%s)
AND resources.rdatadateremoved > "%s"
AND rdatapaths.rdataclass != "OrgDb"
AND biocversions.resource_id == resources.id
AND rdatapaths.resource_id == resources.id',
date, bioc_value, date)
biocIds3 <- .db_query(conn, query3)[[1]]
## OrgDb sqlite files:
##
## OrgDbs are the single resource designed to expire at the end of a
## release cycle. The sqlite files are built before a release, added to the
## devel branch then propagate to the new release branch. For the
## duration of a release cycle both release and devel share the same
## OrgDb packages. Before the next release, new files are built, added
## to devel, propagated to release and so on.
##
## When new sqlite files are added to the hub they are stamped
## with the devel version which immediately becomes the new release version.
## For this reason, the devel code loads OrgDbs with the release version
## e.g.,
## ifelse(isDevel, biocversion - 0.1, biocversion)
##
## NOTE: Because OrgDbs are valid for a full devel cycle they are
## not filtered by snapshotDate(); the OrgDbs are valid for all
query2 <- sprintf(
'SELECT resources.id
FROM resources, biocversions, rdatapaths
WHERE biocversions.biocversion == "%s"
AND rdatapaths.rdataclass == "OrgDb"
AND resources.rdatadateremoved IS NULL
AND biocversions.resource_id == resources.id
AND rdatapaths.resource_id == resources.id',
BiocManager::version())
biocIds2 <- .db_query(conn, query2)[[1]]
## make unique and sort
allIds = sort(unique(c(biocIds1, biocIds2, biocIds3)))
## match id to ah_id
query <- paste0('SELECT ah_id FROM resources ',
'WHERE id IN (', paste0(allIds, collapse=","), ')',
'ORDER BY id')
names(allIds) <- .db_query(conn, query)[[1]]
allIds
}
.resource_table <- function(x)
{
query <- sprintf(
'SELECT %s FROM resources
WHERE resources.id IN (%s)',
.DB_RESOURCE_FIELDS, .id_as_single_string(x))
tbl <- .query_as_data.frame(x, query)
tbl[["tags"]] <- I(.collapse_as_list(x, .tags))
tbl[["rdataclass"]] <- .collapse_as_string(x, .rdataclass)
tbl[["rdatapath"]] <- .collapse_as_string(x, .rdatapath)
tbl[["sourceurl"]] <- .collapse_as_string(x, .sourceurl)
tbl[["sourcetype"]] <- .collapse_as_string(x, .sourcetype)
tbl
}
## Used in mcols()
.DB_RESOURCE_FIELDS <- paste(sep=".", collapse=", ", "resources",
c("ah_id", "title", "dataprovider", "species", "taxonomyid", "genome",
"description", "coordinate_1_based", "maintainer",
"rdatadateadded", "preparerclass"))
.id_as_single_string <- function(x)
paste(sprintf("'%s'", .db_uid(x)), collapse=", ")
.query_as_data.frame <- function(x, query)
{
tbl <- .db_query(dbfile(x), query)
ridx <- match(names(x), tbl$ah_id)
cidx <- match("ah_id", names(tbl))
rownames(tbl) <- tbl$ah_id
tbl[ridx, -cidx, drop=FALSE]
}
## Helper to collapse many to one fields (like above) into one space
.collapse_as_string <- function(x, FUN)
{
uid <- .db_uid(x)
tbl <- FUN(x)
lst <- vapply(split(tbl[[1]], tbl[["id"]]), paste0,
character(1), collapse=", ")
lst <- lst[match(uid, names(lst))]
setNames(lst, names(uid)) # allows for x with no tags
}
.collapse_as_list <- function(x, FUN)
{
uid <- .db_uid(x)
tbl <- FUN(x)
lst <- split(tbl[[1]], tbl$id)
lst <- lst[match(uid, names(lst))]
setNames(lst, names(uid)) # allows for x with no tags
}
## helper to retrieve tags
.tags <- function(x) {
query <- sprintf(
'SELECT DISTINCT tag, resource_id AS id FROM tags
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
## helper for extracting rdataclass
.rdataclass <- function(x) {
query <- sprintf(
'SELECT DISTINCT rdataclass, resource_id AS id FROM rdatapaths
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
## helper for extracting rdatapath
.rdatapath <- function(x) {
query <- sprintf(
'SELECT DISTINCT rdatapath, resource_id AS id FROM rdatapaths
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
## helper for extracting sourceUrls
.sourceurl <- function(x) {
query <- sprintf(
'SELECT DISTINCT sourceurl, resource_id AS id FROM input_sources
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
## helper for extracting sourcetype
.sourcetype <- function(x) {
query <- sprintf(
'SELECT DISTINCT sourcetype, resource_id AS id FROM input_sources
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
.sourcesize <- function(x) {
query <- sprintf(
'SELECT DISTINCT sourcesize, resource_id AS id FROM input_sources
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
.sourcelastmodifieddate <- function(x) {
query <- sprintf(
'SELECT DISTINCT sourcelastmodifieddate, resource_id AS id
FROM input_sources
WHERE resource_id IN (%s)',
.id_as_single_string(x))
.db_query(dbfile(x), query)
}
.dataclass <- function(x)
{
query <- sprintf(
'SELECT DISTINCT r.ah_id AS ah_id, rdp.dispatchclass
FROM rdatapaths AS rdp, resources AS r WHERE
r.id = rdp.resource_id
AND rdp.resource_id IN (%s)',
.id_as_single_string(x))
.query_as_data.frame(x, query)[[1]]
}
.title_data.frame <-
function(x)
{
query <- sprintf(
"SELECT ah_id, title FROM resources
WHERE resources.id IN (%s)",
.id_as_single_string(x))
.query_as_data.frame(x, query)
}
.resource_columns <- function()
strsplit(gsub("resources.", "", .DB_RESOURCE_FIELDS), ", ")[[1]]
.resource_column <- function(x, name)
{
valid <- .resource_columns()
if (!name %in% valid) {
msg <- sprintf("%s is not a resource data column", sQuote(name))
stop(msg)
}
query <- sprintf(
'SELECT ah_id, %s FROM resources WHERE id IN (%s)',
name, .id_as_single_string(x))
.query_as_data.frame(x, query)[[1]]
}
.count_resources <-
function(x, column, limit=10)
{
query <- sprintf(
"SELECT %s FROM resources
WHERE resources.id IN (%s)
GROUP BY %s ORDER BY COUNT(%s) DESC LIMIT %d",
column, .id_as_single_string(x), column, column, limit)
.db_query(dbfile(x), query)[[column]]
}
.count_join_resources <-
function(x, table, column, limit=10)
{
query <- sprintf(
"SELECT %s FROM resources, %s
WHERE resources.id IN (%s) AND %s.resource_id == resources.id
GROUP BY %s ORDER BY COUNT(%s) DESC LIMIT %d",
column, table,
.id_as_single_string(x), table,
column, column, limit)
.db_query(dbfile(x), query)[[column]]
}
.datapathIds <- function(x)
{
query <- sprintf(
'SELECT DISTINCT resources.ah_id, rdatapaths.id
FROM resources, rdatapaths
WHERE resources.id IN (%s)
AND resources.id == rdatapaths.resource_id',
.id_as_single_string(x))
result <- .db_query(dbfile(x), query)
setNames(result[[2]], result[[1]])
}
.IdsInfo <- function(x)
{
query <-
'SELECT DISTINCT resources.ah_id, rdatapaths.id, resources.title, rdatapaths.rdataclass, statuses.status, biocversions.biocversion, resources.rdatadateadded, resources.rdatadateremoved
FROM resources, rdatapaths, statuses, biocversions
WHERE resources.id == rdatapaths.resource_id
AND resources.status_id == statuses.id
AND biocversions.resource_id == resources.id'
mat <- .db_query(dbfile(x), query)
nms <- names(mat)
nms[which(nms == "id")] = "fetch_id"
names(mat) <- nms
mat
}
|