File: studies.R

package info (click to toggle)
r-cran-rotl 3.1.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,384 kB
  • sloc: sh: 9; makefile: 5
file content (510 lines) | stat: -rw-r--r-- 21,077 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
##' Return the list of study properties that can be used to search
##' studies and trees used in the synthetic tree.
##'
##' The list returned has 2 elements \code{tree_properties} and
##' \code{studies_properties}. Each of these elements lists additional
##' arguments to customize the API request properties that can be used
##' to search for trees and studies that are contributing to the
##' synthetic tree. The definitions of these properties are available
##' from
##' \url{https://github.com/OpenTreeOfLife/phylesystem-api/wiki/NexSON}
##'
##' @title Properties of the Studies
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return A list of the study properties that can be used to find
##'     studies and trees that are contributing to the synthetic tree.
##' @seealso \code{\link{studies_find_trees}}
##' @export
##' @examples
##' \dontrun{
##'  all_the_properties <- studies_properties()
##'  unlist(all_the_properties$tree_properties)
##' }

studies_properties <- function(...) {
  res <- .studies_properties(...)
  lapply(res, unlist)
}


##' Return the identifiers of studies that match given properties
##'
##' @title Find a Study
##' @param exact Should exact matching be used? (logical, default
##'     \code{FALSE})
##' @param property The property to be searched on (character)
##' @param value The property value to be searched on (character)
##' @param detailed If \code{TRUE} (default), the function will return
##'     a data frame that summarizes information about the study (see
##'     \sQuote{Value}). Otherwise, it only returns the study
##'     identifiers.
##' @param verbose Should the output include all metadata (logical
##'     default \code{FALSE})
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return If \code{detailed=TRUE}, the function returns a data frame
##'     listing the study id (\code{study_ids}), the number of trees
##'     associated with this study (\code{n_trees}), the tree ids (at
##'     most 5) associated with the studies (\code{tree_ids}), the
##'     tree id that is a candidate for the synthetic tree if any
##'     (\code{candidate}), the year of publication of the study
##'     (\code{study_year}), the title of the publication for the
##'     study (\code{title}), and the DOI (Digital Object Identifier)
##'     for the study (\code{study_doi}).
##'
##'     If \code{detailed=FALSE}, the function returns a data frame
##'     with a single column containing the study identifiers.
##' @seealso \code{\link{studies_properties}} which lists properties
##'     against which the studies can be
##'     searched. \code{\link{list_trees}} that returns a list for all
##'     tree ids associated with a study.
##' @export
##' @examples
##' \dontrun{
##' ## To match a study for which the identifier is already known
##' one_study <- studies_find_studies(property="ot:studyId", value="pg_719")
##' list_trees(one_study)
##'
##' ## To find studies pertaining to Mammals
##' mammals <- studies_find_studies(property="ot:focalCladeOTTTaxonName",
##'                                 value="mammalia")
##' ## To extract the tree identifiers for each of the studies
##' list_trees(mammals)
##' ## ... or for a given study
##' list_trees(mammals, "ot_308")
##'
##' ## Just the identifiers without other information about the studies
##' mammals <- studies_find_studies(property="ot:focalCladeOTTTaxonName",
##'                                 value="mammalia", detailed=FALSE)
##' }
studies_find_studies <- function(property = NULL, value = NULL, verbose = FALSE,
                                 exact = FALSE, detailed = TRUE, ...) {
  .res <- .studies_find_studies(
    property = property, value = value,
    verbose = verbose, exact = exact, ...
  )

  res <- vapply(
    .res[["matched_studies"]],
    function(x) x[["ot:studyId"]],
    character(1)
  )
  if (detailed) {
    dat <- summarize_meta(res)
  } else {
    meta_raw <- .res
    dat <- data.frame(study_ids = res, stringsAsFactors = FALSE)
    attr(dat, "found_trees") <- paste(
      "If you want to get a list of the",
      "trees associated with the studies,",
      "use", sQuote("detailed = TRUE")
    )
    class(dat) <- c("study_ids", class(dat))
    attr(dat, "metadata") <- meta_raw
  }
  class(dat) <- c("matched_studies", class(dat))
  dat
}

##' @export
print.study_ids <- function(x, ...) {
  print(format(x), ...)
}

##' Return a list of studies for which trees match a given set of
##' properties
##'
##' The list of possible values to be used as values for the argument
##' \code{property} can be found using the function
##' \code{\link{studies_properties}}.
##'
##' @title Find Trees
##' @param property The property to be searched on (character)
##' @param value The property-value to be searched on (character)
##' @param verbose Should the output include all metadata? (logical,
##'     default \code{FALSE})
##' @param exact Should exact matching be used for the value?
##'     (logical, default \code{FALSE})
##' @param detailed Should a detailed report be provided? If
##'     \code{TRUE} (default), the output will include metadata about
##'     the study that include trees matching the property. Otherwise,
##'     only information about the trees will be provided.
##' @param ... additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return A data frame that summarizes the trees found (and their
##'     associated studies) for the requested criteria. If a study has
##'     more than 5 trees, the \code{tree_ids} of the first ones will
##'     be shown, followed by \code{...} to indicate that more are
##'     present.
##'
##'     If \code{detailed=FALSE}, the data frame will include the
##'     study ids of the study (\code{study_ids}), the number of trees
##'     in this study that match the search criteria
##'     (\code{n_matched_trees}), the tree ids that match the search
##'     criteria (\code{match_tree_ids}).
##'
##'     If \code{detailed=TRUE}, in addition of the fields listed
##'     above, the data frame will also contain the total number of
##'     trees associated with the study (\code{n_trees}), all the tree
##'     ids associated with the study (\code{tree_ids}), the tree id
##'     that is a potential candidate for inclusion in the synthetic
##'     tree (if any) (\code{candidate}), the year the study was
##'     published (\code{study_year}), the title of the study
##'     (\code{title}), the DOI for the study (\code{study_doi}).
##'
##' @seealso \code{\link{studies_properties}} which lists properties
##'   the studies can be searched on. \code{\link{list_trees}} for
##'   listing the trees that match the query.
##' @export
##' @importFrom stats setNames
##' @examples
##' \dontrun{
##' res <- studies_find_trees(property="ot:ottTaxonName", value="Drosophila",
##'                           detailed=FALSE)
##' ## summary of the trees and associated studies that match this criterion
##' res
##' ## With metadata about the studies (default)
##' res <- studies_find_trees(property="ot:ottTaxonName", value="Drosophila",
##'                           detailed=TRUE)
##' ## The list of trees for each study that match the search criteria
##' list_trees(res)
##' ## the trees for a given study
##' list_trees(res, study_id = "pg_2769")
##' }
studies_find_trees <- function(property = NULL, value = NULL, verbose = FALSE,
                               exact = FALSE, detailed = TRUE, ...) {
  .res <- .studies_find_trees(
    property = property, value = value,
    verbose = verbose, exact = exact, ...
  )
  study_ids <- vapply(
    .res[["matched_studies"]],
    function(x) x[["ot:studyId"]],
    character(1)
  )
  n_matched_trees <- vapply(
    .res[["matched_studies"]],
    function(x) length(x[["matched_trees"]]),
    numeric(1)
  )
  match_tree_ids <- lapply(
    .res[["matched_studies"]],
    function(x) {
      sapply(
        x[["matched_trees"]],
        function(y) y[["ot:treeId"]]
      )
    }
  )
  # this one doesn't return all of the treeids. confusing, bc trees are what is wanted
  # tree_str <- vapply(match_tree_ids, limit_trees, character(1))
  tree_str <- sapply(match_tree_ids, function(x) paste(x, collapse = ", "))
  res <- data.frame(study_ids, n_matched_trees,
    match_tree_ids = tree_str,
    stringsAsFactors = FALSE
  )
  if (detailed) {
    meta <- summarize_meta(study_ids)
    # the next bit seems really slow (JWB)
    res <- merge(meta, res)
    attr(res, "metadata") <- attr(meta, "metadata")
  } else {
    attr(res, "metadata") <- .res
  }
  attr(res, "found_trees") <- stats::setNames(match_tree_ids, study_ids)
  class(res) <- c("matched_studies", class(res))
  res
}



##' Returns the trees associated with a given study
##'
##' If \code{file_format} is missing, the function returns an object
##' of the class \code{phylo} from the \code{ape} package
##' (default), or an object of the class \code{nexml} from the
##' \code{RNeXML} package.
##'
##' Otherwise \code{file_format} can be either \code{newick},
##' \code{nexus}, \code{nexml} or \code{json}, and the function will
##' generate a file of the selected format. In this case, a file name
##' needs to be provided using the argument \code{file}. If a file
##' with the same name already exists, it will be silently
##' overwritten.
##'
##' @title Get all the trees associated with a particular study
##' @param study_id the study ID for the study of interest (character)
##' @param object_format the class of the object the query should
##'     return (either \code{phylo} or \code{nexml}). Ignored if
##'     \code{file_format} is specified.
##' @param file_format the format of the file to be generated
##'     (\code{newick}, \code{nexus}, \code{nexml} or \code{json}).
##' @param file the file name where the output of the function will be
##'     saved.
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return if \code{file_format} is missing, an object of class
##'     \code{phylo} or \code{nexml}, otherwise a logical indicating
##'     whether the file was successfully created.
##' @seealso \code{\link{get_study_meta}}
##' @export
##' @importFrom jsonlite toJSON
##' @examples
##' \dontrun{
##' that_one_study <- get_study(study_id="pg_719", object_format="phylo")
##' if (require(RNeXML)) { ## if RNeXML is installed get the object directly
##'    nexml_study <- get_study(study_id="pg_719", object_format="nexml")
##' } else { ## otherwise write it to a file
##'    get_study(study_id="pg_719", file_format="nexml", file=tempfile(fileext=".nexml"))
##' }
##' }
get_study <- function(study_id = NULL, object_format = c("phylo", "nexml"),
                      file_format, file, ...) {
  object_format <- match.arg(object_format)
  if (!missing(file)) {
    if (!missing(file_format)) {
      file_format <- match.arg(file_format, c("newick", "nexus", "nexml", "json"))
      res <- .get_study(study_id, format = file_format)
      unlink(file)
      if (identical(file_format, "json")) {
        cat(jsonlite::toJSON(res), file = file)
      } else {
        cat(res, file = file)
      }
      return(invisible(file.exists(file)))
    } else {
      stop(sQuote("file_format"), " must be specified.")
    }
  } else if (identical(object_format, "phylo")) {
    file_format <- "newick"
    res <- .get_study(study_id = study_id, format = file_format, ...)
    res <- phylo_from_otl(res)
  } else if (identical(object_format, "nexml")) {
    file_format <- "nexml"
    res <- .get_study(study_id = study_id, format = file_format, ...)
    res <- nexml_from_otl(res)
  } else {
    stop("Something is very wrong. Contact us.")
  }
  res
}

##' Returns a specific tree from within a study
##'
##' @title Study Tree
##' @param study_id the identifier of a study (character)
##' @param tree_id the identifier of a tree within the study
##' @param object_format the class of the object to be returned
##'     (default and currently only possible value \code{phylo} from
##'     the \code{ape} package).
##' @param tip_label the format of the tip
##'     labels. \dQuote{\code{original_label}} (default) returns the
##'     original labels as provided in the study,
##'     \dQuote{\code{ott_id}} labels are replaced by their ott IDs,
##'     \dQuote{\code{ott_taxon_name}} labels are replaced by their
##'     Open Tree Taxonomy taxon name.
##' @param file_format the format of the file to be generated
##'     (\code{newick} default, \code{nexus}, or \code{json}).
##' @param file the file name where the output of the function will be
##'     saved.
##' @param deduplicate logical (default \code{TRUE}). If the tree
##' returned by the study contains duplicated taxon names, should they
##' be made unique? It is normally illegal for NEXUS/Newick tree
##' strings to contain duplicated tip names. This is a workaround to
##' circumvent this requirement. If \code{TRUE}, duplicated tip labels
##' will be appended \code{_1}, \code{_2}, etc.
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return if \code{file_format} is missing, an object of class
##'     \code{phylo}, otherwise a logical indicating whether the file
##'     was successfully created.
##' @export
##' @importFrom jsonlite toJSON
##' @examples
##' \dontrun{
##'  tree <- get_study_tree(study_id="pg_1144", tree_id="tree2324")
##'
##'  ## comparison of the first few tip labels depending on the options used
##'  head(get_study_tree(study_id="pg_1144", tree_id="tree2324", tip_label="original_label")$tip.label)
##'  head(get_study_tree(study_id="pg_1144", tree_id="tree2324", tip_label="ott_id")$tip.label)
##'  head(get_study_tree(study_id="pg_1144", tree_id="tree2324", tip_label="ott_taxon_name")$tip.label)
##' }

get_study_tree <- function(study_id = NULL, tree_id = NULL, object_format = c("phylo"),
                           tip_label = c("original_label", "ott_id", "ott_taxon_name"),
                           file_format, file, deduplicate = TRUE, ...) {
  object_format <- match.arg(object_format)
  tip_label <- match.arg(tip_label)
  tip_label <- switch(tip_label,
    original_labels = "ot:originallabel",
    ott_id = "ot:ottid",
    ott_taxon_name = "ot:otttaxonname"
  )
  if (!missing(file)) {
    if (!missing(file_format)) {
      file_format <- match.arg(file_format, c("newick", "nexus", "json"))
      if (missing(file)) stop("You must specify a file to write your output")
      res <- .get_study_tree(
        study_id = study_id, tree_id = tree_id,
        format = file_format, tip_label = tip_label, ...
      )
      unlink(file)
      if (identical(file_format, "json")) {
        cat(jsonlite::toJSON(res), file = file)
      } else {
        cat(res, file = file)
      }
      return(invisible(file.exists(file)))
    } else {
      stop(sQuote("file_format"), " must be specified.")
    }
  } else if (identical(object_format, "phylo")) {
    file_format <- "newick"
    res <- .get_study_tree(
      study_id = study_id, tree_id = tree_id,
      format = file_format, tip_label = tip_label, ...
    )
    res <- phylo_from_otl(res, dedup = deduplicate)
  } else {
    stop("Something is very wrong. Contact us.")
  }
  res
}

##' Retrieve metadata about a study in the Open Tree of Life datastore.
##'
##' \code{get_study_meta} returns a long list of attributes for the
##' studies that are contributing to the synthetic tree. To help with
##' the extraction of relevant information from this list, several
##' helper functions exists: \itemize{
##'
##'   \item {get_tree_ids} { The identifiers of the trees
##'   associated with the study }
##'
##'   \item {get_publication} { The citation information of the
##'   publication for the study. The DOI (or URL) for the study is
##'   available as an attribute to the returned object (i.e.,
##'   \code{attr(object, "DOI")} ) }.
##'
##'   \item {candidate_for_synth} { The identifier of the tree(s) from
##'   the study used in the synthetic tree. This is a subset of the
##'   result of \code{get_tree_ids}.
##'
##'   \item {get_study_year} { The year of publication of the study. }
##'
##'   }
##' }
##'
##' @title Study Metadata
##' @param study_id the study identifier (character)
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @param sm an object created by \code{get_study_meta}
##' @return named-list containing the metadata associated with the
##'     study requested
##' @export
##' @examples
##' \dontrun{
##' req <- get_study_meta("pg_719")
##' get_tree_ids(req)
##' candidate_for_synth(req)
##' get_publication(req)
##' get_study_year(req)
##' }
get_study_meta <- function(study_id, ...) {
  res <- .get_study_meta(study_id = study_id, ...)
  class(res) <- "study_meta"
  attr(res, "study_id") <- study_id
  res
}

##' @export
print.study_meta <- function(x, ...) {
  cat("Metadata for OToL study ", attr(x, "study_id"), ". Contents:\n", sep = "")
  cat(paste0("  $nexml$", names(x$nexml)), sep = "\n")
}

##' Retrieve subtree from a specific tree in the Open Tree of Life data store
##'
##' @title Study Subtree
##' @param study_id the study identifier (character)
##' @param tree_id the tree identifier (character)
##' @param object_format the class of the object returned by the
##'     function (default, and currently only possibility \code{phylo}
##'     from the \code{ape} package)
##' @param tip_label the format of the tip
##'     labels. \dQuote{\code{original_label}} (default) returns the
##'     original labels as provided in the study,
##'     \dQuote{\code{ott_id}} labels are replaced by their ott IDs,
##'     \dQuote{\code{ott_taxon_name}} labels are replaced by their
##'     Open Tree Taxonomy taxon name.
##' @param file_format character, the file format to use to save the
##'     results of the query (possible values, \sQuote{newick} or
##'     \sQuote{nexus}).
##' @param file character, the path and file name where the output
##'     should be written.
##' @param deduplicate logical (default \code{TRUE}). If the tree
##'     returned by the study contains duplicated taxon names, should
##'     they be made unique? It is normally illegal for NEXUS/Newick
##'     tree strings to contain duplicated tip names. This is a
##'     workaround to circumvent this requirement. If \code{TRUE},
##'     duplicated tip labels will be appended \code{_1}, \code{_2},
##'     etc.
##' @param subtree_id, either a node id that specifies a subtree or
##'     \dQuote{ingroup} which returns the ingroup for this subtree.
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @export
##' @examples
##' \dontrun{
##' small_tr <- get_study_subtree(study_id="pg_1144", tree_id="tree5800", subtree_id="node991044")
##' ingroup  <- get_study_subtree(study_id="pg_1144", tree_id="tree5800", subtree_id="ingroup")
##' nexus_file <- tempfile(fileext=".nex")
##' get_study_subtree(study_id="pg_1144", tree_id="tree5800", subtree_id="ingroup", file=nexus_file,
##'                   file_format="nexus")
##' }
get_study_subtree <- function(study_id, tree_id, subtree_id, object_format = c("phylo"),
                              tip_label = c("original_label", "ott_id", "ott_taxon_name"),
                              file_format, file, deduplicate = TRUE, ...) {
  object_format <- match.arg(object_format)
  tip_label <- match.arg(tip_label)
  tip_label <- switch(tip_label,
    original_labels = "ot:originallabel",
    ott_id = "ot:ottid",
    ott_taxon_name = "ot:otttaxonname"
  )
  if (!missing(file)) {
    if (!missing(file_format)) {
      if (missing(file)) stop("You must specify a file to write your output")
      file_format <- match.arg(file_format, c("newick", "nexus"))
      res <- .get_study_subtree(
        study_id = study_id, tree_id = tree_id,
        subtree_id = subtree_id, format = file_format,
        tip_label = tip_label, ...
      )
      unlink(file)
      cat(res, file = file)
      return(invisible(file.exists(file)))
    } else {
      stop(sQuote("file_format"), " must be specified.")
    }
  } else if (identical(object_format, "phylo")) {
    file_format <- "newick"
    res <- .get_study_subtree(
      study_id = study_id, tree_id = tree_id,
      subtree_id = subtree_id, format = file_format,
      tip_label = tip_label, ...
    )
    res <- phylo_from_otl(res, dedup = deduplicate)
    ## NeXML should be possible for both object_format and file_format but it seems there
    ## is something wrong with the server at this time (FM - 2015-06-07)
    ## } else if (identical(object_format, "nexml")) {
    ##    file_format <- "nexml"
    ##    res <- .get_study_subtree(study_id, tree_id, subtree_id, format=file_format)
    ##    res <- nexml_from_otl(res)
  } else {
    stop("Something is very wrong. Contact us.")
  }
  res
}