File: tnrs.R

package info (click to toggle)
r-cran-rotl 3.1.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,384 kB
  • sloc: sh: 9; makefile: 5
file content (364 lines) | stat: -rw-r--r-- 13,127 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
##' Match taxonomic names to the Open Tree Taxonomy.
##'
##' Accepts one or more taxonomic names and returns information about
##' potential matches for these names to known taxa in the Open Tree
##' Taxonomy.
##'
##' This service uses taxonomic contexts to disambiguate homonyms and misspelled
##' names; a context may be specified using the \code{context_name} argument.
##' The default value for \code{context_name} is "All life". If no context is
##' specified (i.e., \code{context_name} is set to \code{NULL}), then the
##' context will be inferred (i.e., the shallowest taxonomic context that
##' contains all unambiguous names in the input). Taxonomic contexts are
##' uncontested higher taxa that have been selected to allow limits to be
##' applied to the scope of TNRS searches (e.g. 'match names only within
##' flowering plants'). Once a context has been identified (either
##' user-specified or inferred), all taxon name matches will performed only
##' against taxa within that context. For a list of available taxonomic
##' contexts, see \code{\link{tnrs_contexts}}.
##'
##' A name is considered unambiguous if it is not a synonym and has
##' only one exact match to any taxon name in the entire taxonomy.
##'
##' When the name search returns multiple matches, the taxon with the highest
##' match score is returned. If the name returned is not the one you intended,
##' you can use the \code{inspect} function to check the other taxa returned by
##' your search. The
##' \href{https://docs.ropensci.org/rotl/articles/rotl.html#how-to-change-the-ott-ids-assigned-to-my-taxa}{Getting
##' Started vignette} has more information on how to do this.
##'
##' Several functions listed in the \sQuote{See also} section can be
##' used to inspect and manipulate the object generated by this
##' function.
##'
##'
##' @title Match names to the Open Tree Taxonomy
##' @param names taxon names to be queried. Currently limited to 10,000 names
##'   for exact matches and 2,500 names for approximate matches (character
##'   vector)
##' @param context_name name of the taxonomic context to be searched (length-one
##'   character vector or \code{NULL}). Must match (case sensitive) one of the
##'   values returned by \code{\link{tnrs_contexts}}. Default to "All life".
##' @param do_approximate_matching A logical indicating whether or not to
##'   perform approximate string (a.k.a. \dQuote{fuzzy}) matching. Using
##'   \code{FALSE} will greatly improve speed. Default, however, is \code{TRUE}.
##' @param ids A vector of ids to use for identifying names. These will be
##'   assigned to each name in the names array. If ids is provided, then ids and
##'   names must be identical in length.
##' @param include_suppressed Ordinarily, some quasi-taxa, such as incertae
##'   sedis buckets and other non-OTUs, are suppressed from TNRS results. If
##'   this parameter is true, these quasi-taxa are allowed as possible TNRS
##'   results.
##' @param ... additional arguments to customize the API request (see
##'   \code{\link{rotl}} package documentation).
##' @return A data frame summarizing the results of the query. The original
##'   query output is appended as an attribute to the returned object (and can
##'   be obtained using \code{attr(object, "original_response")}).
##' @seealso \code{\link{inspect.match_names}},
##'   \code{\link{update.match_names}}, \code{\link{synonyms.match_names}}.
##' @examples \dontrun{
##'  deuterostomes <- tnrs_match_names(names=c("echinodermata", "xenacoelomorpha",
##'                                             "chordata", "hemichordata"))
##' }
##' @importFrom stats setNames
##' @export
tnrs_match_names <- function(names = NULL, context_name = "All life",
                             do_approximate_matching = TRUE, ids = NULL,
                             include_suppressed = FALSE, ...) {
  if (!is.null(context_name) &&
    !context_name %in% unlist(tnrs_contexts(...))) {
    stop(
      "The ", sQuote("context_name"),
      " is not valid. Check possible values using tnrs_contexts()"
    )
  }

  ## take care of duplicated names
  if (any(duplicated(tolower(names)))) {
    names <- tolower(names)
    warning("Some names were duplicated: ",
      paste(sQuote(names[duplicated(names)]), collapse = ", "), ".",
      call. = FALSE
    )
    names <- unique(names)
  }

  res <- .tnrs_match_names(
    names = names, context_name = context_name,
    do_approximate_matching = do_approximate_matching,
    ids = ids, include_suppressed = include_suppressed,
    ...
  )

  check_tnrs(res)

  match_ids <- highest_match_score(res)

  if (!identical(length(res[["results"]]), length(match_ids))) {
    stop(
      "The number of matches and the number of 'results' elements should",
      " be the same."
    )
  }

  summary_match <- mapply(
    function(rid, mid) {
      build_summary_match(
        res,
        res_id = rid,
        match_id = mid,
        initial_creation = TRUE
      )
    },
    seq_along(res[["results"]]),
    match_ids,
    SIMPLIFY = FALSE
  )

  ## add taxon names with no maches
  summary_match <- do.call("rbind", summary_match)
  summary_match <- as.data.frame(summary_match, stringsAsFactors = FALSE)

  summary_match$search_string <- gsub("\\\\", "", summary_match$search_string)

  ## reorder to match original query
  ordr <- match(tolower(names), tolower(summary_match$search_string))
  stopifnot(identical(length(match_ids), length(ordr)))

  summary_match <- summary_match[ordr, ]
  match_ids <- match_ids[ordr]

  summary_match[["approximate_match"]] <-
    convert_to_logical(summary_match[["approximate_match"]])
  summary_match[["is_synonym"]] <-
    convert_to_logical(summary_match[["is_synonym"]])
  summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])

  has_original_match <- !is.na(summary_match[["number_matches"]])

  json_coords <- data.frame(
    search_string = names,
    original_order = as.numeric(rownames(summary_match)),
    match_id = match_ids,
    has_original_match = has_original_match,
    row.names = seq_along(names),
    stringsAsFactors = FALSE
  )

  attr(summary_match, "original_order") <- as.numeric(rownames(summary_match))
  attr(summary_match, "original_response") <- res
  attr(summary_match, "match_id") <- match_ids
  attr(summary_match, "has_original_match") <- has_original_match
  attr(summary_match, "json_coords") <- json_coords

  class(summary_match) <- c("match_names", "data.frame")
  rownames(summary_match) <- NULL
  summary_match
}

##' @importFrom stats na.omit
convert_to_logical <- function(x) {
  if (all(stats::na.omit(x) %in% c("TRUE", "FALSE"))) {
    x <- as.logical(x)
  } else {
    x
  }
}

check_tnrs <- function(req) {
  no_match <- req[["unmatched_names"]]

  if (any(vapply(no_match, length, integer(1)) > 0)) {
    warning(
      paste(unlist(no_match), collapse = ", "), " are not matched",
      call. = FALSE
    )
  }
}


tnrs_columns <- list(
  "search_string" = function(x) x[["search_string"]],
  "unique_name" = function(x) .tax_unique_name(x[["taxon"]]),
  "approximate_match" = function(x) x[["is_approximate_match"]],
  "score" = function(x) x[["score"]],
  "ott_id" = function(x) .tax_ott_id(x[["taxon"]]),
  "is_synonym" = function(x) x[["is_synonym"]],
  "flags" = function(x) paste(.tax_flags(x[["taxon"]]), collapse = ", ")
)

summary_row_factory <- function(res, res_id, match_id, columns = tnrs_columns) {
  res_address <- res[["results"]][[res_id]][["matches"]][[match_id]]
  ret <- sapply(columns, function(f) f(res_address))
  n_match <- length(res[["results"]][[res_id]][["matches"]])
  c(ret, number_matches = n_match)
}

build_summary_match <- function(res, res_id, match_id = NULL, initial_creation) {
  if (length(res_id) > 1 &&
    (!is.null(match_id) && length(match_id) > 1)) {
    stop("Something is wrong. Please contact us.")
  }

  build_summary_row <- function(rid) {
    if (is.null(match_id)) {
      match_id <- seq_len(length(res[["results"]][[rid]][["matches"]]))
    }
    if (identical(length(match_id), 0L) ||
      is.null(res[["results"]][[rid]][["matches"]][match_id][[1]])) {
      return(build_empty_row(tolower(res[["results"]][[rid]][["name"]])))
    }
    res <- lapply(match_id, function(mid) {
      summary_row_factory(res, rid, mid)
    })
    if (identical(length(match_id), 1L)) {
      unlist(res)
    } else {
      res
    }
  }

  summary_row <- lapply(res_id, build_summary_row)

  if (identical(length(res_id), 1L)) {
    summary_row <- unlist(summary_row, recursive = FALSE)
  }

  ## Needed if only 1 row returned
  if (!inherits(summary_row, "list")) {
    summary_row <- list(summary_row)
  }

  summary_match <- do.call("rbind", summary_row)
  summary_match <- data.frame(summary_match, stringsAsFactors = FALSE)
  names(summary_match) <- c(names(tnrs_columns), "number_matches")
  clean_tnrs_summary(summary_match)
}

##' @importFrom stats setNames
build_empty_row <- function(x) {
  no_match_row <- stats::setNames(
    rep(NA, length(tnrs_columns) + 1),
    c(names(tnrs_columns), "number_matches")
  )
  no_match_row[1] <- x
  no_match_row
}

lowest_ott_id <- function(rsp) {
  vapply(seq_along(rsp[["results"]]), function(x) {
    .r <- build_summary_match(
      res = rsp, res_id = x, match_id = NULL,
      initial_creation = TRUE
    )

    .r <- .r[(!as.logical(.r[["is_synonym"]])) &
      !is.na(.r[["flags"]]) &
      .r[["flags"]] == "", ]

    if (nrow(.r) > 0) {
      which.min(.r[["ott_id"]])
    } else {
      1L
    }
  }, integer(1))
}

highest_match_score <- function(rsp) {
  vapply(seq_along(rsp[["results"]]), function(x) {
    .r <- build_summary_match(
      res = rsp, res_id = x, match_id = NULL,
      initial_creation = TRUE
    )

    .r <- .r[(!as.logical(.r[["is_synonym"]])) &
               !is.na(.r[["flags"]]) &
               .r[["flags"]] == "", ]

    if (nrow(.r) > 0) {
      which.max(.r[["score"]])
    } else {
      1L
    }
  }, integer(1))
}



clean_tnrs_summary <- function(summary_match) {
  summary_match[["approximate_match"]] <-
    convert_to_logical(summary_match[["approximate_match"]])
  summary_match[["score"]] <- as.numeric(summary_match[["score"]])
  summary_match[["is_synonym"]] <-
    convert_to_logical(summary_match[["is_synonym"]])
  summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])
  summary_match[["ott_id"]] <- as.integer(summary_match[["ott_id"]])
  summary_match[["number_matches"]] <-
    as.integer(summary_match[["number_matches"]])
  summary_match
}

##' This function returns a list of pre-defined taxonomic contexts
##' (i.e. clades) which can be used to limit the scope of tnrs
##' queries.
##'
##' Taxonomic contexts are available to limit the scope of TNRS
##' searches. These contexts correspond to uncontested higher taxa
##' such as 'Animals' or 'Land plants'. This service returns a list
##' containing all available taxonomic context names, which may be
##' used as input (via the \code{context_name} argument in other
##' functions) to limit the search scope of other services including
##' \code{\link{tnrs_match_names}}.
##' @title TNRS contexts
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return Returns invisibly a list for each major clades (e.g.,
##'     animals, microbes, plants, fungi, life) whose elements
##'     contains the possible contexts.
##' @export

tnrs_contexts <- function(...) {
  res <- .tnrs_contexts(...)
  class(res) <- "tnrs_contexts"
  res
}

##' @export
print.tnrs_contexts <- function(x, ...) {
  cat("Possible contexts:\n")
  lapply(x, function(t) {
    res <- unlist(t)
    cat("  ", res[1], "\n")
    if (length(res) > 1) {
      lapply(seq(2, length(res), by = 5), function(l) {
        m <- ifelse(l + 5 <= length(res), l + 4, length(res))
        cat("     ", paste(res[l:m], collapse = ", "), "\n")
      })
    }
  })
}

##' Return a taxonomic context given a list of taxonomic names
##'
##' Find the least inclusive taxonomic context that includes all the
##' unambiguous names in the input set. Unambiguous names are names
##' with exact matches to non-homonym taxa. Ambiguous names (those
##' without exact matches to non-homonym taxa) are indicated in
##' results.
##'
##' @title Infer the taxonomic context from a list of names
##' @param names Vector of taxon names.
##' @param ...  additional arguments to customize the API request (see
##'     \code{\link{rotl}} package documentation).
##' @return A list including the context name, the context ott id and
##'     possibly the names in the query that have an ambiguous
##'     taxonomic meaning in the query.
##' @examples
##' \dontrun{
                                ##' res <- tnrs_infer_context(names=c("Stellula calliope", "Struthio camelus"))
                                ##' }
                                ##' @export
                                tnrs_infer_context <- function(names = NULL, ...) {
                                  res <- .tnrs_infer_context(names = names, ...)
                                  return(res)
                                }