1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
|
##' Match taxonomic names to the Open Tree Taxonomy.
##'
##' Accepts one or more taxonomic names and returns information about
##' potential matches for these names to known taxa in the Open Tree
##' Taxonomy.
##'
##' This service uses taxonomic contexts to disambiguate homonyms and misspelled
##' names; a context may be specified using the \code{context_name} argument.
##' The default value for \code{context_name} is "All life". If no context is
##' specified (i.e., \code{context_name} is set to \code{NULL}), then the
##' context will be inferred (i.e., the shallowest taxonomic context that
##' contains all unambiguous names in the input). Taxonomic contexts are
##' uncontested higher taxa that have been selected to allow limits to be
##' applied to the scope of TNRS searches (e.g. 'match names only within
##' flowering plants'). Once a context has been identified (either
##' user-specified or inferred), all taxon name matches will performed only
##' against taxa within that context. For a list of available taxonomic
##' contexts, see \code{\link{tnrs_contexts}}.
##'
##' A name is considered unambiguous if it is not a synonym and has
##' only one exact match to any taxon name in the entire taxonomy.
##'
##' When the name search returns multiple matches, the taxon with the highest
##' match score is returned. If the name returned is not the one you intended,
##' you can use the \code{inspect} function to check the other taxa returned by
##' your search. The
##' \href{https://docs.ropensci.org/rotl/articles/rotl.html#how-to-change-the-ott-ids-assigned-to-my-taxa}{Getting
##' Started vignette} has more information on how to do this.
##'
##' Several functions listed in the \sQuote{See also} section can be
##' used to inspect and manipulate the object generated by this
##' function.
##'
##'
##' @title Match names to the Open Tree Taxonomy
##' @param names taxon names to be queried. Currently limited to 10,000 names
##' for exact matches and 2,500 names for approximate matches (character
##' vector)
##' @param context_name name of the taxonomic context to be searched (length-one
##' character vector or \code{NULL}). Must match (case sensitive) one of the
##' values returned by \code{\link{tnrs_contexts}}. Default to "All life".
##' @param do_approximate_matching A logical indicating whether or not to
##' perform approximate string (a.k.a. \dQuote{fuzzy}) matching. Using
##' \code{FALSE} will greatly improve speed. Default, however, is \code{TRUE}.
##' @param ids A vector of ids to use for identifying names. These will be
##' assigned to each name in the names array. If ids is provided, then ids and
##' names must be identical in length.
##' @param include_suppressed Ordinarily, some quasi-taxa, such as incertae
##' sedis buckets and other non-OTUs, are suppressed from TNRS results. If
##' this parameter is true, these quasi-taxa are allowed as possible TNRS
##' results.
##' @param ... additional arguments to customize the API request (see
##' \code{\link{rotl}} package documentation).
##' @return A data frame summarizing the results of the query. The original
##' query output is appended as an attribute to the returned object (and can
##' be obtained using \code{attr(object, "original_response")}).
##' @seealso \code{\link{inspect.match_names}},
##' \code{\link{update.match_names}}, \code{\link{synonyms.match_names}}.
##' @examples \dontrun{
##' deuterostomes <- tnrs_match_names(names=c("echinodermata", "xenacoelomorpha",
##' "chordata", "hemichordata"))
##' }
##' @importFrom stats setNames
##' @export
tnrs_match_names <- function(names = NULL, context_name = "All life",
do_approximate_matching = TRUE, ids = NULL,
include_suppressed = FALSE, ...) {
if (!is.null(context_name) &&
!context_name %in% unlist(tnrs_contexts(...))) {
stop(
"The ", sQuote("context_name"),
" is not valid. Check possible values using tnrs_contexts()"
)
}
## take care of duplicated names
if (any(duplicated(tolower(names)))) {
names <- tolower(names)
warning("Some names were duplicated: ",
paste(sQuote(names[duplicated(names)]), collapse = ", "), ".",
call. = FALSE
)
names <- unique(names)
}
res <- .tnrs_match_names(
names = names, context_name = context_name,
do_approximate_matching = do_approximate_matching,
ids = ids, include_suppressed = include_suppressed,
...
)
check_tnrs(res)
match_ids <- highest_match_score(res)
if (!identical(length(res[["results"]]), length(match_ids))) {
stop(
"The number of matches and the number of 'results' elements should",
" be the same."
)
}
summary_match <- mapply(
function(rid, mid) {
build_summary_match(
res,
res_id = rid,
match_id = mid,
initial_creation = TRUE
)
},
seq_along(res[["results"]]),
match_ids,
SIMPLIFY = FALSE
)
## add taxon names with no maches
summary_match <- do.call("rbind", summary_match)
summary_match <- as.data.frame(summary_match, stringsAsFactors = FALSE)
summary_match$search_string <- gsub("\\\\", "", summary_match$search_string)
## reorder to match original query
ordr <- match(tolower(names), tolower(summary_match$search_string))
stopifnot(identical(length(match_ids), length(ordr)))
summary_match <- summary_match[ordr, ]
match_ids <- match_ids[ordr]
summary_match[["approximate_match"]] <-
convert_to_logical(summary_match[["approximate_match"]])
summary_match[["is_synonym"]] <-
convert_to_logical(summary_match[["is_synonym"]])
summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])
has_original_match <- !is.na(summary_match[["number_matches"]])
json_coords <- data.frame(
search_string = names,
original_order = as.numeric(rownames(summary_match)),
match_id = match_ids,
has_original_match = has_original_match,
row.names = seq_along(names),
stringsAsFactors = FALSE
)
attr(summary_match, "original_order") <- as.numeric(rownames(summary_match))
attr(summary_match, "original_response") <- res
attr(summary_match, "match_id") <- match_ids
attr(summary_match, "has_original_match") <- has_original_match
attr(summary_match, "json_coords") <- json_coords
class(summary_match) <- c("match_names", "data.frame")
rownames(summary_match) <- NULL
summary_match
}
##' @importFrom stats na.omit
convert_to_logical <- function(x) {
if (all(stats::na.omit(x) %in% c("TRUE", "FALSE"))) {
x <- as.logical(x)
} else {
x
}
}
check_tnrs <- function(req) {
no_match <- req[["unmatched_names"]]
if (any(vapply(no_match, length, integer(1)) > 0)) {
warning(
paste(unlist(no_match), collapse = ", "), " are not matched",
call. = FALSE
)
}
}
tnrs_columns <- list(
"search_string" = function(x) x[["search_string"]],
"unique_name" = function(x) .tax_unique_name(x[["taxon"]]),
"approximate_match" = function(x) x[["is_approximate_match"]],
"score" = function(x) x[["score"]],
"ott_id" = function(x) .tax_ott_id(x[["taxon"]]),
"is_synonym" = function(x) x[["is_synonym"]],
"flags" = function(x) paste(.tax_flags(x[["taxon"]]), collapse = ", ")
)
summary_row_factory <- function(res, res_id, match_id, columns = tnrs_columns) {
res_address <- res[["results"]][[res_id]][["matches"]][[match_id]]
ret <- sapply(columns, function(f) f(res_address))
n_match <- length(res[["results"]][[res_id]][["matches"]])
c(ret, number_matches = n_match)
}
build_summary_match <- function(res, res_id, match_id = NULL, initial_creation) {
if (length(res_id) > 1 &&
(!is.null(match_id) && length(match_id) > 1)) {
stop("Something is wrong. Please contact us.")
}
build_summary_row <- function(rid) {
if (is.null(match_id)) {
match_id <- seq_len(length(res[["results"]][[rid]][["matches"]]))
}
if (identical(length(match_id), 0L) ||
is.null(res[["results"]][[rid]][["matches"]][match_id][[1]])) {
return(build_empty_row(tolower(res[["results"]][[rid]][["name"]])))
}
res <- lapply(match_id, function(mid) {
summary_row_factory(res, rid, mid)
})
if (identical(length(match_id), 1L)) {
unlist(res)
} else {
res
}
}
summary_row <- lapply(res_id, build_summary_row)
if (identical(length(res_id), 1L)) {
summary_row <- unlist(summary_row, recursive = FALSE)
}
## Needed if only 1 row returned
if (!inherits(summary_row, "list")) {
summary_row <- list(summary_row)
}
summary_match <- do.call("rbind", summary_row)
summary_match <- data.frame(summary_match, stringsAsFactors = FALSE)
names(summary_match) <- c(names(tnrs_columns), "number_matches")
clean_tnrs_summary(summary_match)
}
##' @importFrom stats setNames
build_empty_row <- function(x) {
no_match_row <- stats::setNames(
rep(NA, length(tnrs_columns) + 1),
c(names(tnrs_columns), "number_matches")
)
no_match_row[1] <- x
no_match_row
}
lowest_ott_id <- function(rsp) {
vapply(seq_along(rsp[["results"]]), function(x) {
.r <- build_summary_match(
res = rsp, res_id = x, match_id = NULL,
initial_creation = TRUE
)
.r <- .r[(!as.logical(.r[["is_synonym"]])) &
!is.na(.r[["flags"]]) &
.r[["flags"]] == "", ]
if (nrow(.r) > 0) {
which.min(.r[["ott_id"]])
} else {
1L
}
}, integer(1))
}
highest_match_score <- function(rsp) {
vapply(seq_along(rsp[["results"]]), function(x) {
.r <- build_summary_match(
res = rsp, res_id = x, match_id = NULL,
initial_creation = TRUE
)
.r <- .r[(!as.logical(.r[["is_synonym"]])) &
!is.na(.r[["flags"]]) &
.r[["flags"]] == "", ]
if (nrow(.r) > 0) {
which.max(.r[["score"]])
} else {
1L
}
}, integer(1))
}
clean_tnrs_summary <- function(summary_match) {
summary_match[["approximate_match"]] <-
convert_to_logical(summary_match[["approximate_match"]])
summary_match[["score"]] <- as.numeric(summary_match[["score"]])
summary_match[["is_synonym"]] <-
convert_to_logical(summary_match[["is_synonym"]])
summary_match[["flags"]] <- convert_to_logical(summary_match[["flags"]])
summary_match[["ott_id"]] <- as.integer(summary_match[["ott_id"]])
summary_match[["number_matches"]] <-
as.integer(summary_match[["number_matches"]])
summary_match
}
##' This function returns a list of pre-defined taxonomic contexts
##' (i.e. clades) which can be used to limit the scope of tnrs
##' queries.
##'
##' Taxonomic contexts are available to limit the scope of TNRS
##' searches. These contexts correspond to uncontested higher taxa
##' such as 'Animals' or 'Land plants'. This service returns a list
##' containing all available taxonomic context names, which may be
##' used as input (via the \code{context_name} argument in other
##' functions) to limit the search scope of other services including
##' \code{\link{tnrs_match_names}}.
##' @title TNRS contexts
##' @param ... additional arguments to customize the API request (see
##' \code{\link{rotl}} package documentation).
##' @return Returns invisibly a list for each major clades (e.g.,
##' animals, microbes, plants, fungi, life) whose elements
##' contains the possible contexts.
##' @export
tnrs_contexts <- function(...) {
res <- .tnrs_contexts(...)
class(res) <- "tnrs_contexts"
res
}
##' @export
print.tnrs_contexts <- function(x, ...) {
cat("Possible contexts:\n")
lapply(x, function(t) {
res <- unlist(t)
cat(" ", res[1], "\n")
if (length(res) > 1) {
lapply(seq(2, length(res), by = 5), function(l) {
m <- ifelse(l + 5 <= length(res), l + 4, length(res))
cat(" ", paste(res[l:m], collapse = ", "), "\n")
})
}
})
}
##' Return a taxonomic context given a list of taxonomic names
##'
##' Find the least inclusive taxonomic context that includes all the
##' unambiguous names in the input set. Unambiguous names are names
##' with exact matches to non-homonym taxa. Ambiguous names (those
##' without exact matches to non-homonym taxa) are indicated in
##' results.
##'
##' @title Infer the taxonomic context from a list of names
##' @param names Vector of taxon names.
##' @param ... additional arguments to customize the API request (see
##' \code{\link{rotl}} package documentation).
##' @return A list including the context name, the context ott id and
##' possibly the names in the query that have an ambiguous
##' taxonomic meaning in the query.
##' @examples
##' \dontrun{
##' res <- tnrs_infer_context(names=c("Stellula calliope", "Struthio camelus"))
##' }
##' @export
tnrs_infer_context <- function(names = NULL, ...) {
res <- .tnrs_infer_context(names = names, ...)
return(res)
}
|