1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
#' @title Find variables by their names, variable or value labels
#' @name data_seek
#'
#' @description This functions seeks variables in a data frame, based on patterns
#' that either match the variable name (column name), variable labels, value labels
#' or factor levels. Matching variable and value labels only works for "labelled"
#' data, i.e. when the variables either have a `label` attribute or `labels`
#' attribute.
#'
#' `data_seek()` is particular useful for larger data frames with labelled
#' data - finding the correct variable name can be a challenge. This function
#' helps to find the required variables, when only certain patterns of variable
#' names or labels are known.
#'
#' @param data A data frame.
#' @param pattern Character string (regular expression) to be matched in `data`.
#' May also be a character vector of length > 1. `pattern` is searched for in
#' column names, variable label and value labels attributes, or factor levels of
#' variables in `data`.
#' @param seek Character vector, indicating where `pattern` is sought. Use one
#' or more of the following options:
#'
#' - `"names"`: Searches in column names. `"column_names"` and `"columns"` are
#' aliases for `"names"`.
#' - `"labels"`: Searches in variable labels. Only applies when a `label` attribute
#' is set for a variable.
#' - `"values"`: Searches in value labels or factor levels. Only applies when a
#' `labels` attribute is set for a variable, or if a variable is a factor.
#' `"levels"` is an alias for `"values"`.
#' - `"all"`: Searches in all of the above.
#' @param fuzzy Logical. If `TRUE`, "fuzzy matching" (partial and close distance
#' matching) will be used to find `pattern`.
#'
#' @return A data frame with three columns: the column index, the column name
#' and - if available - the variable label of all matched variables in `data`.
#'
#' @examples
#' # seek variables with "Length" in variable name or labels
#' data_seek(iris, "Length")
#'
#' # seek variables with "dependency" in names or labels
#' # column "e42dep" has a label-attribute "elder's dependency"
#' data(efc)
#' data_seek(efc, "dependency")
#'
#' # "female" only appears as value label attribute - default search is in
#' # variable names and labels only, so no match
#' data_seek(efc, "female")
#' # when we seek in all sources, we find the variable "e16sex"
#' data_seek(efc, "female", seek = "all")
#'
#' # typo, no match
#' data_seek(iris, "Lenght")
#' # typo, fuzzy match
#' data_seek(iris, "Lenght", fuzzy = TRUE)
#' @export
data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) {
# check valid args
if (!is.data.frame(data)) {
insight::format_error("`data` must be a data frame.")
}
# check valid args
seek <- intersect(seek, c("names", "labels", "values", "levels", "column_names", "columns", "all"))
if (is.null(seek) || !length(seek)) {
insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") # nolint
}
pos1 <- pos2 <- pos3 <- NULL
pos <- unlist(lapply(pattern, function(search_pattern) {
# search in variable names?
if (any(seek %in% c("names", "columns", "column_names", "all"))) {
pos1 <- grep(search_pattern, colnames(data))
# find in near distance?
if (fuzzy) {
pos1 <- c(pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern))
}
}
# search in variable labels?
if (any(seek %in% c("labels", "all"))) {
var_labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE)))
if (!is.null(var_labels) && length(var_labels)) {
found <- grepl(search_pattern, var_labels)
pos2 <- match(names(var_labels)[found], colnames(data))
# find in near distanc?
if (fuzzy) {
found <- .fuzzy_grep(x = var_labels, pattern = search_pattern)
if (length(found)) {
pos2 <- c(pos2, match(names(var_labels)[found], colnames(data)))
}
}
}
}
# search for pattern in value labels or levels?
if (any(seek %in% c("values", "levels", "all"))) {
values <- insight::compact_list(lapply(data, function(i) {
l <- attr(i, "labels", exact = TRUE)
if (is.null(l) && is.factor(i)) {
levels(i)
} else {
names(l)
}
}))
if (!is.null(values) && length(values)) {
found <- vapply(values, function(i) any(grepl(search_pattern, i)), logical(1))
pos3 <- match(names(found)[found], colnames(data))
# find in near distance
if (fuzzy) {
found <- vapply(
values,
function(i) {
length(.fuzzy_grep(x = i, pattern = search_pattern)) > 0
},
logical(1)
)
if (any(found)) {
pos3 <- c(pos3, match(names(found)[found], colnames(data)))
}
}
}
}
c(pos1, pos2, pos3)
}))
# clean up
pos <- unique(pos)
# variable labels of matching variables
var_labels <- vapply(
colnames(data[pos]),
function(i) {
l <- attr(data[[i]], "label", exact = TRUE)
if (is.null(l)) {
i
} else {
l
}
},
character(1)
)
out <- data.frame(
index = pos,
column = colnames(data)[pos],
labels = var_labels,
stringsAsFactors = FALSE
)
# no row names
rownames(out) <- NULL
class(out) <- c("data_seek", "data.frame")
out
}
# methods ---------------------------------------------------------------------
#' @export
print.data_seek <- function(x, ...) {
if (nrow(x) == 0) {
cat("No matches found.\n")
} else {
cat(insight::export_table(x, ...))
}
}
|