File: data_seek.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (168 lines) | stat: -rw-r--r-- 5,745 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#' @title Find variables by their names, variable or value labels
#' @name data_seek
#'
#' @description This functions seeks variables in a data frame, based on patterns
#' that either match the variable name (column name), variable labels, value labels
#' or factor levels. Matching variable and value labels only works for "labelled"
#' data, i.e. when the variables either have a `label` attribute or `labels`
#' attribute.
#'
#' `data_seek()` is particular useful for larger data frames with labelled
#' data - finding the correct variable name can be a challenge. This function
#' helps to find the required variables, when only certain patterns of variable
#' names or labels are known.
#'
#' @param data A data frame.
#' @param pattern Character string (regular expression) to be matched in `data`.
#' May also be a character vector of length > 1. `pattern` is searched for in
#' column names, variable label and value labels attributes, or factor levels of
#' variables in `data`.
#' @param seek Character vector, indicating where `pattern` is sought. Use one
#' or more of the following options:
#'
#' - `"names"`: Searches in column names. `"column_names"` and `"columns"` are
#'   aliases for `"names"`.
#' - `"labels"`: Searches in variable labels. Only applies when a `label` attribute
#'   is set for a variable.
#' - `"values"`: Searches in value labels or factor levels. Only applies when a
#'   `labels` attribute is set for a variable, or if a variable is a factor.
#'   `"levels"` is an alias for `"values"`.
#' - `"all"`: Searches in all of the above.
#' @param fuzzy Logical. If `TRUE`, "fuzzy matching" (partial and close distance
#' matching) will be used to find `pattern`.
#'
#' @return A data frame with three columns: the column index, the column name
#' and - if available - the variable label of all matched variables in `data`.
#'
#' @examples
#' # seek variables with "Length" in variable name or labels
#' data_seek(iris, "Length")
#'
#' # seek variables with "dependency" in names or labels
#' # column "e42dep" has a label-attribute "elder's dependency"
#' data(efc)
#' data_seek(efc, "dependency")
#'
#' # "female" only appears as value label attribute - default search is in
#' # variable names and labels only, so no match
#' data_seek(efc, "female")
#' # when we seek in all sources, we find the variable "e16sex"
#' data_seek(efc, "female", seek = "all")
#'
#' # typo, no match
#' data_seek(iris, "Lenght")
#' # typo, fuzzy match
#' data_seek(iris, "Lenght", fuzzy = TRUE)
#' @export
data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) {
  # check valid args
  if (!is.data.frame(data)) {
    insight::format_error("`data` must be a data frame.")
  }

  # check valid args
  seek <- intersect(seek, c("names", "labels", "values", "levels", "column_names", "columns", "all"))
  if (is.null(seek) || !length(seek)) {
    insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") # nolint
  }

  pos1 <- pos2 <- pos3 <- NULL

  pos <- unlist(lapply(pattern, function(search_pattern) {
    # search in variable names?
    if (any(seek %in% c("names", "columns", "column_names", "all"))) {
      pos1 <- grep(search_pattern, colnames(data))
      # find in near distance?
      if (fuzzy) {
        pos1 <- c(pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern))
      }
    }

    # search in variable labels?
    if (any(seek %in% c("labels", "all"))) {
      var_labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE)))
      if (!is.null(var_labels) && length(var_labels)) {
        found <- grepl(search_pattern, var_labels)
        pos2 <- match(names(var_labels)[found], colnames(data))
        # find in near distanc?
        if (fuzzy) {
          found <- .fuzzy_grep(x = var_labels, pattern = search_pattern)
          if (length(found)) {
            pos2 <- c(pos2, match(names(var_labels)[found], colnames(data)))
          }
        }
      }
    }

    # search for pattern in value labels or levels?
    if (any(seek %in% c("values", "levels", "all"))) {
      values <- insight::compact_list(lapply(data, function(i) {
        l <- attr(i, "labels", exact = TRUE)
        if (is.null(l) && is.factor(i)) {
          levels(i)
        } else {
          names(l)
        }
      }))
      if (!is.null(values) && length(values)) {
        found <- vapply(values, function(i) any(grepl(search_pattern, i)), logical(1))
        pos3 <- match(names(found)[found], colnames(data))
        # find in near distance
        if (fuzzy) {
          found <- vapply(
            values,
            function(i) {
              length(.fuzzy_grep(x = i, pattern = search_pattern)) > 0
            },
            logical(1)
          )
          if (any(found)) {
            pos3 <- c(pos3, match(names(found)[found], colnames(data)))
          }
        }
      }
    }
    c(pos1, pos2, pos3)
  }))

  # clean up
  pos <- unique(pos)

  # variable labels of matching variables
  var_labels <- vapply(
    colnames(data[pos]),
    function(i) {
      l <- attr(data[[i]], "label", exact = TRUE)
      if (is.null(l)) {
        i
      } else {
        l
      }
    },
    character(1)
  )

  out <- data.frame(
    index = pos,
    column = colnames(data)[pos],
    labels = var_labels,
    stringsAsFactors = FALSE
  )
  # no row names
  rownames(out) <- NULL

  class(out) <- c("data_seek", "data.frame")
  out
}


# methods ---------------------------------------------------------------------

#' @export
print.data_seek <- function(x, ...) {
  if (nrow(x) == 0) {
    cat("No matches found.\n")
  } else {
    cat(insight::export_table(x, ...))
  }
}