File: reduce_parameters.R

package info (click to toggle)
r-cran-parameters 0.24.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,852 kB
  • sloc: sh: 16; makefile: 2
file content (212 lines) | stat: -rw-r--r-- 8,132 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#' Dimensionality reduction (DR) / Features Reduction
#'
#' This function performs a reduction in the parameter space (the number of
#' variables). It starts by creating a new set of variables, based on the given
#' method (the default method is "PCA", but other are available via the
#' `method` argument, such as "cMDS", "DRR" or "ICA"). Then, it names this
#' new dimensions using the original variables that correlates the most with it.
#' For instance, a variable named `'V1_0.97/V4_-0.88'` means that the V1 and the
#' V4 variables correlate maximally (with respective coefficients of .97 and
#' -.88) with this dimension. Although this function can be useful in
#' exploratory data analysis, it's best to perform the dimension reduction step
#' in a separate and dedicated stage, as this is a very important process in the
#' data analysis workflow. `reduce_data()` is an alias for
#' `reduce_parameters.data.frame()`.
#'
#' @inheritParams principal_components
#' @param method The feature reduction method. Can be one of `"PCA"`, `"cMDS"`,
#'   `"DRR"`, `"ICA"` (see the 'Details' section).
#' @param distance The distance measure to be used. Only applies when
#'   `method = "cMDS"`. This must be one of `"euclidean"`, `"maximum"`,
#'   `"manhattan"`, `"canberra"`, `"binary"` or `"minkowski"`. Any unambiguous
#'   substring can be given.
#'
#' @details
#' The different methods available are described below:
#'
#' ## Supervised Methods
#' - **PCA**: See [`principal_components()`].
#'
#' - **cMDS / PCoA**: Classical Multidimensional Scaling (cMDS) takes a
#'   set of dissimilarities (i.e., a distance matrix) and returns a set of points
#'   such that the distances between the points are approximately equal to the
#'   dissimilarities.
#'
#' - **DRR**: Dimensionality Reduction via Regression (DRR) is a very
#'   recent technique extending PCA (*Laparra et al., 2015*). Starting from a
#'   rotated PCA, it predicts redundant information from the remaining components
#'   using non-linear regression. Some of the most notable advantages of
#'   performing DRR are avoidance of multicollinearity between predictors and
#'   overfitting mitigation. DRR tends to perform well when the first principal
#'   component is enough to explain most of the variation in the predictors.
#'   Requires the **DRR** package to be installed.
#'
#' - **ICA**: Performs an Independent Component Analysis using the
#'   FastICA algorithm. Contrary to PCA, which attempts to find uncorrelated
#'   sources (through least squares minimization), ICA attempts to find
#'   independent sources, i.e., the source space that maximizes the
#'   "non-gaussianity" of all sources. Contrary to PCA, ICA does not rank each
#'   source, which makes it a poor tool for dimensionality reduction. Requires the
#'   **fastICA** package to be installed.
#'
#' See also [package vignette](https://easystats.github.io/parameters/articles/parameters_reduction.html).
#'
#' @references
#' - Nguyen, L. H., and Holmes, S. (2019). Ten quick tips for effective
#'  dimensionality reduction. PLOS Computational Biology, 15(6).
#'
#' - Laparra, V., Malo, J., and Camps-Valls, G. (2015). Dimensionality
#'  reduction via regression in hyperspectral imagery. IEEE Journal of Selected
#'  Topics in Signal Processing, 9(6), 1026-1036.
#'
#' @examples
#' data(iris)
#' model <- lm(Sepal.Width ~ Species * Sepal.Length + Petal.Width, data = iris)
#' model
#' reduce_parameters(model)
#'
#' out <- reduce_data(iris, method = "PCA", n = "max")
#' head(out)
#' @export
reduce_parameters <- function(x, method = "PCA", n = "max", distance = "euclidean", ...) {
  UseMethod("reduce_parameters")
}


#' @rdname reduce_parameters
#' @export
reduce_data <- function(x, method = "PCA", n = "max", distance = "euclidean", ...) {
  if (!is.data.frame(x)) {
    insight::format_error("Only works on data frames.")
  }
  reduce_parameters(x, method = method, n = n, distance = distance, ...)
}


#' @export
reduce_parameters.data.frame <- function(x, method = "PCA", n = "max", distance = "euclidean", ...) {
  x <- datawizard::to_numeric(x, dummy_factors = TRUE)

  # N factors
  if (n == "max") {
    nfac <- ncol(x) - 1
  } else {
    nfac <- .get_n_factors(x, n = n, type = "PCA", rotation = "none")
  }

  # compute new features
  if (tolower(method) %in% c("pca", "principal")) {
    features <- principal_components(x, n = nfac, ...)
    features <- as.data.frame(attributes(features)$scores)
  } else if (tolower(method) %in% c("cmds", "pcoa")) {
    features <- .cmds(x, n = nfac, distance = distance, ...)
  } else if (tolower(method) == "drr") {
    features <- .drr(x, n = nfac, ...)
  } else if (tolower(method) == "ica") {
    features <- .ica(x, n = nfac, ...)
  } else {
    insight::format_error("`method` must be one of \"PCA\", \"cMDS\", \"DRR\" or \"ICA\".")
  }

  # Get weights / pseudo-loadings (correlations)
  cormat <- as.data.frame(stats::cor(x = x, y = features))
  cormat <- cbind(data.frame(Variable = row.names(cormat)), cormat)
  pca_weights <- as.data.frame(.sort_loadings(cormat, cols = 2:ncol(cormat)))

  if (n == "max") {
    pca_weights <- .filter_loadings(pca_weights, threshold = "max", 2:ncol(pca_weights))
    non_empty <- vapply(pca_weights[2:ncol(pca_weights)], function(x) !all(is.na(x)), TRUE)
    pca_weights <- pca_weights[c(TRUE, non_empty)]
    features <- features[, non_empty]
    pca_weights[is.na(pca_weights)] <- 0
    pca_weights <- .filter_loadings(.sort_loadings(pca_weights, cols = 2:ncol(pca_weights)), threshold = "max", 2:ncol(pca_weights))
  }

  # Create varnames
  varnames <- vapply(pca_weights[2:ncol(pca_weights)], function(x) {
    name <- pca_weights$Variable[!is.na(x)]
    weight <- insight::format_value(x[!is.na(x)])
    paste0(paste(name, weight, sep = "_"), collapse = "/")
  }, character(1))
  names(features) <- as.character(varnames)

  # Attributes
  attr(features, "loadings") <- pca_weights
  class(features) <- c("parameters_reduction", class(features))

  # Out
  features
}


#' @export
reduce_parameters.lm <- function(x, method = "PCA", n = "max", distance = "euclidean", ...) {
  model_data <- reduce_parameters(
    datawizard::to_numeric(insight::get_predictors(x, ...), ..., dummy_factors = TRUE),
    method = method,
    n = n,
    distance = distance
  )

  y <- data.frame(.row = seq_along(insight::get_response(x)))
  y[insight::find_response(x)] <- insight::get_response(x)
  y$.row <- NULL

  new_formula <- paste(insight::find_response(x), "~", paste(paste0("`", names(model_data), "`"), collapse = " + "))
  stats::update(x, formula = new_formula, data = cbind(model_data, y))
}

#' @export
reduce_parameters.merMod <- reduce_parameters.lm


#' @export
principal_components.lm <- function(x, ...) {
  reduce_parameters(x, method = "PCA", ...)
}

#' @export
principal_components.merMod <- principal_components.lm


#' @keywords internal
.cmds <- function(x, n = "all", distance = "euclidean", ...) {
  n <- .get_n_factors(x, n = n, type = "PCA", rotation = "none")

  d <- stats::dist(x, method = distance)
  cmd <- stats::cmdscale(d, k = n, eig = TRUE)

  features <- as.data.frame(cmd$points)
  names(features) <- paste0("CMDS", seq_len(ncol(features)))
  features
}


#' @keywords internal
.drr <- function(x, n = "all", ...) {
  n <- .get_n_factors(x, n = n, type = "PCA", rotation = "none")

  insight::check_if_installed("DRR")

  junk <- utils::capture.output(suppressMessages({
    rez <- DRR::drr(x, n)
  }))

  features <- as.data.frame(rez$fitted.data)
  names(features) <- paste0("DRR", seq_len(ncol(features)))
  features
}


#' @keywords internal
.ica <- function(x, n = "all", ...) {
  n <- .get_n_factors(x, n = n, type = "PCA", rotation = "none")

  insight::check_if_installed("fastICA")

  rez <- fastICA::fastICA(x, n.comp = ncol(x) - 1)

  features <- as.data.frame(rez$S)
  names(features) <- paste0("ICA", seq_len(ncol(features)))
  features
}