File: findCorrelation.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (164 lines) | stat: -rw-r--r-- 5,642 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#' @importFrom stats complete.cases
findCorrelation_fast <- function(x, cutoff = .90, verbose = FALSE){
  if(any(!complete.cases(x)))
    stop("The correlation matrix has some missing values.")
  averageCorr <- colMeans(abs(x))
  averageCorr <- as.numeric(as.factor(averageCorr))
  x[lower.tri(x, diag = TRUE)] <- NA
  combsAboveCutoff <- which(abs(x) > cutoff)

  colsToCheck <- ceiling(combsAboveCutoff / nrow(x))
  rowsToCheck <- combsAboveCutoff %% nrow(x)

  colsToDiscard <- averageCorr[colsToCheck] > averageCorr[rowsToCheck]
  rowsToDiscard <- !colsToDiscard

  if(verbose){
    colsFlagged <- pmin(ifelse(colsToDiscard, colsToCheck, NA),
                        ifelse(rowsToDiscard, rowsToCheck, NA), na.rm = TRUE)
    values <- round(x[combsAboveCutoff], 3)
    cat('\n',paste('Combination row', rowsToCheck, 'and column', colsToCheck,
                   'is above the cut-off, value =', values,
                   '\n \t Flagging column', colsFlagged, '\n'
    ))
  }

  deletecol <- c(colsToCheck[colsToDiscard], rowsToCheck[rowsToDiscard])
  deletecol <- unique(deletecol)
  deletecol
}

findCorrelation_exact <- function(x, cutoff = 0.90, verbose = FALSE)
{
  varnum <- dim(x)[1]

  if (!isTRUE(all.equal(x, t(x)))) stop("correlation matrix is not symmetric")
  if (varnum == 1) stop("only one variable given")

  x <- abs(x)

  # re-ordered columns based on max absolute correlation
  originalOrder <- 1:varnum

  averageCorr <- function(x) mean(x, na.rm = TRUE)
  tmp <- x
  diag(tmp) <- NA

  maxAbsCorOrder <- order(apply(tmp, 2, averageCorr), decreasing = TRUE)
  x <- x[maxAbsCorOrder, maxAbsCorOrder]
  newOrder <- originalOrder[maxAbsCorOrder]
  rm(tmp)

  deletecol <- rep(FALSE, varnum)

  x2 <- x
  diag(x2) <- NA

  for (i in 1:(varnum - 1)) {
    if(!any(x2[!is.na(x2)] > cutoff)){
      if (verbose) cat("All correlations <=", cutoff, "\n")
      break()
    }
    if (deletecol[i]) next
    for (j in (i + 1):varnum) {
      if (!deletecol[i] & !deletecol[j]) {

        if (x[i, j] > cutoff) {
          mn1 <- mean(x2[i,], na.rm = TRUE)
          mn2 <- mean(x2[-j,], na.rm = TRUE)
          if(verbose) cat("Compare row", newOrder[i],
                          " and column ", newOrder[j],
                          "with corr ", round(x[i,j], 3), "\n")
          if (verbose) cat("  Means: ", round(mn1, 3), "vs", round(mn2, 3))
          if (mn1 > mn2) {
            deletecol[i] <- TRUE
            x2[i, ] <- NA
            x2[, i] <- NA
            if (verbose) cat(" so flagging column", newOrder[i], "\n")
          }
          else {
            deletecol[j] <- TRUE
            x2[j, ] <- NA
            x2[, j] <- NA
            if (verbose) cat(" so flagging column", newOrder[j], "\n")
          }
        }
      }
    }
  }
  newOrder[which(deletecol)]
}



#' Determine highly correlated variables
#'
#' This function searches through a correlation matrix and returns a vector of
#' integers corresponding to columns to remove to reduce pair-wise
#' correlations.
#'
#' The absolute values of pair-wise correlations are considered. If two
#' variables have a high correlation, the function looks at the mean absolute
#' correlation of each variable and removes the variable with the largest mean
#' absolute correlation.
#'
#' Using \code{exact = TRUE} will cause the function to re-evaluate the average
#' correlations at each step while \code{exact = FALSE} uses all the
#' correlations regardless of whether they have been eliminated or not. The
#' exact calculations will remove a smaller number of predictors but can be
#' much slower when the problem dimensions are "big".
#'
#' @param x A correlation matrix
#' @param cutoff A numeric value for the pair-wise absolute correlation cutoff
#' @param verbose A boolean for printing the details
#' @param names a logical; should the column names be returned (\code{TRUE}) or
#' the column index (\code{FALSE})?
#' @param exact a logical; should the average correlations be recomputed at
#' each step? See Details below.
#' @return A vector of indices denoting the columns to remove (when \code{names
#' = TRUE}) otherwise a vector of column names. If no correlations meet the
#' criteria, \code{integer(0)} is returned.
#' @author Original R code by Dong Li, modified by Max Kuhn
#' @seealso \code{\link{findLinearCombos}}
#' @keywords manip
#' @examples
#'
#' R1 <- structure(c(1, 0.86, 0.56, 0.32, 0.85, 0.86, 1, 0.01, 0.74, 0.32,
#'                   0.56, 0.01, 1, 0.65, 0.91, 0.32, 0.74, 0.65, 1, 0.36,
#'                   0.85, 0.32, 0.91, 0.36, 1),
#'                 .Dim = c(5L, 5L))
#' colnames(R1) <- rownames(R1) <- paste0("x", 1:ncol(R1))
#' R1
#'
#' findCorrelation(R1, cutoff = .6, exact = FALSE)
#' findCorrelation(R1, cutoff = .6, exact = TRUE)
#' findCorrelation(R1, cutoff = .6, exact = TRUE, names = FALSE)
#'
#'
#' R2 <- diag(rep(1, 5))
#' R2[2, 3] <- R2[3, 2] <- .7
#' R2[5, 3] <- R2[3, 5] <- -.7
#' R2[4, 1] <- R2[1, 4] <- -.67
#'
#' corrDF <- expand.grid(row = 1:5, col = 1:5)
#' corrDF$correlation <- as.vector(R2)
#' levelplot(correlation ~ row + col, corrDF)
#'
#' findCorrelation(R2, cutoff = .65, verbose = TRUE)
#'
#' findCorrelation(R2, cutoff = .99, verbose = TRUE)
#'
#' @export findCorrelation
findCorrelation <- function(x, cutoff = 0.90, verbose = FALSE, names = FALSE, exact = ncol(x) < 100) {
  if(names & is.null(colnames(x)))
    stop("'x' must have column names when `names = TRUE`")
  out <- if(exact)
    findCorrelation_exact(x = x, cutoff = cutoff, verbose = verbose) else
      findCorrelation_fast(x = x, cutoff = cutoff, verbose = verbose)
  out
  if(names) out <- colnames(x)[out]
  out
}