File: helper_formula.R

package info (click to toggle)
r-cran-projpred 2.0.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 740 kB
  • sloc: cpp: 355; sh: 14; makefile: 2
file content (111 lines) | stat: -rw-r--r-- 3,436 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#' Utilities to handle formulas for the external user
#' @name helper_formula
NULL

#' Sometimes there can be terms in a formula that refer to a matrix instead of a
#' single predictor. Because we can handle search_terms of predictors, this
#' function breaks the matrix term into individual predictors to handle
#' separately, as that is probably the intention of the user.
#' @param formula A formula for a valid model.
#' @param data The original data frame with a matrix as predictor.
#' @return a  list containing the expanded formula and the expanded data frame.
#' @export
break_up_matrix_term <- function(formula, data) {
  tt <- terms(formula)
  response <- attr(tt, "response")
  ## when converting the variables to a list the first element is
  ## "list" itself, so we remove it
  variables_list <- as.list(attr(tt, "variables")[-1])

  ## if there is a response, take it out from the variables as
  ## it is located at the first position after "list"
  if (response) {
    variables_list <- variables_list[-1]
  }

  term_labels <- attr(tt, "term.labels")

  mm <- model.matrix(formula, data)
  assign <- attr(mm, "assign")

  new_data <- data
  for (assignee in unique(assign)) {
    if (assignee == 0) { ## intercept
      next
    }
    appearances <- assign[assign == assignee]
    if (length(appearances) > 1) {
      ## check if special term
      current <- term_labels[assignee]
      int <- grepl(":", current)
      mulilevel <- grepl("\\|", current)
      special <- grepl("[a-z]+\\(([a-z]+)\\)", current)
      individual <- !mulilevel & !int

      linear <- individual & !special
      linear_int <- int & !special

      if (linear) {
        ## if linear we can split it
        split_term <- split_linear_term(current, data)

        formula <- update(formula, paste0(
          ". ~ . - ", current, " + ",
          paste(split_term, collapse = " + ")
        ))
        split_matrix <- mm[, assign == assignee]

        new_data_tmp <- as.data.frame(new_data[, colnames(new_data) != current])
        colnames(new_data_tmp) <-
          colnames(new_data)[colnames(new_data) != current]
        new_data <- cbind(new_data_tmp, split_matrix)
      }

      if (linear_int) {
        ## we can also flatten linear interactions
        vars <- strsplit(current, ":")
        split_terms <- lapply(unlist(vars), function(v) {
          split_linear_term(v, data)
        })

        combined_terms <- c()
        for (v1 in split_terms[[1]]) {
          for (v2 in split_terms[[2]]) {
            combined_terms <- c(combined_terms, paste0(v1, ":", v2))
          }
        }

        formula <- update(formula, paste0(
          ". ~ . - ", current,
          " + ",
          paste(combined_terms, collapse = " + ")
        ))
        ## no need to update the data because the interaction terms
        ## do not appear as features
      }
    }
  }

  tryCatch(model.matrix(formula, data = new_data),
    error = function(e) print(e)
  )
  list(formula = formula, data = new_data)
}

## Splits a linear term into individual predictors.
## @param term A matrix term.
## @param data The original data frame.
## @return a list of the expanded linear matrix term.
split_linear_term <- function(term, data) {
  appearances <- ncol(data[, term])

  if (appearances > 1) {
    split_term <- sapply(
      1:appearances,
      function(i) paste0(term, i)
    )
  } else {
    split_term <- term
  }
  split_term
}