File: search.R

package info (click to toggle)
r-cran-projpred 2.0.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 740 kB
  • sloc: cpp: 355; sh: 14; makefile: 2
file content (193 lines) | stat: -rw-r--r-- 6,855 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
search_forward <- function(p_ref, refmodel, family, intercept, nterms_max,
                           verbose = TRUE, opt, search_terms = NULL,
                           increasing_order = TRUE) {
  ## initialize the forward selection
  ## proj performs the projection over draws
  projfun <- .get_proj_handle(refmodel, p_ref, family, opt$regul, intercept)

  formula <- refmodel$formula
  iq <- ceiling(quantile(seq_len(nterms_max), 1:10 / 10))
  if (is.null(search_terms)) {
    allterms <- split_formula(formula, data = refmodel$fetch_data())
  } else {
    allterms <- search_terms
  }

  chosen <- NULL
  total_terms <- count_terms_in_formula(formula)
  stop_search <- min(total_terms, nterms_max)
  submodels <- c()

  for (size in seq_len(stop_search)) {
    cands <- select_possible_terms_size(chosen, allterms, size = size)
    full_cands <- lapply(cands, function(cand) c(chosen, cand))
    sub <- sapply(full_cands, projfun)

    ## select best candidate
    imin <- which.min(sapply(seq_len(NCOL(sub)), function(i) {
      min(unlist(sub["kl", i]))
    }))
    chosen <- c(chosen, cands[imin])

    ## append submodels
    submodels <- c(submodels, sub["sub_fit", imin])

    if (verbose && length(chosen) %in% iq) {
      print(paste0(names(iq)[max(which(length(chosen) == iq))],
                   " of terms selected."))
    }
  }

  ## reduce chosen to a list of non-redundant accumulated models
  return(list(solution_terms = setdiff(reduce_models(chosen), "1"),
              sub_fits = submodels))
}

# copied over from search until we resolve the TODO below
search_L1_surrogate <- function(p_ref, d_train, family, intercept, nterms_max,
                                penalty, opt) {

  ## predictive mean and variance of the reference model (with parameters
  ## integrated out)
  mu <- p_ref$mu
  v <- p_ref$var

  if (NCOL(mu) > 1 || NCOL(v) > 1) {
    stop("Internal error: search_L1 received multiple draws. ",
         "Please report to the developers.")
  }

  ## L1-penalized projection (projection path).
  ## (Notice: here we use pmax = nterms_max+1 so that the computation gets
  ## carried until all the way down to the least regularization also for model
  ## size nterms_max)
  search <- glm_elnet(d_train$x, mu, family,
    lambda_min_ratio = opt$lambda_min_ratio, nlambda = opt$nlambda,
    pmax = nterms_max + 1, pmax_strict = FALSE, offset = d_train$offset,
    weights = d_train$weights, intercept = intercept, obsvar = v,
    penalty = penalty, thresh = opt$thresh)

  ## sort the variables according to the order in which they enter the model in
  ## the L1-path
  entering_indices <- apply(search$beta != 0, 1, function(num)
    which(num)[1]) # na for those that did not enter
  ## variables that entered at some point
  entered_variables <-
    c(seq_len(NCOL(d_train$x)))[!is.na(entering_indices)]
  ## variables that did not enter at any point
  notentered_variables <-
    c(seq_len(NCOL(d_train$x)))[is.na(entering_indices)]
  order_of_entered <- sort(entering_indices, index.return = TRUE)$ix
  order <- c(entered_variables[order_of_entered], notentered_variables)

  ## fetch the coefficients corresponding to those points at the search_path
  ## where new variable enters
  nvar <- length(order)
  n <- nrow(p_ref$mu)
  out <- list(
    alpha = rep(NA, nterms_max + 1),
    beta = matrix(0, nrow = nterms_max, ncol = nterms_max + 1),
    lambda = rep(NA, nterms_max + 1),
    w = matrix(NA, nrow = n, ncol = nterms_max + 1)
  )
  for (k in 0:nterms_max) {
    if (k == 0) {
      out$alpha[1] <- search$beta0[1]
      out$lambda[1] <- search$lambda[1]
      out$w[, 1] <- search$w[, 1]
    } else {
      ## find those points in the L1-path where only the k most relevant
      ## features can have nonzero coefficient, and then fetch their
      ## coefficients with least regularization
      ivar <- utils::tail(order, nvar - k)
      steps_k_var <- which(colSums(search$beta[ivar, , drop = FALSE] != 0) == 0)
      if (length(steps_k_var) > 0) {
        j <- utils::tail(steps_k_var, 1)
      } else {
        ## no steps where all the variables in set ivar would have zero
        ## coefficient (could be due to one or more of these variables having
        ## penalty = 0 so they are always in the model) so set the coefficients
        ## to be equal to the starting value
        j <- 1
      }
      out$alpha[k + 1] <- search$beta0[j]
      out$beta[1:k, k + 1] <- search$beta[order[1:k], j]
      out$lambda[k + 1] <- search$lambda[j]
      out$w[, k + 1] <- search$w[, j]
    }
  }

  out$solution_terms <- order[1:nterms_max]
  if (any(is.na(out$solution_terms)) &&
      length(entered_variables) < nterms_max) {
    if (length(setdiff(notentered_variables,
                       which(penalty == Inf))) > 0) {
      warning("Less than nterms_max variables entered L1-path. ",
              "Try reducing lambda_min_ratio. ")
    }
  }

  return(out)
}

search_L1 <- function(p_ref, refmodel, family, intercept, nterms_max, penalty,
                      opt) {
  frame <- model.frame(refmodel$formula, refmodel$fetch_data())
  contrasts_arg <- get_contrasts_arg_list(
    refmodel$formula,
    refmodel$fetch_data()
  )
  x <- model.matrix(delete.intercept(refmodel$formula),
    data = frame,
    contrasts.arg = contrasts_arg
  )
  ## it's important to keep the original order because that's the order
  ## in which lasso will estimate the parameters
  tt <- terms(refmodel$formula)
  terms_ <- attr(tt, "term.labels")
  search_path <- search_L1_surrogate(
    p_ref, list(refmodel, x = x), family,
    intercept, ncol(x), penalty, opt
  )
  solution_terms <- collapse_contrasts_solution_path(
    refmodel$formula, colnames(x)[search_path$solution_terms],
    refmodel$fetch_data()
  )
  sub_fits <- lapply(0:length(solution_terms), function(nterms) {
    if (nterms == 0) {
      formula <- make_formula(c("1"))
      beta <- NULL
    } else {
      formula <- make_formula(solution_terms[seq_len(nterms)])
      variables <- unlist(lapply(
        solution_terms[seq_len(nterms)],
        function(term) {
          form <- as.formula(paste("~ 0 +", term))
          contrasts_arg <- get_contrasts_arg_list(
            form,
            refmodel$fetch_data()
          )
          return(colnames(model.matrix(form,
            data = refmodel$fetch_data(),
            contrasts.arg = contrasts_arg
          )))
        }
      ))
      indices <- match(variables, colnames(x)[search_path$solution_terms])
      beta <- search_path$beta[indices, max(indices) + 1, drop = FALSE]
    }
    sub <- nlist(
      alpha = search_path$alpha[nterms + 1],
      beta,
      w = search_path$w[, nterms + 1],
      formula,
      x
    )
    class(sub) <- "subfit"
    return(sub)
  })
  return(nlist(
    solution_terms[seq_len(nterms_max)],
    sub_fits[seq_len(nterms_max + 1)]
  ))
}