File: RLearner_regr_randomForest.R

package info (click to toggle)
r-cran-mlr 2.19.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,264 kB
  • sloc: ansic: 65; sh: 13; makefile: 5
file content (147 lines) | stat: -rw-r--r-- 7,662 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#' @export
makeRLearner.regr.randomForest = function() {
  makeRLearnerRegr(
    cl = "regr.randomForest",
    package = "randomForest",
    par.set = makeParamSet(
      makeIntegerLearnerParam(id = "ntree", default = 500L, lower = 1L),
      makeIntegerLearnerParam(id = "se.ntree", default = 100L, lower = 1L, when = "both", requires = quote(se.method == "bootstrap")),
      makeDiscreteLearnerParam(id = "se.method", default = "sd",
        values = c("bootstrap", "jackknife", "sd"),
        requires = quote(se.method %in% "jackknife" && keep.inbag == TRUE),
        when = "both"),
      makeIntegerLearnerParam(id = "se.boot", default = 50L, lower = 1L, when = "both"),
      makeIntegerLearnerParam(id = "mtry", lower = 1L),
      makeLogicalLearnerParam(id = "replace", default = TRUE),
      makeUntypedLearnerParam(id = "strata", tunable = FALSE),
      makeIntegerVectorLearnerParam(id = "sampsize", lower = 1L),
      makeIntegerLearnerParam(id = "nodesize", default = 5L, lower = 1L),
      makeIntegerLearnerParam(id = "maxnodes", lower = 1L),
      makeLogicalLearnerParam(id = "importance", default = FALSE),
      makeLogicalLearnerParam(id = "localImp", default = FALSE),
      makeIntegerLearnerParam(id = "nPerm", default = 1L),
      makeLogicalLearnerParam(id = "proximity", default = FALSE, tunable = FALSE),
      makeLogicalLearnerParam(id = "oob.prox", requires = quote(proximity == TRUE), tunable = FALSE),
      makeLogicalLearnerParam(id = "do.trace", default = FALSE, tunable = FALSE),
      makeLogicalLearnerParam(id = "keep.forest", default = TRUE, tunable = FALSE),
      makeLogicalLearnerParam(id = "keep.inbag", default = FALSE, tunable = FALSE)
    ),
    properties = c("numerics", "factors", "ordered", "se", "oobpreds", "featimp"),
    name = "Random Forest",
    short.name = "rf",
    note = "See the section about 'regr.randomForest' in `?makeLearner` for information about se estimation. Note that the rf can freeze the R process if trained on a task with 1 feature which is constant. This can happen in feature forward selection, also due to resampling, and you need to remove such features with removeConstantFeatures. keep.inbag is NULL by default but if predict.type = 'se' and se.method = 'jackknife' (the default) then it is automatically set to TRUE.",
    callees = "randomForest"
  )
}

#' @export
trainLearner.regr.randomForest = function(.learner, .task, .subset, .weights = NULL, se.method = "sd", keep.inbag = NULL, se.boot = 50L, se.ntree = 100L, ...) {
  data = getTaskData(.task, .subset, target.extra = TRUE)
  if (is.null(keep.inbag)) keep.inbag = (se.method == "jackknife" && .learner$predict.type == "se")
  m = randomForest::randomForest(x = data[["data"]], y = data[["target"]], keep.inbag = keep.inbag, ...)
  if (.learner$predict.type == "se" && se.method == "bootstrap") {
    base.lrn = setPredictType(.learner, "response")
    base.lrn = setHyperPars(base.lrn, ntree = se.ntree)
    bag.rf = makeBaggingWrapper(base.lrn, se.boot, bw.replace = TRUE)
    m2 = train(bag.rf, .task, .subset, .weights)
    m = list(single.model = m, bagged.models = m2)
  }
  return(m)
}

#' @export
predictLearner.regr.randomForest = function(.learner, .model, .newdata, se.method = "sd", ...) {
  if (se.method == "bootstrap") {
    pred = predict(.model$learner.model$single.model, newdata = .newdata, ...)
  } else {
    pred = predict(.model$learner.model, newdata = .newdata, predict.all = (.learner$predict.type == "se"), ...)
  }
  if (.learner$predict.type == "se") {
    if (se.method == "bootstrap") {
      se = bootstrapStandardError(.learner, .model, .newdata, ...)
      return(cbind(pred, se))
    } else if (se.method == "jackknife") {
      se = jacknifeStandardError(
        aggregated.predictions = pred$aggregate,
        individual.predictions = pred$individual,
        bag.counts = .model$learner.model$inbag)
    } else if (se.method == "sd") {
      se = sdStandardError(individual.predictions = pred$individual)
    }
    return(cbind(pred$aggregate, se))
  } else {
    return(pred)
  }
}

#' @export
getOOBPredsLearner.regr.randomForest = function(.learner, .model) {
  getLearnerModel(.model, more.unwrap = TRUE)$predicted
}

# Computes brute force or noisy bootstrap
# Set ntree = se.ntree for the brute force bootstrap
# Set se.ntree << ntree for the noisy bootstrap (mc bias corrected)
bootstrapStandardError = function(.learner, .model, .newdata,
  se.ntree = 100L, se.boot = 50L, ...) {

  single.model = getLearnerModel(.model)$single.model # get raw RF model
  bagged.models = getLearnerModel(getLearnerModel(.model)$bagged.models) # get list of unbagged mlr models
  pred.bagged = lapply(bagged.models, function(x) predict(getLearnerModel(x), newdata = .newdata, predict.all = TRUE))
  pred.boot.all = extractSubList(pred.bagged, "individual", simplify = FALSE)
  ntree = single.model$ntree
  # following the formula in 3.3 in Sexton and Laake 2009 - Standard errors for bagged and random forest estimators
  # M = ntree    # number of ensembles
  # R = se.ntree # new (reduced) number of ensembles
  # B = se.boot  # number of bootstrap samples
  # Bias is defined as
  # (1/R - 1/M) / (BR*(R-1)) *
  # (sum over all B:
  #   (sum over all R:
  #     (prediction for x of ensemble r in bootstrap b - average prediction for x over all ensambles in bootsrap b )^2
  #   )
  # )
  bias = rowSums(matrix(vapply(pred.boot.all, function(p) rowSums(p - rowMeans(p))^2, numeric(nrow(pred.boot.all[[1]]))), nrow = nrow(.newdata), ncol = se.boot, byrow = FALSE))
  bias = ((1 / se.ntree) - (1 / ntree)) / (se.boot * se.ntree * (se.ntree - 1)) * bias
  pred.boot.aggregated = extractSubList(pred.bagged, "aggregate")
  pred.boot.aggregated = matrix(pred.boot.aggregated, nrow = nrow(.newdata), ncol = se.boot, byrow = FALSE)
  var.boot = apply(pred.boot.aggregated, 1, var) - bias
  var.boot = pmax(var.boot, 0)
  sqrt(var.boot)
}

# Computes the mc bias-corrected jackknife after bootstrap
# @param aggregated.predictions `vector(n)`
#   Vector of length n of predictions, aggregated over all individual predictions
# @param individual.predictions `matrix`
#   The individual predictions. Each row represents one individual and each column represents the predictions of one base learner.
# @param bag.counts `matrix`
#   These are the inbag counts of the model. Each row represents an observation of the training set and each row represents one base learner.
#   The number indicates how often this observation exists in the bootstrap sample for the respective base learner.
jacknifeStandardError = function(aggregated.predictions, individual.predictions, bag.counts) {

  nbase = ncol(individual.predictions)
  bag.counts = bag.counts[rowSums(bag.counts == 0) > 0, , drop = FALSE]
  n = nrow(bag.counts)
  oob = bag.counts == 0
  jack.n = apply(oob, 1, function(x) rowMeans(individual.predictions[, x, drop = FALSE]))
  if (is.vector(jack.n)) {
    jack.n = t(as.matrix(jack.n))
  }
  jack = (n - 1) / n * rowSums((jack.n - aggregated.predictions)^2)
  bias = (exp(1) - 1) * n / nbase^2 * rowSums((individual.predictions - aggregated.predictions)^2)
  jab = pmax(jack - bias, 0)
  sqrt(jab)
}

# computes the standard deviation across trees
# @param individual.predictions `matrix`
#   The individual predictions. Each row represents one individual and each column represents the predictions of one base learner.
sdStandardError = function(individual.predictions) {
  apply(individual.predictions, 1, sd)
}

#' @export
getFeatureImportanceLearner.regr.randomForest = function(.learner, .model, ...) {
  getFeatureImportanceLearner.classif.randomForest(.learner, .model, ...)
}