File: aggregations.R

package info (click to toggle)
r-cran-mlr 2.13-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 6,760 kB
  • sloc: ansic: 65; sh: 13; makefile: 2
file content (263 lines) | stat: -rw-r--r-- 8,263 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#' @title Aggregation methods.
#'
#' @description
#' \itemize{
#'   \item{**test.mean**}{\cr Mean of performance values on test sets.}
#'   \item{**test.sd**}{\cr Standard deviation of performance values on test sets.}
#'   \item{**test.median**}{\cr Median of performance values on test sets.}
#'   \item{**test.min**}{\cr Minimum of performance values on test sets.}
#'   \item{**test.max**}{\cr Maximum of performance values on test sets.}
#'   \item{**test.sum**}{\cr Sum of performance values on test sets.}
#'   \item{**train.mean**}{\cr Mean of performance values on training sets.}
#'   \item{**train.sd**}{\cr Standard deviation of performance values on training sets.}
#'   \item{**train.median**}{\cr Median of performance values on training sets.}
#'   \item{**train.min**}{\cr Minimum of performance values on training sets.}
#'   \item{**train.max**}{\cr Maximum of performance values on training sets.}
#'   \item{**train.sum**}{\cr Sum of performance values on training sets.}
#'   \item{**b632**}{\cr Aggregation for B632 bootstrap.}
#'   \item{**b632plus**}{\cr Aggregation for B632+ bootstrap.}
#'   \item{**testgroup.mean**}{\cr Performance values on test sets are grouped according
#'     to resampling method. The mean for every group is calculated, then the mean of those means.
#'     Mainly used for repeated CV.}
#'   \item{**testgroup.sd**}{\cr Similar to **testgroup.mean** - after
#'     the mean for every group is calculated, the standard deviation of those means is obtained.
#'     Mainly used for repeated CV.}
#'   \item{**test.join**}{\cr Performance measure on joined test sets.
#'     This is especially useful for small sample sizes where unbalanced group sizes have a significant impact
#'     on the aggregation, especially for cross-validation test.join might make sense now.
#'     For the repeated CV, the performance is calculated on each repetition and then aggregated
#'     with the arithmetic mean.}
#' }
#' @format None
#' @seealso [Aggregation]
#' @name aggregations
#' @rdname aggregations
NULL

#' @export
#' @rdname aggregations
test.mean = makeAggregation(
  id = "test.mean",
  name = "Test mean",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) mean(perf.test)
)

#' @export
#' @rdname aggregations
test.sd = makeAggregation(
  id = "test.sd",
  name = "Test sd",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) sd(perf.test)
)

#' @export
#' @rdname aggregations
test.median = makeAggregation(
  id = "test.median",
  name = "Test median",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) median(perf.test)
)

#' @export
#' @rdname aggregations
test.min = makeAggregation(
  id = "test.min",
  name = "Test minimum",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) min(perf.test)
)

#' @export
#' @rdname aggregations
test.max = makeAggregation(
  id = "test.max",
  name = "Test maximum",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) max(perf.test)
)

#' @export
#' @rdname aggregations
test.sum = makeAggregation(
  id = "test.sum",
  name = "Test sum",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) sum(perf.test)
)

#' @export
#' @rdname aggregations
test.range = makeAggregation(
  id = "test.range",
  name = "Test range",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) diff(range(perf.test))
)

#' @export
#' @rdname aggregations
test.rmse = makeAggregation(
  id = "test.rmse",
  name = "Test RMSE",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) sqrt(mean(perf.test^2))
)

#' @export
#' @rdname aggregations
train.mean = makeAggregation(
  id = "train.mean",
  name = "Training mean",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) mean(perf.train)
)

#' @export
#' @rdname aggregations
train.sd = makeAggregation(
  id = "train.sd",
  name = "Training sd",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) sd(perf.train)
)

#' @export
#' @rdname aggregations
train.median = makeAggregation(
  id = "train.median",
  name = "Training median",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) median(perf.train)
)

#' @export
#' @rdname aggregations
train.min = makeAggregation(
  id = "train.min",
  name = "Training min",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) min(perf.train)
)

#' @export
#' @rdname aggregations
train.max = makeAggregation(
  id = "train.max",
  name = "Training max",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) max(perf.train)
)

#' @export
#' @rdname aggregations
train.sum = makeAggregation(
  id = "train.sum",
  name = "Training sum",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) sum(perf.train)
)

#' @export
#' @rdname aggregations
train.range = makeAggregation(
  id = "train.range",
  name = "Training range",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) diff(range(perf.train))
)

#' @export
#' @rdname aggregations
train.rmse = makeAggregation(
  id = "train.rmse",
  name = "Training RMSE",
  properties = "req.train",
  fun = function(task, perf.test, perf.train, measure, group, pred) sqrt(mean(perf.train^2))
)

#' @export
#' @rdname aggregations
b632 = makeAggregation(
  id = "b632",
  name = ".632 Bootstrap",
  properties = c("req.train", "req.test"),
  fun = function(task, perf.test, perf.train, measure, group, pred) {
    mean(0.632 * perf.test + 0.368 * perf.train)
  }
)


#FIXME: read this again properly and double check it
#' @export
#' @rdname aggregations
b632plus = makeAggregation(
  id = "b632plus",
  name = ".632 Bootstrap plus",
  properties = c("req.train", "req.test"),
  fun = function(task, perf.test, perf.train, measure, group, pred) {
    df = as.data.frame(pred)
    a = numeric(length(perf.test))
    for (i in seq_along(a)) {
      df2 = df[df$iter == i, , drop = FALSE]
      y1 = df2$truth
      y2 = df2$response
      grid = expand.grid(y1, y2, KEEP.OUT.ATTRS = FALSE)
      pred2 = makePrediction(task.desc = pred$task.desc, row.names = rownames(grid),
        id = NULL, truth = grid[, 1L], predict.type = "response", y = grid[, 2L],
        time = NA_real_)
      gamma = performance(pred2, measures = measure)
      R = (perf.test[i] - perf.train[i]) / (gamma - perf.train[i])
      w = 0.632 / (1 - 0.368 * R)
      a[i] = (1 - w) * perf.train[i] + w * perf.test[i]
    }
    return(mean(a))
  }
)

#' @export
#' @rdname aggregations
testgroup.mean = makeAggregation(
  id = "testgroup.mean",
  name = "Test group mean",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) {
    mean(vnapply(split(perf.test, group), mean))
  }
)

#' @export
#' @rdname aggregations
testgroup.sd = makeAggregation(
  id = "testgroup.sd",
  name = "Test group standard deviation",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) {
    sd(BBmisc::vnapply(split(perf.test, group), mean))
  }
)

#' @export
#' @rdname aggregations
test.join = makeAggregation(
  id = "test.join",
  name = "Test join",
  properties = "req.test",
  fun = function(task, perf.test, perf.train, measure, group, pred) {
    df = as.data.frame(pred)
    f = if (length(group)) group[df$iter] else factor(rep(1L, nrow(df)))
    mean(vnapply(split(df, f), function(df) {
      if (pred$predict.type == "response") y = df$response
      if (pred$predict.type == "prob") {
        y = df[, stri_startswith_fixed(colnames(df), "prob."), drop = FALSE]
        colnames(y) = stri_sub(colnames(y), 6L)
      }
      npred = makePrediction(task.desc = pred$task.desc, row.names = rownames(df),
        id = NULL, truth = df$truth, predict.type = pred$predict.type, y = y,
        time = NA_real_)
      performance(npred, measure)
    }))
  }
)