File: test_base_imbal_overundersample.R

package info (click to toggle)
r-cran-mlr 2.19.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,392 kB
  • sloc: ansic: 65; sh: 13; makefile: 5
file content (144 lines) | stat: -rwxr-xr-x 5,571 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

test_that("over and undersample works", {
  y = binaryclass.df[, binaryclass.target]
  tab1 = table(y)
  task = oversample(binaryclass.task, rate = 2)
  df = getTaskData(task)
  tab2 = table(df[, binaryclass.target])
  expect_equal(tab2["M"], tab1["M"])
  expect_equal(tab2["R"], tab1["R"] * 2)

  task = undersample(binaryclass.task, rate = 0.5)
  df = getTaskData(task)
  tab2 = table(df[, binaryclass.target])
  expect_equal(tab2["M"], round(tab1["M"] / 2))
  expect_equal(tab2["R"], tab1["R"])
})

test_that("over and undersample wrapper", {
  rdesc = makeResampleDesc("CV", iters = 2)
  lrn1 = makeLearner("classif.rpart")
  lrn2 = makeUndersampleWrapper(lrn1, usw.rate = 0.5)
  r = resample(lrn2, binaryclass.task, rdesc)
  expect_true(!is.na(r$aggr))

  lrn2 = makeOversampleWrapper(lrn1, osw.rate = 1.5)
  r = resample(lrn2, binaryclass.task, rdesc)
  expect_true(!is.na(r$aggr))
})

test_that("over and undersample arg check works", {
  task = makeClassifTask(data = multiclass.df, target = multiclass.target)
  expect_error(undersample(task, rate = 0.5), "binary")
  expect_error(oversample(task, rate = 0.5), "binary")
})

test_that("over and undersample works with weights", {
  task = makeClassifTask(data = binaryclass.df, target = binaryclass.target,
    weights = seq_len(nrow(binaryclass.df)))
  task2 = undersample(task, rate = 0.5)
  expect_true(length(task2$weights) < length(task$weights))
  expect_true(all(task2$weights %in% task$weights))
})

test_that("oversampling keeps all min / max obs", {
  y = binaryclass.df[, binaryclass.target]
  z = getMinMaxClass(y)
  new.inds = sampleBinaryClass(y, 1.05, cl = z$min.name, resample.other.class = FALSE)
  expect_true(setequal(intersect(z$min.inds, new.inds), z$min.inds))
})

test_that("control which class gets over or under sampled", {
  set.seed(getOption("mlr.debug.seed"))
  # check function oversample(), undersample()
  y = binaryclass.df[, binaryclass.target]
  tab1 = table(y)
  z = getMinMaxClass(y)
  task = oversample(binaryclass.task, rate = 2, cl = z$max.name)
  df = getTaskData(task)
  tab2 = table(df[, binaryclass.target])
  expect_equal(tab2["R"], tab1["R"])
  expect_equal(tab2["M"], tab1["M"] * 2)
  task = undersample(binaryclass.task, rate = 0.5, cl = z$min.name)
  df = getTaskData(task)
  tab2 = table(df[, binaryclass.target])
  expect_equal(tab2["R"], round(tab1["R"] / 2))
  expect_equal(tab2["M"], tab1["M"])

  # check over- and undersample-wrapper
  z = getMinMaxClass(binaryclass.df[, binaryclass.target])
  rdesc = makeResampleDesc("CV", iters = 2)
  lrn1 = makeLearner("classif.rpart")
  lrn2 = makeUndersampleWrapper(lrn1, usw.rate = 0.1, usw.cl = z$min.name)
  r = resample(lrn2, binaryclass.task, rdesc)
  expect_true(!is.na(r$aggr))

  lrn2 = makeOversampleWrapper(lrn1, osw.rate = 1.5, osw.cl = z$max.name)
  r = resample(lrn2, binaryclass.task, rdesc)
  expect_true(!is.na(r$aggr))
})

test_that("training performance works as expected (#1357)", {
  num = makeMeasure(id = "num", minimize = FALSE,
    properties = c("classif", "classif.multi", "req.pred", "req.truth"),
    name = "Number",
    fun = function(task, model, pred, feats, extra.args) {
      length(pred$data$response)
    }
  )

  y = binaryclass.df[, binaryclass.target]
  z = getMinMaxClass(y)
  rdesc = makeResampleDesc("Holdout", split = .5, predict = "both")

  lrn = makeUndersampleWrapper("classif.rpart", usw.rate = 0.1, usw.cl = z$max.name)
  r = resample(lrn, binaryclass.task, rdesc, measures = list(setAggregation(num, train.mean)))
  expect_lt(r$measures.train$num, getTaskSize(binaryclass.task) * 0.5 - 1)

  lrn = makeOversampleWrapper("classif.rpart", osw.rate = 2, osw.cl = z$max.name)
  r = resample(lrn, binaryclass.task, rdesc, measures = list(setAggregation(num, train.mean)))
  expect_gt(r$measures.train$num, getTaskSize(binaryclass.task) * 0.5 + 1)
})

test_that("Wrapper works with weights, we had issue #2047", {
  n = nrow(binaryclass.df)
  w = 1:n
  task = makeClassifTask(data = binaryclass.df, target = binaryclass.target, weights = w)
  b = table(getTaskTargets(task))

  # weights from task, use all
  lrn = makeOversampleWrapper("classif.__mlrmocklearners__6", osw.rate = 1)
  m = train(lrn, task)
  expect_set_equal(getLearnerModel(m, more.unwrap = TRUE)$weights, w)

  lrn = makeUndersampleWrapper("classif.__mlrmocklearners__6", usw.rate = 1)
  m = train(lrn, task)
  expect_set_equal(getLearnerModel(m, more.unwrap = TRUE)$weights, w)

  # weights from task, really sample
  lrn = makeOversampleWrapper("classif.__mlrmocklearners__6", osw.rate = 2)
  m = train(lrn, task)
  u = getLearnerModel(m, more.unwrap = TRUE)$weights
  expect_equal(length(u), min(b) * 2 + max(b))
  expect_subset(u, w)

  lrn = makeUndersampleWrapper("classif.__mlrmocklearners__6", usw.rate = 0.5)
  m = train(lrn, task)
  u = getLearnerModel(m, more.unwrap = TRUE)$weights
  expect_equal(length(u), round(max(b) / 2) + min(b))
  expect_subset(u, w)

  # weights from train
  subset = c(head(which(getTaskTargets(task) == names(b)[1]), 5), head(which(getTaskTargets(task) == names(b)[2]), 5))
  lrn = makeOversampleWrapper("classif.__mlrmocklearners__6", osw.rate = 2)
  m = train(lrn, task, subset = subset, weights = 1:10)
  u = getLearnerModel(m, more.unwrap = TRUE)$weights
  expect_equal(length(u), 15)
  expect_subset(u, 1:10)

  lrn = makeUndersampleWrapper("classif.__mlrmocklearners__6", usw.rate = 2 / 5)
  m = train(lrn, task, subset = subset, weights = 1:10)
  u = getLearnerModel(m, more.unwrap = TRUE)$weights
  expect_equal(length(u), 7)
  expect_subset(u, 1:10)
})