1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
|
test_that("Impute data frame", {
data = data.frame(f = letters[c(1, 1, 1, 1, 2)], x = rep(1., 5),
y = c(1, 2, 3, 3, 4), z = NA, stringsAsFactors = TRUE)
target = "z"
data[6, ] = NA
# median
imputed = impute(data, target = target, cols = list(x = imputeMedian(),
y = imputeMedian()))$data
expect_equal(imputed$x[6], 1)
expect_equal(imputed$y[6], 3)
# mode
imputed = impute(data, target = target, cols = list(f = imputeMode(),
x = imputeMode(), y = imputeMode()))$data
expect_equal(as.character(imputed$f[6]), "a")
expect_equal(imputed$x[6], 1)
expect_equal(imputed$y[6], 3)
# min / max
imputed = impute(data, target = target, cols = list(x = imputeMin(2),
y = imputeMax(0)))$data
expect_equal(imputed$x[6], 1)
expect_equal(imputed$y[6], max(data$y, na.rm = TRUE))
imputed = impute(data, target = target, classes = list(numeric = imputeMax()))$data
expect_equal(imputed$x[6], 1)
expect_equal(imputed$y[6], max(data$y, na.rm = TRUE) + diff(range(data$y, na.rm = TRUE)))
# normal
imputed = impute(data, target = target, cols = list(x = imputeNormal(),
y = imputeNormal()))$data
expect_equal(imputed$x[6], 1)
# hist / table
imputed = impute(data, target = target, cols = list(f = imputeHist(),
x = imputeHist(), y = imputeHist(breaks = 1, use.mids = FALSE)))$data
expect_true(imputed$f[6] %in% c("a", "b"))
expect_equal(imputed$x[6], 0.5)
expect_equal(imputed$x[6], 0.5)
expect_true(imputed$y[6] >= 0 && imputed$y[6] <= 5)
# learner
data2 = data.frame(V1 = 1:10, V2 = 1:10, V3 = 1:10,
col = factor(rep(1:2, c(3, 7))), z = 1:10)
data2$V2[9:10] = NA
data2$V3[1:2] = NA
data2$col[8:10] = NA
# we impute col case 1: feature used for imputation (V1) does not have
# missings used to check functionality with a learner that does not have
# property "missings" (see #1035)
lrn = makeLearner("classif.lda")
expect_false(hasLearnerProperties(lrn, "missings"))
imputed = impute(data2, target = target, cols = list(col = imputeLearner(lrn, features = "V1")))$data
expect_true(all(imputed$col[8:10] == "2"))
# case 2: feature used for imputation (V2) has missings only in rows where col
# has missings in this case the imputation task does not have property
# "missings", but a learner with property "missings" is required for
# imputation
expect_error(impute(data2, target = target, cols = list(col = imputeLearner("classif.lda", features = "V2"))), "used for imputation has/have missing values, but learner")
lrn = makeLearner("classif.naiveBayes")
expect_true(hasLearnerProperties(lrn, "missings"))
imputed = impute(data2, target = target, cols = list(col = imputeLearner(lrn, features = "V2")))$data
expect_true(all(imputed$col[8:10] == "2"))
# case 3: feature used for imputation (V3) has missings only in rows where col
# does not have missings in this case the imputation task has property
# "missings"
expect_error(impute(data2, target = target, cols = list(col = imputeLearner("classif.lda", features = "V3"))), "used for imputation has/have missing values, but learner")
imputed = impute(data2, target = target, cols = list(col = imputeLearner("classif.naiveBayes", features = "V3")))$data
expect_true(all(imputed$col[8:10] == "2"))
# we had an issue here (see #26) where e.g. imputation for integer/numeric
# features via a classif learner showed inconsistent behavior and resulted in
# weird error messages
data2$col2 = as.integer(data2$col)
# case 1: impute an integer (data2$col2) with a classif learner (integers are
# coerced to factors by checkTaskData) using learner classif.lvq1 because it
# doesn't work with integer targets (see #26)
set.seed(getOption("mlr.debug.seed"))
imputed = impute(data2, cols = list(col2 = imputeLearner("classif.lvq1",
features = "V1")))$data
expect_true(all(imputed$col2[8:10] == "2"))
# case 2: impute a numeric (data$x) with a classif learner
expect_error(impute(data, target = target,
cols = list(x = imputeLearner("classif.naiveBayes"))),
"Assertion on 'x' failed")
# case 3: impute a factor (data$f) with a regr learner
expect_error(impute(data, target = target,
cols = list(f = imputeLearner("regr.rpart"))),
"Assertion on 'f' failed")
# constant replacements
imputed = impute(data, target = target, cols = list(f = "xxx", x = 999, y = 1000))$data
expect_equal(as.character(imputed$f[6]), "xxx")
expect_equal(imputed$x[6], 999)
expect_equal(imputed$y[6], 1000)
# some reimputations
x = impute(data, target = target, cols = list(f = "xxx", x = imputeMode(),
y = imputeMax(2)), impute.new.levels = TRUE)
imputed = reimpute(data, x$desc)
expect_equal(x$data, imputed)
expect_error(reimpute(data.frame(f = factor("newlvl"), x = NA), x$desc), "column types have changed")
imputed = reimpute(data.frame(f = factor("newlvl"), x = NA_real_), x$desc)
expect_equal(as.character(imputed$f), "xxx")
expect_equal(imputed$x, 1)
# FIXME: y was never in input data? therefore next test fails?
# expect_equal(imputed$y, 8)
x = impute(data, target = target, cols = list(f = "xxx"), impute.new.levels = FALSE)
imputed = reimpute(data.frame(f = factor("newlvl"), x = NA_real_), x$desc)
expect_true(is.na(imputed$f))
expect_true("xxx" %in% levels(imputed$f))
# dummies
x = impute(data, target = target, dummy.cols = "x")
expect_equal(x$data[["x.dummy"]], as.factor(c(rep(FALSE, 5), TRUE)))
expect_equal(reimpute(data, x$desc), x$data)
x = impute(data, target = target, dummy.cols = "x", dummy.type = "numeric")
expect_equal(x$data[["x.dummy"]], as.numeric(c(rep(FALSE, 5), TRUE)))
expect_equal(reimpute(data, x$desc), x$data)
x = impute(data, target = target, dummy.classes = "numeric")$data
expect_true(setequal(names(x), c(names(data), "x.dummy", "y.dummy")))
x = impute(data, target = target, dummy.classes = "numeric", dummy.cols = "z")$data
expect_true(setequal(names(x), c(names(data), "x.dummy", "y.dummy", "z.dummy")))
x = impute(data, target = character(0), classes = list(factor = imputeMode(), numeric = imputeMedian(),
integer = imputeMedian(), logical = imputeConstant(1)))
expect_true(all(!is.na(x$data)))
data2 = data[1:5, ]
x = impute(data2, target = target, dummy.classes = c("numeric", "logical", "factor"), force.dummies = TRUE)
expect_true(setequal(x$desc$dummies, c("f", "x", "y")))
x = impute(data2, target = target, dummy.classes = c("numeric", "logical", "factor"), force.dummies = FALSE)
expect_true(setequal(x$desc$dummies, character(0)))
})
test_that("Impute and reimpute task", {
data = data.frame(f = letters[c(1, 1, 1, 1, 2)], x = rep(1., 5), y = c(1, 2, 3, 3, 4),
stringsAsFactors = TRUE)
data[6L, ] = NA
classif.tar = factor(c(rep(c("a", "b"), 3L)))
regr.tar = rep(c(.1, .2), 3L)
# additional data-frame to check reimpute
data2 = data.frame(f = letters[c(2, 1, 1, 1, 1)], x = rep(2., 5), y = c(2, 4, 2, 3, 3),
stringsAsFactors = TRUE)
data2[6L, ] = NA
data2$z = classif.tar
# test classif task
data$z = classif.tar
data2$z = classif.tar
classif.tsk = makeClassifTask(data = data, target = "z")
imputed = impute(classif.tsk,
cols = list(f = imputeConstant("c"), x = imputeMean(), y = imputeMode()))
imputed.tsk.data = getTaskData(imputed$task)
imputed.data = impute(data, target = "z",
cols = list(f = imputeConstant("c"), x = imputeMean(), y = imputeMode()))$data
test.tsk = makeClassifTask(data = data2, target = "z")
imputed.desc = imputed$desc
imputed.test.tsk = reimpute(test.tsk, imputed.desc)
test.imputed = reimpute(data2, imputed.desc)
expect_identical(imputed.data, imputed.tsk.data)
expect_equal(class(classif.tsk), class(imputed$task))
expect_identical(test.imputed, imputed.test.tsk$env$data)
expect_equal(class(test.tsk), class(imputed.test.tsk))
})
test_that("ImputeWrapper", {
d = iris[seq(1, 150, 3), ]
d[1, 1] = NA_real_
task = makeClassifTask(data = d, target = "Species")
lrn = makeImputeWrapper("classif.rpart", classes = list(numeric = imputeMedian()))
m = train(lrn, task)
p = predict(m, task)
expect_true(!any(is.na(p$data$response)))
mm = getLearnerModel(m, more.unwrap = TRUE)
expect_output(print(mm), "root")
expect_s3_class(mm, "rpart")
mm = getLearnerModel(m, more.unwrap = FALSE)
expect_output(print(mm), "Model")
expect_s3_class(mm, "WrappedModel")
expect_match(lrn$id, "[.]imputed$")
})
test_that("Impute works on non missing data", { # we had issues here: 848,893
data = data.frame(a = c(1, 1, 2), b = 1:3)
impute.methods = list(
imputeConstant(0),
imputeMedian(),
imputeMean(),
imputeMode(),
imputeMin(),
imputeMax(),
imputeUniform(),
imputeNormal(),
imputeHist(),
imputeLearner(learner = makeLearner("regr.fnn"))
)
for (impute.method in impute.methods) {
imputed = impute(data, cols = list(a = impute.method))$data
expect_equal(data, imputed)
}
# test it in resampling
dat = data.frame(y = rnorm(10), a = c(NA, rnorm(9)), b = rnorm(10))
task = makeRegrTask(data = dat, target = "y")
implrn = imputeLearner(makeLearner("regr.rpart"))
lrn = makeImputeWrapper(makeLearner("regr.lm"), cols = list(a = implrn))
holdout(lrn, task)
})
test_that("Logicals are casted to factors instead of character (#1522)", {
x = data.frame(a = c(TRUE, FALSE, NA))
y = impute(x, cols = list(a = imputeConstant("__miss__")))
res = factor(c("TRUE", "FALSE", "__miss__"), levels = c("FALSE", "TRUE", "__miss__"))
expect_equal(y$data$a, res)
})
|