1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
|
#' @title Create a classification, regression, survival, cluster, cost-sensitive classification or
#' multilabel task.
#'
#' @description
#' The task encapsulates the data and specifies - through its subclasses -
#' the type of the task.
#' It also contains a description object detailing further aspects of the data.
#'
#' Useful operators are:
#' - [getTaskFormula],
#' - [getTaskFeatureNames],
#' - [getTaskData],
#' - [getTaskTargets], and
#' - [subsetTask].
#'
#' Object members:
#' \describe{
#' \item{env (`environment`)}{Environment where data for the task are stored.
#' Use [getTaskData] in order to access it.}
#' \item{weights ([numeric])}{See argument. `NULL` if not present.}
#' \item{blocking ([factor])}{See argument. `NULL` if not present.}
#' \item{task.desc ([TaskDesc])}{Encapsulates further information about the task.}
#' }
#'
#' Functional data can be added to a task via matrix columns. For more information refer to
#' [makeFunctionalData].
#'
#' @param id (`character(1)`)\cr
#' Id string for object.
#' Default is the name of the R variable passed to `data`.
#' @param data ([data.frame])\cr
#' A data frame containing the features and target variable(s).
#' @param target (`character(1)` | `character(2)` | `character(n.classes)`)\cr
#' Name(s) of the target variable(s).
#' For survival analysis these are the names of the survival time and event columns,
#' so it has length 2. For multilabel classification it contains the names of the logical
#' columns that encode whether a label is present or not and its length corresponds to the
#' number of classes.
#' @param costs ([data.frame])\cr
#' A numeric matrix or data frame containing the costs of misclassification.
#' We assume the general case of observation specific costs.
#' This means we have n rows, corresponding to the observations, in the same order as `data`.
#' The columns correspond to classes and their names are the class labels
#' (if unnamed we use y1 to yk as labels).
#' Each entry (i,j) of the matrix specifies the cost of predicting class j
#' for observation i.
#' @param weights ([numeric])\cr
#' Optional, non-negative case weight vector to be used during fitting.
#' Cannot be set for cost-sensitive learning.
#' Default is `NULL` which means no (= equal) weights.
#' @param blocking ([factor])\cr
#' An optional factor of the same length as the number of observations.
#' Observations with the same blocking level \dQuote{belong together}.
#' Specifically, they are either put all in the training or the test set
#' during a resampling iteration.
#' Default is `NULL` which means no blocking.
#' @param positive (`character(1)`)\cr
#' Positive class for binary classification (otherwise ignored and set to NA).
#' Default is the first factor level of the target attribute.
#' @param fixup.data (`character(1)`)\cr
#' Should some basic cleaning up of data be performed?
#' Currently this means removing empty factor levels for the columns.
#' Possible choices are:
#' \dQuote{no} = Don't do it.
#' \dQuote{warn} = Do it but warn about it.
#' \dQuote{quiet} = Do it but keep silent.
#' Default is \dQuote{warn}.
#' @param check.data (`logical(1)`)\cr
#' Should sanity of data be checked initially at task creation?
#' You should have good reasons to turn this off (one might be speed).
#' Default is `TRUE`.
#' @param coordinates ([data.frame])\cr
#' Coordinates of a spatial data set that will be used for spatial partitioning of the data in a spatial cross-validation resampling setting.
#' Coordinates have to be numeric values.
#' Provided [data.frame] needs to have the same number of rows as data and consist of at least two dimensions.
#' @return [Task].
#' @name Task
#' @seealso [ClassifTask] [ClusterTask] [CostSensTask] [MultilabelTask] [RegrTask] [SurvTask]
#' @rdname Task
#' @examples
#' if (requireNamespace("mlbench")) {
#' library(mlbench)
#' data(BostonHousing)
#' data(Ionosphere)
#'
#' makeClassifTask(data = iris, target = "Species")
#' makeRegrTask(data = BostonHousing, target = "medv")
#' # an example of a classification task with more than those standard arguments:
#' blocking = factor(c(rep(1, 51), rep(2, 300)))
#' makeClassifTask(id = "myIonosphere", data = Ionosphere, target = "Class",
#' positive = "good", blocking = blocking)
#' makeClusterTask(data = iris[, -5L])
#' }
NULL
#' Exported for internal use.
#' @param id (`character(1)`)\cr
#' task id
#' @param data ([data.frame])\cr
#' data
#' @param target ([character])\cr
#' target columns
#' @param weights ([numeric])\cr
#' weights
#' @param blocking ([numeric`\cr
#' task data blocking
#' @param coordinates ([data.frame])\cr
#' Coordinates of a spatial data set that will be used for spatial partitioning of the data in a spatial cross-validation resampling setting.
#' Coordinates have to be numeric values.
#' Provided ([data.frame]) needs to have the same number of rows as data and consist of at least two dimensions.
#' @keywords internal
#' @name makeTaskDesc
NULL
makeTask = function(type, data, weights = NULL, blocking = NULL, fixup.data = "warn", check.data = TRUE, coordinates = NULL) {
if (fixup.data != "no") {
if (fixup.data == "quiet") {
data = droplevels(data)
} else if (fixup.data == "warn") {
# the next lines look a bit complicated, we calculate the warning info message
dropped = logical(ncol(data))
for (i in seq_col(data)) {
x = data[[i]]
if (is.factor(x) && hasEmptyLevels(x)) {
dropped[i] = TRUE
data[[i]] = droplevels(x)
}
}
if (any(dropped)) {
warningf("Empty factor levels were dropped for columns: %s", collapse(colnames(data)[dropped]))
}
}
}
if (check.data) {
assertDataFrame(data, col.names = "strict")
if (class(data)[1] != "data.frame") {
warningf("Provided data is not a pure data.frame but from class %s, hence it will be converted.", class(data)[1])
data = as.data.frame(data)
}
if (!is.null(weights)) {
assertNumeric(weights, len = nrow(data), any.missing = FALSE, lower = 0)
}
if (!is.null(blocking)) {
assertFactor(blocking, len = nrow(data), any.missing = FALSE)
if (length(blocking) && length(blocking) != nrow(data)) {
stop("Blocking has to be of the same length as number of rows in data! Or pass none at all.")
}
}
if (!is.null(coordinates)) {
if (nrow(coordinates) != nrow(data)) {
stop("Coordinates need to have the same length data! Or pass none at all.")
}
if (ncol(coordinates) < 2) {
stop("Supplied coordinates need to consist of at least two dimensions.")
}
if (!is.data.frame(coordinates)) {
warningf("Provided coordinates are not given as a data frame but as class %s. Please provide a data frame.", class(coordinates))
}
}
}
env = new.env(parent = emptyenv())
env$data = data
makeS3Obj("Task",
type = type,
env = env,
weights = weights,
blocking = blocking,
coordinates = coordinates,
task.desc = NA
)
}
checkTaskData = function(data, cols = names(data)) {
fun = function(cn, x) {
if (is.numeric(x)) {
if (anyInfinite(x)) {
stopf("Column '%s' contains infinite values.", cn)
}
if (anyNaN(x)) {
stopf("Column '%s' contains NaN values.", cn)
}
} else if (is.factor(x)) {
if (hasEmptyLevels(x)) {
stopf("Column '%s' contains empty factor levels.", cn)
}
} else {
stopf("Unsupported feature type (%s) in column '%s'.", class(x)[1L], cn)
}
}
Map(fun, cn = cols, x = data[cols])
invisible(TRUE)
}
#' @export
print.Task = function(x, print.weights = TRUE, ...) {
td = x$task.desc
catf("Task: %s", td$id)
catf("Type: %s", td$type)
catf("Observations: %i", td$size)
catf("Features:")
catf(printToChar(td$n.feat, collapse = "\n"))
catf("Missings: %s", td$has.missings)
if (print.weights) {
catf("Has weights: %s", td$has.weights)
}
catf("Has blocking: %s", td$has.blocking)
catf("Has coordinates: %s", td$has.coordinates)
}
|