File: Task.R

package info (click to toggle)
r-cran-mlr 2.19.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,264 kB
  • sloc: ansic: 65; sh: 13; makefile: 5
file content (212 lines) | stat: -rw-r--r-- 8,194 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#' @title Create a classification, regression, survival, cluster, cost-sensitive classification or
#' multilabel task.
#'
#' @description
#' The task encapsulates the data and specifies - through its subclasses -
#' the type of the task.
#' It also contains a description object detailing further aspects of the data.
#'
#' Useful operators are:
#' - [getTaskFormula],
#' - [getTaskFeatureNames],
#' - [getTaskData],
#' - [getTaskTargets], and
#' - [subsetTask].
#'
#' Object members:
#' \describe{
#' \item{env (`environment`)}{Environment where data for the task are stored.
#'   Use [getTaskData] in order to access it.}
#' \item{weights ([numeric])}{See argument. `NULL` if not present.}
#' \item{blocking ([factor])}{See argument. `NULL` if not present.}
#' \item{task.desc ([TaskDesc])}{Encapsulates further information about the task.}
#' }
#'
#' Functional data can be added to a task via matrix columns. For more information refer to
#' [makeFunctionalData].
#'
#' @param id (`character(1)`)\cr
#'   Id string for object.
#'   Default is the name of the R variable passed to `data`.
#' @param data ([data.frame])\cr
#'   A data frame containing the features and target variable(s).
#' @param target (`character(1)` | `character(2)` | `character(n.classes)`)\cr
#'   Name(s) of the target variable(s).
#'   For survival analysis these are the names of the survival time and event columns,
#'   so it has length 2. For multilabel classification it contains the names of the logical
#'   columns that encode whether a label is present or not and its length corresponds to the
#'   number of classes.
#' @param costs ([data.frame])\cr
#'   A numeric matrix or data frame containing the costs of misclassification.
#'   We assume the general case of observation specific costs.
#'   This means we have n rows, corresponding to the observations, in the same order as `data`.
#'   The columns correspond to classes and their names are the class labels
#'   (if unnamed we use y1 to yk as labels).
#'   Each entry (i,j) of the matrix specifies the cost of predicting class j
#'   for observation i.
#' @param weights ([numeric])\cr
#'   Optional, non-negative case weight vector to be used during fitting.
#'   Cannot be set for cost-sensitive learning.
#'   Default is `NULL` which means no (= equal) weights.
#' @param blocking ([factor])\cr
#'   An optional factor of the same length as the number of observations.
#'   Observations with the same blocking level \dQuote{belong together}.
#'   Specifically, they are either put all in the training or the test set
#'   during a resampling iteration.
#'   Default is `NULL` which means no blocking.
#' @param positive (`character(1)`)\cr
#'   Positive class for binary classification (otherwise ignored and set to NA).
#'   Default is the first factor level of the target attribute.
#' @param fixup.data (`character(1)`)\cr
#'   Should some basic cleaning up of data be performed?
#'   Currently this means removing empty factor levels for the columns.
#'   Possible choices are:
#'   \dQuote{no} = Don't do it.
#'   \dQuote{warn} = Do it but warn about it.
#'   \dQuote{quiet} = Do it but keep silent.
#'   Default is \dQuote{warn}.
#' @param check.data (`logical(1)`)\cr
#'   Should sanity of data be checked initially at task creation?
#'   You should have good reasons to turn this off (one might be speed).
#'   Default is `TRUE`.
#' @param coordinates ([data.frame])\cr
#'   Coordinates of a spatial data set that will be used for spatial partitioning of the data in a spatial cross-validation resampling setting.
#'   Coordinates have to be numeric values.
#'   Provided [data.frame] needs to have the same number of rows as data and consist of at least two dimensions.
#' @return [Task].
#' @name Task
#' @seealso [ClassifTask] [ClusterTask] [CostSensTask] [MultilabelTask] [RegrTask] [SurvTask]
#' @rdname Task
#' @examples
#' if (requireNamespace("mlbench")) {
#'   library(mlbench)
#'   data(BostonHousing)
#'   data(Ionosphere)
#'
#'   makeClassifTask(data = iris, target = "Species")
#'   makeRegrTask(data = BostonHousing, target = "medv")
#'   # an example of a classification task with more than those standard arguments:
#'   blocking = factor(c(rep(1, 51), rep(2, 300)))
#'   makeClassifTask(id = "myIonosphere", data = Ionosphere, target = "Class",
#'     positive = "good", blocking = blocking)
#'   makeClusterTask(data = iris[, -5L])
#' }
NULL

#' Exported for internal use.
#' @param id (`character(1)`)\cr
#'   task id
#' @param data ([data.frame])\cr
#'   data
#' @param target ([character])\cr
#'   target columns
#' @param weights ([numeric])\cr
#'   weights
#' @param blocking ([numeric`\cr
#'   task data blocking
#' @param coordinates ([data.frame])\cr
#'   Coordinates of a spatial data set that will be used for spatial partitioning of the data in a spatial cross-validation resampling setting.
#'   Coordinates have to be numeric values.
#'   Provided ([data.frame]) needs to have the same number of rows as data and consist of at least two dimensions.
#' @keywords internal
#' @name makeTaskDesc
NULL

makeTask = function(type, data, weights = NULL, blocking = NULL, fixup.data = "warn", check.data = TRUE, coordinates = NULL) {
  if (fixup.data != "no") {
    if (fixup.data == "quiet") {
      data = droplevels(data)
    } else if (fixup.data == "warn") {
      # the next lines look a bit complicated, we calculate the warning info message
      dropped = logical(ncol(data))
      for (i in seq_col(data)) {
        x = data[[i]]
        if (is.factor(x) && hasEmptyLevels(x)) {
          dropped[i] = TRUE
          data[[i]] = droplevels(x)
        }
      }
      if (any(dropped)) {
        warningf("Empty factor levels were dropped for columns: %s", collapse(colnames(data)[dropped]))
      }
    }
  }

  if (check.data) {
    assertDataFrame(data, col.names = "strict")
    if (class(data)[1] != "data.frame") {
      warningf("Provided data is not a pure data.frame but from class %s, hence it will be converted.", class(data)[1])
      data = as.data.frame(data)
    }
    if (!is.null(weights)) {
      assertNumeric(weights, len = nrow(data), any.missing = FALSE, lower = 0)
    }
    if (!is.null(blocking)) {
      assertFactor(blocking, len = nrow(data), any.missing = FALSE)
      if (length(blocking) && length(blocking) != nrow(data)) {
        stop("Blocking has to be of the same length as number of rows in data! Or pass none at all.")
      }
    }
    if (!is.null(coordinates)) {
      if (nrow(coordinates) != nrow(data)) {
        stop("Coordinates need to have the same length data! Or pass none at all.")
      }
      if (ncol(coordinates) < 2) {
        stop("Supplied coordinates need to consist of at least two dimensions.")
      }
      if (!is.data.frame(coordinates)) {
        warningf("Provided coordinates are not given as a data frame but as class %s. Please provide a data frame.", class(coordinates))
      }
    }
  }

  env = new.env(parent = emptyenv())
  env$data = data
  makeS3Obj("Task",
    type = type,
    env = env,
    weights = weights,
    blocking = blocking,
    coordinates = coordinates,
    task.desc = NA
  )
}

checkTaskData = function(data, cols = names(data)) {
  fun = function(cn, x) {
    if (is.numeric(x)) {
      if (anyInfinite(x)) {
        stopf("Column '%s' contains infinite values.", cn)
      }
      if (anyNaN(x)) {
        stopf("Column '%s' contains NaN values.", cn)
      }
    } else if (is.factor(x)) {
      if (hasEmptyLevels(x)) {
        stopf("Column '%s' contains empty factor levels.", cn)
      }
    } else {
      stopf("Unsupported feature type (%s) in column '%s'.", class(x)[1L], cn)
    }
  }

  Map(fun, cn = cols, x = data[cols])
  invisible(TRUE)
}

#' @export
print.Task = function(x, print.weights = TRUE, ...) {

  td = x$task.desc
  catf("Task: %s", td$id)
  catf("Type: %s", td$type)
  catf("Observations: %i", td$size)
  catf("Features:")
  catf(printToChar(td$n.feat, collapse = "\n"))
  catf("Missings: %s", td$has.missings)
  if (print.weights) {
    catf("Has weights: %s", td$has.weights)
  }
  catf("Has blocking: %s", td$has.blocking)
  catf("Has coordinates: %s", td$has.coordinates)
}