File: prediction.R

package info (click to toggle)
r-cran-rocr 1.0-11-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid, trixie
  • size: 676 kB
  • sloc: sh: 13; makefile: 2
file content (259 lines) | stat: -rw-r--r-- 9,990 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#' @name prediction
#'
#' @title Function to create prediction objects
#'
#' @description
#' Every classifier evaluation using ROCR starts with creating a
#' \code{prediction} object. This function is used to transform the input data
#' (which can be in vector, matrix, data frame, or list form) into a
#' standardized format.
#'
#' @details
#' \code{predictions} and \code{labels} can simply be vectors of the same
#' length. However, in the case of cross-validation data, different
#' cross-validation runs can be provided as the *columns* of a matrix or
#' data frame, or as the entries of a list. In the case of a matrix or
#' data frame, all cross-validation runs must have the same length, whereas
#' in the case of a list, the lengths can vary across the cross-validation
#' runs. Internally, as described in section 'Value', all of these input
#' formats are converted to list representation.
#'
#' Since scoring classifiers give relative tendencies towards a negative
#' (low scores) or positive (high scores) class, it has to be declared
#' which class label denotes the negative, and which the positive class.
#' Ideally, labels should be supplied as ordered factor(s), the lower
#' level corresponding to the negative class, the upper level to the
#' positive class. If the labels are factors (unordered), numeric,
#' logical or characters, ordering of the labels is inferred from
#' R's built-in \code{<} relation (e.g. 0 < 1, -1 < 1, 'a' < 'b',
#' FALSE < TRUE). Use \code{label.ordering} to override this default
#' ordering. Please note that the ordering can be locale-dependent
#' e.g. for character labels '-1' and '1'.
#'
#' Currently, ROCR supports only binary classification (extensions toward
#' multiclass classification are scheduled for the next release,
#' however). If there are more than two distinct label symbols, execution
#' stops with an error message. If all predictions use the same two
#' symbols that are used for the labels, categorical predictions are
#' assumed. If there are more than two predicted values, but all numeric,
#' continuous predictions are assumed (i.e. a scoring
#' classifier). Otherwise, if more than two symbols occur in the
#' predictions, and not all of them are numeric, execution stops with an
#' error message.
#'
#' @param predictions A vector, matrix, list, or data frame containing the
#'   predictions.
#' @param labels A vector, matrix, list, or data frame containing the true class
#'   labels. Must have the same dimensions as \code{predictions}.
#' @param label.ordering The default ordering (cf.details)  of the classes can
#'   be changed by supplying a vector containing the negative and the positive
#'   class label.
#'
#' @return An S4 object of class \code{prediction}.
#'
#' @references
#' A detailed list of references can be found on the ROCR homepage at
#' \url{http://rocr.bioinf.mpi-sb.mpg.de}.
#'
#' @author
#' Tobias Sing \email{tobias.sing@gmail.com}, Oliver Sander
#' \email{osander@gmail.com}
#'
#' @seealso
#' \code{\link{prediction-class}},
#' \code{\link{performance}},
#' \code{\link{performance-class}},
#' \code{\link{plot.performance}}
#'
#' @export
#'
#' @examples
#' # create a simple prediction object
#' library(ROCR)
#' data(ROCR.simple)
#' pred <- prediction(ROCR.simple$predictions,ROCR.simple$labels)
#' pred
prediction <- function(predictions, labels, label.ordering=NULL) {

  ## bring 'predictions' and 'labels' into list format,
  ## each list entry representing one x-validation run

  ## convert predictions into canonical list format
  if (is.data.frame(predictions)) {
    names(predictions) <- c()
    predictions <- as.list(predictions)
  } else if (is.matrix(predictions)) {
    predictions <- as.list(data.frame(predictions))
    names(predictions) <- c()
  } else if (is.vector(predictions) && !is.list(predictions)) {
    predictions <- list(predictions)
  } else if (!is.list(predictions)) {
    stop("Format of predictions is invalid. It couldn't be coerced to a list.",
         call. = FALSE)
  }
  ## if predictions is a list -> keep unaltered
  if(any(vapply(predictions,anyNA,logical(1)))){
    stop("'predictions' contains NA.", call. = FALSE)
  }

  ## convert labels into canonical list format
  if (is.data.frame(labels)) {
    names(labels) <- c()
    labels <- as.list( labels)
  } else if (is.matrix(labels)) {
    labels <- as.list( data.frame( labels))
    names(labels) <- c()
  } else if ((is.vector(labels) ||
              is.ordered(labels) ||
              is.factor(labels)) &&
             !is.list(labels)) {
    labels <- list( labels)
  } else if (!is.list(labels)) {
    stop("Format of labels is invalid. It couldn't be coerced to a list.",
         call. = FALSE)
  }
  ## if labels is a list -> keep unaltered

  ## Length consistency checks
  if (length(predictions) != length(labels))
    stop(paste("Number of cross-validation runs must be equal",
               "for predictions and labels."))
  if (! all(sapply(predictions, length) == sapply(labels, length)))
    stop(paste("Number of predictions in each run must be equal",
               "to the number of labels for each run."))

  ## only keep prediction/label pairs that are finite numbers
  for (i in 1:length(predictions)) {
    finite.bool <- is.finite( predictions[[i]] )
    predictions[[i]] <- predictions[[i]][ finite.bool ]
    labels[[i]] <- labels[[i]][ finite.bool ]
  }

  ## abort if 'labels' format is inconsistent across
  ## different cross-validation runs
  label.format=""  ## one of 'normal','factor','ordered'
  if (all(sapply( labels, is.factor)) &&
      !any(sapply(labels, is.ordered))) {
    label.format <- "factor"
  } else if (all(sapply( labels, is.ordered))) {
    label.format <- "ordered"
  } else if (all(sapply( labels, is.character)) ||
             all(sapply( labels, is.numeric)) ||
             all(sapply( labels, is.logical))) {
    label.format <- "normal"
  } else {
    stop(paste("Inconsistent label data type across different",
               "cross-validation runs."))
  }

  ## abort if levels are not consistent across different
  ## cross-validation runs
  if (! all(sapply(labels, levels)==levels(labels[[1]])) ) {
    stop(paste("Inconsistent factor levels across different",
               "cross-validation runs."))
  }

  ## convert 'labels' into ordered factors, aborting if the number
  ## of classes is not equal to 2.
  levels <- c()
  if ( label.format == "ordered" ) {
    if (!is.null(label.ordering)) {
      stop(paste("'labels' is already ordered. No additional",
                 "'label.ordering' must be supplied."))
    } else {
      levels <- levels(labels[[1]])
    }
  } else {
    if ( is.null( label.ordering )) {
      if ( label.format == "factor" ) levels <- sort(levels(labels[[1]]))
      else levels <- sort( unique( unlist( labels)))
    } else {
      ## if (!setequal( levels, label.ordering)) {
      if (!setequal( unique(unlist(labels)), label.ordering )) {
        stop("Label ordering does not match class labels.")
      }
      levels <- label.ordering
    }
    for (i in 1:length(labels)) {
      if (is.factor(labels))
        labels[[i]] <- ordered(as.character(labels[[i]]),
                               levels=levels)
      else labels[[i]] <- ordered( labels[[i]], levels=levels)
    }

  }

  if (length(levels) != 2) {
    message <- paste("Number of classes is not equal to 2.\n",
                     "ROCR currently supports only evaluation of ",
                     "binary classification tasks.",sep="")
    stop(message)
  }

  ## determine whether predictions are continuous or categorical
  ## (in the latter case stop; scheduled for the next ROCR version)
  if (!is.numeric( unlist( predictions ))) {
    stop("Currently, only continuous predictions are supported by ROCR.")
  }

  ## compute cutoff/fp/tp data

  cutoffs <- list()
  fp <- list()
  tp <- list()
  fn <- list()
  tn <- list()
  n.pos <- list()
  n.neg <- list()
  n.pos.pred <- list()
  n.neg.pred <- list()
  for (i in 1:length(predictions)) {
    n.pos <- c( n.pos, sum( labels[[i]] == levels[2] ))
    n.neg <- c( n.neg, sum( labels[[i]] == levels[1] ))
    ans <- .compute.unnormalized.roc.curve( predictions[[i]], labels[[i]] )
    cutoffs <- c( cutoffs, list( ans$cutoffs ))
    fp <- c( fp, list( ans$fp ))
    tp <- c( tp, list( ans$tp ))
    fn <- c( fn, list( n.pos[[i]] - tp[[i]] ))
    tn <- c( tn, list( n.neg[[i]] - fp[[i]] ))
    n.pos.pred <- c(n.pos.pred, list(tp[[i]] + fp[[i]]) )
    n.neg.pred <- c(n.neg.pred, list(tn[[i]] + fn[[i]]) )
  }


  return( new("prediction", predictions=predictions,
              labels=labels,
              cutoffs=cutoffs,
              fp=fp,
              tp=tp,
              fn=fn,
              tn=tn,
              n.pos=n.pos,
              n.neg=n.neg,
              n.pos.pred=n.pos.pred,
              n.neg.pred=n.neg.pred))
}


## fast fp/tp computation based on cumulative summing
.compute.unnormalized.roc.curve <- function( predictions, labels ) {
  ## determine the labels that are used for the pos. resp. neg. class :
  pos.label <- levels(labels)[2]
  neg.label <- levels(labels)[1]

  pred.order <- order(predictions, decreasing=TRUE)
  predictions.sorted <- predictions[pred.order]
  tp <- cumsum(labels[pred.order]==pos.label)
  fp <- cumsum(labels[pred.order]==neg.label)

  ## remove fp & tp for duplicated predictions
  ## as duplicated keeps the first occurrence, but we want the last, two
  ## rev are used.
  ## Highest cutoff (Infinity) corresponds to tp=0, fp=0
  dups <- rev(duplicated(rev(predictions.sorted)))
  tp <- c(0, tp[!dups])
  fp <- c(0, fp[!dups])
  cutoffs <- c(Inf, predictions.sorted[!dups])

  return(list( cutoffs=cutoffs, fp=fp, tp=tp ))
}