File: roc.Rd

package info (click to toggle)
r-cran-proc 1.18.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,260 kB
  • sloc: cpp: 144; sh: 14; makefile: 2
file content (429 lines) | stat: -rw-r--r-- 18,440 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
\encoding{UTF-8}
\name{roc}
\alias{roc}
\alias{roc_}
\alias{roc.formula}
\alias{roc.data.frame}
\alias{roc.default}
\title{
 Build a ROC curve
}
\description{
  This is the main function of the pROC package. It builds a ROC
  curve and returns a \dQuote{roc} object, a list of class
  \dQuote{roc}. This object can be \code{print}ed, \code{plot}ted, or
  passed to the functions \code{\link{auc}}, \code{\link{ci}},
  \code{\link{smooth.roc}} and \code{\link{coords}}. Additionally, two
  \code{roc} objects can be compared with \code{\link{roc.test}}. 
}
\usage{
roc(...)
\S3method{roc}{formula}(formula, data, ...)
\S3method{roc}{data.frame}(data, response, predictor,
ret = c("roc", "coords", "all_coords"), ...)
\S3method{roc}{default}(response, predictor, controls, cases,
density.controls, density.cases,
levels=base::levels(as.factor(response)), percent=FALSE, na.rm=TRUE,
direction=c("auto", "<", ">"), algorithm = 6, quiet = FALSE, 
smooth=FALSE, auc=TRUE, ci=FALSE, plot=FALSE, smooth.method="binormal",
smooth.n=512, ci.method=NULL, density=NULL, ...)
roc_(data, response, predictor, ret = c("roc", "coords", "all_coords"), ...)
}

\arguments{
  \item{response}{a factor, numeric or character vector of
    responses (true class), typically encoded with 0 (controls) and 1 (cases).
    Only two classes can be used in a ROC curve. If the vector
    contains more than two unique values, or if their order could be
    ambiguous, use \code{levels} to specify which values must be used as
    control and case value.
    If the first argument was a \code{\link{data.frame}}, \code{response}
    should be the name of the column in \code{data} containing the
    response, quoted for \code{roc_}, and optionally quoted for 
    \code{roc.data.frame} (non-standard evaluation or NSE).
  }
  \item{predictor}{a \code{\link{numeric}} or \code{\link{ordered}} vector
  of the same length than \code{response}, containing the predicted
  value of each observation.
    If the first argument was a \code{\link{data.frame}}, \code{predictor}
    should be the name of the column in \code{data} containing the
    predictor, quoted for \code{roc_}, and optionally quoted for 
    \code{roc.data.frame} (non-standard evaluation or NSE).
  }
  \item{controls, cases}{instead of \code{response}, \code{predictor},
    the data can be supplied as two \code{\link{numeric}} or
    \code{\link{ordered}} vectors containing the predictor
    values for control and case observations.
  }
  \item{density.controls, density.cases}{a smoothed ROC curve can be
    built directly from two densities on identical \code{x} points, as in
    \code{\link[=smooth.roc]{smooth}}.
  }
  \item{formula, data}{a formula of the type \code{response~predictor}. If mulitple predictors 
    are passed, a named list of \code{roc} objects will be returned. Additional arguments
    \code{data} and \code{subset}, but not \code{na.action} are supported, see 
    \code{\link{model.frame}} for more details.
  }
  \item{levels}{the value of the response for controls and cases
    respectively. By default, the first two values of
    \code{levels(as.factor(response))} are taken, and the remaining levels are ignored.
    It usually captures two-class factor data correctly, but will
    frequently fail for other data types (response factor with more than 2 levels,
    or for example if your response is coded \dQuote{controls} and \dQuote{cases},
    the levels will be inverted) and must then be specified here.
    If your data is coded as \code{0} and \code{1} with \code{0}
    being the controls, you can safely omit this argument.
  }
  \item{percent}{if the sensitivities, specificities and AUC must be
    given in percent (\code{TRUE}) or in fraction (\code{FALSE}, default).
  }
  \item{na.rm}{if \code{TRUE}, the \code{NA} values will be removed
    (ignored by \code{roc.formula}).
  }
  \item{direction}{in which direction to make the comparison?
    \dQuote{auto} (default): automatically define in which group the
    median is higher and take the direction accordingly.
    \dQuote{>}: if the predictor values for the control group are
    higher than the values of the case group (controls > t >= cases).
    \dQuote{<}: if the predictor values for the control group are lower
    or equal than the values of the case group (controls < t <= cases).
    You should set this explicity to \dQuote{>} or \dQuote{<} whenever
    you are resampling or randomizing the data, otherwise the 
    curves will be biased towards higher AUC values.
  }
  \item{algorithm}{the method used to compute sensitivity and specificity, 
    an integer of length 1 between \code{0} and \code{6}.
    \code{1}: a safe, well-tested, pure-\R code that is efficient when the 
    number of thresholds is low. It goes with O(T*N).
    \code{2}: an alternative pure-\R algorithm that goes in
    O(N). Typically faster than \code{1} when the number of thresholds of
    the ROC curve is above 1000. Less tested than \code{1}.
    \code{3}: a C++ 
    implementation of \code{1}, about 3-5x faster. Typically the fastest with
    ROC curves with less than 50-100 thresholds, but has a very bad worst-case
    when that number increases.
    \code{4} (debug only, slow):
    runs algorithms 1 to 3 and makes sure they return the same values.
    \code{5}: select \code{2} or \code{3} based on the number of thresholds.
    \code{6} (default): quickly select the algorithm on the class of the data: \code{2}
    for \code{\link{numeric}} and \code{3} for \code{\link{ordered}}.
    \code{0}: use \pkg{microbenchmark} to choose between \code{2} and \code{3}.
  }
  \item{ret}{for \code{roc.data.frame} only, whether to return the
	threshold sensitivity and specificity at all thresholds (\dQuote{coords}),
	all the coordinates at all thresholds (\dQuote{all_coords}) or the
	\code{roc} object (\dQuote{roc}).}
  \item{quiet}{set to \code{TRUE} to turn off \code{\link{message}}s
	 when \code{direction} and \code{levels} are auto-detected.
  }
  \item{smooth}{if TRUE, the ROC curve is passed to \code{\link{smooth}}
    to be smoothed.
  }
  \item{auc}{compute the area under the curve (AUC)? If \code{TRUE}
    (default), additional arguments can be passed to \code{\link{auc}}.
  }
  \item{ci}{compute the confidence interval (CI)? If set to \code{TRUE}, 
    additional arguments can be passed to \code{\link{ci}}.
  }
  \item{plot}{plot the ROC curve? If \code{TRUE}, additional
    arguments can be passed to \code{\link{plot.roc}}.
  }
  \item{smooth.method, smooth.n, ci.method}{in \code{roc.formula} and
	\code{roc.default}, the \code{method} and \code{n} arguments to
	\code{\link[=smooth.roc]{smooth}} (if \code{smooth=TRUE}) and
	\code{of="auc"}) must be passed as
	\code{smooth.method}, \code{smooth.n} and \code{ci.method} to avoid confusions.
  }
  \item{density}{\code{density} argument passed to \code{\link[=smooth.roc]{smooth}}.}
  \item{\dots}{further arguments passed to or from other methods, and
    especially:
    \itemize{
      \item \code{\link{auc}}: \code{partial.auc}, \code{partial.auc.focus}, \code{partial.auc.correct}.
      \item \code{\link{ci}}: \code{of}, \code{conf.level}, \code{boot.n}, \code{boot.stratified}, \code{progress}
      \item \code{\link{ci.auc}}:, \code{reuse.auc}, \code{method}
      \item \code{\link{ci.thresholds}}: \code{thresholds}
      \item \code{\link{ci.se}}: \code{sensitivities}
      \item \code{\link{ci.sp}}: \code{specificities}
      \item \code{\link{plot.roc}}: \code{add}, \code{col} and most
        other arguments to the \code{\link{plot.roc}} function. See
	\code{\link{plot.roc}} directly for more details.
      \item \code{\link{smooth}}: \code{method}, \code{n}, and all other
        arguments. See \code{\link{smooth}} for more details.
    }
  }
}
\details{
  This function's main job is to build a ROC object. See the
  \dQuote{Value} section to this page for more details. Before
  returning, it will call (in this order) the \code{\link[=smooth.roc]{smooth}},
  \code{\link{auc}}, \code{\link{ci}} and \code{\link{plot.roc}}
  functions if \code{smooth} \code{auc}, \code{ci} and \code{plot.roc}
  (respectively) arguments are set to TRUE. By default, only \code{auc}
  is called.

  Data can be provided as \code{response, predictor}, where the
  predictor is the numeric (or ordered) level of the evaluated signal, and
  the response encodes the observation class (control or case). The
  \code{level} argument specifies which response level must be taken as
  controls (first value of \code{level}) or cases (second). It can
  safely be ignored when the response is encoded as \code{0} and
  \code{1}, but it will frequently fail otherwise. By default, the first
  two values of \code{levels(as.factor(response))} are taken, and the
  remaining levels are ignored. This means that if your response is
  coded \dQuote{control} and \dQuote{case}, the levels will be
  inverted.
	
  In some cases, it is more convenient to pass the data as
  \code{controls, cases}, but both arguments are ignored if
  \code{response, predictor} was specified to non-\code{NULL} values.
  It is also possible to pass density data with \code{density.controls,
  density.cases}, which will result in a smoothed ROC curve even if
  \code{smooth=FALSE}, but are ignored if \code{response, predictor} or
  \code{controls, cases} are provided.

  Specifications for \code{\link{auc}}, \code{\link{ci}} and
  \code{\link{plot.roc}} are not kept if \code{auc}, \code{ci} or \code{plot} are set to
  \code{FALSE}. Especially, in the following case:
  
  \preformatted{
    myRoc <- roc(..., auc.polygon=TRUE, grid=TRUE, plot=FALSE)
    plot(myRoc)
  }

  the plot will not have the AUC polygon nor the grid. Similarly, when
  comparing \dQuote{roc} objects, the following is not possible:

  \preformatted{
    roc1 <- roc(..., partial.auc=c(1, 0.8), auc=FALSE)
    roc2 <- roc(..., partial.auc=c(1, 0.8), auc=FALSE)
    roc.test(roc1, roc2)
  }

  This will produce a test on the full AUC, not the partial AUC. To make
  a comparison on the partial AUC, you must repeat the specifications
  when calling \code{\link{roc.test}}:

  \preformatted{
    roc.test(roc1, roc2, partial.auc=c(1, 0.8))
  }

  Note that if \code{roc} was called with \code{auc=TRUE}, the latter syntax will not
  allow redefining the AUC specifications. You must use \code{reuse.auc=FALSE} for that.
  
}
\value{
  If the data contained any \code{NA} value and \code{na.rm=FALSE}, \code{NA} is
  returned. Otherwise, if \code{smooth=FALSE}, a list of class
  \dQuote{roc} with the following fields: 
  \item{auc}{if called with \code{auc=TRUE}, a numeric of class \dQuote{auc} as
    defined in \code{\link{auc}}.
  }
  \item{ci}{if called with \code{ci=TRUE}, a numeric of class \dQuote{ci} as
    defined in \code{\link{ci}}.
  }
  \item{response}{the response vector. Patients whose response is not
    \code{\link{\%in\%}} \code{levels} are discarded. If \code{NA} values
    were removed, a \code{na.action} attribute similar
    to \code{\link{na.omit}} stores the row numbers. 
  }
  \item{predictor}{the predictor vector converted to numeric as used to build the ROC
    curve. Patients whose response is not \code{\link{\%in\%}} \code{levels} are discarded. If
    \code{NA} values were removed, a \code{na.action} attribute similar
    to \code{\link{na.omit}} stores the row numbers.
  }
  \item{original.predictor, original.response}{the response and predictor vectors as passed in argument.}
  \item{levels}{the levels of the response as defined in argument.}
  \item{controls}{the predictor values for the control observations.}
  \item{cases}{the predictor values for the cases.}
  \item{percent}{if the sensitivities, specificities and AUC are
    reported in percent, as defined in argument.
  }
  \item{direction}{the direction of the comparison, as defined in argument.}
  \item{fun.sesp}{the function used to compute sensitivities and specificities.
    Will be re-used in bootstrap operations.}
  \item{sensitivities}{the sensitivities defining the ROC curve.}
  \item{specificities}{the specificities defining the ROC curve.}
  \item{thresholds}{the thresholds at which the sensitivities and
    specificities were computed. See below for details.
  }
  \item{call}{how the function was called. See \code{\link{match.call}} for
    more details.
  }

  If \code{smooth=TRUE} a list of class \dQuote{smooth.roc} as returned
  by \code{\link{smooth}}, with or without additional elements
  \code{auc} and \code{ci} (according to the call).
}

\section{Thresholds}{
  
  Thresholds are selected as the means between any two consecutive values 
  observed in the data. This choice is aimed to facilitate their interpretation,
  as any data point will be unambiguously positive or negative
  regardless of whether the comparison operator includes equality
  or not.
  
  In rare cases it might not be possible to represent the
  mean between two consecutive values, or one might want to use a custom 
  threshold. In those cases, the semantic of the comparison
  is as follows: with \code{direction = '>'},
  observations are positive when they are smaller than or equal 
  (\code{<=}) to the threshold.
  With \code{direction = '<'}, observations are positive when they
  are greater than or equal (\code{>=}) to the threshold.
  
  As a corollary, thresholds do not correspond to actual values
  in the data.
  
}

\section{Experimental: pipelines}{
	Since version 1.15.0, the \code{roc} function can be used in pipelines, for instance with \pkg{dplyr} or \pkg{magrittr}. This is still a highly experimental feature and will change significantly in future versions  (see \href{https://github.com/xrobin/pROC/issues/54}{issue 54}).
	The \code{roc.data.frame} method supports both standard and non-standard evaluation (NSE):
	
    \preformatted{
library(dplyr)
# Standard evaluation:
aSAH \%>\%
    filter(gender == "Female") \%>\%
    roc("outcome", "s100b")
# Non-Standard Evaluation:
aSAH \%>\%
    filter(gender == "Female") \%>\%
    roc(outcome, s100b)
	}
	
	For tasks involving programming and variable column names, the \code{roc_} function provides
	standard evaluation:
	
    \preformatted{
# Standard evaluation:
aSAH \%>\%
    filter(gender == "Female") \%>\%
    roc_("outcome", "s100b")
    }
	
	By default it returns the \code{\link{roc}} object, which can then be piped to
	the \code{\link{coords}} function to extract coordinates that can be used
	in further pipelines.
	
    \preformatted{
# Returns thresholds, sensitivities and specificities:
aSAH  \%>\%
    roc(outcome, s100b) \%>\%
    coords(transpose = FALSE) \%>\%
    filter(sensitivity > 0.6, 
           specificity > 0.6)

# Returns all existing coordinates, then select precision and recall:
aSAH  \%>\%
    roc(outcome, s100b) \%>\%
    coords(ret = "all", transpose = FALSE) \%>\%
    select(precision, recall)
    }
}

\section{Errors}{
  If no control or case observation exist for the given levels of
  response, no ROC curve can be built and an error is triggered with
  message \dQuote{No control observation} or \dQuote{No case
    observation}.

  If the predictor is not a numeric or ordered, as defined by
  \code{\link{as.numeric}} or \code{\link{as.ordered}}, the message
  \dQuote{Predictor must be numeric or ordered} is returned.

  The message \dQuote{No valid data provided} is issued when the data
  wasn't properly passed. Remember you need both \code{response} and
  \code{predictor} of the same (not null) length, or both \code{controls}
  and \code{cases}. Combinations such as \code{predictor} and
  \code{cases} are not valid and will trigger this error.
  
  Infinite values of the predictor cannot always be thresholded by
  infinity and can cause ROC curves to not reach 0 or 100\% 
  specificity or sensitivity. Since version 1.13.0, pROC returns \code{NaN}
  with a warning message \dQuote{Infinite value(s) in predictor} if
  \code{predictor} contains any \link[=is.infinite]{infinite} values.
}

\references{
  Tom Fawcett (2006) ``An introduction to ROC analysis''. \emph{Pattern
    Recognition Letters} \bold{27}, 861--874. DOI:
  \doi{10.1016/j.patrec.2005.10.010}.
  
  Xavier Robin, Natacha Turck, Alexandre Hainard, \emph{et al.}
  (2011) ``pROC: an open-source package for R and S+ to analyze and
  compare ROC curves''. \emph{BMC Bioinformatics}, \bold{7}, 77.
  DOI: \doi{10.1186/1471-2105-12-77}.
}

\seealso{
 \code{\link{auc}}, \code{\link{ci}}, \code{\link{plot.roc}}, \code{\link{print.roc}}, \code{\link{roc.test}}
}

\examples{
data(aSAH)

# Basic example
roc(aSAH$outcome, aSAH$s100b,
    levels=c("Good", "Poor"))
# As levels aSAH$outcome == c("Good", "Poor"),
# this is equivalent to:
roc(aSAH$outcome, aSAH$s100b)
# In some cases, ignoring levels could lead to unexpected results
# Equivalent syntaxes:
roc(outcome ~ s100b, aSAH)
roc(aSAH$outcome ~ aSAH$s100b)
with(aSAH, roc(outcome, s100b))
with(aSAH, roc(outcome ~ s100b))

# With a formula:
roc(outcome ~ s100b, data=aSAH)

\dontrun{
library(dplyr)
aSAH \%>\%
    filter(gender == "Female") \%>\%
    roc(outcome, s100b)
}

# Using subset (only with formula)
roc(outcome ~ s100b, data=aSAH, subset=(gender == "Male"))
roc(outcome ~ s100b, data=aSAH, subset=(gender == "Female"))

# With numeric controls/cases
roc(controls=aSAH$s100b[aSAH$outcome=="Good"], cases=aSAH$s100b[aSAH$outcome=="Poor"])
# With ordered controls/cases
roc(controls=aSAH$wfns[aSAH$outcome=="Good"], cases=aSAH$wfns[aSAH$outcome=="Poor"])

# Inverted the levels: "Poor" are now controls and "Good" cases:
roc(aSAH$outcome, aSAH$s100b,
    levels=c("Poor", "Good"))

# The result was exactly the same because of direction="auto".
# The following will give an AUC < 0.5:
roc(aSAH$outcome, aSAH$s100b,
    levels=c("Poor", "Good"), direction="<")

# If we are sure about levels and direction auto-detection,
# we can turn off the messages:
roc(aSAH$outcome, aSAH$s100b, quiet = TRUE)

# If we prefer counting in percent:
roc(aSAH$outcome, aSAH$s100b, percent=TRUE)

# Plot and CI (see plot.roc and ci for more options):
roc(aSAH$outcome, aSAH$s100b,
    percent=TRUE, plot=TRUE, ci=TRUE)

# Smoothed ROC curve
roc(aSAH$outcome, aSAH$s100b, smooth=TRUE)
# this is not identical to
smooth(roc(aSAH$outcome, aSAH$s100b))
# because in the latter case, the returned object contains no AUC
}

\keyword{univar}
\keyword{nonparametric}
\keyword{utilities}
\keyword{roc}