File: stability.Rd

package info (click to toggle)
r-cran-stablelearner 0.1-5%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 380 kB
  • sloc: makefile: 2
file content (125 lines) | stat: -rw-r--r-- 5,484 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
\name{stability}
\alias{stability}
\alias{print.stablelearner}
\alias{print.stablelearnerList}

\title{Stability Assessment for Results from Supervised Statistical Learning}

\description{
  Stability assessment of results from supervised statistical learning (i.e., 
  recursive partitioning, support vector machines, neural networks, etc.). The 
  procedure involves the pairwise comparison of results generated from learning 
  samples randomly drawn from the original data set or directly from the 
  data-generating process (if available).
}

\usage{
  stability(x, \dots, data = NULL, control = stab_control(), weights = NULL, 
    applyfun = NULL, cores = NULL, names = NULL)
}

\arguments{
  \item{x}{fitted model object. Any model object can be used whose class is 
    registered in \code{\link{LearnerList}}. Users can add classes for the 
    current \R session to \code{\link{LearnerList}}, see \code{\link{addLearner}}.}
  \item{\ldots}{additional fitted model objects.}
  \item{data}{an optional \code{data.frame} or a data-generating function. By 
    default the learning data from \code{x} is used (if this can be inferred 
    from the \code{\link{getCall}} of \code{x}.}
  \item{control}{a list with control parameters, see  \code{\link{stab_control}}.}
  \item{weights}{an optional matrix of dimension n * B that can be used to 
    weight the observations from the original learning data when the models 
    are refitted. If \code{weights = true}, the weights are computed internally 
    according to the \code{sampler} defined in \code{control}. If 
    \code{weight = NULL} (default), no case-weights are used and the 
    \code{sampler} defined in \code{control} will be applied to the original 
    data set.}
  \item{applyfun}{a \code{\link{lapply}}-like function. The default is to use
    \code{\link{lapply}} unless \code{cores} is specified in which case
    \code{\link{mclapply}} is used (for multicore computations on platforms
    that support these).}
  \item{cores}{integer. The number of cores to use in multicore computations
    using \code{\link{mclapply}} (see above).}
  \item{names}{a vector of characters to specify a name for each fitted model 
    object. By default, the objects are named by their class.}
}

\details{
  Assesses the (overall) stability of a result from supervised statistical 
  learning by quantifying the similarity of realizations from the distribution 
  of possible results (given the algorithm, the formulated model, the 
  data-generating process, the sample size, etc.). The stability distribution 
  is estimated by repeatedly assessing the similarity between the results 
  generated by training the algorithm on two different learning samples, by 
  means of a similarity metric. The learning samples are generated by sampling 
  from the learning data or the data-generating process in case of a simulation 
  study. For more details, see Philipp et al. (2018).
}

\value{
  For a single fitted model object, \code{stability} returns an object of 
  class \code{"stablelearner"} with the following components:
  
  \item{call}{the call from the model object \code{x},}
  \item{learner}{the information about the learner retrieved from \code{\link{LearnerList}},}
  \item{B}{the number of repetitions,}
  \item{sval}{a matrix containing the estimated similarity values for each 
  similarity measure specified in \code{control},}
  \item{sampstat}{a list containing information on the size of the learning 
  samples (\code{ls}), the size of the overlap between the learning samples 
  (\code{lo}), the size of the evaluation sample (\code{es}) and the size of
  the overlap between the evaluation and the learning samples (\code{eo})
  in each repetition.}
  \item{data}{a language object referring to the \code{data.frame} or the
    data-generating function used for assessing the stability,}
  \item{control}{a list with control parameters used for assessing the 
  stability,}
  
  For several fitted model objects, \code{stability} returns an object of 
  class \code{"stablelearnerList"} which is a list of objects of class 
  \code{"stablelearner"}.
}

\references{
Philipp M, Rusch T, Hornik K, Strobl C (2018).
  \dQuote{Measuring the Stability of Results from Supervised Statistical Learning}.
  \emph{Journal of Computational and Graphical Statistics}, \bold{27}(4), 685--700.
  \doi{10.1080/10618600.2018.1473779}
}

\seealso{\code{\link{boxplot.stablelearnerList}}, \code{\link{summary.stablelearner}}}

\examples{

\donttest{
## assessing the stability of a single result
library("partykit")
r1 <- ctree(Species ~ ., data = iris)
stab <- stability(r1)
summary(stab)

## assessing the stability of several results
library("rpart")
r2 <- rpart(Species ~ ., data = iris)
stab <- stability(r1, r2, control = stab_control(seed = 0))
summary(stab, names = c("ctree", "rpart"))

## using case-weights instead of resampling
stability(r1, weights = TRUE)

## using self-defined case-weights
n <- nrow(iris)
B <- 500
w <- array(sample(c(0, 1), size = n * B * 3, replace = TRUE), dim = c(n, B, 3))
stability(r1, weights = w)

## assessing stability for a given data-generating process
my_dgp <- function() dgp_twoclass(n = 100, p = 2, noise = 4, rho = 0.2)
res <- ctree(class ~ ., data = my_dgp())
stability(res, data = my_dgp)
}

}

\keyword{resampling}
\keyword{similarity}