File: adjOutlyingness.Rd

package info (click to toggle)
robustbase 0.99-4-1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,552 kB
  • sloc: fortran: 3,245; ansic: 3,243; sh: 15; makefile: 2
file content (220 lines) | stat: -rw-r--r-- 9,989 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
\name{adjOutlyingness}
\alias{adjOutlyingness}
\title{Compute (Skewness-adjusted) Multivariate Outlyingness}
\description{
  For an \eqn{n \times p}{n * p} data matrix (or data frame) \code{x},
  compute the \dQuote{\emph{outlyingness}} of all \eqn{n} observations.
  Outlyingness here is a generalization of the Donoho-Stahel
  outlyingness measure, where skewness is taken into account via the
  medcouple, \code{\link{mc}()}.
}
\usage{
adjOutlyingness(x, ndir = 250, p.samp = p, clower = 4, cupper = 3,
                IQRtype = 7,
                alpha.cutoff = 0.75, coef = 1.5,
                qr.tol = 1e-12, keep.tol = 1e-12,
                only.outlyingness = FALSE, maxit.mult = max(100, p),
                trace.lev = 0,
                mcReflect = n <= 100, mcScale = TRUE, mcMaxit = 2*maxit.mult,
                mcEps1 = 1e-12, mcEps2 = 1e-15,
                mcTrace = max(0, trace.lev-1))
}
\arguments{
  \item{x}{a numeric \eqn{n \times p}{n * p} \code{\link{matrix}} or
    \code{\link{data.frame}},
    which must be of full rank \eqn{p}.}
  \item{ndir}{positive integer specifying the number of directions that
    should be searched.}
  \item{p.samp}{the sample size to use for finding good random
    directions, must be at least \code{p}.  The default, \code{p} had
    been hard coded previously.}
  \item{clower, cupper}{the constant to be used for the lower and upper
    tails, in order to transform the data towards symmetry.  You can set
    \code{clower = 0, cupper = 0} to get the \emph{non}-adjusted,
    i.e., classical (\dQuote{central} or \dQuote{symmetric})
    outlyingness.  In that case, \code{\link{mc}()} is not used.}
  \item{IQRtype}{a number from \code{1:9}, denoting type of empirical
    quantile computation for the \code{\link{IQR}()}.  The default 7
    corresponds to \code{\link{quantile}}'s and \code{\link{IQR}}'s
    default.  MM has evidence that \code{type=6} would be a bit more stable
    for small sample size.}
  \item{alpha.cutoff}{number in (0,1) specifying the quantiles
    \eqn{(\alpha, 1-\alpha)} which determine the \dQuote{outlier}
    cutoff.  The default, using quartiles, corresponds to the definition
    of the medcouple (\code{\link{mc}}), but there is no stringent
    reason for using the same alpha for the outlier cutoff.}
  \item{coef}{positive number specifying the factor with which the
    interquartile range (\code{\link{IQR}}) is multiplied to determine
    \sQuote{boxplot hinges}-like upper and lower bounds.}
  \item{qr.tol}{positive tolerance to be used for \code{\link{qr}} and
    \code{\link{solve.qr}} for determining the \code{ndir} directions,
    each determined by a random sample of \eqn{p} (out of \eqn{n})
    observations.  Note that the default \eqn{10^{-12}} is rather small,
    and \code{\link{qr}}'s default \code{= 1e-7} may be more appropriate.}
  \item{keep.tol}{positive tolerance to determine which of the sample
    direction should be kept, namely only those for which
    \eqn{\|x\| \cdot \|B\|}{||x|| * ||B||} is larger than \code{keep.tol}.}
  \item{only.outlyingness}{logical indicating if the final outlier
    determination should be skipped.  In that case, a vector is
    returned, see \sQuote{Value:} below.}
  \item{maxit.mult}{integer factor; \code{maxit <- maxit.mult * ndir}
    will determine the maximal number of direction searching
    iterations.  May need to be increased for higher dimensional data,
    though increasing \code{ndir} may be more important.}
  \item{trace.lev}{an integer, if positive allows to monitor the
    direction search.}
  %% new (Aug-Dec 2020), related to mc():  see >>> ./mc.Rd <<<
  \item{mcReflect}{passed as \code{doReflect} to \code{\link{mc}()}.}
  \item{mcScale}{passed as \code{doScale} to \code{\link{mc}()}.}
  \item{mcMaxit}{passed as \code{maxit} to \code{\link{mc}()}.}
  \item{mcEps1}{passed as \code{eps1} to \code{\link{mc}()}; the default is slightly
    looser (100 larger) than the default for \code{mc}().}
  \item{mcEps2}{passed as \code{eps2} to \code{\link{mc}()}.}
  \item{mcTrace}{passed as \code{trace.lev} to \code{\link{mc}()}.}
}
\note{
  % During Lukas Graz' Master's thesis (Spring 2021), it finally became clear to MM:
  If there are too many degrees of freedom for the projections, i.e., when
  \eqn{n \le 4p}{n <= 4*p}, the current definition of adjusted outlyingness
  is ill-posed, as one of the projections may lead to a denominator
  (quartile difference) of zero, and hence formally an adjusted
  outlyingness of infinity.
  The current implementation avoids \code{Inf} results, but will return
  seemingly random \code{adjout} values of around \eqn{10^{14} -- 10^{15}} which may
  be completely misleading, see, e.g., the \code{longley} data example.

  The result is \emph{random} as it depends on the sample of
  \code{ndir} directions chosen; specifically, to get sub samples the algorithm uses
  \code{\link{sample.int}(n, p.samp)}
  which from \R version 3.6.0 depends on
  \code{\link{RNGkind}(*, sample.kind)}.  Exact reproducibility of results
  from \R versions 3.5.3 and earlier, requires setting
  \code{\link{RNGversion}("3.5.0")}.% same text in ./glmrob.Rd ("MT")
  In any case, do use \code{\link{set.seed}()} yourself
  for reproducibility!

 Till Aug/Oct. 2014, the default values for \code{clower} and \code{cupper} were
 accidentally reversed, and the signs inside \code{exp(.)} where swapped
 in the (now corrected) two expressions \preformatted{
 tup <- Q3 + coef * IQR * exp(.... + clower * tmc * (tmc < 0))
 tlo <- Q1 - coef * IQR * exp(.... - cupper * tmc * (tmc < 0))
}
 already in the code from Antwerpen (\file{mcrsoft/adjoutlingness.R}),
 contrary to the published reference.

 Further, the original algorithm had not been scale-equivariant in the
 direction construction, which has been amended in 2014-10 as well.

 The results, including diagnosed outliers, therefore have changed,
 typically slightly, since \pkg{robustbase} version 0.92-0.
}
\details{
  \bold{FIXME}:  Details in the comment of the Matlab code;
  also in the reference(s).
  %% SEE /u/maechler/R/MM/STATISTICS/robust/MC/mcmatl/adjoutlyingness.m
  %% ---- which has notes about input/output etc of the corresponding
  %%      Matlab code

  The method as described can be useful as preprocessing in
  FASTICA (\url{http://research.ics.aalto.fi/ica/fastica/}
  see also the \R package \CRANpkg{fastICA}.
}
\value{
  If \code{only.outlyingness} is true, a vector \code{adjout},
  otherwise, as by default, a list with components
  \item{adjout}{numeric of \code{length(n)} giving the adjusted
    outlyingness of each observation.}
  \item{cutoff}{cutoff for \dQuote{outlier} with respect to the adjusted
    outlyingnesses, and depending on \code{alpha.cutoff}.}
  \item{nonOut}{logical of \code{length(n)}, \code{TRUE} when the
    corresponding observation is \bold{non}-outlying with respect to the
    cutoff and the adjusted outlyingnesses.}
}
\references{
  Brys, G., Hubert, M., and Rousseeuw, P.J. (2005)
  A Robustification of Independent Component Analysis;
  \emph{Journal of Chemometrics}, \bold{19}, 1--12.

  Hubert, M., Van der Veeken, S. (2008)
  Outlier detection for skewed data;
  \emph{Journal of Chemometrics} \bold{22}, 235--246;
  \doi{10.1002/cem.1123}.
  %% preprint \url{http://wis.kuleuven.be/stat/robust/papers/2008/outlierdetectionskeweddata-revision.pdf}
  %%MM: Journal-pdf  ~/save/papers/robust-diverse/Hubert_VdV_skewed-Chemom_2008.pdf
  %%MM: Compstat 2010: Slides (of talk) and paper of Mia H:
  %% ~/save/papers/robust-diverse/Hubert_skewed-CS2010-slides.pdf  and
  %% ~/save/papers/robust-diverse/Hubert_skewed-CS2010-paper.pdf (slides are better !!)

  For the up-to-date reference, please consult
 %\url{https://wis.kuleuven.be/stat/robust}% to sound modern:
  \url{https://wis.kuleuven.be/statdatascience/robust}
}
\author{Guy Brys; help page and improvements by Martin Maechler}
\seealso{the adjusted boxplot, \code{\link{adjbox}} and the medcouple,
  \code{\link{mc}}.
}
\examples{
## An Example with bad condition number and "border case" outliers

dim(longley) # 16 x 7  // set seed, as result is random :
set.seed(31)
ao1 <- adjOutlyingness(longley, mcScale=FALSE)
## which are outlying ?
which(!ao1$nonOut) ## for this seed, two: "1956", "1957"; (often: none)
## For seeds 1:100, we observe (Linux 64b)
if(FALSE) {
  adjO <- sapply(1:100, function(iSeed) {
            set.seed(iSeed); adjOutlyingness(longley)$nonOut })
  table(nrow(longley) - colSums(adjO))
}
## #{outl.}:  0  1  2  3
## #{cases}: 74 17  6  3


## An Example with outliers :

dim(hbk)
set.seed(1)
ao.hbk <- adjOutlyingness(hbk)
str(ao.hbk)
hist(ao.hbk $adjout)## really two groups
table(ao.hbk$nonOut)## 14 outliers, 61 non-outliers:
## outliers are :
which(! ao.hbk$nonOut) # 1 .. 14   --- but not for all random seeds!

## here, they are(*) the same as found by (much faster) MCD:
## *) only "almost", since the 2023-05 change to covMcd() 
cc <- covMcd(hbk)
table(cc = cc$mcd.wt, ao = ao.hbk$nonOut)# one differ..:
stopifnot(sum(cc$mcd.wt != ao.hbk$nonOut) <= 1)

## This is revealing: About 1--2 cases, where outliers are *not* == 1:14
## (2023: ~ 1/8 [sec] per call)
if(interactive()) {
  for(i in 1:30) {
    print(system.time(ao.hbk <- adjOutlyingness(hbk)))
    if(!identical(iout <- which(!ao.hbk$nonOut), 1:14)) {
	 cat("Outliers:\n"); print(iout)
    }
  }
}

## "Central" outlyingness: *not* calling mc()  anymore, since 2014-12-11:
trace(mc)
out <- capture.output(
  oo <- adjOutlyingness(hbk, clower=0, cupper=0)
)
untrace(mc)
stopifnot(length(out) == 0)

## A rank-deficient case
T <- tcrossprod(data.matrix(toxicity))
try(adjOutlyingness(T, maxit. = 20, trace.lev = 2)) # fails and recommends:
T. <- fullRank(T)
aT <- adjOutlyingness(T.)
plot(sort(aT$adjout, decreasing=TRUE), log="y")
plot(T.[,9:10], col = (1:2)[1 + (aT$adjout > 10000)])
## .. (not conclusive; directions are random, more 'ndir' makes a difference!)
}
\keyword{robust}
\keyword{multivariate}