File: xtrData.Rd

package info (click to toggle)
robustbase 0.99-4-1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,552 kB
  • sloc: fortran: 3,245; ansic: 3,243; sh: 15; makefile: 2
file content (93 lines) | stat: -rw-r--r-- 3,389 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
\name{xtrData}
\alias{x30o50}
\docType{data}
\title{Extreme Data examples}
\description{
  \code{x30o50}, called \sQuote{'XX'} in the thesis, has been a running
  case for which \code{mc()} had failed to converge.
  A numeric vector of 50 values, 30 of which are very close to zero,
  specifically, their absolute values are less than \code{1.5e-15}.

  The remaining 20 values (11 negative, 9 positive) have absolute values
  between 0.0022 and 1.66.
}
\usage{
data(x30o50, package="robustbase")
}
\format{
  A summary is \preformatted{
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
-1.66006  0.00000  0.00000 -0.04155  0.00000  1.29768
  }
  notably the 1st to 3rd quartiles are all very close to zero.
}
\details{
  a good robust method will treat the 60\% \dQuote{almost zero} values as
  \dQuote{good} data and all other as outliers.

  This is somewhat counter intuitive to typical human perception where the
  30 almost-zero numbers would be considered as inliers and the remaining
  20 as \dQuote{good} data.

  The original \code{mc()} algorithm and also the amendments up to 2022
  (\pkg{robustbase} versions before 0.95) would fail to converge unless (in
  newer versions) \code{eps1} was increased, e.g., only by a factor of 10,
  to \code{eps1 = 1e-13}.
}
\references{
  Lukas Graz (2021); unpublished BSc thesis, see \code{\link{mc}}.
}
\examples{
data(x30o50)
## have 4  duplicated values :
table(dX <- duplicated(x30o50))
     x30o50[dX] # 0 2.77e-17 4.16e-17 2.08e-16
sort(x30o50[dX]) * 2^56 #  0  2  3 15
## and they are  c(0,2,3,15)*2^-56

table(sml <- abs(x30o50) < 1e-11)# 20 30
summary(x30o50[ sml]) # -1.082e-15 ... 1.499e-15 ; mean = 9.2e-19 ~~ 0
summary(x30o50[!sml])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
## -1.6601 -0.4689 -0.0550 -0.1039  0.3986  1.2977

op <- par(mfrow=c(3,1), mgp=c(1.5, .6, 0), mar = .3+c(2,3:1))
(Fn. <- ecdf(x30o50)) # <- only 46 knots (as have 4 duplications)
plot(Fn.) ## and zoom in (*drastically*) to around x=0 :
for(f in c(1e-13, 1.5e-15)) {
  plot(Fn., xval=f*seq(-1,1, length.out = 1001), ylim=c(0,1), main="[zoomed in]")
  if(f == 1e-13) rect(-1e-15,0, +1e-15, 1, col="thistle", border=1)
  plot(Fn., add=TRUE)
}
par(op)

mcOld <- function(x, ..., doScale=TRUE)  mc(x, doScale=doScale, c.huberize=Inf, ...)
try( mcOld(x30o50) ) # Error: .. not 'converged' in 100 iteration
mcOld(x30o50, eps1 = 1e-12) # -0.152
(mcX <- mc(x30o50)) # -7.10849e-13
stopifnot(exprs = {
    all.equal(-7.10848988e-13, mcX, tol = 1e-9)
    all.equal(mcX, mc(1e30*x30o50), tol = 4e-4) # not so close
})
table(sml <- abs(x30o50) < 1e-8)# 20 30
range(x30o50[sml])
x0o50 <- x30o50; x0o50[sml] <- 0
(mcX0 <- mc(x0o50))
stopifnot(exprs = {
    all.equal(-0.378445401788, mcX0, tol=1e-12)
    all.equal(-0.099275805349, mc(x30o50[!sml]) -> mcL, tol=2e-11)
    all.equal(mcL, mcOld(x30o50[!sml]))
})
## -- some instability also wrt c.huberize:
mcHubc <- function(dat, ...)
    function(cc) vapply(cc, function(c) mc(dat, c.huberize = c, ...), -1.)
mcH50 <- mcHubc(x30o50)
head(cHs <- c(sort(outer(c(1, 2, 5), 10^(2:15))), Inf), 9)
mcXc <- mcH50(cHs)
plot(  mcXc  ~ cHs, type="b", log="x" , xlab=quote(c[huberize]))
plot((-mcXc) ~ cHs, type="b", log="xy", xlab=quote(c[huberize]))
## but for "regular" outlier skew data, there's no such dependency:
mcXcu <- mcHubc(cushny)(cHs)
stopifnot( abs(mcXcu - mcXcu[1]) < 1e-15)
}
\keyword{datasets}