File: adjoutlyingness.R

package info (click to toggle)
robustbase 0.99-6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 4,584 kB
  • sloc: fortran: 3,245; ansic: 3,243; sh: 15; makefile: 2
file content (218 lines) | stat: -rw-r--r-- 9,021 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#### -*- mode: R; kept-new-versions: 30; kept-old-versions: 20 -*-

#### MC-adjusted Outlyingness
#### ------------------------
###
### Original code from  the web site from the Antwerpen Statistics Group :
###  http://www.agoras.ua.ac.be/Robustn.htm
### which has a "MC" section and for the software links to
### ftp://ftp.win.ua.ac.be/pub/software/agoras/newfiles/mc.tar.gz
### and that contains  mcrsoft/adjoutlyingness.R

##_NEW_ (2014): moved from Antwerpen to Leuwen,
## ===> http://wis.kuleuven.be/stat/robust/software
##      has several links to 'robustbase', and S-plus code
## http://wis.kuleuven.be/stat/robust/Programs/adjusted-boxplot/adjusted-boxplot.ssc
## (copy in ../misc/Adjusted-boxplot.ssc

## MM [ FIXME ]:
## -----------

## 1)   Use  *transposed*  B[] and A[] (now called 'E') matrices   -- DONE
## 2)   use  IQR() instead of   quantile(., .75) - quantile(., .25)

##-->  but only *after* testing original code
##     ^^^^^^^^^^^^^^^^^^^^^^^^


adjOutlyingness <- function(x, ndir = 250, p.samp = p, clower=4, cupper=3,
                            IQRtype = 7,
                            alpha.cutoff = 0.75, coef = 1.5,
                            qr.tol = 1e-12, keep.tol = 1e-12,
                            only.outlyingness = FALSE, maxit.mult = max(100, p),
                            trace.lev = 0,
                            ## these are all passed to mc() {when applied to the projected data}
                            mcReflect = n <= 100, mcScale = TRUE, mcMaxit = 2*maxit.mult,
                            mcEps1 = 1e-12, mcEps2 = 1e-15,
                            mcTrace = max(0, trace.lev-1))
## Skewness-Adjusted Outlyingness
{
    x <- data.matrix(x)
    n <- nrow(x)
    p <- ncol(x)
    stopifnot(n >= 1, p >= 1, p.samp >= p, is.numeric(x))
    if (p <= n) {
        B <- matrix(0, p, ndir)
        E <- matrix(1, p.samp, 1)
        x. <- unname(x) # for speed in subsequent subsetting and solve
        maxit <- as.integer(maxit.mult * ndir)
        ## ^^ original code had 'Inf', i.e. no iter.count check;
        ## often, maxit == ndir would suffice
        if(trace.lev >= 2) p10 <- 10 ^ max(0, min(6 - trace.lev, floor(log10(maxit))))
	i <- 1L
	it <- 0L
	while (i <= ndir  &&  (it <- it+1L) < maxit) {
            ## we sort to get *identical* projections instead of "almost"
            P <- x.[sort(sample.int(n, p.samp)), , drop=FALSE]
            if ((qrP <- qr(P, tol = qr.tol))$rank == p) {
                B[,i] <- solve(qrP, E, tol = qr.tol)
                ## if(trace.lev >= 2) cat(" it=",it,"; found direction # ", i,"\n", sep="")
		i <- i+1L
            } else if(trace.lev >= 2) {
                if(it %% p10 == 0)
                    cat(" it=",it,": rank(qr(P ..)) = ", qrP$rank, " < p = ",p,"\n", sep="")
            }
        }
	if(it == maxit) {
	    rnk.x <- qr(x, tol = qr.tol)$rank
	    if(rnk.x < p)
		stop("Matrix 'x' is not of full rank: rankM(x) = ",rnk.x," < p = ",p,
                     "\n Use fullRank(x) instead")
	    ## else
	    stop("** direction sampling iterations were not sufficient. Maybe try increasing 'maxit.mult'")
	}
        Bnorm <- sqrt(colSums(B^2))
        Nx <- mean(abs(x.)) ## so the comparison is scale-equivariant:
        keep <- Bnorm*Nx > keep.tol
        if(!all(keep)) {
            if(trace.lev)
                cat("keeping ", sum(keep), "(out of", length(keep),") normalized directions\n")
            Bnorm <- Bnorm[  keep ]
            B     <-     B[, keep , drop=FALSE]
        } else if(trace.lev) cat("keeping *all* ",length(keep)," normalized directions\n")

        B <- B / rep(Bnorm, each = nrow(B)) # normalized B {called 'A' in orig.code}
    }
    else {
        stop('More dimensions than observations: not yet implemented')
        ## MM: In LIBRA(matlab) they have it implemented:
        ##    seed=0;
        ##    nrich1=n*(n-1)/2;
        ##    ndirect=min(250,nrich1);
        ##    true = (ndirect == nrich1);
        ##    B=extradir(x,ndir,seed,true); % n*ri
        ##      ======== % Calculates ndirect directions through
        ##               % two random choosen data points from data
        ##    for i=1:size(B,1)
        ##        Bnorm(i)=norm(B(i,:),2);
        ##    end
        ##    Bnorm=Bnorm(Bnorm > 1.e-12); %ndirect*1
        ##    B=B(Bnorm > 1.e-12,:);       %ndirect*n
        ##    A=diag(1./Bnorm)*B;         %ndirect*n

    }
    ## NB:  colSums( B^2 ) == 1
    Y <- x %*% B # (n x p) %*% (p, nd') == (n x nd');
    ##  nd' = ndir.final := ndir - {those not in 'keep'}

    ## Compute and sweep out the median
    med <- colMedians(Y)
    if(trace.lev) {
        cat("med <- colMedians(Y): ", length(med), " values; summary():\n")
        print(summary(med))
    }
    Y <- Y - rep(med, each=n)
    ## central :<==> non-adjusted  <==> "classical" outlyingness
    central <- clower == 0 && cupper == 0
    if(!central) {
        ## MM: mc() could be made faster if we could tell it that med(..) = 0
        ##                  vv
        tmc <- apply(Y, 2L, mc, doReflect=mcReflect, doScale=mcScale,
                     maxit=mcMaxit, eps1=mcEps1, eps2=mcEps2, trace.lev = mcTrace)
        ## original Antwerpen *wrongly*: tmc <- mc(Y)
        if(trace.lev) {
            cat("Columnwise mc() got ", length(tmc), " values; summary():\n")
            print(summary(tmc))
        }
    }
    Q13 <- apply(Y, 2, quantile, c(.25, .75), names=FALSE, type = IQRtype)
    Q1 <- Q13[1L,]; Q3 <- Q13[2L,]
    IQR <- Q3 - Q1
    ## NOTA BENE(MM): simplified definition of tup/tlo here and below
    ## 2014-10-18: "flipped" sign (which Pieter Setaert (c/o Mia H) proposed, Jul.30,2014:
    tup <- Q3 + coef*
	(if(central) IQR else IQR*exp( cupper*tmc*(tmc >= 0) + clower*tmc*(tmc < 0)))
    tlo <- Q1 - coef*
	(if(central) IQR else IQR*exp(-clower*tmc*(tmc >= 0) - cupper*tmc*(tmc < 0)))
    ## Note: all(tlo < med & med < tup) # where med = 0
    if(trace.lev >= 3) {
        print( cbind(tlo, Q1, Q3, tup) )
    }

    ## Instead of the loop:
    ##  for (i in 1:ndir) {
    ##      tup[i] <-  max(Y[Y[,i] < tup[i], i])
    ##      tlo[i] <- -min(Y[Y[,i] > tlo[i], i])
    ##      ## MM:  FIXED typo-bug :       ^^^ this was missing!
    ##      ## But after the fix, the function stops "working" for longley..
    ##      ## because tlo[] becomes 0 too often, YZ[.,.] = c / 0 = Inf !
    ##  }
    Yup <- Ylo <- Y
    Yup[Y >= rep(tup, each=n)] <- -Inf
    Ylo[Y <= rep(tlo, each=n)] <-  Inf
    y.up <-  apply(Yup, 2, max) # =  max{ Y[i,] ; Y[i,] < tup[i] }
    y.lo <- -apply(Ylo, 2, min) # = -min{ Y[i,] ; Y[i,] > tlo[i] }
    if(trace.lev) {
        cat(length(y.up), "lower & upper Y (:= X - med(.)) values:\n")
        print(summary(y.lo))
        print(summary(y.up))
    }
    tY <- t(Y)
    ## Note: column-wise medians are all 0 : "x_i > m" <==> y > 0
    ## Note: this loop is pretty fast
    for (j in 1:n) { # when y = (X-med) = 0  ==> adjout = 0 rather than
	## 0 / 0 --> NaN; e.g, in  set.seed(3); adjOutlyingness(longley)
	non0 <- 0 != (y <- tY[,j]); y <- y[non0]; I <- (y > 0)
	D <- I*y.up[non0] + (1 - I)*y.lo[non0]
        if(trace.lev >= 3) {
            cat(sprintf("j=%2d: #{non0}= %2d; quantile(D)=\n", j, sum(non0)))
            print(quantile(D), digits=3)
        }
	tY[non0, j] <- abs(y) / D
    }
    ## We get +Inf above for "small n"; e.g. set.seed(11); adjOutlyingness(longley)
    if(trace.lev) {
        cat("outlyingnesses for all directions (of which max(.) will be chosen:\n")
        print(quantile(tY, digits=3))
    }
    adjout <- apply(tY, 2, function(x) max(x[is.finite(x)]))
    ##----                             ---
    if(abs(trace.lev %% 1  - 0.7) < 1e-3) { ## really not for the end user ..
        cat("Plotting outlyingnesses vs. observation i:\n")
        matplot(t(tY), log="y");           axis(2, at=1); abline(h=1, lty=3)
        Sys.sleep(2)
        ## somewhat revealing: 3  groups:  very large |  medium | very small (incl. 0 which are *not* plotted)
        matplot(t(tY), log="y", type="b"); axis(2, at=1); abline(h=1, lty=3)
        browser()  ## <<<<<<<<<<<<<<<<<<<
    }

    if(only.outlyingness)
	adjout
    else {
	Qadj <- quantile(adjout, probs = c(1 - alpha.cutoff, alpha.cutoff))
	mcadjout <- if(cupper != 0) mc(adjout, doScale=mcScale, eps1=mcEps1, eps2=mcEps2) else 0
	##			    ===
	cutoff <- Qadj[2] + coef * (Qadj[2] - Qadj[1]) *
	    (if(mcadjout > 0) exp(cupper*mcadjout) else 1)

	list(adjout = adjout, iter = it, ndir.final = sum(keep),
	     MCadjout = mcadjout, Qalph.adjout = Qadj, cutoff = cutoff,
	     nonOut = (adjout <= cutoff))
    }
}

##' Compute a "full rank" version of matrix x,
##' by removing columns (or rows when nrow(x) < ncol(x)), using qr() and it's pivots
fullRank <- function(x, tol = 1e-7, qrx = qr(x, tol=tol)) {
    d <- dim(x)
    n <- d[[1L]]; p <- d[[2L]]
    if(n < p)
        return( t(fullRank(t(x), tol=tol)) )
    ## else n >= p >= rank(.)
    rnk <- qrx$rank
    if(rnk == p)
        x
    else
        x[, qrx$pivot[seq_len(rnk)], drop=FALSE]
}