File: predab.resample.s

package info (click to toggle)
design 2.0.9-2
links: PTS
area: main
in suites: sarge
size: 1,412 kB
ctags: 1,385
sloc: asm: 13,815; fortran: 626; sh: 28; makefile: 12
file content (290 lines) | stat: -rw-r--r-- 10,131 bytes
parent folder | download | duplicates (2)
#Requires fastbw

predab.resample <- function(fit.orig,  fit, measure, 
		method=c("boot","crossvalidation",".632","randomization"),
		bw=FALSE, B=50, pr=FALSE, rule="aic", type="residual",
		sls=.05, aics=0, strata=FALSE, tol=1e-12, 
		non.slopes.in.x=TRUE, kint=1, cluster, subset, group=NULL, ...) {

method <- match.arg(method)
# .Options$digits <- 4  14Sep00
oldopt <- options(digits=4)
on.exit(options(oldopt))

#Following logic prevents having to load a copy of a large x object
if(any(match(c("x","y"),names(fit.orig),0)==0))
   stop("must have specified x=T and y=T on original fit")
fparms <- fit.orig[c("non.slopes","assign","terms","Design")]
if(!length(fparms$Design))fparms$Design <- getOldDesign(fit.orig) #10Jul01

non.slopes <- num.intercepts(fit.orig)
x.index <- if(non.slopes==0 || non.slopes.in.x) function(i,...) i else
	function(i, ns) { if(any(i>ns)) i[i>ns]-ns else NULL }  #23May94

Xb <- function(x, b, non.slopes, non.slopes.in.x, n, kint=1) {
  if(length(x)) {
    if(non.slopes==0 || non.slopes.in.x) x %*% b else
      b[kint] + x %*% b[-(1:non.slopes)]
  } else {
    if(non.slopes==0) rep(0,n) else rep(b[kint],n)
  }
}

nac <- fit.orig$na.action

x <- as.matrix(fit.orig$x)
n <- nrow(x)
attr(x,'class') <- NULL	#Remove model.matrix class for subset operations later

y <- fit.orig$y
y <- as.matrix(if(is.category(y)) oldUnclass(y) else y)  ##25Mar98

multi <- !missing(cluster)   # some subjects have multiple records now

# 19Mar99:
if(length(group)) {
  if(multi || method!='boot')
    stop('group is currently allowed only when method="boot" and cluster is not given')
  if(length(group) > n) {
	## Missing observations were deleted during fit
	if(length(nac)) {
	  j <- !is.na(naresid(nac, y) %*% rep(1,ncol(y)))
	  group <- group[j]
	}
  }
  if(length(group) != n)
	stop('length of group does not match # rows used in fit')
  group.inds <- split(1:n, group)  ## see bootstrap()
  ngroup <- length(group.inds)
} else ngroup <- 0
  
if(multi) {
  if(method!='boot') stop('cluster only implemented for method="boot"')
  if(length(cluster) > n) {
	## Missing observations were deleted during fit
	if(length(nac)) {
	  j <- !is.na(naresid(nac, y) %*% rep(1,ncol(y)))
	  cluster <- cluster[j]
	}
  }
  if(length(cluster) != n)
	stop('length of cluster does not match # rows used in fit')
  if(any(is.na(cluster))) stop('cluster has NAs')
  n.orig <- length(unique(cluster))
  cl.samp <- split(1:n, cluster)
} else n.orig <- n

if(!missing(subset)) {
  if(length(subset) > n && length(nac)) {
    j <- !is.na(naresid(nac, y) %*% rep(1,ncol(y)))
    subset <- subset[j]
  }
  if(length(subset) != n  && all(subset>=0))
    stop('length of subset does not match # rows used in fit')
  if(any(is.na(subset))) stop('subset has NAs')
  if(!is.logical(subset)) {
    subset2 <- rep(FALSE, n)
    subset2[subset] <- TRUE
    subset <- subset2
    subset2 <- NULL
    }
}

if(strata)			{
	stra <- attr(fit.orig$x, "strata")
	if(!length(stra)) stra <- rep(1, nrow(y))
	y <- cbind(y, stra)	}

if(bw)				{
#	fit.orig <- fit(x,y,iter=0,tol=tol,...)
	if(fit.orig$fail) return()
	cat("\n		Backwards Step-down - Original Model\n")
	fbw <- fastbw(fit.orig,rule=rule,type=type,sls=sls,aics=aics,eps=tol)
	print(fbw)
	orig.col.kept <- fbw$parms.kept
	if(!length(orig.col.kept))stop("no variables kept in original model")
	xcol <- x.index(orig.col.kept, non.slopes)
	fit.orig <- fit(x[,xcol,drop=FALSE], y, iter=0, tol=tol, xcol=xcol, ...)

				}	else 
	orig.col.kept <- 1:length(fit.orig$coef)

b <- fit.orig$coef
xcol <- x.index(orig.col.kept, non.slopes)
xb <- Xb(x[,xcol,drop=FALSE], b, non.slopes, non.slopes.in.x, n,
         kint=kint)

index.orig <- if(missing(subset))measure(xb, 
	y, fit=fit.orig,
	iter=0,	evalfit=TRUE, fit.orig=fit.orig, kint=kint, ...)   else
  measure(xb[subset], y[subset,,drop=FALSE], fit=fit.orig,
        iter=0, evalfit=FALSE, fit.orig=fit.orig, kint=kint, ...)

test.stat <- single(length(index.orig))
train.stat <- test.stat
#name <- attr(fparms$terms,"Design")$name   10Jul01
name <- fparms$Design$name
if(bw) 	{
	varin <- matrix("", nrow=B, ncol=length(name))
	nvarin <- rep(NA,B)
	}

j <- 0
num <- 0

if(method=="crossvalidation")		{ 
	per.group <- n/B
	if(per.group<2)stop("B>n/2")
	sb <- sample(n, replace=FALSE)	}
#Cross-val keeps using same random set of indexes, without replacement

ntest <- 0 #Used in getting weighted average for .632 estimator

if(method==".632")
{
   #Must do assignments ahead of time so can weight estimates
   #according to representation in bootstrap samples
   S <- matrix(integer(1), nrow=n, ncol=B)
   W <- matrix(TRUE, nrow=n, ncol=B)
   for(i in 1:B)
   {
	S[,i] <- s <- sample(n, replace=TRUE)
	W[s,i] <- FALSE  #now these obs are NOT omitted
   }
   nomit <- drop(W %*% rep(1,ncol(W)))  #no. boot samples omitting each obs
   if(min(nomit)==0) stop("not every observation omitted at least once in bootstrap samples.\nRe--run with larger B")
   W <- apply(W/nomit, 2, sum)/n
   cat("\n\nWeights for .632 method (ordinary bootstrap weights ",
	format(1/B),")\n",sep="")
   print(summary(W))
}

if(!pr) cat("Iteration:\n")

for(i in 1:B)								{
	if(!pr) { cat(i,""); if(i %% 20 == 0) cat("\n") }
	switch(method,
	crossvalidation=
		{	is <- 1 + round((i-1)*per.group)
			ie <- min(n, round(is+per.group-1))
			test <- sb[is:ie]
			train <- -test	}, #cross-val
    boot=	{
      if(ngroup) {
        train <- integer(n.orig)
        for(si in 1:ngroup) {
          gi <- group.inds[[si]]
          lgi <- length(gi)
          train[gi] <- if(lgi==1) gi else sample(gi, lgi, replace=TRUE)
          ## 6May99: sample behaves differently when first arg is a single integer
        }
      } else {
        train <- sample(n.orig, replace=TRUE)
        if(multi) train <- unlist(cl.samp[train])
      }
			test <- 1:n  },    #boot
	".632"=	{	train <- S[,i]
			test <- -train},   #boot .632
	randomization=	
		{	train <- sample(n, replace=FALSE)
			test <- 1:n   })   #randomization
	xtrain <- if(method=="randomization") 1:n else train
	f <- fit(x[xtrain,,drop=FALSE], y[train,,drop=FALSE], iter=i, tol=tol,...)
	f$assign <- NULL  #Some programs put a NULL assign (e.g. ols.val fit)
 
	fail <- f$fail
	if(!fail)			{
      ## Following if..stop was before f$assign above   28Apr99
      if((ni <- num.intercepts(f)) != non.slopes) 
        stop(paste('\nA training sample has a different number of intercepts (',
                   ni,')\n than the original model fit (',non.slopes,').  \nYou probably fit an ordinal model with sparse cells and a re-sample\ndid not select at least one observation for each value of Y.\nAdd the argument group=y where y is the response variable.\nThis will force balanced sampling on levels of y.',sep=''))
      clf <- attr(f,"class")  # class is removed by c() below
      f[names(fparms)] <- fparms  # 23Dec99
      ##      f <- c(f, fparms)     23Dec99
      attr(f, "class") <- clf
      if(!bw) 				{
        coef <- f$coef  # 14Sep00, coefficients->coef 14Aug01
        col.kept <- 1:length(coef)
      }	else	{
        f <- fastbw(f,rule=rule,type=type,
                    sls=sls,aics=aics,eps=tol)
        if(pr)print(f)
        varin[j+1, f$factors.kept] <- "*"   #did have drop=F
        nvarin[j+1] <- length(f$factors.kept)
        col.kept <- f$parms.kept
        if(!length(col.kept)) f <- fit(NULL, y[train,,drop=FALSE],
                                       iter=i, tol=tol,...)	else     {
                                         xcol <- x.index(col.kept, non.slopes)
                                         f <- fit(x[xtrain,xcol,drop=FALSE], y[train,,drop=FALSE],
                                                  iter=i, tol=tol, xcol=xcol, ...) }
        if(f$fail) fail <- TRUE else coef <- f$coef  #14Sep00 14Aug01
      }	}
	if(!fail)	{
      j <- j+1
      xcol <- x.index(col.kept, non.slopes)
      xb <- Xb(x[,xcol,drop=FALSE], coef, non.slopes, non.slopes.in.x, n,
               kint=kint)
      if(missing(subset)) {
		train.statj <- measure(xb[xtrain], y[train,,drop=FALSE], 
                               fit=f, iter=i,fit.orig=fit.orig,evalfit=TRUE, 
                               kint=kint, ...)
		test.statj <- measure(xb[test], y[test,,drop=FALSE], fit=f, 
                              iter=i,fit.orig=fit.orig, evalfit=FALSE, kint=kint, ...)
      } else {
		ii <- xtrain
		if(any(ii<0)) ii <- (1:n)[ii]
		ii <- ii[subset[ii]]
		train.statj <- measure(xb[ii], y[ii,,drop=FALSE],
                               fit=f, iter=i, fit.orig=fit.orig,evalfit=FALSE,
                               kint=kint, ...)
		ii <- test
		if(any(ii<0)) ii <- (1:n)[ii]
		ii <- ii[subset[ii]]
		test.statj <- measure(xb[ii], y[ii,,drop=FALSE], fit=f,
                              iter=i, fit.orig=fit.orig, evalfit=FALSE, kint=kint, ...)
      }
      na <- is.na(train.statj+test.statj)
      num <- num + !na
      if(pr) print(cbind(training=train.statj, test=test.statj))
      train.statj[na] <- 0
      test.statj[na] <- 0
      if(method==".632") 
		{
          ##wt <- length(xb[test])*(!na)  else wt <- 1
		  wt <- W[i]
		  if(any(na))warning('method=".632" does not properly handle missing summary indexes')
		}
      else wt <- 1
      train.stat <- train.stat + train.statj
      test.stat <- test.stat + test.statj * wt
      ntest <- ntest + 1   #was +wt
    } 
  }
if(!pr)cat("\n\n")
if(j!=B) cat("\nDivergence or singularity in",B-j,"samples\n")
train.stat <- train.stat/num
if(method!=".632") 	{
  test.stat <- test.stat/num
  optimism <- train.stat - test.stat
}	else	{
	optimism <- .632 * (index.orig - test.stat)
  }
res <- cbind(index.orig=index.orig,training=train.stat,test=test.stat,
	optimism=optimism,index.corrected=index.orig-optimism,n=num)

if(bw) {
	varin <- varin[1:j, ,drop=FALSE]
	nvarin <- nvarin[1:j]
#	dimnames(varin) <- list(rep("",j), abbreviate(name,1:2))
dimnames(varin) <- list(rep("",j), name)
	cat("\n		Factors Retained in Backwards Elimination\n\n")
	print(varin, quote=FALSE)
	cat("\n         Frequencies of Numbers of Factors Retained\n\n")
    tvarin <- table(nvarin)
    if(.R.) names(dimnames(tvarin)) <- NULL
	print(tvarin)
  }

res
}