File: modelsearch2-calcDistMax.R

package info (click to toggle)
r-cran-lavasearch2 2.0.3%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 1,832 kB
sloc: cpp: 28; sh: 13; makefile: 2
file content (419 lines) | stat: -rw-r--r-- 17,220 bytes
### calcDistMax.R --- 
#----------------------------------------------------------------------
## author: Brice Ozenne
## created: jun 21 2017 (16:44) 
## Version: 
## last-updated: jun 20 2019 (10:28) 
##           By: Brice Ozenne
##     Update #: 630
#----------------------------------------------------------------------
## 
### Commentary: 
## 
### Change Log:
#----------------------------------------------------------------------
## 
### Code:

## * documentation
#' @title Adjust the p.values Using the Quantiles of the Max Statistic
#' @description Adjust the p.values using the quantiles of the max statistic.
#' @name calcDistMax
#'
#' @param statistic [numeric vector] the observed Wald statistic.
#' Each statistic correspond to a null hypothesis (i.e. a coefficient) that one wish to test.
#' @param iid [matrix] zero-mean iid decomposition of the coefficient used to compute the statistic.
#' @param df [numeric] the degree of freedom defining the multivariate Student's t distribution.
#' If \code{NULL} the multivariate Gaussian distribution will be used instead.
#' @param iid.previous [matrix, EXPERIMENTAL] zero-mean iid decomposition of previously tested coefficient.
#' @param quantile.compute [logical] should the rejection quantile be computed?
#' @param quantile.previous [numeric, EXPERIMENTAL] rejection quantiles of the previously tested hypotheses. If not \code{NULL} the values should correspond the variable in to the first column(s) of the argument \code{iid.previous}.
#' @param method [character] the method used to compute the p-values.
#' @param alpha [numeric 0-1] the significance cutoff for the p-values.
#' When the p-value is below, the corresponding link will be retained.
#' @param cpus [integer >0] the number of processors to use.
#' If greater than 1, the computation of the p-value relative to each test is performed in parallel. 
#' @param cl [cluster] a parallel socket cluster generated by \code{parallel::makeCluster}
#' that has been registered using \code{registerDoParallel}.
#' @param n.sim [integer >0] the number of bootstrap simulations used to compute each p-values.
#' Disregarded when the p-values are computed using numerical integration.
#' @param n.repmax [integer >0] the maximum number of rejection for each bootstrap sample before switching to a new bootstrap sample.
#' Only relevant when conditioning on a previous test.
#' Disregarded when the p-values are computed using numerical integration.
#' @param trace [logical] should the execution of the function be traced?
#'
#' @return A list containing
#' \itemize{
#' \item p.adjust: the adjusted p-values.
#' \item z: the rejection threshold.
#' \item Sigma: the correlation matrix between the test statistic.
#' \item correctedLevel: the alpha level corrected for conditioning on previous tests.
#' }
#' 
#' @examples 
#' library(mvtnorm)
#'
#' set.seed(10)
#' n <- 100
#' p <- 4
#' link <- letters[1:p]
#' n.sim <- 1e3 # number of bootstrap simulations 
#'
#' #### test - not conditional ####
#' X.iid <- rmvnorm(n, mean = rep(0,p), sigma = diag(1,p))
#' colnames(X.iid) <- link
#' statistic <- setNames(1:p,link)
#'
#' 
#' r1 <- calcDistMaxIntegral(statistic = statistic, iid = X.iid, 
#'             trace = FALSE, alpha = 0.05, df = 1e6) 
#' 
#' r3 <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#'             method = "residual",
#'             trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' r4 <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#'             method = "wild",
#'             trace = FALSE, alpha = 0.05, n.sim = n.sim)
#' 
#' rbind(integration = c(r1$p.adjust, quantile = r1$z),
#'       bootResidual = c(r3$p.adjust, quantile = r3$z),
#'       bootWild    = c(r4$p.adjust, quantile = r4$z))
#'
#' #### test - conditional ####
#' \dontrun{
#' Z.iid <- rmvnorm(n, mean = rep(0,p+1), sigma = diag(1,p+1))
#' seqQuantile <- qmvnorm(p = 0.95, delta = rep(0,p+1), sigma = diag(1,p+1), 
#'                     tail = "both.tails")$quantile
#' 
#' r1c <- calcDistMaxIntegral(statistic = statistic, iid = X.iid,
#'             iid.previous = Z.iid, quantile.previous =  seqQuantile, 
#'             trace = FALSE, alpha = 0.05, df = NULL)
#' 
#' r3c <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#'             iid.previous = Z.iid, quantile.previous =  seqQuantile, method = "residual",
#'             trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' r4c <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#'             iid.previous = Z.iid, quantile.previous =  seqQuantile, method = "wild",
#'             trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' rbind(integration = c(r1c$p.adjust, quantile = r1c$z),
#'       bootResidual = c(r3c$p.adjust, quantile = r3c$z),
#'       bootWild    = c(r4c$p.adjust, quantile = r4c$z))
#' }
#' @concept modelsearch
#' @concept post-selection inference


## * calcDistMaxIntegral
#' @rdname calcDistMax
#' @export
calcDistMaxIntegral <- function(statistic, iid, df, 
                                iid.previous = NULL, quantile.previous = NULL,
                                quantile.compute = lava.options()$search.calc.quantile.int,
                                alpha, cpus = 1, cl = NULL, trace){

    ## ** normalize arguments
    p.iid <- NCOL(iid)
    n <- NROW(iid)
    conditional <- length(quantile.previous)
    if(length(quantile.previous)>1){
        stop("Can only condition on one previous step \n")
    }
    if(is.null(df)){
        distribution.statistic <- "gaussian"
    }else{
        distribution.statistic <- "student"
    }
    
    iid.all <- cbind(iid,iid.previous)
    index.new <- 1:NCOL(iid)
    index.previous <- setdiff(1:NCOL(iid.all),index.new)
    p.iid.all <- NCOL(iid.all)
   
    ## ** Compute the correlation matrix between the test statistics
    # center to be under the null
    # scale since we want the distribution of the Wald statistic (i.e. statistic with unit variance)
    iid.statistic <- scale(iid.all, center = TRUE, scale = TRUE)
    Sigma.statistic <- stats::cov(iid.statistic, use = "pairwise.complete.obs")
    out <- list(p.adjust = NULL, z = NULL, Sigma = Sigma.statistic[index.new,index.new,drop=FALSE])

    ## ** Definition of the functions used to compute the quantiles
    warperQ <- function(alpha){
        .calcQmaxIntegration(alpha = alpha, p = p.iid,
                             Sigma = Sigma.statistic[index.new,index.new,drop=FALSE],
                             df = df, distribution = distribution.statistic)
    }
    warperP <- function(index){
        .calcPmaxIntegration(statistic = statistic[index], p = p.iid,
                             Sigma = Sigma.statistic[index.new,index.new,drop=FALSE],
                             df = df, distribution = distribution.statistic)
    }
        
    ## ** correction for conditioning on the previous steps
    if(conditional==TRUE){
        out$correctedLevel <- calcType1postSelection(1-alpha, quantile.previous =  quantile.previous,
                                                      mu = rep(0,p.iid.all), Sigma = Sigma.statistic,
                                                      distribution =  distribution.statistic,
                                                      df = df)
        alpha <- 1-out$correctedLevel
    }else{
        out$correctedLevel <- NA
    }

    if(quantile.compute){       
        out$z <- warperQ(alpha)
    }else{
        out$z <- NA
    }

    ## ** start parallel computation
    init.cpus <- (cpus > 1 && is.null(cl))
    if(init.cpus){            
        ## define cluster
        if(trace>0){
            cl <- parallel::makeCluster(cpus, outfile = "")
        }else{
            cl <- parallel::makeCluster(cpus)
        }
        ## link to foreach
        doParallel::registerDoParallel(cl)
    }
    
    ## ** Computation
    if(trace > 0){ cat("Computation of multivariate student probabilities to adjust the p.values \n") }
    if(cpus > 1){
        ## *** parallel computations

        if(trace>0){
            pb <- utils::txtProgressBar(max = length(index.new), style = 3)                   
        }

        ## export package
        parallel::clusterCall(cl, fun = function(x){
            suppressPackageStartupMessages(requireNamespace("mvtnorm", quietly = TRUE))
        })
        
        value <- NULL # [:for CRAN check] foreach
        out$p.adjust <- foreach::`%dopar%`(
                                     foreach::foreach(value = 1:length(statistic),
                                                      .export = c(".calcPmaxIntegration"),
                                                      .combine = "list"),
                                     {
                                         if(trace>0){utils::setTxtProgressBar(pb, value)}
                                         return(warperP(index.new[value]))
                                     })

        if(trace>0){close(pb)}
            
    }else{
        ## *** sequential computations
        if(trace>0){      
            out$p.adjust <- pbapply::pblapply(1:length(statistic), function(iStat){ warperP(index.new[iStat]) })            
        }else{
            out$p.adjust <- lapply(1:length(statistic), function(iStat){ warperP(index.new[iStat]) })
        }
                        
    }
    out$error <- stats::setNames(unlist(lapply(out$p.adjust, function(x){attr(x,"error")})), names(statistic))
    out$p.adjust <- stats::setNames(unlist(out$p.adjust), names(statistic))

    ## ** end parallel computation
    if(init.cpus){
        parallel::stopCluster(cl)
    }

    
    ## ** export
    return(out)
}

## * calcDistMaxBootstrap
#' @rdname calcDistMax
#' @export
calcDistMaxBootstrap <- function(statistic, iid, iid.previous = NULL, quantile.previous = NULL,
                                 method, alpha, cpus = 1, cl = NULL, n.sim, trace, n.repmax = 100){

    ## ** normalize arguments
    n <- NROW(iid)
    conditional <- length(quantile.previous)>0
    if(length(quantile.previous)>1){
        stop("Can only condition on one previous step \n")
    }

    iid.all <- cbind(iid,iid.previous)
    index.new <- 1:NCOL(iid)
    index.previous <- setdiff(1:NCOL(iid.all),index.new)

    ## ** Function used for the simulations
    warperBoot <- .bootMaxDist
    
    ## ** Compute the correlation matrix between the test statistics
    # center to be under the null
    # scale since we want the distribution of the Wald statistic (i.e. statistic with unit variance)
    iid.statistic <- scale(iid.all, center = TRUE, scale = TRUE)
    Sigma.statistic <- stats::cov(iid.statistic, use = "pairwise.complete.obs")

    ## ** start parallel computation
    init.cpus <- (cpus > 1 && is.null(cl))
    if(init.cpus){            
        ## define cluster
        if(trace>0){
            cl <- parallel::makeCluster(cpus, outfile = "")
        }else{
            cl <- parallel::makeCluster(cpus)
        }
        ## link to foreach
        doParallel::registerDoParallel(cl)
    }

    ## ** Computation
    if(trace > 0){ cat("Bootsrap simulations to get the 95% quantile of the max statistic: ") }

    if(cpus>1){
        n.simCpus <- rep(round(n.sim/cpus),cpus)
        n.simCpus[1] <- n.sim-sum(n.simCpus[-1])
        
        i <- NULL # [:for CRAN check] foreach
        distMax <- foreach::`%dopar%`(
                                foreach::foreach(i = 1:cpus, .packages =  c("MASS"),
                                                 .export = "calcDistMax",
                                                 .combine = "c"),{
                                                     replicate(n.simCpus[i],
                                                               warperBoot(iid = iid.all, sigma = Sigma.statistic,
                                                                          n = n, method = method,
                                                                          index.new = index.new, index.previous = index.previous,
                                                                          quantile.previous = quantile.previous, n.repmax = n.repmax))
                                                 })
        
    }else{

        if(trace>0){
            distMax <- pbapply::pbsapply(1:n.sim, warperBoot, method = method,
                                         iid = iid.all, sigma = Sigma.statistic, n = n,                                         
                                         index.new = index.new, index.previous = index.previous,
                                         quantile.previous = quantile.previous, n.repmax = n.repmax)
            
        }else{
            distMax <- sapply(1:n.sim, warperBoot, method = method,
                              iid = iid.all, sigma = Sigma.statistic, n = n,
                              index.new = index.new, index.previous = index.previous,
                              quantile.previous = quantile.previous, n.repmax = n.repmax)
        }
        
    }
     
    if(trace > 0){ cat("done \n") }

    ## ** end parallel calculation
    if(init.cpus){
        parallel::stopCluster(cl)
    }

    ## ** export
    out <- list()
    out$z <- stats::quantile(distMax, probs = 1-alpha, na.rm = TRUE)
    out$p.adjust <- sapply(abs(statistic), function(x){mean(distMax>x,na.rm=TRUE)})
    out$Sigma <- Sigma.statistic
    out$correctedLevel <- NA
    return(out)
}

## * .calcQmaxIntegration: numerical integration to compute the critical threshold
.calcQmaxIntegration <- function(alpha, p, Sigma, df, distribution){

    if(distribution == "gaussian"){
        if(p==1){
            q.alpha <- stats::qnorm(1-alpha, mean = 0, sd = 1)
        }else{
            q.alpha <- mvtnorm::qmvnorm(1-alpha,
                                        mean = rep(0,p),
                                        corr = Sigma,
                                        tail = "both.tails")$quantile
        }
    }else if(distribution == "student"){
        if(p==1){
            q.alpha <- stats::qt(1-alpha, df = df)
        }else{
            q.alpha <- mvtnorm::qmvt(1-alpha,
                                     delta = rep(0,p),
                                     corr = Sigma,
                                     df = df,
                                     tail = "both.tails")$quantile
        }
    }

    return(q.alpha)
}
    
## * .calcPmaxIntegration_firstStep: numerical integration to compute the p.values
.calcPmaxIntegration <- function(statistic, p, Sigma, df, distribution){
    value <- abs(statistic)
    if(!is.na(value)){
        if(distribution == "gaussian"){
            if(p==1){
                out <- stats::pnorm(value, mean = 0, sd = Sigma)-stats::pnorm(-value, mean = 0, sd = Sigma)
                attr(out, "error") <- 0
            }else{      
                out <- mvtnorm::pmvnorm(lower = -value, upper = value,
                                        mean = rep(0, p), corr = Sigma)
            }
        }else if(distribution == "student"){
            if(p==1){
                out <- stats::pt(value, df = df)-stats::pt(-value, df = df)
                attr(out, "error") <- 0
            }else{
                out <- mvtnorm::pmvt(lower = -value, upper = value,
                                     delta = rep(0, p), corr = Sigma, df = df)
            }
        }
        return(1-out)
    }else{
        return(NA)
    }   
}

## * .bootMaxDist: bootstrap simulation
.bootMaxDist <- function(iid, sigma, n, method,
                         index.new, index.previous, quantile.previous, n.repmax,
                         ...){

    iRep <- 0
    cv <- FALSE

    while(iRep < n.repmax && cv == FALSE){

        ## ** resample to obtain a new influence function
        if(method == "residual"){
            iid.sim <- MASS::mvrnorm(n,rep(0,NCOL(sigma)),sigma)                    
        }else if(method == "wild"){
            e <- stats::rnorm(n,mean=0,sd=1)
            iid.sim <- sapply(1:NCOL(sigma),function(x){e*iid[,x]})        
        }
        if(!is.null(quantile.previous)){
            iid.previous <- iid.sim[,index.previous]
            test.previous <- apply(iid.previous,2,function(x){sqrt(n)*mean(x)/stats::sd(x)})
            max.previous <- max(abs(test.previous))        
            if(max.previous<quantile.previous){
                iRep <- iRep + 1
            }else{
                iid.sim <- iid.sim[,index.new]
                cv <- TRUE
            }
        }else{
            cv <- TRUE
        }
    }
    
    ## ** compute the bootstrap test statistic
    if(cv){
        Test <- apply(iid.sim,2,function(x){sqrt(n)*mean(x)/stats::sd(x)})
    }else{
        Test <- NA
    }
    return(max(abs(Test)))
}

#----------------------------------------------------------------------
### calcDistMax.R ends here