1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
#-----------------------------------------------------------------------#
# Package: High-dimensional Undirected Graph Estimation #
# huge.mb(): Meinshausen & Buhlmann graph estimation (mb) #
#-----------------------------------------------------------------------#
#' Meinshausen & Buhlmann graph estimation
#'
#' See more details in \code{\link{huge}}
#' @param x There are 2 options: (1) \code{x} is an \code{n} by \code{d} data matrix (2) a \code{d} by \code{d} sample covariance matrix. The program automatically identifies the input matrix by checking the symmetry. (\code{n} is the sample size and \code{d} is the dimension).
#' @param lambda A sequence of decreasing positive numbers to control the regularization when \code{method = "mb"}, \code{"glasso"} or \code{"tiger"}, or the thresholding in \code{method = "ct"}. Typical usage is to leave the input \code{lambda = NULL} and have the program compute its own \code{lambda} sequence based on \code{nlambda} and \code{lambda.min.ratio}. Users can also specify a sequence to override this. When \code{method = "mb"}, \code{"glasso"} or \code{"tiger"}, use with care - it is better to supply a decreasing sequence values than a single (small) value.
#' @param nlambda The number of regularization/thresholding parameters. The default value is \code{30} for \code{method = "ct"} and \code{10} for \code{method = "mb"}, \code{"glasso"} or \code{"tiger"}.
#' @param lambda.min.ratio If \code{method = "mb"}, \code{"glasso"} or \code{"tiger"}, it is the smallest value for \code{lambda}, as a fraction of the upperbound (\code{MAX}) of the regularization/thresholding parameter which makes all estimates equal to \code{0}. The program can automatically generate \code{lambda} as a sequence of length = \code{nlambda} starting from \code{MAX} to \code{lambda.min.ratio*MAX} in log scale. If \code{method = "ct"}, it is the largest sparsity level for estimated graphs. The program can automatically generate \code{lambda} as a sequence of length = \code{nlambda}, which makes the sparsity level of the graph path increases from \code{0} to \code{lambda.min.ratio} evenly.The default value is \code{0.1} when \code{method = "mb"}, \code{"glasso"} or \code{"tiger"}, and 0.05 \code{method = "ct"}.
#' @param scr If \code{scr = TRUE}, the lossy screening rule is applied to preselect the neighborhood before the graph estimation. The default value is \code{FALSE}. NOT applicable when \code{method = "ct"}, {"mb"}, or {"tiger"}.
#' @param scr.num The neighborhood size after the lossy screening rule (the number of remaining neighbors per node). ONLY applicable when \code{scr = TRUE}. The default value is \code{n-1}. An alternative value is \code{n/log(n)}. ONLY applicable when \code{scr = TRUE} and \code{method = "mb"}.
#' @param idx.mat Index matrix for screening.
#' @param sym Symmetrize the output graphs. If \code{sym = "and"}, the edge between node \code{i} and node \code{j} is selected ONLY when both node \code{i} and node \code{j} are selected as neighbors for each other. If \code{sym = "or"}, the edge is selected when either node \code{i} or node \code{j} is selected as the neighbor for each other. The default value is \code{"or"}. ONLY applicable when \code{method = "mb"} or {"tiger"}.
#' @param verbose If \code{verbose = FALSE}, tracing information printing is disabled. The default value is \code{TRUE}.
#' @seealso \code{\link{huge}}, and \code{\link{huge-package}}.
#' @export
huge.mb = function(x, lambda = NULL, nlambda = NULL, lambda.min.ratio = NULL, scr = NULL, scr.num = NULL, idx.mat = NULL, sym = "or", verbose = TRUE)
{
gcinfo(FALSE)
n = nrow(x);
d = ncol(x);
maxdf = min(d,n);
fit = list()
fit$cov.input = isSymmetric(x);
if(fit$cov.input)
{
if(verbose) cat("The input is identified as the covariance matrix.\n")
S = cov2cor(x);
}
if(!fit$cov.input)
{
x = scale(x)
S = cor(x)
}
gc()
if(is.null(idx.mat))
{
if(is.null(scr))
scr = FALSE
if(scr)
{
if(is.null(scr.num))
{
if(n<d)
scr.num = n-1
if(n>=d)
{
if(verbose) cat("lossy screening is skipped without specifying scr.num.\n")
scr = FALSE
}
}
}
fit$scr = scr
}
if(!is.null(idx.mat))
{
scr = TRUE
fit$scr = scr
scr.num = nrow(idx.mat)
}
if(!is.null(lambda)) nlambda = length(lambda)
if(is.null(lambda))
{
if(is.null(nlambda))
nlambda = 10
if(is.null(lambda.min.ratio))
lambda.min.ratio = 0.1
lambda.max = max(max(S-diag(d)),-min(S-diag(d)))
lambda.min = lambda.min.ratio*lambda.max
lambda = exp(seq(log(lambda.max), log(lambda.min), length = nlambda))
rm(lambda.max,lambda.min,lambda.min.ratio)
gc()
}
if(scr)
{
if(verbose)
{
cat("Conducting Meinshausen & Buhlmann graph estimation (mb) with lossy screening....")
flush.console()
}
if(is.null(idx.mat))
idx.mat = apply(-abs(S),2,order)[2:(scr.num+1),] - 1
fit$idx.mat = idx.mat
out = .Call("_huge_SPMBscr", S, lambda, nlambda, d, maxdf, idx.mat, scr.num)
}
if(!scr)
{
if(verbose)
{
cat("Conducting Meinshausen & Buhlmann graph estimation (mb)....")
flush.console()
}
fit$idx_mat = NULL
out = .Call("_huge_SPMBgraph", S, lambda, nlambda, d, maxdf)
}
for(i in 1:d)
{
if(out$col_cnz[i+1]>out$col_cnz[i])
{
idx.tmp = (out$col_cnz[i]+1):out$col_cnz[i+1]
ord = order(out$row_idx[idx.tmp])
out$row_idx[idx.tmp] = out$row_idx[ord + out$col_cnz[i]]
out$x[idx.tmp] = out$x[ord + out$col_cnz[i]]
}
}
G = new("dgCMatrix", Dim = as.integer(c(d*nlambda,d)), x = as.vector(out$x[1:out$col_cnz[d+1]]),p = as.integer(out$col_cnz), i = as.integer(out$row_idx[1:out$col_cnz[d+1]]))
fit$beta = list()
fit$path = list()
fit$df = matrix(0,d,nlambda)
fit$rss = matrix(0,d,nlambda)
fit$sparsity = rep(0,nlambda)
for(i in 1:nlambda)
{
fit$beta[[i]] = G[((i-1)*d+1):(i*d),]
fit$path[[i]] = abs(fit$beta[[i]])
fit$df[,i] = apply(sign(fit$path[[i]]),2,sum)
if(sym == "or")
fit$path[[i]] = sign(fit$path[[i]] + t(as.matrix(fit$path[[i]])))
if(sym == "and")
fit$path[[i]] = sign(fit$path[[i]] * t(as.matrix(fit$path[[i]])))
fit$sparsity[i] = sum(fit$path[[i]])/d/(d-1)
}
rm(x, G, out)
fit$lambda = lambda
if(verbose)
{
cat("done\n")
flush.console()
}
rm(verbose,nlambda)
gc()
return(fit)
}
|