1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
\name{ssden}
\alias{ssden}
\alias{ssden1}
\title{Estimating Probability Density Using Smoothing Splines}
\description{
Estimate probability densities using smoothing spline ANOVA models.
The symbolic model specification via \code{formula} follows the same
rules as in \code{\link{lm}}, but with the response missing.
}
\usage{
ssden(formula, type=NULL, data=list(), alpha=1.4, weights=NULL,
subset, na.action=na.omit, id.basis=NULL, nbasis=NULL, seed=NULL,
domain=as.list(NULL), quad=NULL, qdsz.depth=NULL, bias=NULL,
prec=1e-7, maxiter=30, skip.iter=FALSE)
ssden1(formula, type=NULL, data=list(), alpha=1.4, weights=NULL,
subset, na.action=na.omit, id.basis=NULL, nbasis=NULL, seed=NULL,
domain=as.list(NULL), quad=NULL, prec=1e-7, maxiter=30)
}
\arguments{
\item{formula}{Symbolic description of the model to be fit.}
\item{type}{List specifying the type of spline for each variable.
See \code{\link{mkterm}} for details.}
\item{data}{Optional data frame containing the variables in the
model.}
\item{alpha}{Parameter defining cross-validation score for smoothing
parameter selection.}
\item{weights}{Optional vector of bin-counts for histogram data.}
\item{subset}{Optional vector specifying a subset of observations
to be used in the fitting process.}
\item{na.action}{Function which indicates what should happen when
the data contain NAs.}
\item{id.basis}{Index of observations to be used as "knots."}
\item{nbasis}{Number of "knots" to be used. Ignored when
\code{id.basis} is specified.}
\item{seed}{Seed to be used for the random generation of "knots."
Ignored when \code{id.basis} is specified.}
\item{domain}{Data frame specifying marginal support of density.}
\item{quad}{Quadrature for calculating integral. Mandatory if
variables other than factors or numerical vectors are involved.}
\item{qdsz.depth}{Depth to be used in \code{\link{smolyak.quad}} for
the generation of quadrature.}
\item{bias}{Input for sampling bias.}
\item{prec}{Precision requirement for internal iterations.}
\item{maxiter}{Maximum number of iterations allowed for
internal iterations.}
\item{skip.iter}{Flag indicating whether to use initial values of
theta and skip theta iteration. See \code{\link{ssanova}} for
notes on skipping theta iteration.}
}
\details{
The model specification via \code{formula} is for the log density.
For example, \code{~x1*x2} prescribes a model of the form
\deqn{
log f(x1,x2) = g_{1}(x1) + g_{2}(x2) + g_{12}(x1,x2) + C
}
with the terms denoted by \code{"x1"}, \code{"x2"}, and
\code{"x1:x2"}; the constant is determined by the fact that a
density integrates to one.
The selective term elimination may characterize (conditional)
independence structures between variables. For example,
\code{~x1*x2+x1*x3} yields the conditional independence of x2 and x3
given x1.
Parallel to those in a \code{\link{ssanova}} object, the model terms
are sums of unpenalized and penalized terms. Attached to every
penalized term there is a smoothing parameter, and the model
complexity is largely determined by the number of smoothing
parameters.
The selection of smoothing parameters is through a cross-validation
mechanism described in the references, with a parameter
\code{alpha}; \code{alpha=1} is "unbiased" for the minimization of
Kullback-Leibler loss but may yield severe undersmoothing, whereas
larger \code{alpha} yields smoother estimates.
A subset of the observations are selected as "knots." Unless
specified via \code{id.basis} or \code{nbasis}, the number of
"knots" \eqn{q} is determined by \eqn{max(30,10n^{2/9})}, which is
appropriate for the default cubic splines for numerical vectors.
}
\note{
In \code{ssden}, default quadrature will be constructed for
numerical vectors on a hyper cube, then outer product with factor
levels will be taken if factors are involved. The sides of the
hyper cube are specified by \code{domain}; for \code{domain$x}
missing, the default is
\code{c(min(x),max(x))+c(-1,1)*(max(x)-mimn(x))*.05}. In 1-D, the
quadrature is the 200-point Gauss-Legendre formula returned from
\code{\link{gauss.quad}}. In multi-D, delayed Smolyak cubatures
from \code{\link{smolyak.quad}} are used on cubes with the marginals
properly transformed; see Gu and Wang (2003) for the marginal
transformations.
For reasonable execution time in higher dimensions, set
\code{skip.iter=TRUE} in call to \code{ssden}.
If you get an error message from \code{ssden} stating \code{"Newton
iteration diverges"}, try to use a larger \code{qdsz.depth} which
will execute slower, or switch to \code{ssden1}. The default values
of \code{qdsz.depth} for dimensions 4, 5, 6+ are 12, 11, 10.
\code{ssden1} does not involve multi-D quadrature but does not
perform as well as \code{ssden}. It can be used in very high
dimensions where \code{ssden} is infeasible.
The results may vary from run to run. For consistency, specify
\code{id.basis} or set \code{seed}.
}
\value{
\code{ssden} returns a list object of class \code{"ssden"}.
\code{ssden1} returns a list object of class
\code{c("ssden1","ssden")}.
\code{\link{dssden}} and \code{\link{cdssden}} can be used to
evaluate the estimated joint density and conditional density;
\code{\link{pssden}}, \code{\link{qssden}}, \code{\link{cpssden}},
and \code{\link{cqssden}} can be used to evaluate (conditional) cdf
and quantiles.
The method \code{\link{project.ssden}} can be used to calculate the
Kullback-Leibler projection of \code{"ssden"} objects for model
selection; \code{\link{project.ssden1}} can be used to calculate the
square error projection of \code{"ssden1"} objects.
}
\author{Chong Gu, \email{chong@stat.purdue.edu}}
\references{
Gu, C. and Wang, J. (2003), Penalized likelihood density
estimation: Direct cross-validation and scalable approximation.
\emph{Statistica Sinica}, \bold{13}, 811--826.
Gu, C. (2013), \emph{Smoothing Spline ANOVA Models (2nd Ed)}. New
York: Springer-Verlag.
Chong Gu (2014), Smoothing Spline ANOVA Models: R Package gss.
\emph{Journal of Statistical Software}, 58(5), 1-25. URL
http://www.jstatsoft.org/v58/i05/.
}
\examples{
## 1-D estimate: Buffalo snowfall
data(buffalo)
buff.fit <- ssden(~buffalo,domain=data.frame(buffalo=c(0,150)))
plot(xx<-seq(0,150,len=101),dssden(buff.fit,xx),type="l")
plot(xx,pssden(buff.fit,xx),type="l")
plot(qq<-seq(0,1,len=51),qssden(buff.fit,qq),type="l")
## Clean up
\dontrun{rm(buffalo,buff.fit,xx,qq)
dev.off()}
## 2-D with triangular domain: AIDS incubation
data(aids)
## rectangular quadrature
quad.pt <- expand.grid(incu=((1:40)-.5)/40*100,infe=((1:40)-.5)/40*100)
quad.pt <- quad.pt[quad.pt$incu<=quad.pt$infe,]
quad.wt <- rep(1,nrow(quad.pt))
quad.wt[quad.pt$incu==quad.pt$infe] <- .5
quad.wt <- quad.wt/sum(quad.wt)*5e3
## additive model (pre-truncation independence)
aids.fit <- ssden(~incu+infe,data=aids,subset=age>=60,
domain=data.frame(incu=c(0,100),infe=c(0,100)),
quad=list(pt=quad.pt,wt=quad.wt))
## conditional (marginal) density of infe
jk <- cdssden(aids.fit,xx<-seq(0,100,len=51),data.frame(incu=50))
plot(xx,jk$pdf,type="l")
## conditional (marginal) quantiles of infe (TIME-CONSUMING)
\dontrun{
cqssden(aids.fit,c(.05,.25,.5,.75,.95),data.frame(incu=50))
}
## Clean up
\dontrun{rm(aids,quad.pt,quad.wt,aids.fit,jk,xx)
dev.off()}
## One factor plus one vector
data(gastric)
gastric$trt
fit <- ssden(~futime*trt,data=gastric)
## conditional density
cdssden(fit,c("1","2"),cond=data.frame(futime=150))
## conditional quantiles
cqssden(fit,c(.05,.25,.5,.75,.95),data.frame(trt="1"))
## Clean up
\dontrun{rm(gastric,fit)}
## Sampling bias
## (X,T) is truncated to T<X<1 for T~U(0,1), so X is length-biased
rbias <- function(n) {
t <- runif(n)
x <- rnorm(n,.5,.15)
ok <- (x>t)&(x<1)
while(m<-sum(!ok)) {
t[!ok] <- runif(m)
x[!ok] <- rnorm(m,.5,.15)
ok <- (x>t)&(x<1)
}
cbind(x,t)
}
xt <- rbias(100)
x <- xt[,1]; t <- xt[,2]
## length-biased
bias1 <- list(t=1,wt=1,fun=function(t,x){x[,]})
fit1 <- ssden(~x,domain=list(x=c(0,1)),bias=bias1)
plot(xx<-seq(0,1,len=101),dssden(fit1,xx),type="l")
## truncated
bias2 <- list(t=t,wt=rep(1/100,100),fun=function(t,x){x[,]>t})
fit2 <- ssden(~x,domain=list(x=c(0,1)),bias=bias2)
plot(xx,dssden(fit2,xx),type="l")
## Clean up
\dontrun{rm(rbias,xt,x,t,bias1,fit1,bias2,fit2)}
}
\keyword{smooth}
\keyword{models}
\keyword{distribution}
|