File: refmodel-init-get.Rd

package info (click to toggle)
r-cran-projpred 2.3.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 1,180 kB
sloc: cpp: 296; sh: 14; makefile: 5
file content (333 lines) | stat: -rw-r--r-- 17,661 bytes
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/refmodel.R
\name{refmodel-init-get}
\alias{refmodel-init-get}
\alias{get_refmodel}
\alias{get_refmodel.refmodel}
\alias{get_refmodel.vsel}
\alias{get_refmodel.default}
\alias{get_refmodel.stanreg}
\alias{init_refmodel}
\title{Reference model and more general information}
\usage{
get_refmodel(object, ...)

\method{get_refmodel}{refmodel}(object, ...)

\method{get_refmodel}{vsel}(object, ...)

\method{get_refmodel}{default}(object, formula, family = NULL, ...)

\method{get_refmodel}{stanreg}(object, ...)

init_refmodel(
  object,
  data,
  formula,
  family,
  ref_predfun = NULL,
  div_minimizer = NULL,
  proj_predfun = NULL,
  extract_model_data,
  cvfun = NULL,
  cvfits = NULL,
  dis = NULL,
  cvrefbuilder = NULL,
  ...
)
}
\arguments{
\item{object}{For \code{\link[=init_refmodel]{init_refmodel()}}, an object that the functions from
arguments \code{extract_model_data} and \code{ref_predfun} can be applied to, with a
\code{NULL} object being treated specially (see section "Value" below). For
\code{\link[=get_refmodel.default]{get_refmodel.default()}}, an object of type \code{list} that (i) function
\code{\link[=family]{family()}} can be applied to in order to retrieve the family (if argument
\code{family} is \code{NULL}) and (ii) has an element called \code{data} containing the
original dataset (see argument \code{data} of \code{\link[=init_refmodel]{init_refmodel()}}), additionally
to the properties required for \code{\link[=init_refmodel]{init_refmodel()}}. For non-default methods
of \code{\link[=get_refmodel]{get_refmodel()}}, an object of the corresponding class.}

\item{...}{For \code{\link[=get_refmodel.default]{get_refmodel.default()}} and \code{\link[=get_refmodel.stanreg]{get_refmodel.stanreg()}}:
arguments passed to \code{\link[=init_refmodel]{init_refmodel()}}. For the \code{\link[=get_refmodel]{get_refmodel()}} generic:
arguments passed to the appropriate method. Else: ignored.}

\item{formula}{The full formula to use for the search procedure. For custom
reference models, this does not necessarily coincide with the reference
model's formula. For general information on formulas in \R, see
\code{\link{formula}}. For multilevel formulas, see also package \pkg{lme4} (in
particular, functions \code{\link[lme4:lmer]{lme4::lmer()}} and \code{\link[lme4:glmer]{lme4::glmer()}}). For additive
formulas, see also packages \pkg{mgcv} (in particular, function
\code{\link[mgcv:gam]{mgcv::gam()}}) and \pkg{gamm4} (in particular, function \code{\link[gamm4:gamm4]{gamm4::gamm4()}})
as well as the notes in section "Formula terms" below.}

\item{family}{An object of class \code{family} representing the observation model
(i.e., the distributional family for the response) of the \emph{submodels}.
(However, the link and the inverse-link function of this \code{family} are also
used for quantities like predictions and fitted values related to the
\emph{reference model}.) May be \code{NULL} for \code{\link[=get_refmodel.default]{get_refmodel.default()}} in which
case the family is retrieved from \code{object}. For custom reference models,
\code{family} does not have to coincide with the family of the reference model
(if the reference model possesses a formal \code{family} at all). In typical
reference models, however, these families do coincide.}

\item{data}{A \code{data.frame} containing the data to use for the projection
predictive variable selection. Any \code{contrasts} attributes of the dataset's
columns are silently removed. For custom reference models, the columns of
\code{data} do not necessarily have to coincide with those of the dataset used
for fitting the reference model, but keep in mind that a row-subset of
\code{data} is used for argument \code{newdata} of \code{ref_predfun} during \eqn{K}-fold
CV.}

\item{ref_predfun}{Prediction function for the linear predictor of the
reference model, including offsets (if existing). See also section
"Arguments \code{ref_predfun}, \code{proj_predfun}, and \code{div_minimizer}" below. If
\code{object} is \code{NULL}, \code{ref_predfun} is ignored and an internal default is
used instead.}

\item{div_minimizer}{A function for minimizing the Kullback-Leibler (KL)
divergence from the reference model to a submodel (i.e., for performing the
projection of the reference model onto a submodel). The output of
\code{div_minimizer} is used, e.g., by \code{proj_predfun}'s argument \code{fits}. See
also section "Arguments \code{ref_predfun}, \code{proj_predfun}, and \code{div_minimizer}"
below.}

\item{proj_predfun}{Prediction function for the linear predictor of a
submodel onto which the reference model is projected. See also section
"Arguments \code{ref_predfun}, \code{proj_predfun}, and \code{div_minimizer}" below.}

\item{extract_model_data}{A function for fetching some variables (response,
observation weights, offsets) from the original dataset (supplied to
argument \code{data}) or from a new dataset. See also section "Argument
\code{extract_model_data}" below.}

\item{cvfun}{For \eqn{K}-fold CV only. A function that, given a fold indices
vector, fits the reference model separately for each fold and returns the
\eqn{K} model fits as a \code{list}. Each of the \eqn{K} model fits needs to be
a \code{list}. If \code{object} is \code{NULL}, \code{cvfun} may be \code{NULL} for using an
internal default. Only one of \code{cvfits} and \code{cvfun} needs to be provided
(for \eqn{K}-fold CV). Note that \code{cvfits} takes precedence over \code{cvfun},
i.e., if both are provided, \code{cvfits} is used.}

\item{cvfits}{For \eqn{K}-fold CV only. A \code{list} containing a sub-\code{list}
called \code{fits} containing the \eqn{K} model fits from which reference model
structures are created. The \code{cvfits} \code{list} (i.e., the super-\code{list}) needs
to have attributes \code{K} and \code{folds}: \code{K} has to be a single integer giving
the number of folds and \code{folds} has to be an integer vector giving the fold
indices (one fold index per observation). Each element of \code{cvfits$fits}
(i.e., each of the \eqn{K} model fits) needs to be a list. Only one of
\code{cvfits} and \code{cvfun} needs to be provided (for \eqn{K}-fold CV). Note that
\code{cvfits} takes precedence over \code{cvfun}, i.e., if both are provided,
\code{cvfits} is used.}

\item{dis}{A vector of posterior draws for the reference model's dispersion
parameter or---more precisely---the posterior values for the reference
model's parameter-conditional predictive variance (assuming that this
variance is the same for all observations). May be \code{NULL} if the submodels
have no dispersion parameter or if the submodels do have a dispersion
parameter, but \code{object} is \code{NULL} (in which case \code{0} is used for \code{dis}).
Note that for the \code{\link[=gaussian]{gaussian()}} \code{family}, \code{dis} is the standard deviation,
not the variance.}

\item{cvrefbuilder}{For \eqn{K}-fold CV only. A function that, given a
reference model fit for fold \eqn{k \in \{1, ..., K\}}{k = 1, ..., K} (this
model fit is the \eqn{k}-th element of the return value of \code{cvfun} or the
\eqn{k}-th element of \code{cvfits$fits}, extended by elements \code{omitted}
(containing the indices of the left-out observations in that fold) and
\code{projpred_k} (containing the integer \eqn{k})), returns an object of the
same type as \code{\link[=init_refmodel]{init_refmodel()}} does. Argument \code{cvrefbuilder} may be \code{NULL}
for using an internal default: \code{\link[=get_refmodel]{get_refmodel()}} if \code{object} is not \code{NULL}
and a function calling \code{\link[=init_refmodel]{init_refmodel()}} appropriately (with the assumption
\code{dis = 0}) if \code{object} is \code{NULL}.}
}
\value{
An object that can be passed to all the functions that take the
reference model fit as the first argument, such as \code{\link[=varsel]{varsel()}},
\code{\link[=cv_varsel]{cv_varsel()}}, \code{\link[=project]{project()}}, \code{\link[=proj_linpred]{proj_linpred()}}, and \code{\link[=proj_predict]{proj_predict()}}.
Usually, the returned object is of class \code{refmodel}. However, if \code{object}
is \code{NULL}, the returned object is of class \code{datafit} as well as of class
\code{refmodel} (with \code{datafit} being first). Objects of class \code{datafit} are
handled differently at several places throughout this package.
}
\description{
Function \code{\link[=get_refmodel]{get_refmodel()}} is a generic function whose methods usually call
\code{\link[=init_refmodel]{init_refmodel()}} which is the underlying workhorse (and may also be used
directly without a call to \code{\link[=get_refmodel]{get_refmodel()}}).

Both, \code{\link[=get_refmodel]{get_refmodel()}} and \code{\link[=init_refmodel]{init_refmodel()}}, create an object containing
information needed for the projection predictive variable selection, namely
about the reference model, the submodels, and how the projection should be
carried out. For the sake of simplicity, the documentation may refer to the
resulting object also as "reference model" or "reference model object", even
though it also contains information about the submodels and the projection.

A "typical" reference model object is created by \code{\link[=get_refmodel.stanreg]{get_refmodel.stanreg()}} and
\code{\link[brms:get_refmodel.brmsfit]{brms::get_refmodel.brmsfit()}}, either implicitly by a call to a top-level
function such as \code{\link[=project]{project()}}, \code{\link[=varsel]{varsel()}}, and \code{\link[=cv_varsel]{cv_varsel()}} or explicitly by
a call to \code{\link[=get_refmodel]{get_refmodel()}}. All non-"typical" reference model objects will be
called "custom" reference model objects.

Some arguments are for \eqn{K}-fold cross-validation (\eqn{K}-fold CV) only;
see \code{\link[=cv_varsel]{cv_varsel()}} for the use of \eqn{K}-fold CV in \pkg{projpred}.
}
\section{Formula terms}{
For additive models (still an experimental feature), only \code{\link[mgcv:s]{mgcv::s()}} and
\code{\link[mgcv:t2]{mgcv::t2()}} are currently supported as smooth terms. Furthermore, these need
to be called without any arguments apart from the predictor names (symbols).
For example, for smoothing the effect of a predictor \code{x}, only \code{s(x)} or
\code{t2(x)} are allowed. As another example, for smoothing the joint effect of
two predictors \code{x} and \code{z}, only \code{s(x, z)} or \code{t2(x, z)} are allowed (and
analogously for higher-order joint effects, e.g., of three predictors).
}

\section{Arguments \code{ref_predfun}, \code{proj_predfun}, and \code{div_minimizer}}{
Arguments \code{ref_predfun}, \code{proj_predfun}, and \code{div_minimizer} may be \code{NULL}
for using an internal default (see \link{projpred-package} for the functions used
by the default divergence minimizer). Otherwise, let \eqn{N} denote the
number of observations (in case of CV, these may be reduced to each fold),
\eqn{S_{\mathrm{ref}}}{S_ref} the number of posterior draws for the reference
model's parameters, and \eqn{S_{\mathrm{prj}}}{S_prj} the number of draws for
the parameters of a submodel that the reference model has been projected onto
(short: the number of projected draws). Then the functions supplied to these
arguments need to have the following prototypes:
\itemize{
\item \code{ref_predfun}: \code{ref_predfun(fit, newdata = NULL)} where:
\itemize{
\item \code{fit} accepts the reference model fit as given in argument \code{object}
(but possibly re-fitted to a subset of the observations, as done in
\eqn{K}-fold CV).
\item \code{newdata} accepts either \code{NULL} (for using the original dataset,
typically stored in \code{fit}) or data for new observations (at least in the
form of a \code{data.frame}).
}
\item \code{proj_predfun}: \code{proj_predfun(fits, newdata)} where:
\itemize{
\item \code{fits} accepts a \code{list} of length \eqn{S_{\mathrm{prj}}}{S_prj}
containing this number of submodel fits. This \code{list} is the same as that
returned by \code{\link[=project]{project()}} in its output element \code{submodl} (which in turn is
the same as the return value of \code{div_minimizer}, except if \code{\link[=project]{project()}}
was used with an \code{object} of class \code{vsel} based on an L1 search as well
as with \code{refit_prj = FALSE}).
\item \code{newdata} accepts data for new observations (at least in the form of a
\code{data.frame}).
}
\item \code{div_minimizer} does not need to have a specific prototype, but it needs to
be able to be called with the following arguments:
\itemize{
\item \code{formula} accepts either a standard \code{\link{formula}} with a single response
(if \eqn{S_{\mathrm{prj}} = 1}{S_prj = 1}) or a \code{\link{formula}} with
\eqn{S_{\mathrm{prj}} > 1}{S_prj > 1} response variables \code{\link[=cbind]{cbind()}}-ed on
the left-hand side in which case the projection has to be performed for
each of the response variables separately.
\item \code{data} accepts a \code{data.frame} to be used for the projection.
\item \code{family} accepts an object of class \code{family}.
\item \code{weights} accepts either observation weights (at least in the form of a
numeric vector) or \code{NULL} (for using a vector of ones as weights).
\item \code{projpred_var} accepts an \eqn{N \times S_{\mathrm{prj}}}{N x S_prj}
matrix of predictive variances (necessary for \pkg{projpred}'s internal
GLM fitter).
\item \code{projpred_regul} accepts a single numeric value as supplied to argument
\code{regul} of \code{\link[=project]{project()}}, for example.
\item \code{...} accepts further arguments specified by the user.
}
}

The return value of these functions needs to be:
\itemize{
\item \code{ref_predfun}: an \eqn{N \times S_{\mathrm{ref}}}{N x S_ref} matrix.
\item \code{proj_predfun}: an \eqn{N \times S_{\mathrm{prj}}}{N x S_prj} matrix.
\item \code{div_minimizer}: a \code{list} of length \eqn{S_{\mathrm{prj}}}{S_prj}
containing this number of submodel fits.
}
}

\section{Argument \code{extract_model_data}}{
The function supplied to argument \code{extract_model_data} needs to have the
prototype

\if{html}{\out{<div class="sourceCode r">}}\preformatted{extract_model_data(object, newdata, wrhs = NULL, orhs = NULL, extract_y = TRUE)
}\if{html}{\out{</div>}}

where:
\itemize{
\item \code{object} accepts the reference model fit as given in argument \code{object} (but
possibly re-fitted to a subset of the observations, as done in \eqn{K}-fold
CV).
\item \code{newdata} accepts either \code{NULL} (for using the original dataset, typically
stored in \code{object}) or data for new observations (at least in the form of a
\code{data.frame}).
\item \code{wrhs} accepts at least either \code{NULL} (for using a vector of ones) or a
right-hand side formula consisting only of the variable in \code{newdata}
containing the weights.
\item \code{orhs} accepts at least either \code{NULL} (for using a vector of zeros) or a
right-hand side formula consisting only of the variable in \code{newdata}
containing the offsets.
\item \code{extract_y} accepts a single logical value indicating whether output
element \code{y} (see below) shall be \code{NULL} (\code{TRUE}) or not (\code{FALSE}).
}

The return value of \code{extract_model_data} needs to be a \code{list} with elements
\code{y}, \code{weights}, and \code{offset}, each being a numeric vector containing the data
for the response, the observation weights, and the offsets, respectively. An
exception is that \code{y} may also be \code{NULL} (depending on argument \code{extract_y})
or a \code{factor}.

The weights and offsets returned by \code{extract_model_data} will be assumed to
hold for the reference model as well as for the submodels.
}

\examples{
if (requireNamespace("rstanarm", quietly = TRUE)) {
  # Data:
  dat_gauss <- data.frame(y = df_gaussian$y, df_gaussian$x)

  # The "stanreg" fit which will be used as the reference model (with small
  # values for `chains` and `iter`, but only for technical reasons in this
  # example; this is not recommended in general):
  fit <- rstanarm::stan_glm(
    y ~ X1 + X2 + X3 + X4 + X5, family = gaussian(), data = dat_gauss,
    QR = TRUE, chains = 2, iter = 500, refresh = 0, seed = 9876
  )

  # Define the reference model explicitly:
  ref <- get_refmodel(fit)
  print(class(ref)) # gives `"refmodel"`
  # Now see, for example, `?varsel`, `?cv_varsel`, and `?project` for
  # possible post-processing functions. Most of the post-processing functions
  # call get_refmodel() internally at the beginning, so you will rarely need
  # to call get_refmodel() yourself.

  # A custom reference model which may be used in a variable selection where
  # the candidate predictors are not a subset of those used for the reference
  # model's predictions:
  ref_cust <- init_refmodel(
    fit,
    data = dat_gauss,
    formula = y ~ X6 + X7,
    family = gaussian(),
    extract_model_data = function(object, newdata = NULL, wrhs = NULL,
                                  orhs = NULL, extract_y = TRUE) {
      if (!extract_y) {
        resp_form <- NULL
      } else {
        resp_form <- ~ y
      }

      if (is.null(newdata)) {
        newdata <- dat_gauss
      }

      args <- projpred:::nlist(object, newdata, wrhs, orhs, resp_form)
      return(projpred::do_call(projpred:::.extract_model_data, args))
    },
    cvfun = function(folds) {
      kfold(
        fit, K = max(folds), save_fits = TRUE, folds = folds, cores = 1
      )$fits[, "fit"]
    },
    dis = as.matrix(fit)[, "sigma"]
  )
  # Now, the post-processing functions mentioned above (for example,
  # varsel(), cv_varsel(), and project()) may be applied to `ref_cust`.
}

}