File: kNN.Rd

package info (click to toggle)
r-cran-vim 6.2.2%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 1,556 kB
sloc: cpp: 141; sh: 12; makefile: 2
file content (132 lines) | stat: -rw-r--r-- 4,254 bytes
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/kNN.R
\name{kNN}
\alias{kNN}
\title{k-Nearest Neighbour Imputation}
\usage{
kNN(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE,
  methodStand = "range",
  ordFun = medianSamp
)
}
\arguments{
\item{data}{data.frame or matrix}

\item{variable}{variables where missing values should be imputed}

\item{metric}{metric to be used for calculating the distances between}

\item{k}{number of Nearest Neighbours used}

\item{dist_var}{names or variables to be used for distance calculation}

\item{weights}{weights for the variables for distance calculation.
If \code{weights = "auto"} weights will be selected based on variable importance from random forest regression, using function \code{\link[ranger:ranger]{ranger::ranger()}}.
Weights are calculated for each variable seperately.}

\item{numFun}{function for aggregating the k Nearest Neighbours in the case
of a numerical variable}

\item{catFun}{function for aggregating the k Nearest Neighbours in the case
of a categorical variable}

\item{makeNA}{list of length equal to the number of variables, with values, that should be converted to NA for each variable}

\item{NAcond}{list of length equal to the number of variables, with a condition for imputing a NA}

\item{impNA}{TRUE/FALSE whether NA should be imputed}

\item{donorcond}{list of length equal to the number of variables, with a donorcond condition as character string.
e.g. a list element can be ">5" or c(">5","<10). If the list element for a variable is NULL no condition will be applied for this variable.}

\item{mixed}{names of mixed variables}

\item{mixed.constant}{vector with length equal to the number of
semi-continuous variables specifying the point of the semi-continuous
distribution with non-zero probability}

\item{trace}{TRUE/FALSE if additional information about the imputation
process should be printed}

\item{imp_var}{TRUE/FALSE if a TRUE/FALSE variables for each imputed
variable should be created show the imputation status}

\item{imp_suffix}{suffix for the TRUE/FALSE variables showing the imputation
status}

\item{addRF}{TRUE/FALSE each variable will be modelled using random forest regression (\code{\link[ranger:ranger]{ranger::ranger()}}) and used as additional distance variable.}

\item{onlyRF}{TRUE/FALSE if TRUE only additional distance variables created from random forest regression will be used as distance variables.}

\item{addRandom}{TRUE/FALSE if an additional random variable should be added
for distance calculation}

\item{useImputedDist}{TRUE/FALSE if an imputed value should be used for distance calculation for imputing another variable.
Be aware that this results in a dependency on the ordering of the variables.}

\item{weightDist}{TRUE/FALSE if the distances of the k nearest neighbours should be used as weights in the
aggregation step}

\item{methodStand}{either "range" or "iqr" to be used in the standardization of numeric vaiables in the gower distance}

\item{ordFun}{function for aggregating the k Nearest Neighbours in the case
of a ordered factor variable}
}
\value{
the imputed data set.
}
\description{
k-Nearest Neighbour Imputation based on a variation of the Gower Distance
for numerical, categorical, ordered and semi-continous variables.
}
\examples{

data(sleep)
kNN(sleep)
library(laeken)
kNN(sleep, numFun = weightedMean, weightDist=TRUE)

}
\references{
A. Kowarik, M. Templ (2016) Imputation with
R package VIM.  \emph{Journal of
Statistical Software}, 74(7), 1-16.
}
\seealso{
Other imputation methods: 
\code{\link{hotdeck}()},
\code{\link{impPCA}()},
\code{\link{irmi}()},
\code{\link{matchImpute}()},
\code{\link{medianSamp}()},
\code{\link{rangerImpute}()},
\code{\link{regressionImp}()},
\code{\link{sampleCat}()}
}
\author{
Alexander Kowarik, Statistik Austria
}
\concept{imputation methods}
\keyword{manip}