File: discretize.Rd

package info (click to toggle)
r-cran-recipes 0.1.15%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bullseye
size: 2,496 kB
sloc: sh: 37; makefile: 2
file content (116 lines) | stat: -rw-r--r-- 3,513 bytes
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/discretize.R
\name{discretize}
\alias{discretize}
\alias{discretize.default}
\alias{discretize.numeric}
\alias{predict.discretize}
\title{Discretize Numeric Variables}
\usage{
discretize(x, ...)

\method{discretize}{default}(x, ...)

\method{discretize}{numeric}(
  x,
  cuts = 4,
  labels = NULL,
  prefix = "bin",
  keep_na = TRUE,
  infs = TRUE,
  min_unique = 10,
  ...
)

\method{predict}{discretize}(object, new_data, ...)
}
\arguments{
\item{x}{A numeric vector}

\item{...}{Options to pass to
\code{\link[stats:quantile]{stats::quantile()}} that should not include \code{x}
or \code{probs}.}

\item{cuts}{An integer defining how many cuts to make of the
data.}

\item{labels}{A character vector defining the factor levels
that will be in the new factor (from smallest to largest). This
should have length \code{cuts+1} and should not include a level
for missing (see \code{keep_na} below).}

\item{prefix}{A single parameter value to be used as a prefix
for the factor levels (e.g. \code{bin1}, \code{bin2}, ...). If
the string is not a valid R name, it is coerced to one.}

\item{keep_na}{A logical for whether a factor level should be
created to identify missing values in \code{x}.}

\item{infs}{A logical indicating whether the smallest and
largest cut point should be infinite.}

\item{min_unique}{An integer defining a sample size line of
dignity for the binning. If (the number of unique
values)\verb{/(cuts+1)} is less than \code{min_unique}, no
discretization takes place.}

\item{object}{An object of class \code{discretize}.}

\item{new_data}{A new numeric object to be binned.}
}
\value{
\code{discretize} returns an object of class
\code{discretize} and \code{predict.discretize} returns a factor
vector.
}
\description{
\code{discretize} converts a numeric vector into a factor with
bins having approximately the same number of data points (based
on a training set).
}
\details{
\code{discretize} estimates the cut points from
\code{x} using percentiles. For example, if \code{cuts = 3}, the
function estimates the quartiles of \code{x} and uses these as
the cut points. If \code{cuts = 2}, the bins are defined as
being above or below the median of \code{x}.

The \code{predict} method can then be used to turn numeric
vectors into factor vectors.

If \code{keep_na = TRUE}, a suffix of "_missing" is used as a
factor level (see the examples below).

If \code{infs = FALSE} and a new value is greater than the
largest value of \code{x}, a missing value will result.
}
\examples{
library(modeldata)
data(biomass)

biomass_tr <- biomass[biomass$dataset == "Training",]
biomass_te <- biomass[biomass$dataset == "Testing",]

median(biomass_tr$carbon)
discretize(biomass_tr$carbon, cuts = 2)
discretize(biomass_tr$carbon, cuts = 2, infs = FALSE)
discretize(biomass_tr$carbon, cuts = 2, infs = FALSE, keep_na = FALSE)
discretize(biomass_tr$carbon, cuts = 2, prefix = "maybe a bad idea to bin")

carbon_binned <- discretize(biomass_tr$carbon)
table(predict(carbon_binned, biomass_tr$carbon))

carbon_no_infs <- discretize(biomass_tr$carbon, infs = FALSE)
predict(carbon_no_infs, c(50, 100))

rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
              data = biomass_tr)
rec <- rec \%>\% step_discretize(carbon, hydrogen)
rec <- prep(rec, biomass_tr)
binned_te <- bake(rec, biomass_te)
table(binned_te$carbon)
}
\concept{discretization}
\concept{factors}
\concept{preprocessing}
\keyword{datagen}