File: split.Rd

package info (click to toggle)
r-cran-data.table 1.12.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 13,084 kB
  • sloc: ansic: 12,667; sh: 13; makefile: 6
file content (76 lines) | stat: -rw-r--r-- 3,641 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
\name{split}
\alias{split}
\alias{split.data.table}
\title{ Split data.table into chunks in a list }
\description{
  Split method for data.table. Faster and more flexible. Be aware that processing list of data.tables will be generally much slower than manipulation in single data.table by group using \code{by} argument, read more on \code{\link{data.table}}.
}
\usage{
\method{split}{data.table}(x, f, drop = FALSE,
      by, sorted = FALSE, keep.by = TRUE, flatten = TRUE,
      \dots, verbose = getOption("datatable.verbose"))
}
\arguments{
  \item{x}{data.table }
  \item{f}{factor or list of factors. Same as \code{\link[base:split]{split.data.frame}}. Use \code{by} argument instead, this is just for consistency with data.frame method.}
  \item{drop}{logical. Default \code{FALSE} will not drop empty list elements caused by factor levels not referred by that factors. Works also with new arguments of split data.table method.}
  \item{by}{character vector. Column names on which split should be made. For \code{length(by) > 1L} and \code{flatten} FALSE it will result nested lists with data.tables on leafs.}
  \item{sorted}{When default \code{FALSE} it will retain the order of groups we are splitting on. When \code{TRUE} then sorted list(s) are returned. Does not have effect for \code{f} argument.}
  \item{keep.by}{logical default \code{TRUE}. Keep column provided to \code{by} argument.}
  \item{flatten}{logical default \code{TRUE} will unlist nested lists of data.tables. When using \code{f} results are always flattened to list of data.tables.}
  \item{\dots}{passed to data.frame way of processing when using \code{f} argument.}
  \item{verbose}{logical default \code{FALSE}. When \code{TRUE} it will print to console data.table split query used to split data.}
}
\details{
    Argument \code{f} is just for consistency in usage to data.frame method. Recommended is to use \code{by} argument instead, it will be faster, more flexible, and by default will preserve order according to order in data.
}
\value{
    List of \code{data.table}s. If using \code{flatten} FALSE and \code{length(by) > 1L} then recursively nested lists having \code{data.table}s as leafs of grouping according to \code{by} argument.
}
\seealso{ \code{\link{data.table}}, \code{\link{rbindlist}} }
\examples{
set.seed(123)
DT = data.table(x1 = rep(letters[1:2], 6),
                x2 = rep(letters[3:5], 4),
                x3 = rep(letters[5:8], 3),
                y = rnorm(12))
DT = DT[sample(.N)]
DF = as.data.frame(DT)

# split consistency with data.frame: `x, f, drop`
all.equal(
    split(DT, list(DT$x1, DT$x2)),
    lapply(split(DF, list(DF$x1, DF$x2)), setDT)
)

# nested list using `flatten` arguments
split(DT, by=c("x1", "x2"))
split(DT, by=c("x1", "x2"), flatten=FALSE)

# dealing with factors
fdt = DT[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3]
fdf = as.data.frame(fdt)
sdf = split(fdf, list(fdf$x1, fdf$x2))
all.equal(
    split(fdt, by=c("x1", "x2"), sorted=TRUE),
    lapply(sdf[sort(names(sdf))], setDT)
)

# factors having unused levels, drop FALSE, TRUE
fdt = DT[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L],
             x2 = as.factor(c("a", as.character(x2)))[-1L],
             x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)],
             y = y)]
fdf = as.data.frame(fdt)
sdf = split(fdf, list(fdf$x1, fdf$x2))
all.equal(
    split(fdt, by=c("x1", "x2"), sorted=TRUE),
    lapply(sdf[sort(names(sdf))], setDT)
)
sdf = split(fdf, list(fdf$x1, fdf$x2), drop=TRUE)
all.equal(
    split(fdt, by=c("x1", "x2"), sorted=TRUE, drop=TRUE),
    lapply(sdf[sort(names(sdf))], setDT)
)
}
\keyword{ data }