1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
|
guess <- function(x) {
if ("value" %chin% names(x))
return("value")
if ("(all)" %chin% names(x))
return("(all)")
var <- names(x)[ncol(x)]
message("Using '", var, "' as value column. Use 'value.var' to override")
return(var)
}
dcast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, value.var = guess(data)) {
if (is.data.table(data))
UseMethod("dcast", data)
else {
# reshape2::dcast is not generic so we have to call it explicitly. See comments at the top of fmelt.R too.
# nocov start
ns = tryCatch(getNamespace("reshape2"), error=function(e)
stop("The dcast generic in data.table has been passed a ",class(data)[1L]," (not a data.table) but the reshape2 package is not installed to process this type. Please either install reshape2 and try again, or pass a data.table to dcast instead."))
ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins,
subset = subset, fill = fill, value.var = value.var)
# nocov end
}
}
check_formula <- function(formula, varnames, valnames) {
if (is.character(formula)) formula = as.formula(formula)
if (!inherits(formula, "formula") || length(formula) != 3L)
stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.") # nocov; couldn't find a way to construct a test formula with length!=3L
vars = all.vars(formula)
vars = vars[!vars %chin% c(".", "...")]
allvars = c(vars, valnames)
if (any(allvars %chin% varnames[duplicated(varnames)]))
stop('data.table to cast must have unique column names')
ans = deparse_formula(as.list(formula)[-1L], varnames, allvars)
}
deparse_formula <- function(expr, varnames, allvars) {
lvars = lapply(expr, function(this) {
if (is.call(this)) {
if (this[[1L]] == quote(`+`))
unlist(deparse_formula(as.list(this)[-1L], varnames, allvars))
else this
} else if (is.name(this)) {
if (this == quote(`...`)) {
subvars = setdiff(varnames, allvars)
lapply(subvars, as.name)
} else this
}
})
lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x)
}
value_vars <- function(value.var, varnames) {
if (is.character(value.var))
value.var = list(value.var)
value.var = lapply(value.var, unique)
valnames = unique(unlist(value.var))
iswrong = which(!valnames %chin% varnames)
if (length(iswrong))
stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.")
value.var
}
aggregate_funs <- function(funs, vals, sep="_", ...) {
if (is.call(funs) && funs[[1L]] == "eval")
funs = eval(funs[[2L]], parent.frame(2L), parent.frame(2L))
if (is.call(funs) && as.character(funs[[1L]]) %chin% c("c", "list"))
funs = lapply(as.list(funs)[-1L], function(x) {
if (is.call(x) && as.character(x[[1L]]) %chin% c("c", "list")) as.list(x)[-1L] else x
})
else funs = list(funs)
if (length(funs) != length(vals)) {
if (length(vals) == 1L)
vals = replicate(length(funs), vals)
else stop("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).")
}
only_one_fun = length(unlist(funs)) == 1L
dots = list(...)
construct_funs <- function(fun, val) {
if (!is.list(fun)) fun = list(fun)
ans = vector("list", length(fun)*length(val))
nms = vector("character", length(ans))
k = 1L
for (i in fun) {
for (j in val) {
expr = list(i, as.name(j))
if (length(dots))
expr = c(expr, dots)
ans[[k]] = as.call(expr)
# changed order of arguments here, #1153
nms[k] = if (only_one_fun) j else
paste(j, all.names(i, max.names=1L, functions=TRUE), sep=sep)
k = k+1L;
}
}
setattr(ans, 'names', nms)
}
ans = mapply(construct_funs, funs, vals, SIMPLIFY=FALSE)
as.call(c(quote(list), unlist(ans)))
}
dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
if (!is.data.table(data)) stop("'data' must be a data.table.")
drop = as.logical(rep(drop, length.out=2L))
if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
lvals = value_vars(value.var, names(data))
valnames = unique(unlist(lvals))
lvars = check_formula(formula, names(data), valnames)
lvars = lapply(lvars, function(x) if (!length(x)) quote(`.`) else x)
# tired of lapply and the way it handles environments!
allcols = c(unlist(lvars), lapply(valnames, as.name))
dat = vector("list", length(allcols))
for (i in seq_along(allcols)) {
x = allcols[[i]]
dat[[i]] = if (identical(x, quote(`.`))) rep(".", nrow(data))
else eval(x, data, parent.frame())
if (is.function(dat[[i]]))
stop("Column [", deparse(x), "] not found or of unknown type.")
}
setattr(lvars, 'names', c("lhs", "rhs"))
# Have to take care of duplicate names, and provide names for expression columns properly.
varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
dupidx = which(valnames %chin% varnames)
if (length(dupidx)) {
dups = valnames[dupidx]
valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
}
lhsnames = head(varnames, length(lvars$lhs))
rhsnames = tail(varnames, -length(lvars$lhs))
setattr(dat, 'names', c(varnames, valnames))
setDT(dat)
if (any(vapply_1b(as.list(dat)[varnames], is.list))) {
stop("Columns specified in formula can not be of type list")
}
m <- as.list(match.call()[-1L])
subset <- m[["subset"]][[2L]]
if (!is.null(subset)) {
if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
}
if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table")
fun.call = m[["fun.aggregate"]]
fill.default = NULL
if (is.null(fun.call)) {
oo = forderv(dat, by=varnames, retGrp=TRUE)
if (attr(oo, 'maxgrpn') > 1L) {
message("Aggregate function missing, defaulting to 'length'")
fun.call = quote(length)
}
}
if (!is.null(fun.call)) {
fun.call = aggregate_funs(fun.call, lvals, sep, ...)
errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
if (is.null(fill)) {
fill.default <- suppressWarnings(dat[0L][, eval(fun.call)])
# tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stop(errmsg, call.=FALSE))
if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE)
}
if (!any(valnames %chin% varnames)) {
dat = dat[, eval(fun.call), by=c(varnames)]
} else {
dat = dat[, { .SD; eval(fun.call) }, by=c(varnames), .SDcols = valnames]
}
}
order_ <- function(x) {
o = forderv(x, retGrp=TRUE, sort=TRUE)
idx = attr(o, 'starts')
if (!length(o)) o = seq_along(x)
o[idx] # subsetVector retains attributes, using R's subset for now
}
cj_uniq <- function(DT) {
do.call("CJ", lapply(DT, function(x)
if (is.factor(x)) {
xint = seq_along(levels(x))
setattr(xint, 'levels', levels(x))
setattr(xint, 'class', class(x))
} else .Call(CsubsetVector, x, order_(x))
))}
valnames = setdiff(names(dat), varnames)
# 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
if (!is.null(fun.call) || !is.null(subset))
setkeyv(dat, varnames)
if (length(rhsnames)) {
lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
# handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
if (all(drop)) {
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense")))
maporder = lapply(map, order_)
mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
} else {
lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs)))
rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs)))
map = vector("list", 2L)
.Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
.Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
setDT(map)
mapunique = vector("list", 2L)
.Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
.Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
lhs = lhs_; rhs = rhs_
}
maplen = vapply_1i(mapunique, length)
idx = do.call("CJ", mapunique)[map, I := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
allcols = do.call("paste", c(rhs, sep=sep))
if (length(valnames) > 1L)
allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
# removed 'setcolorder()' here, #1153
setattr(ans, 'names', c(lhsnames, allcols))
setDT(ans); setattr(ans, 'sorted', lhsnames)
} else {
# formula is of the form x + y ~ . (rare case)
if (drop) {
if (is.null(subset) && is.null(fun.call)) {
dat = copy(dat) # can't be avoided
setkeyv(dat, lhsnames)
}
ans = dat
} else {
lhs = shallow(dat, lhsnames)
val = shallow(dat, valnames)
lhs_ = cj_uniq(lhs)
idx = lhs_[lhs, I := .I][["I"]]
lhs_[, I := NULL]
ans = .Call(Cfcast, lhs_, val, nrow(lhs_), 1L, idx, fill, fill.default, is.null(fun.call))
setDT(ans); setattr(ans, 'sorted', lhsnames)
setnames(ans, c(lhsnames, valnames))
}
if (length(valnames) == 1L)
setnames(ans, valnames, value.var)
}
return (ans)
}
|