1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
|
guess = function(x) {
if ("value" %chin% names(x))
return("value")
if ("(all)" %chin% names(x))
return("(all)")
var = names(x)[ncol(x)]
message("Using '", var, "' as value column. Use 'value.var' to override")
return(var)
}
dcast <- function(
data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, value.var = guess(data)
) {
if (is.data.table(data)) UseMethod("dcast", data)
# nocov start
else {
data_name = deparse(substitute(data))
ns = tryCatch(getNamespace("reshape2"), error=function(e)
stop("The dcast generic in data.table has been passed a ",class(data)[1L],", but data.table::dcast currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a reshape2::dcast, try installing that package first, but do note that reshape2 is deprecated and you should be migrating your code away from using it."))
warning("The dcast generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(", data_name, "). In the next version, this warning will become an error.")
ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins,
subset = subset, fill = fill, value.var = value.var)
}
# nocov end
}
check_formula = function(formula, varnames, valnames) {
if (is.character(formula)) formula = as.formula(formula)
if (!inherits(formula, "formula") || length(formula) != 3L)
stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.") # nocov; couldn't find a way to construct a test formula with length!=3L
vars = all.vars(formula)
vars = vars[!vars %chin% c(".", "...")]
allvars = c(vars, valnames)
if (any(allvars %chin% varnames[duplicated(varnames)]))
stop('data.table to cast must have unique column names')
deparse_formula(as.list(formula)[-1L], varnames, allvars)
}
deparse_formula = function(expr, varnames, allvars) {
lvars = lapply(expr, function(this) {
if (!is.language(this)) return(NULL)
if (this %iscall% '+') return(unlist(deparse_formula(this[-1L], varnames, allvars)))
if (is.name(this) && this == quote(`...`)) {
subvars = setdiff(varnames, allvars)
return(lapply(subvars, as.name))
}
this
})
lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x)
}
value_vars = function(value.var, varnames) {
if (is.character(value.var))
value.var = list(value.var)
value.var = lapply(value.var, unique)
valnames = unique(unlist(value.var))
iswrong = which(!valnames %chin% varnames)
if (length(iswrong))
stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.")
value.var
}
aggregate_funs = function(funs, vals, sep="_", ...) {
if (funs %iscall% 'eval')
funs = eval(funs[[2L]], parent.frame(2L), parent.frame(2L))
if (funs %iscall% c('c', 'list')) {
funs = lapply(as.list(funs)[-1L], function(x) {
if (x %iscall% c('c', 'list')) as.list(x)[-1L] else x
})
} else funs = eval(funs, parent.frame(2L), parent.frame(2L))
if(is.function(funs)) funs = list(funs) # needed for cases as shown in test#1700.1
if (length(funs) != length(vals)) {
if (length(vals) == 1L)
vals = replicate(length(funs), vals)
else stop("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).")
}
only_one_fun = length(unlist(funs)) == 1L
dots = list(...)
construct_funs = function(fun, nm, val) {
ans = vector("list", length(fun)*length(val))
nms = vector("character", length(ans))
k = 1L
for (i in fun) {
for (j in val) {
expr = list(i, as.name(j))
if (length(dots))
expr = c(expr, dots)
ans[[k]] = as.call(expr)
# changed order of arguments here, #1153
nms[k] = if (only_one_fun) j else paste(j, nm, sep=sep)
k = k+1L;
}
}
setattr(ans, 'names', nms)
}
ans = lapply(seq_along(funs), function(i) {
nm = names(funs[i])
if (is.null(nm) || !nzchar(nm)) {
nm = all.names(funs[[i]], max.names=1L, functions=TRUE)
}
if (!length(nm)) nm <- paste0("fun", i)
construct_funs(funs[i], nm, vals[[i]])
})
as.call(c(quote(list), unlist(ans)))
}
dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
if (!is.data.table(data)) stop("'data' must be a data.table.")
drop = as.logical(rep(drop, length.out=2L))
if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
# #2980 if explicitly providing fun.aggregate=length but not a value.var,
# just use the last column (as guess(data) would do) because length will be
# the same on all columns
if (missing(value.var) && !missing(fun.aggregate) && identical(fun.aggregate, length))
value.var = names(data)[ncol(data)]
lvals = value_vars(value.var, names(data))
valnames = unique(unlist(lvals))
lvars = check_formula(formula, names(data), valnames)
lvars = lapply(lvars, function(x) if (length(x)) x else quote(`.`))
# tired of lapply and the way it handles environments!
allcols = c(unlist(lvars), lapply(valnames, as.name))
dat = vector("list", length(allcols))
for (i in seq_along(allcols)) {
x = allcols[[i]]
dat[[i]] = if (identical(x, quote(`.`))) rep(".", nrow(data)) else eval(x, data, parent.frame())
if (is.function(dat[[i]]))
stop("Column [", deparse(x), "] not found or of unknown type.")
}
setattr(lvars, 'names', c("lhs", "rhs"))
# Have to take care of duplicate names, and provide names for expression columns properly.
varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
dupidx = which(valnames %chin% varnames)
if (length(dupidx)) {
dups = valnames[dupidx]
valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
}
lhsnames = head(varnames, length(lvars$lhs))
rhsnames = tail(varnames, -length(lvars$lhs))
setattr(dat, 'names', c(varnames, valnames))
if (any(vapply_1b(dat[varnames], is.list))) {
stop("Columns specified in formula can not be of type list")
}
setDT(dat)
m = as.list(match.call()[-1L])
subset = m[["subset"]][[2L]]
if (!is.null(subset)) {
if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
}
if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table")
fun.call = m[["fun.aggregate"]]
fill.default = NULL
if (is.null(fun.call)) {
oo = forderv(dat, by=varnames, retGrp=TRUE)
if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) {
message("Aggregate function missing, defaulting to 'length'")
fun.call = quote(length)
}
}
if (!is.null(fun.call)) {
fun.call = aggregate_funs(fun.call, lvals, sep, ...)
errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
if (is.null(fill)) {
fill.default = suppressWarnings(dat[0L][, eval(fun.call)])
# tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stop(errmsg, call.=FALSE))
if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE)
}
dat = dat[, eval(fun.call), by=c(varnames)]
}
order_ = function(x) {
o = forderv(x, retGrp=TRUE, sort=TRUE)
idx = attr(o, 'starts', exact=TRUE)
if (!length(o)) o = seq_along(x)
o[idx] # subsetVector retains attributes, using R's subset for now
}
cj_uniq = function(DT) {
do.call("CJ", lapply(DT, function(x)
if (is.factor(x)) {
xint = seq_along(levels(x))
setattr(xint, 'levels', levels(x))
setattr(xint, 'class', class(x))
} else .Call(CsubsetVector, x, order_(x))
))}
valnames = setdiff(names(dat), varnames)
# 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
if (!is.null(fun.call) || !is.null(subset))
setkeyv(dat, varnames)
if (length(rhsnames)) {
lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
# handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
if (all(drop)) {
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
maporder = lapply(map, order_)
mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
} else {
lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs)))
rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs)))
map = vector("list", 2L)
.Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
.Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
setDT(map)
mapunique = vector("list", 2L)
.Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
.Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
lhs = lhs_; rhs = rhs_
}
maplen = vapply_1i(mapunique, length)
idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
allcols = do.call("paste", c(rhs, sep=sep))
if (length(valnames) > 1L)
allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
# removed 'setcolorder()' here, #1153
setattr(ans, 'names', c(lhsnames, allcols))
setDT(ans); setattr(ans, 'sorted', lhsnames)
} else stop("Internal error -- empty rhsnames in dcast; please report") # nocov
return (ans)
}
|