File: fcast.R

package info (click to toggle)
r-cran-data.table 1.14.8%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 15,936 kB
  • sloc: ansic: 15,680; sh: 100; makefile: 6
file content (225 lines) | stat: -rw-r--r-- 10,688 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
guess = function(x) {
  if ("value" %chin% names(x))
    return("value")
  if ("(all)" %chin% names(x))
    return("(all)")
  var = names(x)[ncol(x)]
  message("Using '", var, "' as value column. Use 'value.var' to override")
  return(var)
}

dcast <- function(
  data, formula, fun.aggregate = NULL, ..., margins = NULL,
  subset = NULL, fill = NULL, value.var = guess(data)
) {
  if (is.data.table(data)) UseMethod("dcast", data)
  # nocov start
  else {
    data_name = deparse(substitute(data))
    ns = tryCatch(getNamespace("reshape2"), error=function(e)
      stop("The dcast generic in data.table has been passed a ",class(data)[1L],", but data.table::dcast currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a reshape2::dcast, try installing that package first, but do note that reshape2 is deprecated and you should be migrating your code away from using it."))
    warning("The dcast generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(", data_name, "). In the next version, this warning will become an error.")
    ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins,
             subset = subset, fill = fill, value.var = value.var)
  }
  # nocov end
}

check_formula = function(formula, varnames, valnames) {
  if (is.character(formula)) formula = as.formula(formula)
  if (!inherits(formula, "formula") || length(formula) != 3L)
    stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.")  # nocov; couldn't find a way to construct a test formula with length!=3L
  vars = all.vars(formula)
  vars = vars[!vars %chin% c(".", "...")]
  allvars = c(vars, valnames)
  if (any(allvars %chin% varnames[duplicated(varnames)]))
    stop('data.table to cast must have unique column names')
  deparse_formula(as.list(formula)[-1L], varnames, allvars)
}

deparse_formula = function(expr, varnames, allvars) {
  lvars = lapply(expr, function(this) {
    if (!is.language(this)) return(NULL)
    if (this %iscall% '+') return(unlist(deparse_formula(this[-1L], varnames, allvars)))
    if (is.name(this) && this == quote(`...`)) {
      subvars = setdiff(varnames, allvars)
      return(lapply(subvars, as.name))
    }
    this
  })
  lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x)
}

value_vars = function(value.var, varnames) {
  if (is.character(value.var))
    value.var = list(value.var)
  value.var = lapply(value.var, unique)
  valnames = unique(unlist(value.var))
  iswrong = which(!valnames %chin% varnames)
  if (length(iswrong))
    stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.")
  value.var
}

aggregate_funs = function(funs, vals, sep="_", ...) {
  if (funs %iscall% 'eval')
    funs = eval(funs[[2L]], parent.frame(2L), parent.frame(2L))
  if (funs %iscall% c('c', 'list')) {
    funs = lapply(as.list(funs)[-1L], function(x) {
      if (x %iscall% c('c', 'list')) as.list(x)[-1L] else x
    })
  } else funs = eval(funs, parent.frame(2L), parent.frame(2L))
  if(is.function(funs)) funs = list(funs) # needed for cases as shown in test#1700.1
  if (length(funs) != length(vals)) {
    if (length(vals) == 1L)
      vals = replicate(length(funs), vals)
    else stop("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).")
  }
  only_one_fun = length(unlist(funs)) == 1L
  dots = list(...)
  construct_funs = function(fun, nm, val) {
    ans = vector("list", length(fun)*length(val))
    nms = vector("character", length(ans))
    k = 1L
    for (i in fun) {
      for (j in val) {
        expr = list(i, as.name(j))
        if (length(dots))
          expr = c(expr, dots)
        ans[[k]] = as.call(expr)
        # changed order of arguments here, #1153
        nms[k] = if (only_one_fun) j else paste(j, nm, sep=sep)
        k = k+1L;
      }
    }
    setattr(ans, 'names', nms)
  }
  ans = lapply(seq_along(funs), function(i) {
    nm = names(funs[i])
    if (is.null(nm) || !nzchar(nm)) {
      nm = all.names(funs[[i]], max.names=1L, functions=TRUE)
    }
    if (!length(nm)) nm <- paste0("fun", i)
    construct_funs(funs[i], nm, vals[[i]])
  })
  as.call(c(quote(list), unlist(ans)))
}

dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
  if (!is.data.table(data)) stop("'data' must be a data.table.")
  drop = as.logical(rep(drop, length.out=2L))
  if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
  # #2980 if explicitly providing fun.aggregate=length but not a value.var,
  #   just use the last column (as guess(data) would do) because length will be
  #   the same on all columns
  if (missing(value.var) && !missing(fun.aggregate) && identical(fun.aggregate, length))
    value.var = names(data)[ncol(data)]
  lvals = value_vars(value.var, names(data))
  valnames = unique(unlist(lvals))
  lvars = check_formula(formula, names(data), valnames)
  lvars = lapply(lvars, function(x) if (length(x)) x else quote(`.`))
  # tired of lapply and the way it handles environments!
  allcols = c(unlist(lvars), lapply(valnames, as.name))
  dat = vector("list", length(allcols))
  for (i in seq_along(allcols)) {
    x = allcols[[i]]
    dat[[i]] = if (identical(x, quote(`.`))) rep(".", nrow(data)) else eval(x, data, parent.frame())
    if (is.function(dat[[i]]))
      stop("Column [", deparse(x), "] not found or of unknown type.")
  }
  setattr(lvars, 'names', c("lhs", "rhs"))
  # Have to take care of duplicate names, and provide names for expression columns properly.
  varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
  dupidx = which(valnames %chin% varnames)
  if (length(dupidx)) {
    dups = valnames[dupidx]
    valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
    lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
  }
  lhsnames = head(varnames, length(lvars$lhs))
  rhsnames = tail(varnames, -length(lvars$lhs))
  setattr(dat, 'names', c(varnames, valnames))
  if (any(vapply_1b(dat[varnames], is.list))) {
    stop("Columns specified in formula can not be of type list")
  }
  setDT(dat)

  m = as.list(match.call()[-1L])
  subset = m[["subset"]][[2L]]
  if (!is.null(subset)) {
    if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
    idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
    dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
  }
  if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table")
  fun.call = m[["fun.aggregate"]]
  fill.default = NULL
  if (is.null(fun.call)) {
    oo = forderv(dat, by=varnames, retGrp=TRUE)
    if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) {
      message("Aggregate function missing, defaulting to 'length'")
      fun.call = quote(length)
    }
  }
  if (!is.null(fun.call)) {
    fun.call = aggregate_funs(fun.call, lvals, sep, ...)
    errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
    if (is.null(fill)) {
      fill.default = suppressWarnings(dat[0L][, eval(fun.call)])
      # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stop(errmsg, call.=FALSE))
      if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE)
    }
    dat = dat[, eval(fun.call), by=c(varnames)]
  }
  order_ = function(x) {
    o = forderv(x, retGrp=TRUE, sort=TRUE)
    idx = attr(o, 'starts', exact=TRUE)
    if (!length(o)) o = seq_along(x)
    o[idx] # subsetVector retains attributes, using R's subset for now
  }
  cj_uniq = function(DT) {
    do.call("CJ", lapply(DT, function(x)
      if (is.factor(x)) {
        xint = seq_along(levels(x))
        setattr(xint, 'levels', levels(x))
        setattr(xint, 'class', class(x))
      } else .Call(CsubsetVector, x, order_(x))
  ))}
  valnames = setdiff(names(dat), varnames)
  # 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
  if (!is.null(fun.call) || !is.null(subset))
    setkeyv(dat, varnames)
  if (length(rhsnames)) {
    lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
    # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
    if (all(drop)) {
      map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
      maporder = lapply(map, order_)
      mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
      lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
      rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
    } else {
      lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs)))
      rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs)))
      map = vector("list", 2L)
      .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
      .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
      setDT(map)
      mapunique = vector("list", 2L)
      .Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
      .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
      lhs = lhs_; rhs = rhs_
    }
    maplen = vapply_1i(mapunique, length)
    idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
    ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
    allcols = do.call("paste", c(rhs, sep=sep))
    if (length(valnames) > 1L)
      allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
            else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
      # removed 'setcolorder()' here, #1153
    setattr(ans, 'names', c(lhsnames, allcols))
    setDT(ans); setattr(ans, 'sorted', lhsnames)
  } else stop("Internal error -- empty rhsnames in dcast; please report") # nocov
  return (ans)
}