File: fcast.R

package info (click to toggle)
r-cran-data.table 1.12.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 13,084 kB
  • sloc: ansic: 12,667; sh: 13; makefile: 6
file content (239 lines) | stat: -rw-r--r-- 10,511 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
guess <- function(x) {
  if ("value" %chin% names(x))
    return("value")
  if ("(all)" %chin% names(x))
    return("(all)")
  var <- names(x)[ncol(x)]
  message("Using '", var, "' as value column. Use 'value.var' to override")
  return(var)
}

dcast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL,
      subset = NULL, fill = NULL, value.var = guess(data)) {
  if (is.data.table(data))
    UseMethod("dcast", data)
  else {
    # reshape2::dcast is not generic so we have to call it explicitly. See comments at the top of fmelt.R too.
    # nocov start
    ns = tryCatch(getNamespace("reshape2"), error=function(e)
         stop("The dcast generic in data.table has been passed a ",class(data)[1L]," (not a data.table) but the reshape2 package is not installed to process this type. Please either install reshape2 and try again, or pass a data.table to dcast instead."))
    ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins,
             subset = subset, fill = fill, value.var = value.var)
    # nocov end
  }
}

check_formula <- function(formula, varnames, valnames) {
  if (is.character(formula)) formula = as.formula(formula)
  if (!inherits(formula, "formula") || length(formula) != 3L)
    stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.")  # nocov; couldn't find a way to construct a test formula with length!=3L
  vars = all.vars(formula)
  vars = vars[!vars %chin% c(".", "...")]
  allvars = c(vars, valnames)
  if (any(allvars %chin% varnames[duplicated(varnames)]))
    stop('data.table to cast must have unique column names')
  ans = deparse_formula(as.list(formula)[-1L], varnames, allvars)
}

deparse_formula <- function(expr, varnames, allvars) {
  lvars = lapply(expr, function(this) {
    if (is.call(this)) {
      if (this[[1L]] == quote(`+`))
        unlist(deparse_formula(as.list(this)[-1L], varnames, allvars))
      else this
    } else if (is.name(this)) {
      if (this == quote(`...`)) {
        subvars = setdiff(varnames, allvars)
        lapply(subvars, as.name)
      } else this
    }
  })
  lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x)
}

value_vars <- function(value.var, varnames) {
  if (is.character(value.var))
    value.var = list(value.var)
  value.var = lapply(value.var, unique)
  valnames = unique(unlist(value.var))
  iswrong = which(!valnames %chin% varnames)
  if (length(iswrong))
    stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.")
  value.var
}

aggregate_funs <- function(funs, vals, sep="_", ...) {
  if (is.call(funs) && funs[[1L]] == "eval")
    funs = eval(funs[[2L]], parent.frame(2L), parent.frame(2L))
  if (is.call(funs) && as.character(funs[[1L]]) %chin% c("c", "list"))
    funs = lapply(as.list(funs)[-1L], function(x) {
      if (is.call(x) && as.character(x[[1L]]) %chin% c("c", "list")) as.list(x)[-1L] else x
    })
  else funs = list(funs)
  if (length(funs) != length(vals)) {
    if (length(vals) == 1L)
      vals = replicate(length(funs), vals)
    else stop("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).")
  }
  only_one_fun = length(unlist(funs)) == 1L
  dots = list(...)
  construct_funs <- function(fun, val) {
    if (!is.list(fun)) fun = list(fun)
    ans = vector("list", length(fun)*length(val))
    nms = vector("character", length(ans))
    k = 1L
    for (i in fun) {
      for (j in val) {
        expr = list(i, as.name(j))
        if (length(dots))
          expr = c(expr, dots)
        ans[[k]] = as.call(expr)
        # changed order of arguments here, #1153
        nms[k] = if (only_one_fun) j else
              paste(j, all.names(i, max.names=1L, functions=TRUE), sep=sep)
        k = k+1L;
      }
    }
    setattr(ans, 'names', nms)
  }
  ans = mapply(construct_funs, funs, vals, SIMPLIFY=FALSE)
  as.call(c(quote(list), unlist(ans)))
}

dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
  if (!is.data.table(data)) stop("'data' must be a data.table.")
  drop = as.logical(rep(drop, length.out=2L))
  if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
  lvals = value_vars(value.var, names(data))
  valnames = unique(unlist(lvals))
  lvars = check_formula(formula, names(data), valnames)
  lvars = lapply(lvars, function(x) if (!length(x)) quote(`.`) else x)
  # tired of lapply and the way it handles environments!
  allcols = c(unlist(lvars), lapply(valnames, as.name))
  dat = vector("list", length(allcols))
  for (i in seq_along(allcols)) {
    x = allcols[[i]]
    dat[[i]] = if (identical(x, quote(`.`))) rep(".", nrow(data))
            else eval(x, data, parent.frame())
    if (is.function(dat[[i]]))
      stop("Column [", deparse(x), "] not found or of unknown type.")
  }
  setattr(lvars, 'names', c("lhs", "rhs"))
  # Have to take care of duplicate names, and provide names for expression columns properly.
  varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
  dupidx = which(valnames %chin% varnames)
  if (length(dupidx)) {
    dups = valnames[dupidx]
    valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
    lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
  }
  lhsnames = head(varnames, length(lvars$lhs))
  rhsnames = tail(varnames, -length(lvars$lhs))
  setattr(dat, 'names', c(varnames, valnames))
  setDT(dat)
  if (any(vapply_1b(as.list(dat)[varnames], is.list))) {
    stop("Columns specified in formula can not be of type list")
  }
  m <- as.list(match.call()[-1L])
  subset <- m[["subset"]][[2L]]
  if (!is.null(subset)) {
    if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
    idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
    dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
  }
  if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table")
  fun.call = m[["fun.aggregate"]]
  fill.default = NULL
  if (is.null(fun.call)) {
    oo = forderv(dat, by=varnames, retGrp=TRUE)
    if (attr(oo, 'maxgrpn') > 1L) {
      message("Aggregate function missing, defaulting to 'length'")
      fun.call = quote(length)
    }
  }
  if (!is.null(fun.call)) {
    fun.call = aggregate_funs(fun.call, lvals, sep, ...)
    errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately."
    if (is.null(fill)) {
      fill.default <- suppressWarnings(dat[0L][, eval(fun.call)])
      # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stop(errmsg, call.=FALSE))
      if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE)
    }
    if (!any(valnames %chin% varnames)) {
      dat = dat[, eval(fun.call), by=c(varnames)]
    } else {
      dat = dat[, { .SD; eval(fun.call) }, by=c(varnames), .SDcols = valnames]
    }
  }
  order_ <- function(x) {
    o = forderv(x, retGrp=TRUE, sort=TRUE)
    idx = attr(o, 'starts')
    if (!length(o)) o = seq_along(x)
    o[idx] # subsetVector retains attributes, using R's subset for now
  }
  cj_uniq <- function(DT) {
    do.call("CJ", lapply(DT, function(x)
      if (is.factor(x)) {
        xint = seq_along(levels(x))
        setattr(xint, 'levels', levels(x))
        setattr(xint, 'class', class(x))
      } else .Call(CsubsetVector, x, order_(x))
  ))}
  valnames = setdiff(names(dat), varnames)
  # 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
  if (!is.null(fun.call) || !is.null(subset))
    setkeyv(dat, varnames)
  if (length(rhsnames)) {
    lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
    # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
    if (all(drop)) {
      map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense")))
      maporder = lapply(map, order_)
      mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
      lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
      rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
    } else {
      lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs)))
      rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs)))
      map = vector("list", 2L)
      .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
      .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
      setDT(map)
      mapunique = vector("list", 2L)
      .Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
      .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
      lhs = lhs_; rhs = rhs_
    }
    maplen = vapply_1i(mapunique, length)
    idx = do.call("CJ", mapunique)[map, I := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
    ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
    allcols = do.call("paste", c(rhs, sep=sep))
    if (length(valnames) > 1L)
      allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
            else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
      # removed 'setcolorder()' here, #1153
    setattr(ans, 'names', c(lhsnames, allcols))
    setDT(ans); setattr(ans, 'sorted', lhsnames)
  } else {
    # formula is of the form x + y ~ . (rare case)
    if (drop) {
      if (is.null(subset) && is.null(fun.call)) {
        dat = copy(dat) # can't be avoided
        setkeyv(dat, lhsnames)
      }
      ans = dat
    } else {
      lhs = shallow(dat, lhsnames)
      val = shallow(dat, valnames)
      lhs_ = cj_uniq(lhs)
      idx = lhs_[lhs, I := .I][["I"]]
      lhs_[, I := NULL]
      ans = .Call(Cfcast, lhs_, val, nrow(lhs_), 1L, idx, fill, fill.default, is.null(fun.call))
      setDT(ans); setattr(ans, 'sorted', lhsnames)
      setnames(ans, c(lhsnames, valnames))
    }
    if (length(valnames) == 1L)
      setnames(ans, valnames, value.var)
  }
  return (ans)
}