File: duplicated.R

package info (click to toggle)
r-cran-data.table 1.12.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 13,084 kB
  • sloc: ansic: 12,667; sh: 13; makefile: 6
file content (160 lines) | stat: -rw-r--r-- 6,505 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

warning_oldUniqueByKey = "The deprecated option 'datatable.old.unique.by.key' is being used. Please stop using it and pass 'by=key(DT)' instead for clarity. For more information please search the NEWS file for this option."

duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
  if (!cedta()) return(NextMethod("duplicated")) #nocov
  if (!identical(incomparables, FALSE)) {
    .NotYetUsed("incomparables != FALSE")
  }
  if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) {  #1284
    by = key(x)
    warning(warning_oldUniqueByKey)
  }
  if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #5582
  if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE")
  query <- .duplicated.helper(x, by)
  # fix for bug #5405 - unique on null data table returns error (because of 'forderv')
  # however, in this case we can bypass having to go to forderv at all.
  if (!length(query$by)) return(logical(0L))

  if (query$use.keyprefix) {
    f = uniqlist(shallow(x, query$by))
    if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
  } else {
    o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE)
    if (attr(o, 'maxgrpn') == 1L) return(rep.int(FALSE, nrow(x)))
    f = attr(o,"starts")
    if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
    if (length(o)) f = o[f]
  }
  res <- rep.int(TRUE, nrow(x))
  res[f] = FALSE
  res
}

unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
  if (!cedta()) return(NextMethod("unique")) # nocov
  if (!identical(incomparables, FALSE)) {
    .NotYetUsed("incomparables != FALSE")
  }
  if (nrow(x) <= 1L) return(x)
  if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) {
    by = key(x)
    warning(warning_oldUniqueByKey)
  } else if (is.null(by)) by=seq_along(x)
  o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
  # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
  # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist retuns a new nrow(x) vector anyway and isn't
  # as efficient as forderv returning empty o when input is already ordered
  if (attr(o, 'maxgrpn') == 1L) return(x)  # avoid copy. Oftentimes, user just wants to check DT is unique with perhaps nrow(unique(DT))==nrow(DT)
  f = attr(o,"starts")
  if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
  if (length(o)) f = o[f]
  if (length(o <- forderv(f))) f = f[o]  # don't sort the uniques too
  .Call(CsubsetDT, x, f, seq_len(ncol(x)))
  # TO DO: allow by=NULL to mean all, for further speed gain.
  #        See news for v1.9.3 for link to benchmark use-case on datatable-help.
}

# Test for #2013 unique() memory efficiency improvement in v1.10.5
# set.seed(1)
# Create unique 7.6GB DT on 16GB laptop
# DT = data.table(
#  A = sample(1e8, 2e8, TRUE),
#  B = sample(1e8, 2e8, TRUE),
#  C = 1:2e8,
#  D = 1:2e8,
#  E = 1:2e8,
#  F = 1:2e8,
#  G = 1:2e8,
#  H = 1:2e8,
#  I = 1:2e8,
#  J = 1:2e8
# )
# print(dim(unique(DT)))  # works now, failed with oom in 1.10.4-3


## Specify the column names to be used in the uniqueness query, and if this
## query can take advantage of the keys of `x` (if present).
## returns a list
##
## This was dropped into a helper because initial implementation of
## unique.data.table and duplicated.data.table both needed this. However,
## unique.data.table has been refactored to simply call duplicated.data.table
## making the refactor unnecessary, but let's leave it here just in case
.duplicated.helper <- function(x, by) {
  use.sub.cols <- !is.null(by) # && !isTRUE(by) # Fixing bug #5424

  if (use.sub.cols) {
    ## Did the user specify (integer) indexes for the columns?
    if (is.numeric(by)) {
      if (any(as.integer(by) != by) || any(by<1L) || any(by>ncol(x))) {
        stop("Integer values between 1 and ncol are required for 'by' when ",
             "column indices. It's often better to use column names.")
      }
      by <- names(x)[by]
    }
    if (!is.character(by)) {
      stop("Only NULL, column indices or column names are allowed in by")
    }
    bad.cols <- setdiff(by, names(x))
    if (length(bad.cols)) {
      stop("by specifies column names that do not exist. First 5: ",paste(head(bad.cols,5),collapse=","))
    }

    use.keyprefix = haskey(x) &&
      length(by) <= length(key(x)) &&
      all(head(key(x), length(by)) == by)
  } else {
    ## by is not was explicitly set to
    use.keyprefix = FALSE
    by = names(x)
  }

  list(use.keyprefix=use.keyprefix, by=by)
}

# FR #5172 anyDuplicated.data.table
# Note that base's anyDuplicated is faster than any(duplicated(.)) (for vectors) - for data.frames it still pastes before calling duplicated
# In that sense, this anyDuplicated is *not* the same as base's - meaning it's not a different implementation
# This is just a wrapper. That being said, it should be incredibly fast on data.tables (due to data.table's fast forder)
anyDuplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
  if (!cedta()) return(NextMethod("anyDuplicated")) # nocov
  if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) {
    by = key(x)
    warning(warning_oldUniqueByKey)
  }
  dups <- duplicated(x, incomparables, fromLast, by, ...)
  if (fromLast) idx = tail(which(dups), 1L) else idx = head(which(dups), 1L)
  if (!length(idx)) idx=0L
  idx
}

# simple straightforward helper function to get the number
# of groups in a vector or data.table. Here by data.table,
# we really mean `.SD` - used in a grouping operation
# TODO: optimise uniqueN further with GForce.
uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455
  if (missing(by) && is.data.table(x) && isTRUE(getOption("datatable.old.unique.by.key"))) {
    by = key(x)
    warning(warning_oldUniqueByKey)
  }
  if (is.null(x)) return(0L)
  if (!is.atomic(x) && !is.data.frame(x))
    stop("x must be an atomic vector or data.frames/data.tables")
  if (is.atomic(x)) {
    if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm))
    x = as_list(x)
  }
  if (is.null(by)) by = seq_along(x)
  o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA)
  starts = attr(o, 'starts')
  if (!na.rm) {
    length(starts)
  } else {
    # TODO: internal efficient sum
    # fix for #1771, account for already sorted input
    sum( (if (length(o)) o[starts] else starts) != 0L)
  }
}