File: bmerge.R

package info (click to toggle)
r-cran-data.table 1.12.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 13,084 kB
  • sloc: ansic: 12,667; sh: 13; makefile: 6
file content (107 lines) | stat: -rw-r--r-- 6,858 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

bmerge <- function(i, x, leftcols, rightcols, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp, verbose)
{
  # TO DO: rename leftcols to icols, rightcols to xcols
  # TO DO: xo could be moved inside Cbmerge
  # bmerge moved to be separate function now that list() doesn't copy in R
  # types of i join columns are promoted to match x's types (with warning or verbose)

  # Important that i is already passed in as a shallow copy, due to these coercions for factors.
  # i.e. bmerge(i<-shallow(i),...)
  # The caller ([.data.table) then uses the coerced columns to build the output

  # careful to only plonk syntax (full column) on i from now on (otherwise i would change)
  # TO DO: enforce via .internal.shallow attribute and expose shallow() to users
  # This is why shallow() is very importantly internal only, currently.

  origi = shallow(i)      # Needed for factor to factor/character joins, to recover the original levels
                          # Otherwise, types of i join columns are anyways promoted to match x's
                          # types (with warning or verbose)
  resetifactor = NULL     # Keep track of any factor to factor/character join cols (only time we keep orig)
  for (a in seq_along(leftcols)) {
    # This loop is simply to support joining factor columns
    # Note that if i is keyed, if this coerces, i's key gets dropped and the key may not be retained
    lc = leftcols[a]   # i   # TO DO: rename left and right to i and x
    rc = rightcols[a]  # x
    icnam = names(i)[lc]
    xcnam = names(x)[rc]
    if (is.character(x[[rc]])) {
      if (is.character(i[[lc]])) next
      if (!is.factor(i[[lc]]))
        stop("x.'",xcnam,"' is a character column being joined to i.'",icnam,"' which is type '",typeof(i[[lc]]),"'. Character columns must join to factor or character columns.")
      if (verbose) cat("Coercing factor column i.'",icnam,"' to character to match type of x.'",xcnam,"'.\n",sep="")
      set(i,j=lc,value=as.character(i[[lc]]))
      # no longer copies all of i, thanks to shallow() and :=/set
      next
    }
    if (is.factor(x[[rc]])) {
      if (is.character(i[[lc]])) {
        if (verbose) cat("Coercing character column i.'",icnam,"' to factor to match type of x.'",xcnam,"'. If possible please change x.'",xcnam,"' to character. Character columns are now preferred in joins.\n",sep="")
        set(origi, j=lc, value=factor(origi[[lc]])) # note the use of 'origi' here - see #499 and #945
        # TO DO: we need a way to avoid copying 'value' for internal purposes
        # that would allow setting: set(i, j=lc, value=origi[[lc]]) without resulting in a copy.
        # until then using 'val <- origi[[lc]]' below to avoid another copy.
      } else {
        if (!is.factor(i[[lc]]))
          stop("x.'",xcnam,"' is a factor column being joined to i.'",icnam,"' which is type '",typeof(i[[lc]]),"'. Factor columns must join to factor or character columns.")
      }
      # Retain original levels of i's factor columns in factor to factor joins (important when NAs,
      # see tests 687 and 688).
      # Moved it outside of 'else' to fix #499 and #945.
      resetifactor = c(resetifactor,lc)
      if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.")   # because the chmatch on next line returns <strike>NA</strike> <new>0</new> for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true.
      val = origi[[lc]] # note: using 'origi' here because set(..., value = .) always copies '.', we need a way to avoid it in internal cases.
      lx = levels(x[[rc]])
      li = levels(val)
      newfactor = chmatch(li, lx, nomatch=0L)[val] # fix for #945, a hacky solution for now.
      levels(newfactor) = lx
      class(newfactor) = "factor"
      set(i, j=lc, value=newfactor)
      # COMMENT BELOW IS NOT TRUE ANYMORE... had to change nomatch to 0L to take care of case where 'NA' occurs as a separate value... See #945.
      # <OUTDATED> NAs can be produced by this level match, in which case the C code (it knows integer value NA)
      # can skip over the lookup. It's therefore important we pass NA rather than 0 to the C code.
    }
    # Fix for #1108.
    # TODO: clean this code up...
    # NOTE: bit64::is.double(int64) returns FALSE.. but base::is.double returns TRUE
    is.int64 <- function(x) inherits(x, 'integer64')
    is.strictlydouble <- function(x) !is.int64(x) && is.double(x)
    if (is.integer(x[[rc]]) && (base::is.double(i[[lc]]) || is.logical(i[[lc]]))) {
      # TO DO: add warning if reallyreal about loss of precision
      # or could coerce in binary search on the fly, at cost
      if (verbose) cat("Coercing ", typeof(i[[lc]])," column i.'",icnam,"' to integer to match type of x.'",xcnam,"'. Please avoid coercion for efficiency.\n",sep="")
      newval = i[[lc]]
      if (is.int64(newval))
        newval = as.integer(newval)
      else mode(newval) = "integer"  # retains column attributes (such as IDateTime class)
      set(i, j=lc, value=newval)
    } else if (is.int64(x[[rc]]) && (is.integer(i[[lc]]) || is.logical(i[[lc]]) || is.strictlydouble(i[[lc]]) )) {
      if (verbose) cat("Coercing ",typeof(i[[lc]])," column i.'",icnam,"' to double to match type of x.'",xcnam,"'. Please avoid coercion for efficiency.\n",sep="")
      newval = bit64::as.integer64(i[[lc]])
      set(i, j=lc, value=newval)
    } else if (is.strictlydouble(x[[rc]]) && (is.integer(i[[lc]]) || is.logical(i[[lc]]) || is.int64(i[[lc]]) )) {
      if (verbose) cat("Coercing ",typeof(i[[lc]])," column i.'",icnam,"' to double to match type of x.'",xcnam,"'. Please avoid coercion for efficiency.\n",sep="")
      newval = i[[lc]]
      if (is.int64(newval))
        newval = as.numeric(newval)
      else mode(newval) = "double"
      set(i, j=lc, value=newval)
    }
  }
  ## after all modifications of i, check if i has a proper key on all leftcols
  io <- identical(leftcols, head(chmatch(key(i), names(i)), length(leftcols)))
  if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...");flush.console()}
  ans = .Call(Cbmerge, i, x, as.integer(leftcols), as.integer(rightcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
  if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()}

  # in the caller's shallow copy,  see comment at the top of this function for usage
  # We want to leave the coercions to i in place otherwise, since the caller depends on that to build the result
  if (length(resetifactor)) {
    for (ii in resetifactor)
      set(i,j=ii,value=origi[[ii]])
    if (haskey(origi))
      setattr(i, 'sorted', key(origi))
  }
  return(ans)
}