File: word.R

package info (click to toggle)
r-cran-diffobj 0.3.5-1
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 2,432 kB
sloc: ansic: 455; javascript: 96; sh: 32; makefile: 8
file content (671 lines) | stat: -rwxr-xr-x 25,400 bytes
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.

# Used to initialize the word difference index lists; represents a non matching
# result for use with `regmatches`

.word.diff.atom <- -1L
attr(.word.diff.atom, "match.length") <- -1L

# Matches syntactically valid R variable names

.reg.r.ident <- "(?:\\.[[:alpha:]]|[[:alpha:]])[[:alnum:]_.]*"

# Helper function when lining up word in a word diff to the lines they came from
#
# This one used to be simple but grew out of control as we discovered corner
# cases; would be good to see if there is a better collapse algorithm that
# naturally handles the corner cases (note: we added general handling of the
# situation where many hunks share the same line, but have not yet removed
# specific handling for corner cases generated by that issue so there is
# redundant code in here).
#
# lines: is a list of what lines are in each hunk,
# cont: is a logical vector of same length as lines denoting whether a
#   particular value in lines is context or diff
# hunk.diff: logical vector denoting if for the other object the hunk contains
#   only differences (seemingly not used in the most recent algorithm)
#
# What do we do about lines that are fully context?  These are flexible in as
# much as we can put them anyplace between the two diff hunks.  We are trying to
# maximize overlapping context elements.

reassign_lines2 <- function(lines, cont, hunk.diff) {
  # Find out what lines show up as duplicated

  hunk.count <- length(cont)
  hunk.len <- vapply(lines, length, integer(1L))
  hunk.n <- seq_along(cont)
  nums <- unlist(lines)
  nums.l <- unlist(
    lapply(seq_along(lines), function(x) rep(x, length(lines[[x]])))
  )
  nums.d <- unique(nums[duplicated(nums)])

  # For each duplicated number, find range of hunks that contain it and remove
  # it from inappropriate hunks / add it to proper ones

  lines.p <- lines

  for(n in nums.d) {
    n.r <- range(nums.l[nums == n])

    # If any of the non-empty hunks are diff hunks, remove line reference from
    # every hunk except the first non-empty diff hunk, otherwise remove the
    # reference from everything except the first non-empty matching hunk

    b.w <- hunk.n >= n.r[[1L]] & hunk.n <= n.r[[2L]]
    min.diff.h <- head(which(!cont & b.w & hunk.len), 1L)
    min.mtch.h <- head(which(cont & b.w & hunk.len), 1L)

    keep.h <- if(length(min.diff.h)) min.diff.h else min.mtch.h

    if(length(keep.h))
      for(i in n.r[[1L]]:n.r[[2L]])
        if(i != keep.h) lines.p[[i]] <- lines.p[[i]][lines.p[[i]] != n]
  }
  lines.p
}
## Helper Function for Mapping Word Diffs to Lines
##
## Used when we're doing a wrapped diff for atomic vectors.  We expect to
## receive `tar/cur.dat` with any meta data lines (e.g. factor levels, time
## series meta data) removed already.  The challenge is we need to then be able
## to re-map back each word back to the line it was on originally before
## unwrapping.  This may include adding padding line blanks in the case one hunk
## displays across more lines than another.
#
## This function does two things: inserts padding lines when hunks in one object
## end up longer than in the other and similar lines don't align, and computes
## mock character strings that will be used by to force alignments. We
## manufacture unique strings that either match or don't match across the two
## objects depending on the word contents of each line, and then pass those
## back as the `comp` component of the `tar.dat` and `cur.dat` returned.  The
## subsequent line diff will use `comp` and cause the relevant lines to be lined
## up.  This is inefficient and round-about, but has the huge benefit of
## allowing us to plug in the wrapped diff into our existing line diff
## infrastructure
##
## Note that in "word" mode the returned values may be longer than the input ones
## as it may be necessary to add lines to get things to match-up.  Added lines
## are indicated by TRUE values in the `fill` component of the `*.dat` return
## values
##
## We have been through several iterations trying to get the most intuitive
## behavior and the result is a fairly non-intuitive and likely inefficient
## algorithm.  It works for the most part, so we leave it as is, but is long,
## messy, and should be replaced by a more elegant solution.
##
## @param tar.ends and cur.ends are the indices of the last elements in each
## line of the vector
## @param tar.dat and cur.dat are the data, see `line_diff` body for detailed
##   description of them (about 100 lines in).  Note that the data has been
##   subset to just the portion of it that has row headers (e.g. excluding
##   factor levels, etc.)
## @param tar/cur.ends the position in the unwrapped vector of the last "word"
##   in each line.

word_to_line_map <- function(
  hunks, tar.dat, cur.dat, tar.ends, cur.ends
) {
  # Once we've done all the replication and disambiguation, we need to make sure
  # diff hunks have the same number of lines.  Start mix lines or start/end mix
  # lines should go at beginning (this includes full "context" lines where there
  # is an insertion or deletion in middle. End mix lines should go at end.
  #
  # For each hunk, we need to identify what lines it contains, and whether the
  # lines are contained in full or not
  #
  # If a diff hunk is empty for tar/cur, and the corresponding cur/tar hunk
  # does not begin/end at beginning of line, then must add lines containing
  # adjoining elements to the diff

  find_word_line <- function(h.i, pos, ends.a, ends.b, hunks) {
    inds_pos <- function(h) c(h$A, h$B)[c(h$A, h$B) > 0L]
    inds_neg <- function(h) abs(c(h$A, h$B)[c(h$A, h$B) < 0L])

    h <- hunks[[h.i]]
    h.prev <- if(h.i > 1L) hunks[[h.i - 1L]]
    h.next <- if(h.i < length(hunks)) hunks[[h.i + 1L]]

    inds.a <- if(pos) inds_pos(h) else inds_neg(h)
    inds.b <- if(pos) inds_neg(h) else inds_pos(h)

    ints.a <- c(1L, head(ends.a, -1L) + 1L)
    ints.b <- c(1L, head(ends.b, -1L) + 1L)
    ends.b.m <- max(ends.b)

    # If a diff hunk and empty, but the matching hunk isn't empty, then add
    # the last element of prior hunk and first element of next hunk

    if(!h$context && !length(inds.a) && length(inds.b)) {
      inds.prev <- if(h.i > 1L) if(pos) inds_pos(h.prev) else inds_neg(h.prev)
      inds.next <- if(h.i < length(hunks))
        if(pos) inds_pos(h.next) else inds_neg(h.next)
      ind.b.min <- min(inds.b)
      ind.b.max <- max(inds.b)
      add.left <- if(!ind.b.min %in% ints.b) max(inds.prev)
      add.right <- if(!ind.b.max %in% ends.b) min(inds.next)
      inds.a <- if(length(add.left) && length(add.right))
        seq(from=add.left, to=add.right, by=1L) else c(add.left, add.right)
    }
    sort(unique(findInterval(inds.a, ints.a)))
  }
  find_full_diff_line <- function(dat, ends, diffs) {
    w.t <- vapply(
      dat$word.ind,
      function(x) if(is.null(a.val <- attr(x, "word.count"))) -1L else a.val,
      integer(1L)
    )
    inds.d.l <- findInterval(diffs, c(1L, head(ends, -1L) + 1L))
    inds.tab <- tabulate(inds.d.l, length(ends))
    diff.full <- which(inds.tab == w.t & inds.tab)
  }
  h.seq <- seq_along(hunks)
  tar.lines <- lapply(h.seq, find_word_line, TRUE, tar.ends, cur.ends, hunks)
  cur.lines <- lapply(h.seq, find_word_line, FALSE, cur.ends, tar.ends, hunks)

  # which hunks are context hunks?

  h.cont <- vapply(hunks, "[[", logical(1L), "context")

  # Compute what indices are in each lines; we are going to use this to
  # categorize what type of line this is; some of this might be duplicative with
  # what we did earlier, but that was so long ago I don't want to get back into
  # it.

  tar.idx <- Map(seq, c(1L, head(tar.ends, -1L) + 1L), tar.ends, by=1L)
  cur.idx <- Map(seq, c(1L, head(cur.ends, -1L) + 1L), cur.ends, by=1L)

  tar.diff <- unlist(
    lapply(hunks[!h.cont], function(x) with(x, abs(c(A, B))[c(A, B) > 0]))
  )
  cur.diff <- unlist(
    lapply(hunks[!h.cont], function(x) with(x, abs(c(A, B))[c(A, B) < 0]))
  )
  # identify whether a line starts with context, ends with context, neither, or
  # both

  context_type <- function(idx, diffs) {
    idx.in <- idx %in% diffs
    if(!length(idx)) {  # arbitrarily assign this case to both
      "both"
    } else {
      if(head(idx.in, 1L) && tail(idx.in, 1L)) {
        "neither"
      } else if(head(idx.in, 1L)) {
        "ends"
      } else if(tail(idx.in, 1L)) {
        "starts"
      } else {
        "both"
      }
    }
  }
  tar.end.mix <- vapply(tar.idx, context_type, character(1L), diffs=tar.diff)
  cur.end.mix <- vapply(cur.idx, context_type, character(1L), diffs=cur.diff)

  # Handle cases where line is shared by multiple hunks; also need to know which
  # hunks contain only lines that are fully different (and by extension, are
  # themselves fully different) as these don't need to have a line from the
  # opposite object brought in for alignment

  diff.inds <- unlist(lapply(hunks[!h.cont], "[",  c("A", "B")))
  if(is.null(diff.inds)) diff.inds <- integer()
  tar.inds.d <- diff.inds[diff.inds > 0]
  cur.inds.d <- abs(diff.inds[diff.inds < 0])

  tar.tot.diff.l <- find_full_diff_line(tar.dat, tar.ends, tar.inds.d)
  cur.tot.diff.l <- find_full_diff_line(cur.dat, cur.ends, cur.inds.d)

  # Remove duplicated line references

  tar.lines.u <- reassign_lines2(tar.lines, h.cont)
  cur.lines.u <- reassign_lines2(cur.lines, h.cont)

  # Search for aligned matching hunks that are empty, and if both those have
  # adjacent empty diff hunks, remove the matched and diff hunks from both
  # NOTE: this changes the number of hunks in the word diff!

  len.orig <- length(tar.lines.u)
  tar.lines.p <- tar.lines.u
  cur.lines.p <- cur.lines.u
  j <- if(h.cont[[1L]]) 1L else 2L
  l.cont <- as.list(h.cont)
  k <- 0

  while(j < length(tar.lines.p)) {
    if((k <- k + 1L) > len.orig) {
      # nocov start
      stop("Logic Error: infine loop in atomic hunk align; contact maintainer.")
      # nocov end
    }
    if(!length(tar.lines.p[[j]]) && !length(cur.lines.p[[j]])) {
      if(j > 1L) {
        tar.lo <- !length(tar.lines.p[[j - 1L]])
        cur.lo <- !length(cur.lines.p[[j - 1L]])
      } else tar.lo <- cur.lo <- FALSE
      tar.hi <- !length(tar.lines.p[[j + 1L]])
      cur.hi <- !length(cur.lines.p[[j + 1L]])

      # Need to remove paired empty match and diff; since we are shortening the
      # list we don't need to increment J (note possible memory inefficiency
      # here)
      if((tar.lo || tar.hi) && (cur.lo || cur.hi))  {
        if(tar.lo) tar.lines.p[(j - 1L):j] <- NULL else
          tar.lines.p[j:(j + 1L)] <- NULL
        if(cur.lo) cur.lines.p[(j - 1L):j] <- NULL else
          cur.lines.p[j:(j + 1L)] <- NULL
        l.cont[j:(j + 1L)] <- NULL
      } else {
        j <- j + 1L
      }
    } else j <- j + 1L
  }
  # Update our context vector since we have now possibly removed hunks

  h.cont <- unlist(l.cont)

  # If necessary, populate empty diff hunks with matching lines; this happens
  # if one of tar/cur has differences but the other doesn't

  steal_matching_line <- function(lines, i) {
    lines.p <- lines
    l.len <- length(lines)
    if(l.len > i && length(lines[[i + 1L]])) {
      lines.p[[i]] <- head(lines.p[[i + 1L]], 1L)
      lines.p[[i + 1L]] <- tail(lines.p[[i + 1L]], -1L)
    } else if (i > 1L && length(lines[[i - 1L]])) {
      lines.p[[i]] <- tail(lines.p[[i - 1L]], 1L)
      lines.p[[i - 1L]] <- head(lines.p[[i - 1L]], -1L)
    }
    lines.p
  }
  tar.lines.f <- tar.lines.p
  cur.lines.f <- cur.lines.p

  # lines that are all diffs

  hunk_diff <- function(vec, tot.diffs) length(vec) && all(vec %in% tot.diffs)
  tar.tot.diff.h <- vapply(tar.lines, hunk_diff, logical(1L), tar.tot.diff.l)
  cur.tot.diff.h <- vapply(cur.lines, hunk_diff, logical(1L), cur.tot.diff.l)

  for(i in seq_along(h.cont)) {
    if(!h.cont[[i]]) {
      t.i <- tar.lines.f[[i]]
      c.i <- cur.lines.f[[i]]
      if(!length(t.i) && length(c.i) && !cur.tot.diff.h[[i]]) {
        tar.lines.f <- steal_matching_line(tar.lines.f, i)
      } else if (!length(c.i) && length(t.i) && !tar.tot.diff.h[[i]]) {
        cur.lines.f <- steal_matching_line(cur.lines.f, i)
      }
    }
  }
  # We now need to make sure that every hunk is the same length

  if(
    length(tar.lines.f) != length(cur.lines.f) ||
    length(tar.lines.f) != length(h.cont)
  )
    # nocov start
    stop(
      "Logic error: mismatched hunk sizes when aligning words to lines; ",
      "contact maintainer."
    )
    # nocov end

  tar.lines.f2 <- tar.lines.f
  cur.lines.f2 <- cur.lines.f

  # add padding vector as close to middle of input vector as possible, except
  # in special cases (only one short line, or first or last hunks)

  pad_in_middle <- function(vec, pad)
    c(
      head(vec, ceiling(length(vec) / 2)),
      pad,
      tail(vec, floor(length(vec) / 2))
    )

  for(i in seq_along(tar.lines.f)) {
    if(length(tar.lines.f[[i]]) != length(cur.lines.f[[i]])) {
      tar.long <- length(tar.lines.f[[i]]) > length(cur.lines.f[[i]])
      long <- if(tar.long) tar.lines.f[[i]] else cur.lines.f[[i]]
      short <- if(!tar.long) tar.lines.f[[i]] else cur.lines.f[[i]]
      long.type <- if(tar.long) tar.end.mix[long] else cur.end.mix[long]
      short.type <- if(!tar.long) tar.end.mix[short] else cur.end.mix[short]

      pad <- rep(NA, length(long) - length(short))

      short.pad <- if(i == 1L && length(tar.lines.f) > 1L) {
        c(pad, short)
      } else if (i == length(tar.lines.f)) {
        c(short, pad)
      } else if(h.cont[[i]] || length(short) != 1L) {
        pad_in_middle(short, pad)
      } else {
        if(
          short.type == "ends" && (long.type[[1L]] %in% c("ends", "neither"))
        ) {
          c(pad, short)
        } else c(short, pad)
      }
      if(tar.long) cur.lines.f2[[i]] <- short.pad
      else tar.lines.f2[[i]] <- short.pad
  } }
  # Augment the input vectors by the blanks we added; these blanks are
  # represented by NAs in our index vector.

  augment <- function(dat, lines) {
    lines.u <- unlist(lines)
    lines.len <- length(lines.u)
    for(i in names(dat)) {
      i.vec <- vector(typeof(dat[[i]]), length(lines.u))
      i.vec[!is.na(lines.u)] <- dat[[i]]
      if(i == "word.ind") {
        i.vec[is.na(lines.u)] <- list(.word.diff.atom)
      } else if (i == "fill") {
        # warning: this is also used/subverted for augmenting the original
        # indices so think before you change it
        i.vec[is.na(lines.u)] <- TRUE
      }
      dat[[i]] <- i.vec
    }
    dat
  }
  tar.dat.aug <- augment(tar.dat, tar.lines.f2)
  cur.dat.aug <- augment(cur.dat, cur.lines.f2)

  # Generate the final vectors to do the diffs on; these should be unique
  # and matching for the matches, and unique and mismatching for the
  # mismatches

  hunk_match <- function(i, l) rep(h.cont[i], length(l[[i]]))
  tar.match <- unlist(lapply(seq_along(h.cont), hunk_match, l=tar.lines.f2))
  cur.match <- unlist(lapply(seq_along(h.cont), hunk_match, l=cur.lines.f2))

  pos.nums <- sum(tar.match)
  if(pos.nums != length(unlist(cur.lines.f2[h.cont]))) {
    # nocov start
    stop("Logic Error: pos nums incorrect; contact maintainer")
    # nocov end
  }
  neg.nums <- sum(!tar.match, !cur.match)

  strings <- make_unique_strings(
    pos.nums + neg.nums, c(tar.dat.aug$raw, cur.dat.aug$raw)
  )
  strings.pos <- strings[seq.int(pos.nums)]
  strings.neg <- tail(strings, neg.nums)
  if(neg.nums + pos.nums != length(strings)) {
    # nocov start
    stop("Logic Error: num-string maping failed; contact maintainer")
    # nocov end
  }

  tar.dat.aug$comp[tar.match] <- strings.pos
  cur.dat.aug$comp[cur.match] <- strings.pos
  tar.dat.aug$comp[!tar.match] <- head(strings.neg, sum(!tar.match))
  cur.dat.aug$comp[!cur.match] <- tail(strings.neg, sum(!cur.match))
  list(tar.dat=tar.dat.aug, cur.dat=cur.dat.aug)
}
# Pull out mismatching words from the word regexec; helper functions

reg_pull <- function(ind, reg) {
  reg.out <- reg[ind]
  attr(reg.out, "match.length") <- attr(reg, "match.length")[ind]
  attr(reg.out, "useBytes") <- attr(reg, "useBytes")
  attr(reg.out, "word.count") <- length(reg)
  reg.out
}
# Generate the indices in each row and apply the pulling functions
# - reg list produced by `gregexpr` and such
# - ends length of each line in words
# - mismatch index of mismatching words
#

reg_apply <- function(reg, ends, mismatch) {
  if(!length(reg)) {
    reg
  } else {
    use.bytes <- attr(reg[[1L]], "useBytes") # assume useBytes value unchanging
    regs.fin <- reg
    buckets <- head(c(0L, ends) + 1L, -1L)
    mism.lines <- findInterval(mismatch, buckets)
    mism.lines.u <- unique(mism.lines)
    mtch.lines.u <- which(!seq_along(ends) %in% mism.lines.u )
    # These don't have any mismatches
    attr(.word.diff.atom, "useBytes") <- use.bytes
    regs.fin[mtch.lines.u] <-
      replicate(length(mtch.lines.u), .word.diff.atom, simplify=FALSE)
    # These do have mismatches, we need to split them up in list elements and
    # substract the starting index to identify position within each sub-list

    if(length(mism.lines.u)) {
      inds.msm <- Map(
        "-", unname(split(mismatch, mism.lines)), buckets[mism.lines.u] - 1L
      )
      regs.fin[mism.lines.u] <- Map(reg_pull, inds.msm, reg[mism.lines.u])
    }
    regs.fin
  }
}
# Modify `tar.dat` and `cur.dat` by generating `regmatches` indices for the
# words that are different
#
# If `diff.mode` is "wrap", then wrapped atomic vector output is unwrapped and
# the diff is carried out in the unwrapped form, and then re-assembled.  See
# `word_to_line_map` for details in how its done.  Return values may be longer
# than input in this mode.
#
# `match.quotes` will make "words" starting and ending with quotes; it should
# only be used with atomic character vectors or possibly deparsed objects.

diff_word2 <- function(
  tar.dat, cur.dat, tar.ind, cur.ind, etc, match.quotes=FALSE, diff.mode,
  warn=TRUE
) {
  stopifnot(
    is.TF(match.quotes), is.TF(warn)
    # isTRUE(valid_dat(tar.dat)), isTRUE(valid_dat(cur.dat)) # too expensive
  )
  # Compute the char by char diffs for each line

  reg <- paste0(
    # grab leading spaces for each word; these will be stripped before actual
    # word diff, but we want them to be part of mismatch so they are removed
    # when we construct the equal strings as that allows better matching b/w
    # strings with differences removed; could do trailing spaces instead
    "\\s*(?:",
    # Some attempt at matching R identifiers; note we explicitly chose not to
    # match `.` or `..`, etc, since those could easily be punctuation
    sprintf("%s|", .reg.r.ident),
    # Not whitespaces that doesn't include quotes
    "[^ \"]+|",
    # Quoted phrases as structured in atomic character vectors
    if(match.quotes) "(?:(?<= )|(?<=^))\"(?:[^\"]|\\\")*?\"(?:(?= )|(?=$))|",
    # Other quoted phrases we might see in expressions or deparsed chr vecs,
    # this is a bit lazy currently b/c we're not forcing precise matching b/w
    # starting and ending delimiters
    "(?:(?<=[ ([,{])|(?<=^))\"(?:[^\"]|\\\"|\"(?=[^ ]))*?",
    "\"(?:(?=[ ,)\\]}])|(?=$))|",
    # Other otherwise 'illegal' quotes that couldn't be matched to one of the
    # known valid quote structures
    "\")"
  )
  tar.chr <- tar.dat$trim[tar.ind]
  cur.chr <- cur.dat$trim[cur.ind]
  tar.reg <- gregexpr(reg, tar.chr, perl=TRUE)
  cur.reg <- gregexpr(reg, cur.chr, perl=TRUE)

  tar.split <- regmatches(tar.chr, tar.reg)
  cur.split <- regmatches(cur.chr, cur.reg)

  # Collapse into one line if to do the diff across lines, but record
  # item counts so we can reconstitute the lines at the end

  tar.lens <- vapply(tar.split, length, integer(1L))
  cur.lens <- vapply(cur.split, length, integer(1L))

  tar.unsplit <- unlist(tar.split)
  cur.unsplit <- unlist(cur.split)
  if(is.null(tar.unsplit)) tar.unsplit <- character(0L)
  if(is.null(cur.unsplit)) cur.unsplit <- character(0L)

  # Remove the leading spaces we grabbed for each word

  tar.unsplit <- trimws(tar.unsplit, "left")
  cur.unsplit <- trimws(cur.unsplit, "left")

  # Run the word diff as a line diff configured in a manner compatible for the
  # word diff

  etc@line.limit <- etc@hunk.limit <- etc@context <- -1L
  etc@mode <- "context"

  diffs <- char_diff(
    tar.unsplit, cur.unsplit, etc=etc, diff.mode=diff.mode, warn=warn
  )
  # Need to figure out which elements match, and which ones do not

  hunks.flat <- diffs$hunks
  tar.mism <- unlist(
    lapply(hunks.flat, function(x) if(!x$context) x$A else integer(0L))
  )
  cur.mism <- abs(
    unlist(lapply(hunks.flat, function(x) if(!x$context) x$B else integer(0L)))
  )
  # Figure out which line each of these elements came from, and what index
  # in each of those lines they are; we use the recorded lengths in words of
  # each line to reconstruct this; also record original line length so we
  # can compute token ratios

  tar.ends <- cumsum(tar.lens)
  cur.ends <- cumsum(cur.lens)

  tar.dat$word.ind[tar.ind] <- reg_apply(tar.reg, tar.ends, tar.mism)
  cur.dat$word.ind[cur.ind] <- reg_apply(cur.reg, cur.ends, cur.mism)

  # If in wrap mode (which is really atomic mode), generate a spoofed
  # `comp` vector (see word_to_line_map)
  #
  # Note that we're only operating on a subset of the data via tar.ind and
  # cur.ind, these are supposed to be the contiguous block of lines that have
  # row headers.

  tar.dat.fin <- tar.dat
  cur.dat.fin <- cur.dat
  if(diff.mode == "wrap") {
    tar.dat.ind <- lapply(tar.dat, '[', tar.ind)
    cur.dat.ind <- lapply(cur.dat, '[', cur.ind)
    word.line.mapped <- word_to_line_map(
      hunks.flat, tar.dat.ind, cur.dat.ind, tar.ends, cur.ends
    )
    # Merge back the mapped data, need to account for possiblity of padding
    # lines being added.

    tar.len.old <- length(tar.dat[[1L]])
    cur.len.old <- length(cur.dat[[1L]])

    tar.ind.lo <- seq_len(head(tar.ind, 1L) - 1L)
    tar.ind.hi <- seq_len(tar.len.old - tail(tar.ind, 1L)) + tail(tar.ind, 1L)
    cur.ind.lo <- seq_len(head(cur.ind, 1L) - 1L)
    cur.ind.hi <- seq_len(cur.len.old - tail(cur.ind, 1L)) + tail(cur.ind, 1L)

    interleave <- function(idx, new, old, lo, hi)
      c(old[[idx]][lo], new[[idx]], old[[idx]][hi])

    tar.dat.fin <- setNames(
      lapply(
        seq_along(tar.dat), interleave,
        new=word.line.mapped[['tar.dat']], old=tar.dat,
        lo=tar.ind.lo, hi=tar.ind.hi
      ),
      names(tar.dat)
    )
    cur.dat.fin <- setNames(
      lapply(
        seq_along(cur.dat), interleave,
        new=word.line.mapped[['cur.dat']], old=cur.dat,
        lo=cur.ind.lo, hi=cur.ind.hi
      ),
      names(cur.dat)
    )
  }
  list(
    tar.dat=tar.dat.fin, cur.dat=cur.dat.fin, hit.diffs.max=diffs$hit.diffs.max
  )
}
# Make unique strings
#
# Makes gibberish strings that are 16 characters long, are unique, and don't
# overlap with `invalid`.  This allows us to generate strings we can use to
# cause a specific diff outcome.
#
# n: how long the character vector should be
# invalid: what values cannot be contained in the returned values

make_unique_strings <- function(n, invalid) {
  pool <- c(
    letters, LETTERS, 0:9, "_", ".", "*", "+", "-", "=", "(", ")", "{",
    "}", "~", "`", "!", "@", "#", "$", "%", "^", "&", ";", ":", "<", ">", "?",
    ",", "/"
  )
  cols <- 16 # use 16 character samples, should be more than big enough
  dat <- matrix("", ncol=16, nrow=n)
  rows <- 1:n
  safety <- 0
  repeat {
    dat[rows, ] <-
      matrix(sample(pool, cols * length(rows), replace=TRUE), ncol=cols)
    dat.chr <- do.call(paste0, split(dat, col(dat)))
    rows <- which(duplicated(dat.chr) | dat.chr %in% invalid)
    if(!length(rows)) break
    # nocov start
    if(safety <- safety + 1 > 100)
      stop(
        "Logic Error: unable to generate unique strings; this should be ",
        "incredibly rare as we are sampling from 10^31 elements, so try ",
        "again and if it happens again contact maintainer"
      )
    # nocov end
  }
  dat.chr
}
# Add word diff highlighting

word_color <- function(txt, inds, fun) {
  word.list <- regmatches(txt, inds)
  word.lens <- vapply(word.list, length, integer(1L))

  # remove leading space before coloring
  words.u <- if(length(word.list)) unlist(word.list) else character(0L)
  words.u.trim.ind <- regexpr("\\S.*", words.u)
  words.u.trim <- regmatches(words.u, words.u.trim.ind)

  # color and re-insert back into space
  words.c.trim <- fun(words.u.trim)
  regmatches(words.u, words.u.trim.ind) <- words.c.trim

  # split back into original lines
  words.res <- vector("list", length(word.list))
  words.res[!!word.lens] <- split(
    words.u, rep(seq_along(word.lens), times=word.lens)
  )
  words.res[!word.lens] <- list(character(0L))
  regmatches(txt, inds) <- words.res
  txt
}