File: core.R

package info (click to toggle)
r-cran-diffobj 0.3.5-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,432 kB
  • sloc: ansic: 455; javascript: 96; sh: 32; makefile: 8
file content (791 lines) | stat: -rwxr-xr-x 28,553 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.

#' @include s4.R

NULL


#' Generate a character representation of Shortest Edit Sequence
#'
#' @keywords internal
#' @seealso \code{\link{ses}}
#' @param x S4 object of class \code{MyersMbaSes}
#' @param ... unused
#' @return character vector

setMethod("as.character", "MyersMbaSes",
  function(x, ...) {
    dat <- as.data.frame(x)

    # Split our data into sections that have either deletes or inserts and get
    # rid of the matches

    dat <- dat[dat$type != "Match", ]
    d.s <- split(dat, dat$section)

    # For each section, compute whether we should display, change, insert,
    # delete, or both, and based on that append to the ses string

    ses_rng <- function(off, len)
      paste0(off, if(len > 1L) paste0(",", off + len - 1L))

    vapply(
      unname(d.s),
      function(d) {
        del <- sum(d$len[d$type == "Delete"])
        ins <- sum(d$len[d$type == "Insert"])
        if(del) {
          del.first <- which(d$type == "Delete")[[1L]]
          del.off <- d$off[del.first]
        }
        if(ins) {
          ins.first <- which(d$type == "Insert")[[1L]]
          ins.off <- d$off[ins.first]
        }
        if(del && ins) {
          paste0(ses_rng(del.off, del), "c", ses_rng(ins.off, ins))
        } else if (del) {
          paste0(ses_rng(del.off, del), "d", d$last.b[[1L]])
        } else if (ins) {
          paste0(d$last.a[[1L]], "a", ses_rng(ins.off, ins))
        } else {
          stop("Logic Error: unexpected edit type; contact maintainer.") # nocov
        }
      },
      character(1L)
) } )
# Used for mapping edit actions to numbers so we can use numeric matrices
# absolutely must be used to create the @type factor in the MBA object.
#
# DO NOT CHANGE LIGHTLY; SOME CODE MIGHT RELY ON THE UNDERLYING INTEGER POSITIONS

.edit.map <- c("Match", "Insert", "Delete")

setMethod("as.matrix", "MyersMbaSes",
  function(x, row.names=NULL, optional=FALSE, ...) {
    # map del/ins/match to numbers

    len <- length(x@type)
    matches <- x@type == "Match"
    section <- cumsum(matches + c(0L, head(matches, -1L)))

    # Track what the max offset observed so far for elements of the `a` string
    # so that if we have an insert command we can get the insert position in
    # `a`

    last.a <- c(
      if(len) 0L,
      head(
        cummax(
          ifelse(x@type != "Insert", x@offset + x@length, 1L)
        ) - 1L, -1L
    ) )
    # Do same thing with `b`, complicated because the matching entries are all
    # in terms of `a`

    last.b <- c(
      if(len) 0L,
      head(cumsum(ifelse(x@type != "Delete", x@length, 0L)), -1L)
    )
    cbind(
      type=as.integer(x@type), len=x@length, off=x@offset,
      section=section, last.a=last.a, last.b = last.b
    )
} )
setMethod("as.data.frame", "MyersMbaSes",
  function(x, row.names=NULL, optional=FALSE, ...) {
    len <- length(x@type)
    mod <- c("Insert", "Delete")
    dat <- data.frame(type=x@type, len=x@length, off=x@offset)
    matches <- dat$type == "Match"
    dat$section <- cumsum(matches + c(0L, head(matches, -1L)))

    # Track what the max offset observed so far for elements of the `a` string
    # so that if we have an insert command we can get the insert position in
    # `a`

    dat$last.a <- c(
      if(nrow(dat)) 0L,
      head(
        cummax(ifelse(dat$type != "Insert", dat$off + dat$len, 1L)) - 1L, -1L
    ) )

    # Do same thing with `b`, complicated because the matching entries are all
    # in terms of `a`

    dat$last.b <- c(
      if(nrow(dat)) 0L,
      head(cumsum(ifelse(dat$type != "Delete", dat$len, 0L)), -1L)
    )
    dat
} )
#' Shortest Edit Script
#'
#' Computes shortest edit script to convert \code{a} into \code{b} by removing
#' elements from \code{a} and adding elements from \code{b}.  Intended primarily
#' for debugging or for other applications that understand that particular
#' format.  See \href{http://www.gnu.org/software/diffutils/manual/diffutils.html#Detailed-Normal}{GNU diff docs}
#' for how to interpret the symbols.
#'
#' \code{ses} will be much faster than any of the
#' \code{\link[=diffPrint]{diff*}} methods, particularly for large inputs with
#' limited numbers of differences.
#'
#' NAs are treated as the string \dQuote{NA}.  Non-character inputs are coerced
#' to character.
#'
#' \code{ses_dat} provides a semi-processed \dQuote{machine-readable} version of
#' precursor data to \code{ses} that may be useful for those desiring to use the
#' raw diff data and not the printed output of \code{diffobj}, but do not wish
#' to manually parse the \code{ses} output.  Whether it is faster than
#' \code{ses} or not depends on the ratio of matching to non-matching values as
#' \code{ses_dat} includes matching values whereas \code{ses} does not.
#' \code{ses_dat} objects have a print method that makes it easy to interpret
#' the diff, but are actually data.frames.  You can see the underlying data by
#' using \code{as.data.frame}, removing the "ses_dat" class, etc..
#'
#' @export
#' @param a character
#' @param b character
#' @param extra TRUE (default) or FALSE, whether to also return the indices in
#'   \code{a} and \code{b} the diff values are taken from.  Set to FALSE for a
#'   small performance gain.
#' @inheritParams diffPrint
#' @param warn TRUE (default) or FALSE whether to warn if we hit
#'   \code{max.diffs}.
#' @return character shortest edit script, or a machine readable version of it
#'   as a \code{ses_dat} object, which is a \code{data.frame} with columns
#'   \code{op} (factor, values \dQuote{Match}, \dQuote{Insert}, or
#'   \dQuote{Delete}), \code{val} character corresponding to the value taken
#'   from either \code{a} or \code{b}, and if \code{extra} is TRUE, integer
#'   columns \code{id.a} and \code{id.b} corresponding to the indices in
#'   \code{a} or \code{b} that \code{val} was taken from.  See Details.
#' @examples
#' a <- letters[1:6]
#' b <- c('b', 'CC', 'DD', 'd', 'f')
#' ses(a, b)
#' (dat <- ses_dat(a, b))
#' str(dat)                 # data.frame with a print method
#'
#' ## use `ses_dat` output to construct a minimal diff
#' ## color with ANSI CSI SGR
#' diff <- dat[['val']]
#' del <- dat[['op']] == 'Delete'
#' ins <- dat[['op']] == 'Insert'
#' if(any(del))
#'   diff[del] <- paste0("\033[33m- ", diff[del], "\033[m")
#' if(any(ins))
#'   diff[ins] <- paste0("\033[34m+ ", diff[ins], "\033[m")
#' if(any(!ins & !del))
#'   diff[!ins & !del] <- paste0("  ", diff[!ins & !del])
#' writeLines(diff)
#'
#' ## We can recover `a` and `b` from the data
#' identical(subset(dat, op != 'Insert', val)[[1]], a)
#' identical(subset(dat, op != 'Delete', val)[[1]], b)

ses <- function(a, b, max.diffs=gdo("max.diffs"), warn=gdo("warn")) {
  args <- ses_prep(a=a, b=b, max.diffs=max.diffs, warn=warn)
  as.character(
    diff_myers(
      args[['a']], args[['b']], max.diffs=args[['max.diffs']],
      warn=args[['warn']]
  ) )
}
#' @export
#' @rdname ses

ses_dat <- function(
  a, b, extra=TRUE, max.diffs=gdo("max.diffs"), warn=gdo("warn")
) {
  args <- ses_prep(a=a, b=b, max.diffs=max.diffs, warn=warn)
  if(!is.TF(extra)) stop("Argument `extra` must be TRUE or FALSE.")
  mba <- diff_myers(
    args[['a']], args[['b']], max.diffs=args[['max.diffs']],
    warn=args[['warn']]
  )
  # reorder so that deletes are before (lack of foresight in setting factor
  # levels...) inserts in each section

  sec <- cumsum(mba@type == 'Match')
  o <- order(sec, c(1L,3L,2L)[as.integer(mba@type)])
  type <- mba@type[o]
  len <- mba@length[o]
  off <- mba@offset[o]

  # offsets are indices in `a` for 'Match' and 'Delete', and in `b` for insert
  # see `diff_myers` for details

  id <- rep(seq_along(type), len)
  type2 <- type[id]
  off2 <- off[id]
  id2 <- sequence(len) + off2 - 1L

  use.a <- type2 %in% c('Match', 'Delete')
  use.b <- !use.a
  values <- character(length(id))
  values[use.a] <- a[id2[use.a]]
  values[use.b] <- b[id2[use.b]]
  res <- if(extra) {
    id.a <- id.b <- rep(NA_integer_, length(values))
    id.a[use.a] <- id2[use.a]
    id.b[use.b] <- id2[use.b]

    data.frame(
      op=type2, val=values, id.a=id.a, id.b=id.b, stringsAsFactors=FALSE
    )
  } else {
    data.frame(op=type2, val=values, stringsAsFactors=FALSE)
  }
  structure(res, class=c('ses_dat', class(res)))
}
#' @export

print.ses_dat <- function(x, quote=FALSE, ...) {
  op <- x[['op']]
  diff <- matrix(
    "", 3, nrow(x),
    dimnames=list(c('D:', 'M:', 'I:'), character(nrow(x)))
  )
  d <- op == 'Delete'
  m <- op == 'Match'
  i <- op == 'Insert'
  diff[1, d] <- x[['val']][d]
  diff[2, m] <- x[['val']][m]
  diff[3, i] <- x[['val']][i]
  writeLines(
    sprintf(
      "\"ses_dat\" object (Match: %d, Delete: %d, Insert: %d):",
      sum(m), sum(d), sum(i)
  ) )
  print(diff, quote=quote, ...)
  invisible(x)
}

# Internal validation fun for ses_*

ses_prep <- function(a, b, max.diffs, warn) {
  if(!is.character(a)) {
    a <- try(as.character(a))
    if(inherits(a, "try-error"))
      stop("Argument `a` is not character and could not be coerced to such")
  }
  if(!is.character(b)) {
    b <- try(as.character(b))
    if(inherits(b, "try-error"))
      stop("Argument `b` is not character and could not be coerced to such")
  }
  if(is.numeric(max.diffs)) max.diffs <- as.integer(max.diffs)
  if(!is.int.1L(max.diffs)) stop("Argument `max.diffs` must be scalar integer.")
  if(!is.TF(warn)) stop("Argument `warn` must be TRUE or FALSE.")
  if(anyNA(a)) a[is.na(a)] <- "NA"
  if(anyNA(b)) b[is.na(b)] <- "NA"
  list(a=a, b=b, max.diffs=max.diffs, warn=warn)
}
#' Diff two character vectors
#'
#' Implementation of Myer's Diff algorithm with linear space refinement
#' originally implemented by Mike B. Allen as part of
#' \href{http://www.ioplex.com/~miallen/libmba/}{libmba}
#' version 0.9.1.  This implementation is a heavily modified version of the
#' original C code and is not compatible with the \code{libmba} library.
#' The C code is simplified by using fixed size arrays instead of variable
#' ones for tracking the longest reaching paths and for recording the shortest
#' edit scripts.  Additionally all error handling and memory allocation calls
#' have been moved to the internal R functions designed to handle those things.
#' A failover result is provided in the case where max diffs allowed is
#' exceeded.  Ability to provide custom comparison functions is removed.
#'
#' The result format indicates operations required to convert \code{a} into
#' \code{b} in a precursor format to the GNU diff shortest edit script.  The
#' operations are \dQuote{Match} (do nothing), \dQuote{Insert} (insert one or
#' more values of \code{b} into \code{a}), and \dQuote{Delete} (remove one or
#' more values from \code{a}).  The \code{length} slot dictates how
#' many values to advance along, insert into, or delete from \code{a}.  The
#' \code{offset} slot changes meaning depending on the operation.  For
#' \dQuote{Match} and \dQuote{Delete}, it is the starting index of that
#' operation in \code{a}.  For \dQuote{Insert}, it is the starting index in
#' \code{b} of the values to insert into \code{a}; the index in \code{a} to
#' insert at is implicit in previous operations.
#'
#' @keywords internal
#' @param a character
#' @param b character
#' @param max.diffs integer(1L) how many differences before giving up; set to
#'   -1 to allow as many as there are up to the maximum allowed (~INT_MAX/4).
#' @param warn TRUE or FALSE, whether to warn if we hit `max.diffs`.
#' @return MyersMbaSes object
#' @useDynLib diffobj, .registration=TRUE, .fixes="DIFFOBJ_"

diff_myers <- function(a, b, max.diffs=-1L, warn=FALSE) {
  stopifnot(
    is.character(a), is.character(b), all(!is.na(c(a, b))), is.int.1L(max.diffs),
    is.TF(warn)
  )
  a <- enc2utf8(a)
  b <- enc2utf8(b)
  res <- .Call(DIFFOBJ_diffobj, a, b, max.diffs)
  res <- setNames(res, c("type", "length", "offset", "diffs"))
  types <- .edit.map
  # silly that we have to generate a factor when we have the integer vector and
  # levels...  Two unncessary hashes.
  res$type <- factor(types[res$type], levels=types)
  res$offset <- res$offset + 1L  # C 0-indexing originally
  res.s4 <- try(do.call("new", c(list("MyersMbaSes", a=a, b=b), res)))
  if(inherits(res.s4, "try-error"))
    # nocov start
    stop(
      "Logic Error: unable to instantiate shortest edit script object; contact ",
      "maintainer."
    )
    # nocov end
  if(isTRUE(warn) && res$diffs < 0) {
    warning(
      "Exceeded `max.diffs`: ", abs(res$diffs), " vs ", max.diffs, " allowed. ",
      "Diff is probably suboptimal."
    )
  }
  res.s4
}
# Print Method for Shortest Edit Path
#
# Bare bones display of shortest edit path using GNU diff conventions
#
# @param object object to display
# @return character the shortest edit path character representation, invisibly
# @rdname diffobj_s4method_doc

#' @rdname diffobj_s4method_doc

setMethod("show", "MyersMbaSes",
  function(object) {
    res <- as.character(object)
    cat(res, sep="\n")
    invisible(res)
} )

#' Summary Method for Shortest Edit Path
#'
#' Displays the data required to generate the shortest edit path for comparison
#' between two strings.
#'
#' @export
#' @keywords internal
#' @param object the \code{diff_myers} object to display
#' @param with.match logical(1L) whether to show what text the edit command
#'   refers to
#' @param ... forwarded to the data frame print method used to actually display
#'   the data
#' @return whatever the data frame print method returns

setMethod("summary", "MyersMbaSes",
  function(object, with.match=FALSE, ...) {
    what <- vapply(
      seq_along(object@type),
      function(y) {
        t <- object@type[[y]]
        o <- object@offset[[y]]
        l <- object@length[[y]]
        vec <- if(t == "Insert") object@b else object@a
        paste0(vec[o:(o + l - 1L)], collapse="")
      },
      character(1L)
    )
    res <- data.frame(
      type=object@type, string=what, len=object@length, offset=object@offset,
      stringsAsFactors=FALSE
    )
    if(!with.match) res <- res[-2L]
    print(res, ...)
} )
# mode is display mode (sidebyside, etc.)
# diff.mode is whether we are doing the first pass line diff, or doing the
#   in-hunk or word-wrap versions
# warn is to allow us to suppress warnings after first hunk warning

char_diff <- function(x, y, context=-1L, etc, diff.mode, warn) {
  stopifnot(
    diff.mode %in% c("line", "hunk", "wrap"),
    isTRUE(warn) || identical(warn, FALSE)
  )
  max.diffs <- etc@max.diffs
  # probably shouldn't generate S4, but easier...
  diff <- diff_myers(x, y, max.diffs, warn=FALSE)

  hunks <- as.hunks(diff, etc=etc)
  hit.diffs.max <- FALSE
  if(diff@diffs < 0L) {
    hit.diffs.max <- TRUE
    diff@diffs <- -diff@diffs
    diff.msg <- c(
      line="overall", hunk="in-hunk word", wrap="atomic wrap-word"
    )
    if(warn)
      warning(
        "Exceeded diff limit during diff computation (",
        diff@diffs, " vs. ", max.diffs, " allowed); ",
        diff.msg[diff.mode], " diff is likely not optimal",
        call.=FALSE
      )
  }
  # used to be a `DiffDiffs` object, but too slow

  list(hunks=hunks, hit.diffs.max=hit.diffs.max)
}
# Compute the character representation of a hunk header

make_hh <- function(h.g, mode, tar.dat, cur.dat, ranges.orig) {
  h.ids <- vapply(h.g, "[[", integer(1L), "id")
  h.head <- vapply(h.g, "[[", logical(1L), "guide")

  # exclude header hunks from contributing to range, and adjust ranges for
  # possible fill lines added to the data

  h.ids.nh <- h.ids[!h.head]

  tar.rng <- find_rng(h.ids.nh, ranges.orig[1:2, , drop=FALSE], tar.dat$fill)
  tar.rng.f <- cumsum(!tar.dat$fill)[tar.rng]

  cur.rng <- find_rng(h.ids.nh, ranges.orig[3:4, , drop=FALSE], cur.dat$fill)
  cur.rng.f <- cumsum(!cur.dat$fill)[cur.rng]

  hh.a <- paste0(rng_as_chr(tar.rng.f))
  hh.b <- paste0(rng_as_chr(cur.rng.f))

  if(mode == "sidebyside") sprintf("@@ %s @@", c(hh.a, hh.b)) else {
    sprintf("@@ %s / %s @@", hh.a, hh.b)
  }
}
# Do not allow `useBytes=TRUE` if there are any matches with `useBytes=FALSE`
#
# Clean up word.ind to avoid issues where we have mixed UTF-8 and non
# UTF-8 strings in different hunks, and gregexpr is trying to optimize
# buy using useBytes=TRUE in ASCII only strings without knowing that in a
# different hunk there are UTF-8 strings

fix_word_ind <- function(x) {
  matches <- vapply(x, function(y) length(y) > 1L || y != -1L, logical(1L))
  useBytes <- vapply(x, function(y) isTRUE(attr(y, "useBytes")), logical(1L))
  if(!all(useBytes[matches])) x <- lapply(x, `attr<-`, "useBytes", NULL)
  x
}
# Variation on `char_diff` used for the overall diff where we don't need
# to worry about overhead from creating the `Diff` object

line_diff <- function(
  target, current, tar.capt, cur.capt, context, etc, warn=TRUE, strip=TRUE
) {
  if(!is.valid.guide.fun(etc@guides))
    # nocov start
    stop(
      "Logic Error: guides are not a valid guide function; contact maintainer"
    )
    # nocov end
  etc@guide.lines <-
    make_guides(target, tar.capt, current, cur.capt, etc@guides)

  # Need to remove new lines as the processed captures do that anyway and we
  # end up with mismatched lengths if we don't

  if(any(nzchar(tar.capt)))
    tar.capt <- split_new_line(tar.capt, sgr.supported=etc@sgr.supported)
  if(any(nzchar(cur.capt)))
    cur.capt <- split_new_line(cur.capt, sgr.supported=etc@sgr.supported)

  # Some debate as to whether we want to do this first, or last.  First has
  # many benefits so that everything is consistent, width calcs can work fine,
  # etc., but only issue is that user provided trim functions might not expect
  # the transformation of the data; this needs to be documented with the trim
  # docs.

  tar.capt.p <- tar.capt
  cur.capt.p <- cur.capt
  if(etc@convert.hz.white.space) {
    tar.capt.p <- strip_hz_control(
      tar.capt, stops=etc@tab.stops, sgr.supported=etc@sgr.supported
    )
    cur.capt.p <- strip_hz_control(
      cur.capt, stops=etc@tab.stops, sgr.supported=etc@sgr.supported
    )
  }
  # Remove whitespace and CSI SGR if warranted

  if(etc@strip.sgr) {
    if(has.style.1 <- any(crayon::has_style(tar.capt.p)))
      tar.capt.p <- crayon::strip_style(tar.capt.p)
    if(has.style.2 <- any(crayon::has_style(cur.capt.p)))
      cur.capt.p <- crayon::strip_style(cur.capt.p)
    if(has.style.1 || has.style.2)
      etc@warn(
        "`target` or `current` contained ANSI CSI SGR when rendered; these ",
        "were stripped.  Use `strip.sgr=FALSE` to preserve them in the diffs."
      )
  }
  # Apply trimming to remove row heads, etc, but only if something gets trimmed
  # from both elements

  tar.trim.ind <- apply_trim(target, tar.capt.p, etc@trim)
  tar.trim <- do.call(
    substr, list(tar.capt.p, tar.trim.ind[, 1L], tar.trim.ind[, 2L])
  )
  cur.trim.ind <- apply_trim(current, cur.capt.p, etc@trim)
  cur.trim <- do.call(
    substr, list(cur.capt.p, cur.trim.ind[, 1L], cur.trim.ind[, 2L])
  )
  if(identical(tar.trim, tar.capt.p) || identical(cur.trim, cur.capt.p)) {
    # didn't trim in both, so go back to original
    tar.trim <- tar.capt.p
    tar.trim.ind <- cbind(
      rep(1L, length(tar.capt.p)),
      nchar(tar.capt.p)
    )
    cur.trim <- cur.capt.p
    cur.trim.ind <- cbind(
      rep(1L, length(cur.capt.p)),
      nchar(cur.capt.p)
    )
  }
  tar.comp <- tar.trim
  cur.comp <- cur.trim

  if(etc@ignore.white.space) {
    tar.comp <- normalize_whitespace(tar.comp)
    cur.comp <- normalize_whitespace(cur.comp)
  }
  # Word diff is done in three steps: create an empty template vector structured
  # as the result of a call to `gregexpr` without matches, if dealing with
  # compliant atomic vectors in print mode, then update with the word diff
  # matches, finally, update with in-hunk word diffs for hunks that don't have
  # any existing word diffs:

  # Set up data lists with all relevant info; need to pass to diff_word so it
  # can be modified.
  # - orig: the very original string
  # - raw: the original captured text line by line, with strip_hz applied
  # - trim: as above, but with row meta data removed
  # - trim.ind: the indices used to re-insert `trim` into `raw`
  # - comp: the strings that will have the line diffs run on, these can be
  #   modified to force a particular outcome, e.g. by word_to_line_map
  # - eq: the portion of `trim` that is equal post word-diff
  # - fin: the final character string for display to user
  # - word.ind: for use by `regmatches<-` to re-insert colored words
  # - tok.rat: for use by `align_eq` when lining up lines within hunks

  tar.dat <- list(
    orig=tar.capt, raw=tar.capt.p, trim=tar.trim,
    trim.ind.start=tar.trim.ind[, 1L], trim.ind.end=tar.trim.ind[, 2L],
    comp=tar.comp, eq=tar.comp, fin=tar.capt.p,
    fill=logical(length(tar.capt.p)),
    word.ind=replicate(length(tar.capt.p), .word.diff.atom, simplify=FALSE),
    tok.rat=rep(1, length(tar.capt.p))
  )
  cur.dat <- list(
    orig=cur.capt, raw=cur.capt.p, trim=cur.trim,
    trim.ind.start=cur.trim.ind[, 1L], trim.ind.end=cur.trim.ind[, 2L],
    comp=cur.comp, eq=cur.comp, fin=cur.capt.p,
    fill=logical(length(cur.capt.p)),
    word.ind=replicate(length(cur.capt.p), .word.diff.atom, simplify=FALSE),
    tok.rat=rep(1, length(cur.capt.p))
  )
  # Word diffs in wrapped form is atomic; note this will potentially change
  # the length of the vectors.

  tar.wrap.diff <- integer(0L)
  cur.wrap.diff <- integer(0L)
  tar.dat.w <- tar.dat
  cur.dat.w <- cur.dat

  if(
    is.atomic(target) && is.atomic(current) &&
    is.null(dim(target)) && is.null(dim(current)) &&
    length(tar.rh <- which_atomic_cont(tar.capt.p, target)) &&
    length(cur.rh <- which_atomic_cont(cur.capt.p, current)) &&
    is.null(names(target)) && is.null(names(current)) &&
    etc@unwrap.atomic && etc@word.diff
  ) {
    # For historical compatibility we allow `diffChr` to get into this step if
    # the text format is right, even though it is arguable whether it should be
    # allowed or not.

    if(!all(diff(tar.rh) == 1L) || !all(diff(cur.rh)) == 1L){
      # nocov start
      stop("Logic Error, row headers must be sequential; contact maintainer.")
      # nocov end
    }
    # Only do this for the portion of the data that actually matches up with
    # the atomic row headers.

    diff.word <- diff_word2(
      tar.dat, cur.dat, tar.ind=tar.rh, cur.ind=cur.rh,
      diff.mode="wrap", warn=warn, etc=etc
    )
    warn <- !diff.word$hit.diffs.max
    tar.dat.w <- diff.word$tar.dat
    cur.dat.w <- diff.word$cur.dat

    # Mark the lines that were wrapped diffed; necessary b/c tar/cur.rh are
    # defined even if other conditions to get in this loop are not, and also
    # because the addition of the fill lines moves everything around
    # (effectively tar/cur.wrap.diff are the fill-offset versions of tar/cur.rh)

    tar.wrap.diff <- seq_along(tar.dat.w$fill)[!tar.dat.w$fill][tar.rh]
    cur.wrap.diff <- seq_along(cur.dat.w$fill)[!cur.dat.w$fill][cur.rh]
  }
  # Actual line diff

  diffs <- char_diff(
    tar.dat.w$comp, cur.dat.w$comp, etc=etc, diff.mode="line", warn=warn
  )
  warn <- !diffs$hit.diffs.max

  hunks.flat <- diffs$hunks

  # For each of those hunks, run the word diffs and store the results in the
  # word.diffs list; bad part here is that we keep overwriting the overall
  # diff data for each hunk, which might be slow

  tar.dat.ww <- tar.dat.w
  cur.dat.ww <- cur.dat.w

  if(etc@word.diff) {
    # Word diffs on hunks, excluding all values that have already been wrap
    # diffed as in tar.rh and cur.rh / tar.wrap.diff and cur.wrap.diff

    for(h.a in hunks.flat) {
      if(h.a$context) next
      h.a.ind <- c(h.a$A, h.a$B)
      h.a.tar.ind <- setdiff(h.a.ind[h.a.ind > 0], tar.wrap.diff)
      h.a.cur.ind <- setdiff(abs(h.a.ind[h.a.ind < 0]), cur.wrap.diff)
      h.a.w.d <- diff_word2(
        tar.dat.ww, cur.dat.ww, h.a.tar.ind, h.a.cur.ind, diff.mode="hunk",
        warn=warn, etc=etc
      )
      tar.dat.ww <- h.a.w.d[['tar.dat']]
      cur.dat.ww <- h.a.w.d[['cur.dat']]
      warn <- warn || !h.a.w.d[['hit.diffs.max']]
    }
    # Compute the token ratios

    tok_ratio_compute <- function(z) vapply(
      z,
      function(y)
        if(is.null(wc <- attr(y, "word.count"))) 1
        else max(0, (wc - length(y)) / wc),
      numeric(1L)
    )
    tar.dat.ww$tok.rat <- tok_ratio_compute(tar.dat.ww$word.ind)
    cur.dat.ww$tok.rat <- tok_ratio_compute(cur.dat.ww$word.ind)

    # Deal with mixed UTF/plain strings

    tar.dat.ww$word.ind <- fix_word_ind(tar.dat.ww$word.ind)
    cur.dat.ww$word.ind <- fix_word_ind(cur.dat.ww$word.ind)

    # Remove different words to make equal strings

    tar.dat.ww$eq <- with(tar.dat.ww, `regmatches<-`(trim, word.ind, value=""))
    cur.dat.ww$eq <- with(cur.dat.ww, `regmatches<-`(trim, word.ind, value=""))
  }
  # Instantiate result

  hunk.grps.raw <- group_hunks(
    hunks.flat, etc=etc, tar.capt=tar.dat.ww$raw, cur.capt=cur.dat.ww$raw
  )
  gutter.dat <- etc@gutter
  max.w <- etc@text.width

  # Recompute line limit accounting for banner len, needed for correct trim

  etc.group <- etc
  if(etc.group@line.limit[[1L]] >= 0L) {
    etc.group@line.limit <-
      pmax(integer(2L), etc@line.limit - banner_len(etc@mode))
  }
  # Trim hunks to the extent needed to make sure we fit in lines

  hunk.grps <-
    trim_hunks(hunk.grps.raw, etc.group, tar.dat.ww$raw, cur.dat.ww$raw)
  hunks.flat <- unlist(hunk.grps, recursive=FALSE)

  # Compact to width of widest element, so retrieve all char values; also
  # need to generate all the hunk headers b/c we need to use them in width
  # computation as well; under no circumstances are hunk headers allowed to
  # wrap as they are always assumed to take one line.
  #
  # Note: this used to be done after trimming / subbing, which is technically
  # better since we might have trimmed away long rows, but we need to do it
  # here so that we can can record the new text width in the outgoing object;
  # also, logic a bit circuitous b/c this was originally done elsewhere; might
  # be faster to use tar.dat and cur.dat directly

  chr.ind <- unlist(lapply(hunks.flat, "[", c("A", "B")))
  chr.dat <- get_dat_raw(chr.ind, tar.dat.ww$raw, cur.dat.ww$raw)
  chr.size <- integer(length(chr.dat))

  ranges <- vapply(
    hunks.flat, function(h.a) c(h.a$tar.rng.trim, h.a$cur.rng.trim),
    integer(4L)
  )
  # compute ranges excluding fill lines
  rng_non_fill <- function(rng, fill) {
    if(!rng[[1L]]) rng else {
      rng.seq <- seq(rng[[1L]], rng[[2L]], by=1L)
      seq.not.fill <- rng.seq[!rng.seq %in% fill]
      if(!length(seq.not.fill)) {
        integer(2L)
      } else {
        range(seq.not.fill)
  } } }
  ranges.orig <- vapply(
    hunks.flat, function(h.a) {
      with(
        h.a, c(
          rng_non_fill(tar.rng.sub, which(tar.dat.ww$fill)),
          rng_non_fill(cur.rng.sub, which(cur.dat.ww$fill))
      ) )
    },
    integer(4L)
  )
  # We need a version of ranges that adjust for the fill lines that are counted
  # in the ranges but don't represent actual lines of output.  This does mean
  # that adjusted ranges are not necessarily contiguous

  hunk.heads <-
    lapply(hunk.grps, make_hh, etc@mode, tar.dat.ww, cur.dat.ww, ranges.orig)
  h.h.chars <- nchar2(
    chr_trim(
      unlist(hunk.heads), etc@line.width, sgr.supported=etc@sgr.supported
    ),
    sgr.supported=etc@sgr.supported
  )
  chr.size <- nchar2(chr.dat, sgr.supported=etc@sgr.supported)
  max.col.w <- max(
    max(0L, chr.size, .min.width + gutter.dat@width), h.h.chars
  )
  max.w <- if(max.col.w < max.w) max.col.w else max.w

  # future calculations should assume narrower display

  etc@text.width <- max.w
  etc@line.width <- max.w + gutter.dat@width

  new(
    "Diff", diffs=hunk.grps, target=target, current=current,
    hit.diffs.max=!warn, tar.dat=tar.dat.ww, cur.dat=cur.dat.ww, etc=etc,
    hunk.heads=hunk.heads, trim.dat=attr(hunk.grps, 'meta')
  )
}