1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
|
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
#' @include s4.R
NULL
#' Generate a character representation of Shortest Edit Sequence
#'
#' @keywords internal
#' @seealso \code{\link{ses}}
#' @param x S4 object of class \code{MyersMbaSes}
#' @param ... unused
#' @return character vector
setMethod("as.character", "MyersMbaSes",
function(x, ...) {
dat <- as.data.frame(x)
# Split our data into sections that have either deletes or inserts and get
# rid of the matches
dat <- dat[dat$type != "Match", ]
d.s <- split(dat, dat$section)
# For each section, compute whether we should display, change, insert,
# delete, or both, and based on that append to the ses string
ses_rng <- function(off, len)
paste0(off, if(len > 1L) paste0(",", off + len - 1L))
vapply(
unname(d.s),
function(d) {
del <- sum(d$len[d$type == "Delete"])
ins <- sum(d$len[d$type == "Insert"])
if(del) {
del.first <- which(d$type == "Delete")[[1L]]
del.off <- d$off[del.first]
}
if(ins) {
ins.first <- which(d$type == "Insert")[[1L]]
ins.off <- d$off[ins.first]
}
if(del && ins) {
paste0(ses_rng(del.off, del), "c", ses_rng(ins.off, ins))
} else if (del) {
paste0(ses_rng(del.off, del), "d", d$last.b[[1L]])
} else if (ins) {
paste0(d$last.a[[1L]], "a", ses_rng(ins.off, ins))
} else {
stop("Logic Error: unexpected edit type; contact maintainer.") # nocov
}
},
character(1L)
) } )
# Used for mapping edit actions to numbers so we can use numeric matrices
# absolutely must be used to create the @type factor in the MBA object.
#
# DO NOT CHANGE LIGHTLY; SOME CODE MIGHT RELY ON THE UNDERLYING INTEGER POSITIONS
.edit.map <- c("Match", "Insert", "Delete")
setMethod("as.matrix", "MyersMbaSes",
function(x, row.names=NULL, optional=FALSE, ...) {
# map del/ins/match to numbers
len <- length(x@type)
matches <- x@type == "Match"
section <- cumsum(matches + c(0L, head(matches, -1L)))
# Track what the max offset observed so far for elements of the `a` string
# so that if we have an insert command we can get the insert position in
# `a`
last.a <- c(
if(len) 0L,
head(
cummax(
ifelse(x@type != "Insert", x@offset + x@length, 1L)
) - 1L, -1L
) )
# Do same thing with `b`, complicated because the matching entries are all
# in terms of `a`
last.b <- c(
if(len) 0L,
head(cumsum(ifelse(x@type != "Delete", x@length, 0L)), -1L)
)
cbind(
type=as.integer(x@type), len=x@length, off=x@offset,
section=section, last.a=last.a, last.b = last.b
)
} )
setMethod("as.data.frame", "MyersMbaSes",
function(x, row.names=NULL, optional=FALSE, ...) {
len <- length(x@type)
mod <- c("Insert", "Delete")
dat <- data.frame(type=x@type, len=x@length, off=x@offset)
matches <- dat$type == "Match"
dat$section <- cumsum(matches + c(0L, head(matches, -1L)))
# Track what the max offset observed so far for elements of the `a` string
# so that if we have an insert command we can get the insert position in
# `a`
dat$last.a <- c(
if(nrow(dat)) 0L,
head(
cummax(ifelse(dat$type != "Insert", dat$off + dat$len, 1L)) - 1L, -1L
) )
# Do same thing with `b`, complicated because the matching entries are all
# in terms of `a`
dat$last.b <- c(
if(nrow(dat)) 0L,
head(cumsum(ifelse(dat$type != "Delete", dat$len, 0L)), -1L)
)
dat
} )
#' Shortest Edit Script
#'
#' Computes shortest edit script to convert \code{a} into \code{b} by removing
#' elements from \code{a} and adding elements from \code{b}. Intended primarily
#' for debugging or for other applications that understand that particular
#' format. See \href{http://www.gnu.org/software/diffutils/manual/diffutils.html#Detailed-Normal}{GNU diff docs}
#' for how to interpret the symbols.
#'
#' \code{ses} will be much faster than any of the
#' \code{\link[=diffPrint]{diff*}} methods, particularly for large inputs with
#' limited numbers of differences.
#'
#' NAs are treated as the string \dQuote{NA}. Non-character inputs are coerced
#' to character.
#'
#' \code{ses_dat} provides a semi-processed \dQuote{machine-readable} version of
#' precursor data to \code{ses} that may be useful for those desiring to use the
#' raw diff data and not the printed output of \code{diffobj}, but do not wish
#' to manually parse the \code{ses} output. Whether it is faster than
#' \code{ses} or not depends on the ratio of matching to non-matching values as
#' \code{ses_dat} includes matching values whereas \code{ses} does not.
#' \code{ses_dat} objects have a print method that makes it easy to interpret
#' the diff, but are actually data.frames. You can see the underlying data by
#' using \code{as.data.frame}, removing the "ses_dat" class, etc..
#'
#' @export
#' @param a character
#' @param b character
#' @param extra TRUE (default) or FALSE, whether to also return the indices in
#' \code{a} and \code{b} the diff values are taken from. Set to FALSE for a
#' small performance gain.
#' @inheritParams diffPrint
#' @param warn TRUE (default) or FALSE whether to warn if we hit
#' \code{max.diffs}.
#' @return character shortest edit script, or a machine readable version of it
#' as a \code{ses_dat} object, which is a \code{data.frame} with columns
#' \code{op} (factor, values \dQuote{Match}, \dQuote{Insert}, or
#' \dQuote{Delete}), \code{val} character corresponding to the value taken
#' from either \code{a} or \code{b}, and if \code{extra} is TRUE, integer
#' columns \code{id.a} and \code{id.b} corresponding to the indices in
#' \code{a} or \code{b} that \code{val} was taken from. See Details.
#' @examples
#' a <- letters[1:6]
#' b <- c('b', 'CC', 'DD', 'd', 'f')
#' ses(a, b)
#' (dat <- ses_dat(a, b))
#' str(dat) # data.frame with a print method
#'
#' ## use `ses_dat` output to construct a minimal diff
#' ## color with ANSI CSI SGR
#' diff <- dat[['val']]
#' del <- dat[['op']] == 'Delete'
#' ins <- dat[['op']] == 'Insert'
#' if(any(del))
#' diff[del] <- paste0("\033[33m- ", diff[del], "\033[m")
#' if(any(ins))
#' diff[ins] <- paste0("\033[34m+ ", diff[ins], "\033[m")
#' if(any(!ins & !del))
#' diff[!ins & !del] <- paste0(" ", diff[!ins & !del])
#' writeLines(diff)
#'
#' ## We can recover `a` and `b` from the data
#' identical(subset(dat, op != 'Insert', val)[[1]], a)
#' identical(subset(dat, op != 'Delete', val)[[1]], b)
ses <- function(a, b, max.diffs=gdo("max.diffs"), warn=gdo("warn")) {
args <- ses_prep(a=a, b=b, max.diffs=max.diffs, warn=warn)
as.character(
diff_myers(
args[['a']], args[['b']], max.diffs=args[['max.diffs']],
warn=args[['warn']]
) )
}
#' @export
#' @rdname ses
ses_dat <- function(
a, b, extra=TRUE, max.diffs=gdo("max.diffs"), warn=gdo("warn")
) {
args <- ses_prep(a=a, b=b, max.diffs=max.diffs, warn=warn)
if(!is.TF(extra)) stop("Argument `extra` must be TRUE or FALSE.")
mba <- diff_myers(
args[['a']], args[['b']], max.diffs=args[['max.diffs']],
warn=args[['warn']]
)
# reorder so that deletes are before (lack of foresight in setting factor
# levels...) inserts in each section
sec <- cumsum(mba@type == 'Match')
o <- order(sec, c(1L,3L,2L)[as.integer(mba@type)])
type <- mba@type[o]
len <- mba@length[o]
off <- mba@offset[o]
# offsets are indices in `a` for 'Match' and 'Delete', and in `b` for insert
# see `diff_myers` for details
id <- rep(seq_along(type), len)
type2 <- type[id]
off2 <- off[id]
id2 <- sequence(len) + off2 - 1L
use.a <- type2 %in% c('Match', 'Delete')
use.b <- !use.a
values <- character(length(id))
values[use.a] <- a[id2[use.a]]
values[use.b] <- b[id2[use.b]]
res <- if(extra) {
id.a <- id.b <- rep(NA_integer_, length(values))
id.a[use.a] <- id2[use.a]
id.b[use.b] <- id2[use.b]
data.frame(
op=type2, val=values, id.a=id.a, id.b=id.b, stringsAsFactors=FALSE
)
} else {
data.frame(op=type2, val=values, stringsAsFactors=FALSE)
}
structure(res, class=c('ses_dat', class(res)))
}
#' @export
print.ses_dat <- function(x, quote=FALSE, ...) {
op <- x[['op']]
diff <- matrix(
"", 3, nrow(x),
dimnames=list(c('D:', 'M:', 'I:'), character(nrow(x)))
)
d <- op == 'Delete'
m <- op == 'Match'
i <- op == 'Insert'
diff[1, d] <- x[['val']][d]
diff[2, m] <- x[['val']][m]
diff[3, i] <- x[['val']][i]
writeLines(
sprintf(
"\"ses_dat\" object (Match: %d, Delete: %d, Insert: %d):",
sum(m), sum(d), sum(i)
) )
print(diff, quote=quote, ...)
invisible(x)
}
# Internal validation fun for ses_*
ses_prep <- function(a, b, max.diffs, warn) {
if(!is.character(a)) {
a <- try(as.character(a))
if(inherits(a, "try-error"))
stop("Argument `a` is not character and could not be coerced to such")
}
if(!is.character(b)) {
b <- try(as.character(b))
if(inherits(b, "try-error"))
stop("Argument `b` is not character and could not be coerced to such")
}
if(is.numeric(max.diffs)) max.diffs <- as.integer(max.diffs)
if(!is.int.1L(max.diffs)) stop("Argument `max.diffs` must be scalar integer.")
if(!is.TF(warn)) stop("Argument `warn` must be TRUE or FALSE.")
if(anyNA(a)) a[is.na(a)] <- "NA"
if(anyNA(b)) b[is.na(b)] <- "NA"
list(a=a, b=b, max.diffs=max.diffs, warn=warn)
}
#' Diff two character vectors
#'
#' Implementation of Myer's Diff algorithm with linear space refinement
#' originally implemented by Mike B. Allen as part of
#' \href{http://www.ioplex.com/~miallen/libmba/}{libmba}
#' version 0.9.1. This implementation is a heavily modified version of the
#' original C code and is not compatible with the \code{libmba} library.
#' The C code is simplified by using fixed size arrays instead of variable
#' ones for tracking the longest reaching paths and for recording the shortest
#' edit scripts. Additionally all error handling and memory allocation calls
#' have been moved to the internal R functions designed to handle those things.
#' A failover result is provided in the case where max diffs allowed is
#' exceeded. Ability to provide custom comparison functions is removed.
#'
#' The result format indicates operations required to convert \code{a} into
#' \code{b} in a precursor format to the GNU diff shortest edit script. The
#' operations are \dQuote{Match} (do nothing), \dQuote{Insert} (insert one or
#' more values of \code{b} into \code{a}), and \dQuote{Delete} (remove one or
#' more values from \code{a}). The \code{length} slot dictates how
#' many values to advance along, insert into, or delete from \code{a}. The
#' \code{offset} slot changes meaning depending on the operation. For
#' \dQuote{Match} and \dQuote{Delete}, it is the starting index of that
#' operation in \code{a}. For \dQuote{Insert}, it is the starting index in
#' \code{b} of the values to insert into \code{a}; the index in \code{a} to
#' insert at is implicit in previous operations.
#'
#' @keywords internal
#' @param a character
#' @param b character
#' @param max.diffs integer(1L) how many differences before giving up; set to
#' -1 to allow as many as there are up to the maximum allowed (~INT_MAX/4).
#' @param warn TRUE or FALSE, whether to warn if we hit `max.diffs`.
#' @return MyersMbaSes object
#' @useDynLib diffobj, .registration=TRUE, .fixes="DIFFOBJ_"
diff_myers <- function(a, b, max.diffs=-1L, warn=FALSE) {
stopifnot(
is.character(a), is.character(b), all(!is.na(c(a, b))), is.int.1L(max.diffs),
is.TF(warn)
)
a <- enc2utf8(a)
b <- enc2utf8(b)
res <- .Call(DIFFOBJ_diffobj, a, b, max.diffs)
res <- setNames(res, c("type", "length", "offset", "diffs"))
types <- .edit.map
# silly that we have to generate a factor when we have the integer vector and
# levels... Two unncessary hashes.
res$type <- factor(types[res$type], levels=types)
res$offset <- res$offset + 1L # C 0-indexing originally
res.s4 <- try(do.call("new", c(list("MyersMbaSes", a=a, b=b), res)))
if(inherits(res.s4, "try-error"))
# nocov start
stop(
"Logic Error: unable to instantiate shortest edit script object; contact ",
"maintainer."
)
# nocov end
if(isTRUE(warn) && res$diffs < 0) {
warning(
"Exceeded `max.diffs`: ", abs(res$diffs), " vs ", max.diffs, " allowed. ",
"Diff is probably suboptimal."
)
}
res.s4
}
# Print Method for Shortest Edit Path
#
# Bare bones display of shortest edit path using GNU diff conventions
#
# @param object object to display
# @return character the shortest edit path character representation, invisibly
# @rdname diffobj_s4method_doc
#' @rdname diffobj_s4method_doc
setMethod("show", "MyersMbaSes",
function(object) {
res <- as.character(object)
cat(res, sep="\n")
invisible(res)
} )
#' Summary Method for Shortest Edit Path
#'
#' Displays the data required to generate the shortest edit path for comparison
#' between two strings.
#'
#' @export
#' @keywords internal
#' @param object the \code{diff_myers} object to display
#' @param with.match logical(1L) whether to show what text the edit command
#' refers to
#' @param ... forwarded to the data frame print method used to actually display
#' the data
#' @return whatever the data frame print method returns
setMethod("summary", "MyersMbaSes",
function(object, with.match=FALSE, ...) {
what <- vapply(
seq_along(object@type),
function(y) {
t <- object@type[[y]]
o <- object@offset[[y]]
l <- object@length[[y]]
vec <- if(t == "Insert") object@b else object@a
paste0(vec[o:(o + l - 1L)], collapse="")
},
character(1L)
)
res <- data.frame(
type=object@type, string=what, len=object@length, offset=object@offset,
stringsAsFactors=FALSE
)
if(!with.match) res <- res[-2L]
print(res, ...)
} )
# mode is display mode (sidebyside, etc.)
# diff.mode is whether we are doing the first pass line diff, or doing the
# in-hunk or word-wrap versions
# warn is to allow us to suppress warnings after first hunk warning
char_diff <- function(x, y, context=-1L, etc, diff.mode, warn) {
stopifnot(
diff.mode %in% c("line", "hunk", "wrap"),
isTRUE(warn) || identical(warn, FALSE)
)
max.diffs <- etc@max.diffs
# probably shouldn't generate S4, but easier...
diff <- diff_myers(x, y, max.diffs, warn=FALSE)
hunks <- as.hunks(diff, etc=etc)
hit.diffs.max <- FALSE
if(diff@diffs < 0L) {
hit.diffs.max <- TRUE
diff@diffs <- -diff@diffs
diff.msg <- c(
line="overall", hunk="in-hunk word", wrap="atomic wrap-word"
)
if(warn)
warning(
"Exceeded diff limit during diff computation (",
diff@diffs, " vs. ", max.diffs, " allowed); ",
diff.msg[diff.mode], " diff is likely not optimal",
call.=FALSE
)
}
# used to be a `DiffDiffs` object, but too slow
list(hunks=hunks, hit.diffs.max=hit.diffs.max)
}
# Compute the character representation of a hunk header
make_hh <- function(h.g, mode, tar.dat, cur.dat, ranges.orig) {
h.ids <- vapply(h.g, "[[", integer(1L), "id")
h.head <- vapply(h.g, "[[", logical(1L), "guide")
# exclude header hunks from contributing to range, and adjust ranges for
# possible fill lines added to the data
h.ids.nh <- h.ids[!h.head]
tar.rng <- find_rng(h.ids.nh, ranges.orig[1:2, , drop=FALSE], tar.dat$fill)
tar.rng.f <- cumsum(!tar.dat$fill)[tar.rng]
cur.rng <- find_rng(h.ids.nh, ranges.orig[3:4, , drop=FALSE], cur.dat$fill)
cur.rng.f <- cumsum(!cur.dat$fill)[cur.rng]
hh.a <- paste0(rng_as_chr(tar.rng.f))
hh.b <- paste0(rng_as_chr(cur.rng.f))
if(mode == "sidebyside") sprintf("@@ %s @@", c(hh.a, hh.b)) else {
sprintf("@@ %s / %s @@", hh.a, hh.b)
}
}
# Do not allow `useBytes=TRUE` if there are any matches with `useBytes=FALSE`
#
# Clean up word.ind to avoid issues where we have mixed UTF-8 and non
# UTF-8 strings in different hunks, and gregexpr is trying to optimize
# buy using useBytes=TRUE in ASCII only strings without knowing that in a
# different hunk there are UTF-8 strings
fix_word_ind <- function(x) {
matches <- vapply(x, function(y) length(y) > 1L || y != -1L, logical(1L))
useBytes <- vapply(x, function(y) isTRUE(attr(y, "useBytes")), logical(1L))
if(!all(useBytes[matches])) x <- lapply(x, `attr<-`, "useBytes", NULL)
x
}
# Variation on `char_diff` used for the overall diff where we don't need
# to worry about overhead from creating the `Diff` object
line_diff <- function(
target, current, tar.capt, cur.capt, context, etc, warn=TRUE, strip=TRUE
) {
if(!is.valid.guide.fun(etc@guides))
# nocov start
stop(
"Logic Error: guides are not a valid guide function; contact maintainer"
)
# nocov end
etc@guide.lines <-
make_guides(target, tar.capt, current, cur.capt, etc@guides)
# Need to remove new lines as the processed captures do that anyway and we
# end up with mismatched lengths if we don't
if(any(nzchar(tar.capt)))
tar.capt <- split_new_line(tar.capt, sgr.supported=etc@sgr.supported)
if(any(nzchar(cur.capt)))
cur.capt <- split_new_line(cur.capt, sgr.supported=etc@sgr.supported)
# Some debate as to whether we want to do this first, or last. First has
# many benefits so that everything is consistent, width calcs can work fine,
# etc., but only issue is that user provided trim functions might not expect
# the transformation of the data; this needs to be documented with the trim
# docs.
tar.capt.p <- tar.capt
cur.capt.p <- cur.capt
if(etc@convert.hz.white.space) {
tar.capt.p <- strip_hz_control(
tar.capt, stops=etc@tab.stops, sgr.supported=etc@sgr.supported
)
cur.capt.p <- strip_hz_control(
cur.capt, stops=etc@tab.stops, sgr.supported=etc@sgr.supported
)
}
# Remove whitespace and CSI SGR if warranted
if(etc@strip.sgr) {
if(has.style.1 <- any(crayon::has_style(tar.capt.p)))
tar.capt.p <- crayon::strip_style(tar.capt.p)
if(has.style.2 <- any(crayon::has_style(cur.capt.p)))
cur.capt.p <- crayon::strip_style(cur.capt.p)
if(has.style.1 || has.style.2)
etc@warn(
"`target` or `current` contained ANSI CSI SGR when rendered; these ",
"were stripped. Use `strip.sgr=FALSE` to preserve them in the diffs."
)
}
# Apply trimming to remove row heads, etc, but only if something gets trimmed
# from both elements
tar.trim.ind <- apply_trim(target, tar.capt.p, etc@trim)
tar.trim <- do.call(
substr, list(tar.capt.p, tar.trim.ind[, 1L], tar.trim.ind[, 2L])
)
cur.trim.ind <- apply_trim(current, cur.capt.p, etc@trim)
cur.trim <- do.call(
substr, list(cur.capt.p, cur.trim.ind[, 1L], cur.trim.ind[, 2L])
)
if(identical(tar.trim, tar.capt.p) || identical(cur.trim, cur.capt.p)) {
# didn't trim in both, so go back to original
tar.trim <- tar.capt.p
tar.trim.ind <- cbind(
rep(1L, length(tar.capt.p)),
nchar(tar.capt.p)
)
cur.trim <- cur.capt.p
cur.trim.ind <- cbind(
rep(1L, length(cur.capt.p)),
nchar(cur.capt.p)
)
}
tar.comp <- tar.trim
cur.comp <- cur.trim
if(etc@ignore.white.space) {
tar.comp <- normalize_whitespace(tar.comp)
cur.comp <- normalize_whitespace(cur.comp)
}
# Word diff is done in three steps: create an empty template vector structured
# as the result of a call to `gregexpr` without matches, if dealing with
# compliant atomic vectors in print mode, then update with the word diff
# matches, finally, update with in-hunk word diffs for hunks that don't have
# any existing word diffs:
# Set up data lists with all relevant info; need to pass to diff_word so it
# can be modified.
# - orig: the very original string
# - raw: the original captured text line by line, with strip_hz applied
# - trim: as above, but with row meta data removed
# - trim.ind: the indices used to re-insert `trim` into `raw`
# - comp: the strings that will have the line diffs run on, these can be
# modified to force a particular outcome, e.g. by word_to_line_map
# - eq: the portion of `trim` that is equal post word-diff
# - fin: the final character string for display to user
# - word.ind: for use by `regmatches<-` to re-insert colored words
# - tok.rat: for use by `align_eq` when lining up lines within hunks
tar.dat <- list(
orig=tar.capt, raw=tar.capt.p, trim=tar.trim,
trim.ind.start=tar.trim.ind[, 1L], trim.ind.end=tar.trim.ind[, 2L],
comp=tar.comp, eq=tar.comp, fin=tar.capt.p,
fill=logical(length(tar.capt.p)),
word.ind=replicate(length(tar.capt.p), .word.diff.atom, simplify=FALSE),
tok.rat=rep(1, length(tar.capt.p))
)
cur.dat <- list(
orig=cur.capt, raw=cur.capt.p, trim=cur.trim,
trim.ind.start=cur.trim.ind[, 1L], trim.ind.end=cur.trim.ind[, 2L],
comp=cur.comp, eq=cur.comp, fin=cur.capt.p,
fill=logical(length(cur.capt.p)),
word.ind=replicate(length(cur.capt.p), .word.diff.atom, simplify=FALSE),
tok.rat=rep(1, length(cur.capt.p))
)
# Word diffs in wrapped form is atomic; note this will potentially change
# the length of the vectors.
tar.wrap.diff <- integer(0L)
cur.wrap.diff <- integer(0L)
tar.dat.w <- tar.dat
cur.dat.w <- cur.dat
if(
is.atomic(target) && is.atomic(current) &&
is.null(dim(target)) && is.null(dim(current)) &&
length(tar.rh <- which_atomic_cont(tar.capt.p, target)) &&
length(cur.rh <- which_atomic_cont(cur.capt.p, current)) &&
is.null(names(target)) && is.null(names(current)) &&
etc@unwrap.atomic && etc@word.diff
) {
# For historical compatibility we allow `diffChr` to get into this step if
# the text format is right, even though it is arguable whether it should be
# allowed or not.
if(!all(diff(tar.rh) == 1L) || !all(diff(cur.rh)) == 1L){
# nocov start
stop("Logic Error, row headers must be sequential; contact maintainer.")
# nocov end
}
# Only do this for the portion of the data that actually matches up with
# the atomic row headers.
diff.word <- diff_word2(
tar.dat, cur.dat, tar.ind=tar.rh, cur.ind=cur.rh,
diff.mode="wrap", warn=warn, etc=etc
)
warn <- !diff.word$hit.diffs.max
tar.dat.w <- diff.word$tar.dat
cur.dat.w <- diff.word$cur.dat
# Mark the lines that were wrapped diffed; necessary b/c tar/cur.rh are
# defined even if other conditions to get in this loop are not, and also
# because the addition of the fill lines moves everything around
# (effectively tar/cur.wrap.diff are the fill-offset versions of tar/cur.rh)
tar.wrap.diff <- seq_along(tar.dat.w$fill)[!tar.dat.w$fill][tar.rh]
cur.wrap.diff <- seq_along(cur.dat.w$fill)[!cur.dat.w$fill][cur.rh]
}
# Actual line diff
diffs <- char_diff(
tar.dat.w$comp, cur.dat.w$comp, etc=etc, diff.mode="line", warn=warn
)
warn <- !diffs$hit.diffs.max
hunks.flat <- diffs$hunks
# For each of those hunks, run the word diffs and store the results in the
# word.diffs list; bad part here is that we keep overwriting the overall
# diff data for each hunk, which might be slow
tar.dat.ww <- tar.dat.w
cur.dat.ww <- cur.dat.w
if(etc@word.diff) {
# Word diffs on hunks, excluding all values that have already been wrap
# diffed as in tar.rh and cur.rh / tar.wrap.diff and cur.wrap.diff
for(h.a in hunks.flat) {
if(h.a$context) next
h.a.ind <- c(h.a$A, h.a$B)
h.a.tar.ind <- setdiff(h.a.ind[h.a.ind > 0], tar.wrap.diff)
h.a.cur.ind <- setdiff(abs(h.a.ind[h.a.ind < 0]), cur.wrap.diff)
h.a.w.d <- diff_word2(
tar.dat.ww, cur.dat.ww, h.a.tar.ind, h.a.cur.ind, diff.mode="hunk",
warn=warn, etc=etc
)
tar.dat.ww <- h.a.w.d[['tar.dat']]
cur.dat.ww <- h.a.w.d[['cur.dat']]
warn <- warn || !h.a.w.d[['hit.diffs.max']]
}
# Compute the token ratios
tok_ratio_compute <- function(z) vapply(
z,
function(y)
if(is.null(wc <- attr(y, "word.count"))) 1
else max(0, (wc - length(y)) / wc),
numeric(1L)
)
tar.dat.ww$tok.rat <- tok_ratio_compute(tar.dat.ww$word.ind)
cur.dat.ww$tok.rat <- tok_ratio_compute(cur.dat.ww$word.ind)
# Deal with mixed UTF/plain strings
tar.dat.ww$word.ind <- fix_word_ind(tar.dat.ww$word.ind)
cur.dat.ww$word.ind <- fix_word_ind(cur.dat.ww$word.ind)
# Remove different words to make equal strings
tar.dat.ww$eq <- with(tar.dat.ww, `regmatches<-`(trim, word.ind, value=""))
cur.dat.ww$eq <- with(cur.dat.ww, `regmatches<-`(trim, word.ind, value=""))
}
# Instantiate result
hunk.grps.raw <- group_hunks(
hunks.flat, etc=etc, tar.capt=tar.dat.ww$raw, cur.capt=cur.dat.ww$raw
)
gutter.dat <- etc@gutter
max.w <- etc@text.width
# Recompute line limit accounting for banner len, needed for correct trim
etc.group <- etc
if(etc.group@line.limit[[1L]] >= 0L) {
etc.group@line.limit <-
pmax(integer(2L), etc@line.limit - banner_len(etc@mode))
}
# Trim hunks to the extent needed to make sure we fit in lines
hunk.grps <-
trim_hunks(hunk.grps.raw, etc.group, tar.dat.ww$raw, cur.dat.ww$raw)
hunks.flat <- unlist(hunk.grps, recursive=FALSE)
# Compact to width of widest element, so retrieve all char values; also
# need to generate all the hunk headers b/c we need to use them in width
# computation as well; under no circumstances are hunk headers allowed to
# wrap as they are always assumed to take one line.
#
# Note: this used to be done after trimming / subbing, which is technically
# better since we might have trimmed away long rows, but we need to do it
# here so that we can can record the new text width in the outgoing object;
# also, logic a bit circuitous b/c this was originally done elsewhere; might
# be faster to use tar.dat and cur.dat directly
chr.ind <- unlist(lapply(hunks.flat, "[", c("A", "B")))
chr.dat <- get_dat_raw(chr.ind, tar.dat.ww$raw, cur.dat.ww$raw)
chr.size <- integer(length(chr.dat))
ranges <- vapply(
hunks.flat, function(h.a) c(h.a$tar.rng.trim, h.a$cur.rng.trim),
integer(4L)
)
# compute ranges excluding fill lines
rng_non_fill <- function(rng, fill) {
if(!rng[[1L]]) rng else {
rng.seq <- seq(rng[[1L]], rng[[2L]], by=1L)
seq.not.fill <- rng.seq[!rng.seq %in% fill]
if(!length(seq.not.fill)) {
integer(2L)
} else {
range(seq.not.fill)
} } }
ranges.orig <- vapply(
hunks.flat, function(h.a) {
with(
h.a, c(
rng_non_fill(tar.rng.sub, which(tar.dat.ww$fill)),
rng_non_fill(cur.rng.sub, which(cur.dat.ww$fill))
) )
},
integer(4L)
)
# We need a version of ranges that adjust for the fill lines that are counted
# in the ranges but don't represent actual lines of output. This does mean
# that adjusted ranges are not necessarily contiguous
hunk.heads <-
lapply(hunk.grps, make_hh, etc@mode, tar.dat.ww, cur.dat.ww, ranges.orig)
h.h.chars <- nchar2(
chr_trim(
unlist(hunk.heads), etc@line.width, sgr.supported=etc@sgr.supported
),
sgr.supported=etc@sgr.supported
)
chr.size <- nchar2(chr.dat, sgr.supported=etc@sgr.supported)
max.col.w <- max(
max(0L, chr.size, .min.width + gutter.dat@width), h.h.chars
)
max.w <- if(max.col.w < max.w) max.col.w else max.w
# future calculations should assume narrower display
etc@text.width <- max.w
etc@line.width <- max.w + gutter.dat@width
new(
"Diff", diffs=hunk.grps, target=target, current=current,
hit.diffs.max=!warn, tar.dat=tar.dat.ww, cur.dat=cur.dat.ww, etc=etc,
hunk.heads=hunk.heads, trim.dat=attr(hunk.grps, 'meta')
)
}
|