File: text.R

package info (click to toggle)
r-cran-diffobj 0.3.5-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,432 kB
  • sloc: ansic: 455; javascript: 96; sh: 32; makefile: 8
file content (428 lines) | stat: -rwxr-xr-x 14,982 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.

# borrowed from crayon, will lobby to get it exported

ansi_regex <- paste0("(?:(?:\\x{001b}\\[)|\\x{009b})",
                     "(?:(?:[0-9]{1,3})?(?:(?:;[0-9]{0,3})*)?[A-M|f-m])",
                     "|\\x{001b}[A-M]")

# Function to split a character vector by newlines; handles some special cases

split_new_line <- function(x, sgr.supported) {
  y <- x
  y[!nzchar(x)] <- "\n"
  unlist(strsplit2(y, "\n", sgr.supported=sgr.supported))
}
html_ent_sub <- function(x, style) {
  if(is(style, "StyleHtml") && style@escape.html.entities) {
    x <- gsub("&", "&amp;", x, fixed=TRUE)
    x <- gsub("<", "&lt;", x, fixed=TRUE)
    x <- gsub(">", "&gt;", x, fixed=TRUE)
    x <- gsub("\n", "<br />", x, fixed=TRUE)
    # x <- gsub(" ", "&#32;", x, fixed=TRUE)
  }
  x
}
# Helper function for align_eq; splits up a vector into matched elements and
# interstitial elements, including possibly empty interstitial elements when
# two matches are abutting

align_split <- function(v, m) {
  match.len <- sum(!!m)
  res.len <- match.len * 2L + 1L
  splits <- cumsum(
    c(
      if(length(m)) 1L,
      (!!diff(m) < 0L & !tail(m, -1L)) | (head(m, -1L) & tail(m, -1L))
  ) )
  m.all <- match(m, sort(unique(m[!!m])), nomatch=0L)  # normalize
  m.all[!m.all] <- -ave(m.all, splits, FUN=max)[!m.all]
  m.all[!m.all] <- -match.len - 1L  # trailing zeros
  m.fin <- ifelse(m.all < 0, -m.all * 2 - 1, m.all * 2)
  if(any(diff(m.fin) < 0L))
    stop("Logic Error: non monotonic alignments; contact maintainer") # nocov
  res <- replicate(res.len, character(0L), simplify=FALSE)
  res[unique(m.fin)] <- unname(split(v, m.fin))
  res
}
# Align lists based on equalities on other vectors
#
# This is used for hunks that are word diffed.  Once the word differences are
# accounted for, the remaining strings (A.eq/B.eq) are compared to try to align
# them with a naive algorithm on a line basis.  This works best when lines as a
# whole are equal except for a few differences.  There can be funny situations
# where matched words are on one line in e.g. A, but spread over multiple lines
# in B.  This isn't really handled well currently.
#
# See issue #37.
#
# The A/B vecs  will be split up into matchd elements, and non-matched elements.
# Each matching element will be surrounding by (possibly empty) non-matching
# elements.
#
# Need to reconcile the padding that happens as a result of alignment as well
# as the padding that happens with atomic vectors

align_eq <- function(A, B, x, context) {
  stopifnot(
    is.integer(A), is.integer(B), !anyNA(c(A, B)),
    is(x, "Diff")
  )
  A.fill <- get_dat(x, A, "fill")
  B.fill <- get_dat(x, B, "fill")
  A.fin <- get_dat(x, A, "fin")
  B.fin <- get_dat(x, B, "fin")

  if(context) {             # Nothing to align if this is context hunk
    A.chunks <- list(A.fin)
    B.chunks <- list(B.fin)
  } else {
    etc <- x@etc
    A.eq <- get_dat(x, A, "eq")
    B.eq <- get_dat(x, B, "eq")

    # Cleanup so only relevant stuff is allowed to match

    A.tok.ratio <- get_dat(x, A, "tok.rat")
    B.tok.ratio <- get_dat(x, B, "tok.rat")

    if(etc@align@count.alnum.only) {
      A.eq.trim <- gsub("[^[:alnum:]]", "", A.eq, perl=TRUE)
      B.eq.trim <- gsub("[^[:alnum:]]", "", B.eq, perl=TRUE)
    } else {
      A.eq.trim <- A.eq
      B.eq.trim <- B.eq
    }
    # TBD whether nchar here should be ansi-aware; probably if in alnum only
    # mode...

    A.valid <- which(
      nchar2(A.eq.trim, sgr.supported=etc@sgr.supported) >= etc@align@min.chars &
      A.tok.ratio >= etc@align@threshold
    )
    B.valid <- which(
      nchar2(B.eq.trim, sgr.supported=etc@sgr.supported) >= etc@align@min.chars &
      B.tok.ratio >= etc@align@threshold
    )
    B.eq.seq <- seq_along(B.eq.trim)

    align <- integer(length(A.eq))
    min.match <- 0L

    # Need to match each element in A.eq to B.eq, though each match consumes the
    # match so we can't use `match`; unfortunately this is slow; for context
    # hunks the match is one to one for each line; also, this whole matching
    # needs to be improved (see issue #37)

    if(length(A.valid) & length(B.valid)) {
      B.max <- length(B.valid)
      B.eq.val <- B.eq.trim[B.valid]

      for(i in A.valid) {
        if(min.match >= B.max) break
        B.match <- which(
          A.eq.trim[[i]] == if(min.match)
            tail(B.eq.val, -min.match) else B.eq.val
        )
        if(length(B.match)) {
          align[[i]] <- B.valid[B.match[[1L]] + min.match]
          min.match <- B.match[[1L]] + min.match
        }
      }
    }
    # Group elements together.  We number the interstitial buckest as the
    # negative of the next match.  There are always matches together, split
    # by possibly empty interstitial elements

    align.b <- seq_along(B.eq)
    align.b[!align.b %in% align] <- 0L
    A.chunks <- align_split(A.fin, align)
    B.chunks <- align_split(B.fin, align.b)
  }
  if(length(A.chunks) != length(B.chunks))
    # nocov start
    stop("Logic Error: aligned chunks unequal length; contact maintainer.")
    # nocov end

  list(A=A.chunks, B=B.chunks, A.fill=A.fill, B.fill=B.fill)
}
# Calculate how many lines of screen space are taken up by the diff hunks
#
# `disp.width` should be the available display width, this function computes
# the net real estate account for mode, padding, etc.

nlines <- function(txt, disp.width, mode, etc) {
  # stopifnot(is.character(txt), all(!is.na(txt)))
  capt.width <- calc_width_pad(disp.width, mode)
  pmax(
    1L,
    as.integer(
      ceiling(
        nchar2(txt, sgr.supported=etc@sgr.supported
        ) / capt.width
) ) ) }
# Gets rid of tabs and carriage returns
#
# Assumes each line is one screen line
# @param stops may be a single positive integer value, or a vector of values
#   whereby the last value will be repeated as many times as necessary

strip_hz_c_int <- function(txt, stops, sgr.supported) {

  # remove trailing and leading CRs (need to record if trailing remains to add
  # back at end? no, not really since by structure next thing must be a newline

  w.chr <- nzchar(txt)  # corner case with strsplit and zero length strings
  txt <- gsub("^\r+|\r+$", "", txt)
  has.tabs <- grep("\t", txt, fixed=TRUE)
  has.crs <- grep("\r", txt, fixed=TRUE)
  txt.s <- as.list(txt)
  txt.s[has.crs] <- if(!any(has.crs)) list()
    else strsplit2(txt[has.crs], "\r+", sgr.supported=sgr.supported)

  # Assume \r resets tab stops as it would on a type writer; so now need to
  # generate the set maximum set of possible tab stops; approximate here by
  # using largest stop

  if(length(has.tabs)) {
    max.stop <- max(stops)
    width.w.tabs <- max(
      vapply(
        txt.s[has.tabs], function(x) {
          # add number of chars and number of tabs times max tab length
          sum(
            nchar2(x, sgr.supported=sgr.supported) + (
              vapply(
                strsplit2(x, "\t", sgr.supported=sgr.supported),
                length, integer(1L)
              ) +
              grepl("\t$", x) - 1L
            ) * max.stop
          )
        }, integer(1L)
    ) )
    extra.chars <- width.w.tabs - sum(stops)
    extra.stops <- ceiling(extra.chars / tail(stops, 1L))
    stop.vec <- cumsum(c(stops, rep(tail(stops, 1L), extra.stops)))

    # For each line, assess effects of tabs

    txt.s[has.tabs] <- lapply(txt.s[has.tabs],
      function(x) {
        if(length(h.t <- grep("\t", x, fixed=T))) {
          # workaround for strsplit dropping trailing tabs
          x.t <- sub("\t$", "\t\t", x[h.t])
          x.s <- strsplit2(x.t, "\t", sgr.supported=sgr.supported)

          # Now cycle through each line with tabs and replace them with
          # spaces

          res <- vapply(x.s,
            function(y) {
              topad <- head(y, -1L)
              rest <- tail(y, 1L)
              chrs <- nchar2(topad, sgr.supported=sgr.supported)
              pads <- character(length(topad))
              txt.len <- 0L
              for(i in seq_along(topad)) {
                txt.len <- chrs[i] + txt.len
                tab.stop <- head(which(stop.vec > txt.len), 1L)
                if(!length(tab.stop))
                  # nocov start
                  stop(
                    "Logic Error: failed trying to find tab stop; contact ",
                    "maintainer"
                  )
                  # nocov end
                tab.len <- stop.vec[tab.stop]
                pads[i] <- paste0(rep(" ", tab.len - txt.len), collapse="")
                txt.len <- tab.len
              }
              paste0(paste0(topad, pads, collapse=""), rest)
            },
            character(1L)
          )
          x[h.t] <- res
        }
        x
  } ) }
  # Simulate the effect of \r by collapsing every \r separated element on top
  # of each other with some special handling for ansi escape seqs

  txt.fin <- txt.s
  txt.fin[has.crs] <- vapply(
    txt.s[has.crs],
    function(x) {
      if(length(x) > 1L) {
        chrs <- nchar2(x, sgr.supported=sgr.supported)
        max.disp <- c(tail(rev(cummax(rev(chrs))), -1L), 0L)
        res <- paste0(
          rev(
            substr2(x, max.disp + 1L, chrs, sgr.supported=sgr.supported)
          ),
          collapse=""
        )
        # add back every ANSI esc sequence from last line to very end
        # to ensure that we leave in correct ANSI escaped state

        if(grepl(ansi_regex, res, perl=TRUE)) {
          res <- paste0(
            res,
            gsub(paste0(".*", ansi_regex, ".*"), "\\1", tail(x, 1L), perl=TRUE)
        ) }
        res
      } else x # nocov has.cr elements can't have length zero after split...
    },
    character(1L)
  )
  # txt.fin should only have one long char vectors as elements
  if(!length(txt.fin)) txt else {
    # handle strsplit corner case where splitting empty string
    txt.fin[!nzchar(txt)] <- ""
    unlist(txt.fin)
  }
}
#' Replace Horizontal Spacing Control Characters
#'
#' Removes tabs, newlines, and manipulates the text so that
#' it looks the same as it did with those horizontal control
#' characters embedded.  Currently carriage returns are also processed, but
#' in the future they no longer will be.  This function is used when the
#' \code{convert.hz.white.space} parameter to the
#' \code{\link[=diffPrint]{diff*}} methods is active.  The term \dQuote{strip}
#' is a misnomer that remains for legacy reasons and lazyness.
#'
#' This is an internal function with exposed documentation because it is
#' referenced in an external function's documentation.
#'
#' @keywords internal
#' @param txt character to covert
#' @param stops integer, what tab stops to use
#' @param sgr.supported logical whether the current display device supports
#'   ANSI CSI SGR.  See \code{\link[=diffPrint]{diff*}}'s \code{sgr.supported}
#'   parameter.
#' @return character, `txt` with horizontal control sequences
#'   replaced.

strip_hz_control <- function(txt, stops=8L, sgr.supported) {
  # stopifnot(
  #   is.character(txt), !anyNA(txt),
  #   is.integer(stops), length(stops) >= 1L, !anyNA(stops), all(stops > 0L)
  # )

  # for speed in case no special chars, just skip; obviously this adds a penalty
  # for other cases but it is small

  if(!any(grepl("\n|\t|\r", txt, perl=TRUE))) {
    txt
  } else {
    if(length(has.n <- grep("\n", txt, fixed=TRUE))) {
      txt.l <- as.list(txt)
      txt.l.n <- strsplit2(txt[has.n], "\n", sgr.supported=sgr.supported)
      txt.l[has.n] <- txt.l.n
      txt <- unlist(txt.l)
    }
    has.ansi <- grepl(ansi_regex, txt, perl=TRUE)
    w.ansi <- which(has.ansi)
    wo.ansi <- which(!has.ansi)

    # since for the time being the crayon funs are a bit slow, only us them on
    # strings that are known to have ansi escape sequences

    strip_hz_c_int(txt, stops, sgr.supported=sgr.supported)
  }
}
# Normalize strings so whitespace differences don't show up as differences

normalize_whitespace <- function(txt)
  gsub(" ([[:punct:]])", "\\1", gsub("(\t| )+", " ", trimws(txt)))

# Simple text manip functions

chr_trim <- function(text, width, sgr.supported) {
  stopifnot(all(width > 2L))
  ifelse(
    nchar2(text, sgr.supported=sgr.supported) > width,
    paste0(substr2(text, 1L, width - 2L, sgr.supported=sgr.supported), ".."),
    text
  )
}
rpad <- function(text, width, pad.chr=" ", sgr.supported) {
  stopifnot(is.character(pad.chr), length(pad.chr) == 1L, nchar(pad.chr) == 1L)
  pad.count <- width - nchar2(text, sgr.supported=sgr.supported)
  pad.count[pad.count < 0L] <- 0L
  pad.chrs <- vapply(
    pad.count, function(x) paste0(rep(pad.chr, x), collapse=""), character(1L)
  )
  paste0(text, pad.chrs)
}
# Breaks long character vectors into vectors of length width
#
# Right pads them to full length if requested.  Only attempt to wrap if
# longer than width since wrapping is pretty expensive
#
# Returns a list of split vectors

wrap_int <- function(txt, width, sgr.supported) {
  nchars <- nchar2(txt, sgr.supported=sgr.supported)
  res <- as.list(txt)
  too.wide <- which(nchars > width)
  res[too.wide] <- lapply(
    too.wide,
    function(i) {
      split.end <- seq(
        from=width, by=width, length.out=ceiling(nchars[[i]] / width)
      )
      split.start <- split.end - width + 1L
      substr2(
        rep(txt[[i]], length(split.start)), split.start, split.end,
        sgr.supported=sgr.supported
      )
  } )
  res
}
wrap <- function(txt, width, pad=FALSE, sgr.supported) {
  if(length(grep("\n", txt, fixed=TRUE)))
    # nocov start
    stop("Logic error: wrap input contains newlines; contact maintainer.")
    # nocov end

  # If there are ansi escape sequences, account for them; either way, create
  # a vector of character positions after which we should split our character
  # vector

  has.na <- is.na(txt)
  has.chars <- nchar2(txt, sgr.supported=sgr.supported) & !has.na
  w.chars <- which(has.chars)
  wo.chars <- which(!has.chars & !has.na)

  txt.sub <- txt[has.chars]

  # Wrap differently depending on whether contains ansi or not, exclude zero
  # length char elements

  res.l <- vector("list", length(txt))
  res.l[has.na] <- NA_character_
  res.l[wo.chars] <- ""
  res.l[w.chars] <- wrap_int(txt.sub, width, sgr.supported=sgr.supported)

  # pad if requested

  if(pad) res.l[!has.na] <- 
    lapply(res.l[!has.na], rpad, width=width, sgr.supported=sgr.supported)
  res.l
}