1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
|
# Copyright (C) 2021 Brodie Gaslam
#
# This file is part of "diffobj - Diffs for R Objects"
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
# These are deprecated legacy functions from before we incorporated the
# libmba versions of the myers algo
# Alternate implementation of Myers algorithm in R, without linear space
# modification. Included here mostly for reference purposes and not intended
# for use since the MBA myers implemenation should be far superior
myers_simple <- function(target, current) {
path <- myers_simple_int(target, current)
diff_path_to_diff(path, target, current)
}
myers_simple_int <- function(A, B) {
N <- length(A)
M <- length(B)
MAX <- M + N + 1L
OFF <- MAX + 1L # offset to adjust to R indexing
Vl <- vector("list", MAX)
for(D in seq_len(MAX) - 1L) {
Vl[[D + 1L]] <- if(!D) integer(2L * MAX + 1L) else Vl[[D]]
for(k in seq(-D, D, by=2L)) {
# not sure of precendence for || vs &&
# k == -D means x == 0
V <- Vl[[D + 1L]]
if(k == -D || (k != D && V[k - 1L + OFF] < V[k + 1L + OFF])) {
x <- V[k + 1L + OFF]
} else {
x <- V[k - 1L + OFF] + 1L
}
y <- x - k
# Move on diagonal
while (x < N && y < M && A[x + 1L] == B[y + 1L]) {
x <- x + 1L
y <- y + 1L
}
# Record last match or end; if a mismatch no longer increment
Vl[[D + 1L]][k + OFF] <- x
if(x >= N && y >= M) {
# Create matrix to hold entire result path; should be longest of
# A and B plus recorded differences
path.len <- D + max(N, M)
res <- matrix(integer(1L), nrow=path.len, ncol=2)
res[path.len, ] <- c(x, y)
path.len <- path.len - 1L
for(d in rev(seq_len(D))) {
Vp <- Vl[[d]]
break.out <- FALSE
repeat {
# can't match to zero since that is the initialized value
shift.up <- Vp[k + 1L + OFF] == x && x
shift.left <- Vp[k - 1L + OFF] == x - 1L && x > 1L
if(x <= 0L && y <= 0L) {
break
} else if(!shift.up && !shift.left) {
# must be on snake or about to hit 0,0
x <- max(x - 1L, 0L)
y <- max(y - 1L, 0L)
} else {
if(shift.up) {
y <- y - 1L
k <- k + 1L
} else {
x <- x - 1L
k <- k - 1L
}
break.out <- TRUE
}
res[path.len, ] <- c(x, y)
path.len <- path.len - 1L
if(break.out) break
}
}
if(any(res < 0L)) {
# nocov start
stop(
"Logic Error: diff generated illegal coords; contact maintainer."
)
# nocov end
}
return(res)
}
}
}
stop("Logic Error, should not get here") # nocov
}
# Translates a diff path produced by the simple Myers Algorithm into the
# standard format we use in the rest of the package
diff_path_to_diff <- function(path, target, current) {
stopifnot(
is.character(target), is.character(current),
is.matrix(path), is.integer(path), ncol(path) == 2,
all(path[, 1L] %in% c(0L, seq_along(target))),
all(path[, 2L] %in% c(0L, seq_along(current)))
)
# Path specifies 0s as well as duplicate coordinates, which we don't use
# in our other formats. For dupes, find first value for each index that is
# lined up with a real value in the other column
get_dupe <- function(x) {
base <- !logical(length(x))
if(!length(y <- which(x != 0L)))
base[[1L]] <- FALSE else base[[min(y)]] <- FALSE
base
}
cur.dup <- as.logical(ave(path[, 1L], path[, 2L], FUN=get_dupe))
tar.dup <- as.logical(ave(path[, 2L], path[, 1L], FUN=get_dupe))
path[!path] <- NA_integer_
path[tar.dup, 1L] <- NA_integer_
path[cur.dup, 2L] <- NA_integer_
# Now create the character equivalents of the path matrix
tar.path <- target[path[, 1L]]
cur.path <- current[path[, 2L]]
# Mark the equalities in the path matrix by setting them negative
path[which(tar.path == cur.path), ] <- -path[which(tar.path == cur.path), ]
# Remaining numbers are the mismatches which we will arbitrarily assign to
# each other; to do so we first split our data into groups of matches and
# mismatches and do the mapping there-in. We also get rid of non-matching
# entries.
matched <- ifelse(!is.na(path[, 1]) & path[, 1] < 0L, 1L, 0L)
splits <- cumsum(abs(diff(c(0, matched))))
chunks <- split.data.frame(path, splits)
res.tar <- res.cur <- vector("list", length(chunks))
mm.count <- 0L # for tracking matched mismatches
for(i in seq_along(chunks)) {
x <- chunks[[i]]
if((neg <- any(x < 0L, na.rm=TRUE)) && !all(x < 0L, na.rm=TRUE))
stop("Internal Error: match group error; contact maintainer") # nocov
if(neg) {
# Matches, so equal length and set to zero
res.tar[[i]] <- res.cur[[i]] <- integer(nrow(x))
} else {
# Mismatches
tar.mm <- Filter(Negate(is.na), x[, 1L])
cur.mm <- Filter(Negate(is.na), x[, 2L])
x.min.len <- min(length(tar.mm), length(cur.mm))
res.tar[[i]] <- res.cur[[i]] <- seq_len(x.min.len) + mm.count
mm.count <- x.min.len + mm.count
length(res.tar[[i]]) <- length(tar.mm)
length(res.cur[[i]]) <- length(cur.mm)
}
}
if(!length(res.tar)) res.tar <- integer()
if(!length(res.cur)) res.cur <- integer()
return(list(target=unlist(res.tar), current=unlist(res.cur)))
}
|