1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
|
#
#
# Nim's Runtime Library
# (c) Copyright 2018 Nim contributors
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements an algorithm to compute the
## `diff`:idx: between two sequences of lines.
##
## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_
runnableExamples:
assert diffInt(
[0, 1, 2, 3, 4, 5, 6, 7, 8],
[-1, 1, 2, 3, 4, 5, 666, 7, 42]) ==
@[Item(startA: 0, startB: 0, deletedA: 1, insertedB: 1),
Item(startA: 6, startB: 6, deletedA: 1, insertedB: 1),
Item(startA: 8, startB: 8, deletedA: 1, insertedB: 1)]
runnableExamples:
# 2 samples of text (from "The Call of Cthulhu" by Lovecraft)
let txt0 = """
abc
def ghi
jkl2"""
let txt1 = """
bacx
abc
def ghi
jkl"""
assert diffText(txt0, txt1) ==
@[Item(startA: 0, startB: 0, deletedA: 0, insertedB: 1),
Item(startA: 2, startB: 3, deletedA: 1, insertedB: 1)]
# code owner: Arne Döring
#
# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
#
# This Class implements the Difference Algorithm published in
# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
# Algorithmica Vol. 1 No. 2, 1986, p 251.
import std/[tables, strutils]
when defined(nimPreviewSlimSystem):
import std/assertions
type
Item* = object ## An Item in the list of differences.
startA*: int ## Start Line number in Data A.
startB*: int ## Start Line number in Data B.
deletedA*: int ## Number of changes in Data A.
insertedB*: int ## Number of changes in Data B.
DiffData = object ## Data on one input file being compared.
data: seq[int] ## Buffer of numbers that will be compared.
modified: seq[bool] ## Array of booleans that flag for modified
## data. This is the result of the diff.
## This means deletedA in the first Data or
## inserted in the second Data.
Smsrd = object
x, y: int
# template to avoid a seq copy. Required until `sink` parameters are ready.
template newDiffData(initData: seq[int]; L: int): DiffData =
DiffData(
data: initData,
modified: newSeq[bool](L + 2)
)
proc len(d: DiffData): int {.inline.} = d.data.len
proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
## This function converts all textlines of the text into unique numbers for every unique textline
## so further work can work only with simple numbers.
## `aText` the input text
## `h` This extern initialized hashtable is used for storing all ever used textlines.
## `trimSpace` ignore leading and trailing space characters
## Returns a array of integers.
var lastUsedCode = h.len
result.data = newSeq[int]()
for s in aText.splitLines:
if h.contains s:
result.data.add h[s]
else:
inc lastUsedCode
h[s] = lastUsedCode
result.data.add lastUsedCode
result.modified = newSeq[bool](result.data.len + 2)
proc optimize(data: var DiffData) =
## If a sequence of modified lines starts with a line that contains the same content
## as the line that appends the changes, the difference sequence is modified so that the
## appended line and not the starting line is marked as modified.
## This leads to more readable diff sequences when comparing text files.
var startPos = 0
while startPos < data.len:
while startPos < data.len and not data.modified[startPos]:
inc startPos
var endPos = startPos
while endPos < data.len and data.modified[endPos]:
inc endPos
if endPos < data.len and data.data[startPos] == data.data[endPos]:
data.modified[startPos] = false
data.modified[endPos] = true
else:
startPos = endPos
proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
downVector, upVector: var openArray[int]): Smsrd =
## This is the algorithm to find the Shortest Middle Snake (sms).
## `dataA` sequence A
## `lowerA` lower bound of the actual range in dataA
## `upperA` upper bound of the actual range in dataA (exclusive)
## `dataB` sequence B
## `lowerB` lower bound of the actual range in dataB
## `upperB` upper bound of the actual range in dataB (exclusive)
## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
## Returns a MiddleSnakeData record containing x,y and u,v.
let max = dataA.len + dataB.len + 1
let downK = lowerA - lowerB # the k-line to start the forward search
let upK = upperA - upperB # the k-line to start the reverse search
let delta = (upperA - lowerA) - (upperB - lowerB)
let oddDelta = (delta and 1) != 0
# The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
# and are access using a specific offset: upOffset upVector and downOffset for downVector
let downOffset = max - downK
let upOffset = max - upK
let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
downVector[downOffset + downK + 1] = lowerA
upVector[upOffset + upK - 1] = upperA
for D in 0 .. maxD:
# Extend the forward path.
for k in countup(downK - D, downK + D, 2):
# find the only or better starting point
var x: int
if k == downK - D:
x = downVector[downOffset + k + 1] # down
else:
x = downVector[downOffset + k - 1] + 1 # a step to the right
if k < downK + D and downVector[downOffset + k + 1] >= x:
x = downVector[downOffset + k + 1] # down
var y = x - k
# find the end of the furthest reaching forward D-path in diagonal k.
while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
inc x
inc y
downVector[downOffset + k] = x
# overlap ?
if oddDelta and upK - D < k and k < upK + D:
if upVector[upOffset + k] <= downVector[downOffset + k]:
return Smsrd(x: downVector[downOffset + k],
y: downVector[downOffset + k] - k)
# Extend the reverse path.
for k in countup(upK - D, upK + D, 2):
# find the only or better starting point
var x: int
if k == upK + D:
x = upVector[upOffset + k - 1] # up
else:
x = upVector[upOffset + k + 1] - 1 # left
if k > upK - D and upVector[upOffset + k - 1] < x:
x = upVector[upOffset + k - 1] # up
var y = x - k
while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
dec x
dec y
upVector[upOffset + k] = x
# overlap ?
if not oddDelta and downK-D <= k and k <= downK+D:
if upVector[upOffset + k] <= downVector[downOffset + k]:
return Smsrd(x: downVector[downOffset + k],
y: downVector[downOffset + k] - k)
assert false, "the algorithm should never come here."
proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
downVector, upVector: var openArray[int]) =
## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
## algorithm.
## The published algorithm passes recursively parts of the A and B sequences.
## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
## `dataA` sequence A
## `lowerA` lower bound of the actual range in dataA
## `upperA` upper bound of the actual range in dataA (exclusive)
## `dataB` sequence B
## `lowerB` lower bound of the actual range in dataB
## `upperB` upper bound of the actual range in dataB (exclusive)
## `downVector` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
## `upVector` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
# make mutable copy:
var lowerA = lowerA
var lowerB = lowerB
var upperA = upperA
var upperB = upperB
# Fast walkthrough equal lines at the start
while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
inc lowerA
inc lowerB
# Fast walkthrough equal lines at the end
while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
dec upperA
dec upperB
if lowerA == upperA:
# mark as inserted lines.
while lowerB < upperB:
dataB.modified[lowerB] = true
inc lowerB
elif lowerB == upperB:
# mark as deleted lines.
while lowerA < upperA:
dataA.modified[lowerA] = true
inc lowerA
else:
# Find the middle snake and length of an optimal path for A and B
let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
# Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
# The path is from LowerX to (x,y) and (x,y) to UpperX
lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points
proc createDiffs(dataA, dataB: DiffData): seq[Item] =
## Scan the tables of which lines are inserted and deleted,
## producing an edit script in forward order.
var startA = 0
var startB = 0
var lineA = 0
var lineB = 0
while lineA < dataA.len or lineB < dataB.len:
if lineA < dataA.len and not dataA.modified[lineA] and
lineB < dataB.len and not dataB.modified[lineB]:
# equal lines
inc lineA
inc lineB
else:
# maybe deleted and/or inserted lines
startA = lineA
startB = lineB
while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
inc lineA
while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
inc lineB
if (startA < lineA) or (startB < lineB):
result.add Item(startA: startA,
startB: startB,
deletedA: lineA - startA,
insertedB: lineB - startB)
proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
## Find the difference in 2 arrays of integers.
##
## `arrayA` A-version of the numbers (usually the old one)
##
## `arrayB` B-version of the numbers (usually the new one)
##
## Returns a sequence of Items that describe the differences.
# The A-Version of the data (original data) to be compared.
var dataA = newDiffData(@arrayA, arrayA.len)
# The B-Version of the data (modified data) to be compared.
var dataB = newDiffData(@arrayB, arrayB.len)
let max = dataA.len + dataB.len + 1
# vector for the (0,0) to (x,y) search
var downVector = newSeq[int](2 * max + 2)
# vector for the (u,v) to (N,M) search
var upVector = newSeq[int](2 * max + 2)
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
result = createDiffs(dataA, dataB)
proc diffText*(textA, textB: string): seq[Item] =
## Find the difference in 2 text documents, comparing by textlines.
##
## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
## each line is converted into a (hash) number. This hash-value is computed by storing all
## textlines into a common hashtable so i can find duplicates in there, and generating a
## new number each time a new textline is inserted.
##
## `textA` A-version of the text (usually the old one)
##
## `textB` B-version of the text (usually the new one)
##
## Returns a seq of Items that describe the differences.
# See also `gitutils.diffStrings`.
# prepare the input-text and convert to comparable numbers.
var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size
# The A-Version of the data (original data) to be compared.
var dataA = diffCodes(textA, h)
# The B-Version of the data (modified data) to be compared.
var dataB = diffCodes(textB, h)
h.clear # free up hashtable memory (maybe)
let max = dataA.len + dataB.len + 1
# vector for the (0,0) to (x,y) search
var downVector = newSeq[int](2 * max + 2)
# vector for the (u,v) to (N,M) search
var upVector = newSeq[int](2 * max + 2)
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
optimize(dataA)
optimize(dataB)
result = createDiffs(dataA, dataB)
|