1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
'''
====================================================================
Copyright (c) 2003-2006 Barry A Scott. All rights reserved.
This software is licensed as described in the file LICENSE.txt,
which you should have received as part of this distribution.
====================================================================
wb_diff_difflib.py
'''
import sys
import re
import difflib
import wb_exceptions
import wb_read_file
# define what "junk" means
def IS_LINE_JUNK(line, pat=None):
return False
def IS_CHARACTER_JUNK(ch, ws=" \t"):
return False
class Difference:
'Difference'
def __init__( self, text_body ):
self.text_body = text_body
# meant for dumping lines
def dump( self, fn, x, lo, hi ):
for i in xrange(lo, hi):
fn( x[i] )
def plain_replace( self, a, alo, ahi, b, blo, bhi ):
assert alo < ahi and blo < bhi
# dump the shorter block first -- reduces the burden on short-term
# memory if the blocks are of very different sizes
if bhi - blo < ahi - alo:
self.dump(self.text_body.addInsertedLine, b, blo, bhi)
self.dump(self.text_body.addDeletedLine, a, alo, ahi)
else:
self.dump(self.text_body.addDeletedLine, a, alo, ahi)
self.dump(self.text_body.addInsertedLine, b, blo, bhi)
# When replacing one block of lines with another, this guy searches
# the blocks for *similar* lines; the best-matching pair (if any) is
# used as a synch point, and intraline difference marking is done on
# the similar pair. Lots of work, but often worth it.
def fancy_replace( self, a, alo, ahi, b, blo, bhi):
# don't synch up unless the lines have a similarity score of at
# least cutoff; best_ratio tracks the best score seen so far
best_ratio, cutoff = 0.51, 0.52
cruncher = difflib.SequenceMatcher(IS_CHARACTER_JUNK)
eqi, eqj = None, None # 1st indices of equal lines (if any)
# search for the pair that matches best without being identical
# (identical lines must be junk lines, & we don't want to synch up
# on junk -- unless we have to)
for j in xrange(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
for i in xrange(alo, ahi):
ai = a[i]
if ai == bj:
if eqi is None:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
# note that ratio() is only expensive to compute the first
# time it's called on a sequence pair; the expensive part
# of the computation is cached by cruncher
if( cruncher.real_quick_ratio() > best_ratio
and cruncher.quick_ratio() > best_ratio
and cruncher.ratio() > best_ratio ):
best_ratio, best_i, best_j = cruncher.ratio(), i, j
if best_ratio < cutoff:
# no non-identical "pretty close" pair
if eqi is None:
# no identical pair either -- treat it as a straight replace
self.plain_replace( a, alo, ahi, b, blo, bhi )
return
# no close pair, but an identical pair -- synch up on that
best_i, best_j, best_ratio = eqi, eqj, 1.0
else:
# there's a close pair, so forget the identical pair (if any)
eqi = None
# a[best_i] very similar to b[best_j]; eqi is None iff they're not
# identical
# pump out diffs from before the synch point
self.fancy_helper( a, alo, best_i, b, blo, best_j )
# do intraline marking on the synch pair
aelt, belt = a[ best_i ], b[ best_j ]
if eqi is None:
self.text_body.addChangedLineBegin()
cruncher.set_seqs( aelt, belt )
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
la, lb = ai2 - ai1, bj2 - bj1
if tag == 'replace':
self.text_body.addChangedLineReplace( aelt[ai1:ai2], belt[bj1:bj2] )
elif tag == 'delete':
self.text_body.addChangedLineDelete( aelt[ai1:ai2] )
elif tag == 'insert':
self.text_body.addChangedLineInsert( belt[bj1:bj2] )
elif tag == 'equal':
self.text_body.addChangedLineEqual( belt[bj1:bj2] )
else:
raise ValueError, 'unknown tag ' + str(tag)
self.text_body.addChangedLineEnd()
else:
self.text_body.addNormalLine( aelt )
# pump out diffs from after the synch point
self.fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
def fancy_helper( self, a, alo, ahi, b, blo, bhi):
if alo < ahi:
if blo < bhi:
self.fancy_replace( a, alo, ahi, b, blo, bhi)
else:
self.dump(self.text_body.addDeletedLine, a, alo, ahi)
elif blo < bhi:
self.dump(self.text_body.addInsertedLine, b, blo, bhi)
def fail(self, msg):
out = sys.stderr.write
out(msg + "\n\n")
return 0
# filename can be a list of lines of the name of a file to open
def filecompare( self, filename_left, filename_right ):
if type(filename_left) == type([]):
lines_left = filename_left
else:
try:
lines_left = wb_read_file.readFileContentsAsUnicode( filename_left ).split('\n')
except IOError, detail:
print 'Error opening %s\n%s' % (filename_left, detail)
return 0
if type(filename_right) == type([]):
lines_right = filename_right
else:
try:
lines_right = wb_read_file.readFileContentsAsUnicode( filename_right ).split('\n')
except IOError, detail:
print 'Error opening %s\n%s' % (filename_right, detail)
return 0
lines_left = [eolRemoval( line ) for line in lines_left]
lines_right = [eolRemoval( line ) for line in lines_right]
matcher = difflib.SequenceMatcher( IS_LINE_JUNK, lines_left, lines_right )
for tag, left_lo, left_hi, right_lo, right_hi in matcher.get_opcodes():
if tag == 'replace':
self.fancy_replace( lines_left, left_lo, left_hi, lines_right, right_lo, right_hi )
elif tag == 'delete':
self.dump( self.text_body.addDeletedLine, lines_left, left_lo, left_hi )
elif tag == 'insert':
self.dump( self.text_body.addInsertedLine, lines_right, right_lo, right_hi )
elif tag == 'equal':
self.dump( self.text_body.addNormalLine, lines_left, left_lo, left_hi )
else:
raise ValueError, 'unknown tag ' + str( tag )
self.text_body.addEnd()
return 1
# need to strip any \n or \r thats on the end of the line
def eolRemoval( line ):
while line and line[-1] in ['\n','\r']:
line = line[:-1]
return line
|