File: wb_diff_difflib.py

package info (click to toggle)
svn-workbench 1.5.0-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 2,400 kB
  • ctags: 1,585
  • sloc: python: 12,163; sh: 74; makefile: 46; ansic: 9
file content (188 lines) | stat: -rw-r--r-- 7,343 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
'''
 ====================================================================
 Copyright (c) 2003-2006 Barry A Scott.  All rights reserved.

 This software is licensed as described in the file LICENSE.txt,
 which you should have received as part of this distribution.

 ====================================================================

    wb_diff_difflib.py

'''
import sys
import re
import difflib
import wb_exceptions
import wb_read_file

# define what "junk" means

def IS_LINE_JUNK(line, pat=None):
    return False

def IS_CHARACTER_JUNK(ch, ws=" \t"):
    return False

class Difference:
    'Difference'
    def __init__( self, text_body ):
        self.text_body = text_body

    # meant for dumping lines
    def dump( self, fn, x, lo, hi ):
        for i in xrange(lo, hi):
            fn( x[i] )

    def plain_replace( self, a, alo, ahi, b, blo, bhi ):
        assert alo < ahi and blo < bhi
        # dump the shorter block first -- reduces the burden on short-term
        # memory if the blocks are of very different sizes
        if bhi - blo < ahi - alo:
            self.dump(self.text_body.addInsertedLine, b, blo, bhi)
            self.dump(self.text_body.addDeletedLine, a, alo, ahi)
        else:
            self.dump(self.text_body.addDeletedLine, a, alo, ahi)
            self.dump(self.text_body.addInsertedLine, b, blo, bhi)

    # When replacing one block of lines with another, this guy searches
    # the blocks for *similar* lines; the best-matching pair (if any) is
    # used as a synch point, and intraline difference marking is done on
    # the similar pair.  Lots of work, but often worth it.

    def fancy_replace( self, a, alo, ahi, b, blo, bhi):

        # don't synch up unless the lines have a similarity score of at
        # least cutoff; best_ratio tracks the best score seen so far
        best_ratio, cutoff = 0.51, 0.52
        cruncher = difflib.SequenceMatcher(IS_CHARACTER_JUNK)
        eqi, eqj = None, None   # 1st indices of equal lines (if any)

        # search for the pair that matches best without being identical
        # (identical lines must be junk lines, & we don't want to synch up
        # on junk -- unless we have to)
        for j in xrange(blo, bhi):
            bj = b[j]
            cruncher.set_seq2(bj)
            for i in xrange(alo, ahi):
                ai = a[i]
                if ai == bj:
                    if eqi is None:
                        eqi, eqj = i, j
                    continue
                cruncher.set_seq1(ai)
                # computing similarity is expensive, so use the quick
                # upper bounds first -- have seen this speed up messy
                # compares by a factor of 3.
                # note that ratio() is only expensive to compute the first
                # time it's called on a sequence pair; the expensive part
                # of the computation is cached by cruncher
                if( cruncher.real_quick_ratio() > best_ratio
                and cruncher.quick_ratio() > best_ratio
                and cruncher.ratio() > best_ratio ):
                    best_ratio, best_i, best_j = cruncher.ratio(), i, j

        if best_ratio < cutoff:
            # no non-identical "pretty close" pair
            if eqi is None:
                # no identical pair either -- treat it as a straight replace
                self.plain_replace( a, alo, ahi, b, blo, bhi )
                return
            # no close pair, but an identical pair -- synch up on that
            best_i, best_j, best_ratio = eqi, eqj, 1.0
        else:
            # there's a close pair, so forget the identical pair (if any)
            eqi = None

        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
        # identical

        # pump out diffs from before the synch point
        self.fancy_helper( a, alo, best_i, b, blo, best_j )

        # do intraline marking on the synch pair
        aelt, belt = a[ best_i ], b[ best_j ]
        if eqi is None:
            self.text_body.addChangedLineBegin()

            cruncher.set_seqs( aelt, belt )
            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
                la, lb = ai2 - ai1, bj2 - bj1
                if tag == 'replace':
                    self.text_body.addChangedLineReplace( aelt[ai1:ai2], belt[bj1:bj2] )
                elif tag == 'delete':
                    self.text_body.addChangedLineDelete( aelt[ai1:ai2] )
                elif tag == 'insert':
                    self.text_body.addChangedLineInsert( belt[bj1:bj2] )
                elif tag == 'equal':
                    self.text_body.addChangedLineEqual( belt[bj1:bj2] )
                else:
                    raise ValueError, 'unknown tag ' + str(tag)

            self.text_body.addChangedLineEnd()
        else:
            self.text_body.addNormalLine( aelt )

        # pump out diffs from after the synch point
        self.fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)

    def fancy_helper( self, a, alo, ahi, b, blo, bhi):
        if alo < ahi:
            if blo < bhi:
                self.fancy_replace( a, alo, ahi, b, blo, bhi)
            else:
                self.dump(self.text_body.addDeletedLine, a, alo, ahi)
        elif blo < bhi:
            self.dump(self.text_body.addInsertedLine, b, blo, bhi)

    def fail(self, msg):
        out = sys.stderr.write
        out(msg + "\n\n")
        return 0

    # filename can be a list of lines of the name of a file to open
    def filecompare( self, filename_left, filename_right ):
        if type(filename_left) == type([]):
            lines_left = filename_left
        else:
            try:
                lines_left = wb_read_file.readFileContentsAsUnicode( filename_left ).split('\n')
            except IOError, detail:
                print 'Error opening %s\n%s' % (filename_left, detail)
                return 0

        if type(filename_right) == type([]):
            lines_right = filename_right
        else:
            try:
                lines_right = wb_read_file.readFileContentsAsUnicode( filename_right ).split('\n')
            except IOError, detail:
                print 'Error opening %s\n%s' % (filename_right, detail)
                return 0

        lines_left = [eolRemoval( line ) for line in lines_left]
        lines_right = [eolRemoval( line ) for line in lines_right]



        matcher = difflib.SequenceMatcher( IS_LINE_JUNK, lines_left, lines_right )
        for tag, left_lo, left_hi, right_lo, right_hi in matcher.get_opcodes():
            if tag == 'replace':
                self.fancy_replace( lines_left, left_lo, left_hi, lines_right, right_lo, right_hi )
            elif tag == 'delete':
                self.dump( self.text_body.addDeletedLine, lines_left, left_lo, left_hi )
            elif tag == 'insert':
                self.dump( self.text_body.addInsertedLine, lines_right, right_lo, right_hi )
            elif tag == 'equal':
                self.dump( self.text_body.addNormalLine, lines_left, left_lo, left_hi )
            else:
                raise ValueError, 'unknown tag ' + str( tag )

        self.text_body.addEnd()
        return 1

# need to strip any \n or \r thats on the end of the line
def eolRemoval( line ):
    while line and line[-1] in ['\n','\r']:
            line = line[:-1]
    return line