1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
;;;; "differ.scm" O(NP) Sequence Comparison Algorithm.
;;; Copyright (C) 2001 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1. Any copy made of this software must include this copyright notice
;in full.
;
;2. I have made no warrantee or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3. In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.
;;@noindent
;;This package implements the algorithm:
;;
;;@ifinfo
;;@example
;;S. Wu, E. Myers, U. Manber, and W. Miller,
;; "An O(NP) Sequence Comparison Algorithm,"
;; Information Processing Letters 35, 6 (1990), 317-323.
;; @url{http://www.cs.arizona.edu/people/gene/vita.html}
;;@end example
;;@end ifinfo
;;@ifset html
;;S. Wu, <A HREF="http://www.cs.arizona.edu/people/gene/vita.html">
;;E. Myers,</A> U. Manber, and W. Miller,
;;<A HREF="http://www.cs.arizona.edu/people/gene/PAPERS/np_diff.ps">
;;"An O(NP) Sequence Comparison Algorithm,"</A>
;;Information Processing Letters 35, 6 (1990), 317-323.
;;@end ifset
;;
;;@noindent
;;If the items being sequenced are text lines, then the computed
;;edit-list is equivalent to the output of the @dfn{diff} utility
;;program. If the items being sequenced are words, then it is like the
;;lesser known @dfn{spiff} program.
;;
;;@noindent
;;The values returned by @code{diff:edit-length} can be used to gauge
;;the degree of match between two sequences.
;;
;;@noindent
;;I believe that this algorithm is currently the fastest for these
;;tasks, but genome sequencing applications fuel extensive research in
;;this area.
(require 'array)
(define (fp:compare fp Delta snake len2)
(let loop ((p 0))
(do ((k (- p) (+ 1 k)))
((> k (+ -1 Delta)))
(array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
(array-ref fp (+ 1 k))))
k))
(do ((k (+ Delta p) (+ -1 k)))
((< k (+ 1 Delta)))
(array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
(array-ref fp (+ 1 k))))
k))
(array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta)))
(array-ref fp (+ 1 Delta))))
Delta)
(if (= (array-ref fp Delta) len2)
(+ Delta (* 2 p))
(loop (+ 1 p)))))
(define (fp->edits fp Delta)
(let loop ((idx (+ -1 Delta))
(ddx (+ 1 Delta))
(edits '()))
(define ivl (array-ref fp idx))
(define dvl (array-ref fp ddx))
(if (not (= -1 dvl)) (set! dvl (- dvl ddx)))
;;(print idx '-> ivl ddx '-> dvl)
(cond ((= ivl -1) edits)
((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))
((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits)))
(else (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))))))
(define (fp->lcs fp Delta array1 len)
(define len1 (car (array-dimensions array1)))
(define lcs (make-array #f len))
(define (subarray-copy! array1 start1 end1 array2 start2)
(do ((i start1 (+ i 1))
(j start2 (+ j 1))
(l (- end1 start1) (- l 1)))
((<= l 0))
(array-set! array2 (array-ref array1 i) j)))
(let loop ((ddx (+ 1 Delta))
(pos len1)
(dpos len))
(let* ((dvl (array-ref fp ddx))
(sublen (- pos (- dvl ddx -1))))
(cond ((= dvl -1)
(subarray-copy! array1 0 pos lcs 0)
lcs)
(else
(subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen))
(loop (+ 1 ddx) (- dvl ddx) (- dpos sublen)))))))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns a one-dimensional array of length @code{(quotient (- (+
;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence
;;common to both @var{array}s.
(define (diff:longest-common-subsequence array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y))
y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:longest-common-subsequence array2 array1)
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp->lcs fp Delta array1
(quotient (- (+ len1 len2) (fp:compare fp Delta snake len2))
2)))))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of
;;a shortest sequence of edits transformaing @1 to @2.
;;
;;Each edit is a list of an integer and a symbol:
;;@table @asis
;;@item (@var{j} insert)
;;Inserts @code{(array-ref @1 @var{j})} into the sequence.
;;@item (@var{k} delete)
;;Deletes @code{(array-ref @2 @var{k})} from the sequence.
;;@end table
(define (diff:edits array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y)) y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:reverse-edits (diff:edits array2 array1))
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp:compare fp Delta snake len2)
;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx))
;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx)))
(fp->edits fp Delta))))
(define (diff:reverse-edits edits)
(map (lambda (edit)
(list (car edit)
(case (cadr edit)
((delete) 'insert)
((insert) 'delete))))
edits))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns the length of the shortest sequence of edits transformaing
;;@1 to @2.
(define (diff:edit-length array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y))
y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:edit-length array2 array1)
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp:compare fp Delta snake len2))))
;;@example
;;(diff:longest-common-subsequence '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m))
;; @result{} #(f g h i j k l m)
;;
;;(diff:edit-length '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m))
;;@result{} 6
;;
;;(pretty-print (diff:edits '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m)))
;;@print{}
;;((3 insert) ; e
;; (4 delete) ; c
;; (6 delete) ; h
;; (7 insert) ; p
;; (8 insert) ; q
;; (9 insert)) ; r
;;@end example
;; 12 - 10 = 2
;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12
;; -1 -1 -1 -1 -1 -1 -1 -1 -1 3 7 8 9 12 9 8 -1 -1 -1 -1 -1 -1 -1 -1
;; edit-distance = 6
|