1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
|
# Natural Language Toolkit: Windowdiff
#
# Copyright (C) 2001-2009 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
# Steven Bird <sb@csse.unimelb.edu.au>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
##########################################################################
# Windowdiff
# Pevzner, L., and Hearst, M., A Critique and Improvement of
# an Evaluation Metric for Text Segmentation,
# Computational Linguistics,, 28 (1), March 2002, pp. 19-36
##########################################################################
def windowdiff(seg1, seg2, k, boundary="1"):
"""
Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence
over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used
to mark the edge of a segmentation.
>>> s1 = "00000010000000001000000"
>>> s2 = "00000001000000010000000"
>>> s3 = "00010000000000000001000"
>>> windowdiff(s1, s1, 3)
0
>>> windowdiff(s1, s2, 3)
4
>>> windowdiff(s2, s3, 3)
16
@param seg1: a segmentation
@type seg1: C{string} or C{list}
@param seg2: a segmentation
@type seg2: C{string} or C{list}
@param k: window width
@type k: C{int}
@param boundary: boundary value
@type boundary: C{string} or C{int} or C{bool}
@rtype: C{int}
"""
if len(seg1) != len(seg2):
raise ValueError, "Segmentations have unequal length"
wd = 0
for i in range(len(seg1) - k):
wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary))
return wd
def demo():
s1 = "00000010000000001000000"
s2 = "00000001000000010000000"
s3 = "00010000000000000001000"
print "s1:", s1
print "s2:", s2
print "s3:", s3
print "windowdiff(s1, s1, 3) = ", windowdiff(s1, s1, 3)
print "windowdiff(s1, s2, 3) = ", windowdiff(s1, s2, 3)
print "windowdiff(s2, s3, 3) = ", windowdiff(s2, s3, 3)
|