File: WinklerRescorer.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (57 lines) | stat: -rw-r--r-- 2,094 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package com.wcohen.ss;

import com.wcohen.ss.api.*;

/**
 * Winkler's reweighting scheme for distance metrics.  In the
 * literature, this was applied to the Jaro metric ('An Application of
 * the Fellegi-Sunter Model of Record Linkage to the 1990
 * U.S. Decennial Census' by William E. Winkler and Yves Thibaudeau.)
 */

public class WinklerRescorer extends AbstractStringDistance
{
	private StringDistance innerDistance;

	/** Rescore the innerDistance's scores, to account for the
	 * subjectively greater importance of the first few characters.
	 * <p>
	 * Note: the innerDistance must produce scores between 0 and 1.
	 */
	public WinklerRescorer(StringDistance innerDistance) { this.innerDistance = innerDistance; }

	public String toString() { return "[WinklerRescorer:"+innerDistance+"]"; }

	public double score(StringWrapper s,StringWrapper t) 
	{
		double dist = innerDistance.score(s,t);
		if (dist<0 || dist>1) 
			throw new IllegalArgumentException("innerDistance should produce scores between 0 and 1"); 
		int prefLength = commonPrefixLength(4,s.unwrap(),t.unwrap());
		dist = dist + prefLength*0.1 * (1 - dist);
		return dist;
	}

	public String explainScore(StringWrapper s, StringWrapper t)	
	{
		double dist = innerDistance.score(s,t);
		int prefLength = commonPrefixLength(4,s.unwrap(),t.unwrap());
		dist = dist + prefLength*0.1 * (1 - dist);
		StringBuffer buf = new StringBuffer("");
		buf.append("original score using "+innerDistance+":\n");
		buf.append(innerDistance.explainScore(s,t)+"\n");
		buf.append("prefLength = max(4,commonPrefixLength) = "+prefLength+"\n");
		buf.append("Corrected score = dist + "+prefLength+"/10 * (1-dist) = "+score(s,t)+"\n");
		return buf.toString();
	}
	private static int commonPrefixLength(int maxLength,String common1,String common2)
	{
		int n = Math.min(maxLength, Math.min(common1.length(), common2.length()) );
		for (int i=0; i<n; i++) {
			if (common1.charAt(i)!=common2.charAt(i)) return i;
		}
		return n; // first n characters are the same
	}
	public StringWrapper prepare(String s) { return innerDistance.prepare(s); }

}