File: Jaro.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (96 lines) | stat: -rw-r--r-- 2,889 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package com.wcohen.ss;

import com.wcohen.ss.api.*;

/**
 * Jaro distance metric. From 'An Application of the Fellegi-Sunter
 * Model of Record Linkage to the 1990 U.S. Decennial Census' by
 * William E. Winkler and Yves Thibaudeau.
 */

public class Jaro extends AbstractStringDistance
{
	public Jaro() { }

	public String toString() { return "[Jaro]"; }

	public double score(StringWrapper s,StringWrapper t) 
	{
		String str1 = s.unwrap();
		String str2 = t.unwrap();
		int halflen = halfLengthOfShorter(str1,str2);
		String common1 = commonChars(str1, str2, halflen);
		String common2 = commonChars(str2, str1, halflen);
		if (common1.length()!=common2.length()) return 0;
		if (common1.length()==0 || common2.length()==0) return 0;
		int transpositions = transpositions(common1,common2);
		double dist =
			 ( common1.length()/((double)str1.length()) + 
				 common2.length()/((double)str2.length()) + 
				 (common1.length()-transpositions)/((double)common1.length()) ) / 3.0;
		return dist;
	}

	public String explainScore(StringWrapper s, StringWrapper t)	
	{
		String str1 = s.unwrap();
		String str2 = t.unwrap();
		int halflen = halfLengthOfShorter(str1,str2);
		String common1 = commonChars(str1, str2, halflen);
		String common2 = commonChars(str2, str1, halflen);
		// count transpositions
		if (common1.length()!=common2.length()) 
			return "common1!=common2: '"+common1+"' != '"+common2+"'\nscore: "+score(s,t)+"\n";
		if (common1.length()==0 || common2.length()==0) 
			return "|commoni|=0: common1='"+common1+"' common2='"+common2+"'\nscore: "+score(s,t)+"\n";
		int transpositions = transpositions(common1,common2);
		String explanation  =
			"common1: '"+common1+"'\n" 
			+ "common2: '"+common2+"'\n" 
			+ "transpositions: "+transpositions+"\n";
		return explanation + "score: " + score(s,t)+"\n";
	}

	private int halfLengthOfShorter(String str1,String str2)
	{
		return (str1.length() > str2.length()) ? str2.length()/2 + 1 : str1.length()/2 + 1;
	}

	private String commonChars(String s,String t,int halflen) 
	{
		StringBuffer common = new StringBuffer(); 
		StringBuffer copy = new StringBuffer(t);
		for (int i=0; i<s.length(); i++) {
			char ch = s.charAt(i);
			boolean foundIt = false;
			for (int j=Math.max(0,i-halflen); !foundIt && j<Math.min(i+halflen+1,t.length()); j++) {
				if (copy.charAt(j)==ch) {
					foundIt = true;
					common.append(ch);
					copy.setCharAt(j,'*');
				}
			}
		}
		return common.toString();
	}

	private int transpositions(String common1,String common2)
	{
		int transpositions = 0;
		for (int i=0; i<common1.length(); i++) {
			if (common1.charAt(i)!=common2.charAt(i)) 
				transpositions++;
		}
		transpositions /= 2;
		return transpositions;
	}

	public StringWrapper prepare(String s) 
	{
		return new BasicStringWrapper(s.toLowerCase());	
	}

	static public void main(String[] argv) {
		doMain(new Jaro(), argv);
	}
}