File: SoftTokenFelligiSunter.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (187 lines) | stat: -rw-r--r-- 5,780 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
package com.wcohen.ss;

import java.util.*;
import com.wcohen.ss.api.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.expt.*;

/**
 * Highly simplified model of Felligi-Sunter's method 1,
 * applied to tokens.
 */

public class SoftTokenFelligiSunter extends AbstractStatisticalTokenDistance
{
	private double mismatchFactor;

	// distance to use to compare tokens
	private StringDistance tokenDistance;
	// threshold beyond which tokens are considered a match
	private double tokenMatchThreshold;
	// default token distance
	private static final StringDistance DEFAULT_TOKEN_DISTANCE = new JaroWinkler();
	
	public SoftTokenFelligiSunter(Tokenizer tokenizer,StringDistance tokenDistance,
																double tokenMatchThreshold,double mismatchFactor) 
	{
		super(tokenizer);
		this.tokenDistance = tokenDistance;
		this.tokenMatchThreshold = tokenMatchThreshold;
		this.mismatchFactor = mismatchFactor;
	}
	public SoftTokenFelligiSunter() {
		this(SimpleTokenizer.DEFAULT_TOKENIZER, DEFAULT_TOKEN_DISTANCE, 0.90, 0.5 );
	}
	public void setMismatchFactor(double d) { mismatchFactor=d; }
	public void setMismatchFactor(Double d) { mismatchFactor=d.doubleValue(); }
	public void setTokenMatchThreshold(double d) { tokenMatchThreshold=d; }
	public void setTokenMatchThreshold(Double d) { tokenMatchThreshold=d.doubleValue(); }	

	public double score(StringWrapper s,StringWrapper t) 
	{
		computeTokenDistances();

		BagOfTokens sBag = (BagOfTokens)s;
		BagOfTokens tBag = (BagOfTokens)t;
		double sim = 0.0;
		for (Iterator i = sBag.tokenIterator(); i.hasNext(); ) {
	    Token tok = (Token)i.next();
			double	df = getDocumentFrequency(tok);
			if (tBag.contains(tok)) {
				double w = -Math.log( df/collectionSize );
				sim += w;
			} else {
				Token matchTok = null;
				double matchScore = tokenMatchThreshold;
				for (Iterator j=tBag.tokenIterator(); j.hasNext(); ) {
					Token tokJ = (Token)j.next();
					double distItoJ = tokenDistance.score( tok.getValue(), tokJ.getValue() );
					if (distItoJ>=matchScore) {
						matchTok = tokJ;
						matchScore = distItoJ;
					}
				}
				if (matchTok!=null) {
					df = neighborhoodDocumentFrequency(tok,  matchScore);
					double w = -Math.log( df/collectionSize );
					sim += w;
				} else {
					double w = -Math.log( df/collectionSize );					
					sim -= w * mismatchFactor;
				}
			}
		}
		return sim;
	}
	
	/** Preprocess a string by finding tokens */ 
	public StringWrapper prepare(String s) 
	{
		return new BagOfTokens(s, tokenizer.tokenize(s));
	}
	
	/** Explain how the distance was computed. 
	 * In the output, the tokens in S and T are listed, and the
	 * common tokens are marked with an asterisk.
	 */
	public String explainScore(StringWrapper s, StringWrapper t) 
	{
		BagOfTokens sBag = (BagOfTokens)s;
		BagOfTokens tBag = (BagOfTokens)t;
		StringBuffer buf = new StringBuffer("");
		PrintfFormat fmt = new PrintfFormat("%.3f");
		buf.append("Common tokens: ");
		for (Iterator i = sBag.tokenIterator(); i.hasNext(); ) {
	    Token tok = (Token)i.next();
			if (tBag.contains(tok)) {
				buf.append(" "+tok.getValue()+": ");
				buf.append(fmt.sprintf(tBag.getWeight(tok)));
			}
		}
		buf.append("\nscore = "+score(s,t));
		return buf.toString(); 
	}
	public String toString() { return "[SoftTokenFelligiSunter]"; }
	
	//
	// compute pairwise distance between all tokens
	//
	private boolean tokenDistancesComputed = false;
	private Map neighborMap;

	private void computeTokenDistances() 
	{
		if (tokenDistancesComputed) return;
		// use blocker to compute pairwise distances between similar tokens
		neighborMap = new HashMap();
		MatchData tokenData = new MatchData();
		for (Iterator i=documentFrequency.keySet().iterator(); i.hasNext(); ) {
			Token tok = (Token)i.next();
			tokenData.addInstance("tokens",tok.getValue(),tok.getValue());
		}
		Blocker tokenBlocker = new ClusterNGramBlocker();
		tokenBlocker.block(tokenData);
		for (int i=0; i<tokenBlocker.size(); i++) {
			Blocker.Pair pair = tokenBlocker.getPair(i);
			String s = pair.getA().unwrap();
			String t = pair.getB().unwrap();
			double d = tokenDistance.score( s, t );
			if (d>=tokenMatchThreshold) {
				addNeighbor( s, t, d);
			}
		}
		// never do this again
		tokenDistancesComputed = true;
	}
	private void addNeighbor(String s, String t, double d) {
		TreeSet set = (TreeSet)neighborMap.get(s);
		if (set==null) {
			set = new TreeSet();
			neighborMap.put(s, set);
		}
		set.add( new TokenNeighbor(t, d) );
	}

	/** Encodes a neighboring token, the document frequency of that
	 * neighboring token, and the distance to it. Goal is that an
	 * ordered set of these will let you quickly find the DF of
	 * all tokens closer than a threshold D.  */
	private class TokenNeighbor implements Comparable {
		public String tokVal;
		public int freq;
		public double score;
		public TokenNeighbor(String tokVal,double score) { 
			this.tokVal=tokVal; 
			this.score=score; 
			this.freq = getDocumentFrequency(tokenizer.intern(tokVal));
		}
		// sort by score, closest first
		public int compareTo(Object o) {
			TokenNeighbor other = (TokenNeighbor)o;
	    if (other.score > score) return +1;
	    else if (other.score < score) return -1;
			else return 0;
		}
		public int hashCode() {
			return tokVal.hashCode();
		}
	}

	private int neighborhoodDocumentFrequency(Token tok, double d) {
		int df = getDocumentFrequency(tok);
		String s = tok.getValue();
		TreeSet neighbors = (TreeSet)neighborMap.get(s);
		if (neighbors==null) return df;
		for (Iterator i=neighbors.iterator(); i.hasNext(); ) {
			TokenNeighbor neighbor = (TokenNeighbor)i.next();
			if (neighbor.score<d) break;
			df += neighbor.freq;
		}
		return df;
	}

	static public void main(String[] argv) {
		doMain(new SoftTokenFelligiSunter(), argv);
	}

}