1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
package com.wcohen.ss;
import java.util.*;
import com.wcohen.ss.api.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.expt.*;
/**
* Highly simplified model of Felligi-Sunter's method 1,
* applied to tokens.
*/
public class SoftTokenFelligiSunter extends AbstractStatisticalTokenDistance
{
private double mismatchFactor;
// distance to use to compare tokens
private StringDistance tokenDistance;
// threshold beyond which tokens are considered a match
private double tokenMatchThreshold;
// default token distance
private static final StringDistance DEFAULT_TOKEN_DISTANCE = new JaroWinkler();
public SoftTokenFelligiSunter(Tokenizer tokenizer,StringDistance tokenDistance,
double tokenMatchThreshold,double mismatchFactor)
{
super(tokenizer);
this.tokenDistance = tokenDistance;
this.tokenMatchThreshold = tokenMatchThreshold;
this.mismatchFactor = mismatchFactor;
}
public SoftTokenFelligiSunter() {
this(SimpleTokenizer.DEFAULT_TOKENIZER, DEFAULT_TOKEN_DISTANCE, 0.90, 0.5 );
}
public void setMismatchFactor(double d) { mismatchFactor=d; }
public void setMismatchFactor(Double d) { mismatchFactor=d.doubleValue(); }
public void setTokenMatchThreshold(double d) { tokenMatchThreshold=d; }
public void setTokenMatchThreshold(Double d) { tokenMatchThreshold=d.doubleValue(); }
public double score(StringWrapper s,StringWrapper t)
{
computeTokenDistances();
BagOfTokens sBag = (BagOfTokens)s;
BagOfTokens tBag = (BagOfTokens)t;
double sim = 0.0;
for (Iterator i = sBag.tokenIterator(); i.hasNext(); ) {
Token tok = (Token)i.next();
double df = getDocumentFrequency(tok);
if (tBag.contains(tok)) {
double w = -Math.log( df/collectionSize );
sim += w;
} else {
Token matchTok = null;
double matchScore = tokenMatchThreshold;
for (Iterator j=tBag.tokenIterator(); j.hasNext(); ) {
Token tokJ = (Token)j.next();
double distItoJ = tokenDistance.score( tok.getValue(), tokJ.getValue() );
if (distItoJ>=matchScore) {
matchTok = tokJ;
matchScore = distItoJ;
}
}
if (matchTok!=null) {
df = neighborhoodDocumentFrequency(tok, matchScore);
double w = -Math.log( df/collectionSize );
sim += w;
} else {
double w = -Math.log( df/collectionSize );
sim -= w * mismatchFactor;
}
}
}
return sim;
}
/** Preprocess a string by finding tokens */
public StringWrapper prepare(String s)
{
return new BagOfTokens(s, tokenizer.tokenize(s));
}
/** Explain how the distance was computed.
* In the output, the tokens in S and T are listed, and the
* common tokens are marked with an asterisk.
*/
public String explainScore(StringWrapper s, StringWrapper t)
{
BagOfTokens sBag = (BagOfTokens)s;
BagOfTokens tBag = (BagOfTokens)t;
StringBuffer buf = new StringBuffer("");
PrintfFormat fmt = new PrintfFormat("%.3f");
buf.append("Common tokens: ");
for (Iterator i = sBag.tokenIterator(); i.hasNext(); ) {
Token tok = (Token)i.next();
if (tBag.contains(tok)) {
buf.append(" "+tok.getValue()+": ");
buf.append(fmt.sprintf(tBag.getWeight(tok)));
}
}
buf.append("\nscore = "+score(s,t));
return buf.toString();
}
public String toString() { return "[SoftTokenFelligiSunter]"; }
//
// compute pairwise distance between all tokens
//
private boolean tokenDistancesComputed = false;
private Map neighborMap;
private void computeTokenDistances()
{
if (tokenDistancesComputed) return;
// use blocker to compute pairwise distances between similar tokens
neighborMap = new HashMap();
MatchData tokenData = new MatchData();
for (Iterator i=documentFrequency.keySet().iterator(); i.hasNext(); ) {
Token tok = (Token)i.next();
tokenData.addInstance("tokens",tok.getValue(),tok.getValue());
}
Blocker tokenBlocker = new ClusterNGramBlocker();
tokenBlocker.block(tokenData);
for (int i=0; i<tokenBlocker.size(); i++) {
Blocker.Pair pair = tokenBlocker.getPair(i);
String s = pair.getA().unwrap();
String t = pair.getB().unwrap();
double d = tokenDistance.score( s, t );
if (d>=tokenMatchThreshold) {
addNeighbor( s, t, d);
}
}
// never do this again
tokenDistancesComputed = true;
}
private void addNeighbor(String s, String t, double d) {
TreeSet set = (TreeSet)neighborMap.get(s);
if (set==null) {
set = new TreeSet();
neighborMap.put(s, set);
}
set.add( new TokenNeighbor(t, d) );
}
/** Encodes a neighboring token, the document frequency of that
* neighboring token, and the distance to it. Goal is that an
* ordered set of these will let you quickly find the DF of
* all tokens closer than a threshold D. */
private class TokenNeighbor implements Comparable {
public String tokVal;
public int freq;
public double score;
public TokenNeighbor(String tokVal,double score) {
this.tokVal=tokVal;
this.score=score;
this.freq = getDocumentFrequency(tokenizer.intern(tokVal));
}
// sort by score, closest first
public int compareTo(Object o) {
TokenNeighbor other = (TokenNeighbor)o;
if (other.score > score) return +1;
else if (other.score < score) return -1;
else return 0;
}
public int hashCode() {
return tokVal.hashCode();
}
}
private int neighborhoodDocumentFrequency(Token tok, double d) {
int df = getDocumentFrequency(tok);
String s = tok.getValue();
TreeSet neighbors = (TreeSet)neighborMap.get(s);
if (neighbors==null) return df;
for (Iterator i=neighbors.iterator(); i.hasNext(); ) {
TokenNeighbor neighbor = (TokenNeighbor)i.next();
if (neighbor.score<d) break;
df += neighbor.freq;
}
return df;
}
static public void main(String[] argv) {
doMain(new SoftTokenFelligiSunter(), argv);
}
}
|