1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
|
package com.wcohen.ss.expt;
import com.wcohen.ss.api.*;
import com.wcohen.ss.tokens.*;
import java.util.*;
/**
* Finds all pairs that share a not-too-common token.
*/
public class TokenBlocker extends Blocker
{
private static double defaultMaxFraction = 1.0;
static {
try {
String s = System.getProperty("blockerMaxFraction");
if (s!=null) defaultMaxFraction = Double.parseDouble(s);
} catch (NumberFormatException e) {
;
}
}
private static final Set STOPWORD_TOKEN_MARKER = new HashSet();
private ArrayList pairList;
protected Tokenizer tokenizer;
private double maxFraction;
private int numCorrectPairs;
public TokenBlocker(Tokenizer tokenizer, double maxFraction) {
this.tokenizer = tokenizer;
this.maxFraction = maxFraction;
}
public TokenBlocker() {
this(SimpleTokenizer.DEFAULT_TOKENIZER, defaultMaxFraction);
}
public double getMaxFraction() { return maxFraction; }
public void setMaxFraction(double maxFraction) { this.maxFraction = maxFraction; }
public void block(MatchData data)
{
numCorrectPairs = countCorrectPairs(data);
pairList = new ArrayList();
if (!clusterMode && data.numSources()!=2)
throw new IllegalArgumentException("need exactly two sources out of clusterMode");
if (clusterMode && data.numSources()!=1)
throw new IllegalArgumentException("need exactly one source in clusterMode");
String smallSource = data.getSource(0);
String bigSource = clusterMode ? data.getSource(0) : data.getSource(1);
if (data.numInstances(smallSource)>data.numInstances(bigSource)) {
String tmp = smallSource;
smallSource = bigSource;
bigSource = tmp;
}
// index the smaller source
double maxSetSize = data.numInstances(smallSource)*maxFraction;
Map index = new TreeMap();
for (int i=0; i<data.numInstances(smallSource); i++) {
Token[] tokens = tokenizer.tokenize( data.getInstance(smallSource,i).unwrap() );
for (int j=0; j<tokens.length; j++) {
Set containers = (Set)index.get(tokens[j]);
if (containers==STOPWORD_TOKEN_MARKER) {
/* do nothing */;
} else if (containers==null) {
containers = new TreeSet();
index.put(tokens[j], containers);
}
containers.add( new Integer(i) );
// mark this if it's too full
if (containers.size() > maxSetSize) {
index.put(tokens[j], STOPWORD_TOKEN_MARKER);
}
}
}
//System.out.println("data:\n"+data); showIndex(index);
// find pairs
Set pairedUpInstances = new TreeSet();
for (int i=0; i<data.numInstances(bigSource); i++) {
MatchData.Instance bigInst = data.getInstance(bigSource,i);
pairedUpInstances.clear();
Token[] tokens = tokenizer.tokenize( bigInst.unwrap() );
for (int j=0; j<tokens.length; j++) {
Set containers = (Set)index.get( tokens[j] );
if (containers!=null && containers!=STOPWORD_TOKEN_MARKER) {
for (Iterator k=containers.iterator(); k.hasNext(); ) {
Integer smallIndexInteger = (Integer)k.next();
int smallIndex = smallIndexInteger.intValue();
if (!pairedUpInstances.contains(smallIndexInteger) &&
(smallSource!=bigSource || smallIndex>i))
{
MatchData.Instance smallInst = data.getInstance(smallSource, smallIndex);
pairList.add( new Blocker.Pair( bigInst, smallInst, smallInst.sameId(bigInst) ));
pairedUpInstances.add( smallIndexInteger );
}
}
}
}
}
}
public int size() { return pairList.size(); }
public Pair getPair(int i) { return (Pair)pairList.get(i); }
public String toString() { return "[TokenBlocker:clusterMode="+clusterMode+",maxFraction="+maxFraction+"]"; }
public int numCorrectPairs() { return numCorrectPairs; }
private void showIndex(Map index)
{
for (Iterator i=index.keySet().iterator(); i.hasNext(); ) {
Token tok = (Token)i.next();
System.out.print(tok.toString());
Set containers = (Set)index.get( tok );
for (Iterator j=containers.iterator(); j.hasNext(); ) {
Integer k = (Integer)j.next();
System.out.print(" "+k);
}
System.out.println();
}
}
}
|