1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
|
package com.wcohen.ss.expt;
import com.wcohen.ss.*;
import com.wcohen.ss.api.*;
import java.util.*;
import java.io.*;
/**
* Holds data for evaluating a distance metric.
*/
public class SourcedMatchData
{
private Map sourceLists;
private ArrayList sourceNames;
private String filename;
/**
* Read match data from a file. Format should be:
* sourceRelation TAB instanceID TAB field1 TAB ... fieldn LF
*/
public SourcedMatchData(String filename) throws InputFormatException
{
this.filename = filename;
sourceNames = new ArrayList();
sourceLists = new HashMap();
try {
BufferedReader in = new BufferedReader(new FileReader(filename));
String line;
int lineNum = 0;
while ((line = in.readLine())!=null) {
lineNum++;
String tok[] = line.split("\t",-1);
int toklen = tok.length;
if(toklen < 1)
throw new
InputFormatException(filename,lineNum,"no source");
String src = tok[0];
if (toklen < 2)
throw new
InputFormatException(filename,lineNum,"no id");
String id = tok[1];
if (toklen < 3)
throw new
InputFormatException(filename,lineNum,"no text fields");
String text = tok[2];
for(int i = 3;i < toklen;i++){
text += "\t" + tok[i];
}
addInstance(src,id,text);
}
in.close();
} catch (IOException e) {
throw new InputFormatException(filename,0,e.toString());
}
}
public SourcedMatchData()
{
this.filename = "none";
sourceNames = new ArrayList();
sourceLists = new HashMap();
}
/** Add a single instance, with given src and id, to the datafile */
public void addInstance(String src,String id,String text)
{
Instance inst = new Instance(src,id,text);
ArrayList list = (ArrayList)sourceLists.get(src);
if (list==null) {
list = new ArrayList();
sourceLists.put(src,list);
sourceNames.add(src);
}
list.add(inst);
}
/** Number of sources in data set */
public int numSources() {
return sourceNames.size();
}
/** Get string identifier for i-th source */
public String getSource(int i) {
return (String)sourceNames.get(i);
}
/** Number of records for source with given string id */
public int numInstances(String src) {
return ((ArrayList)sourceLists.get(src)).size();
}
/** Get the j-th record for the named source. */
public Instance getInstance(String src, int j) {
return (Instance)((ArrayList)sourceLists.get(src)).get(j);
}
public StringWrapperIterator getIterator() {
return new MatchIterator(this);
}
public String getFilename()
{
return filename;
}
public String toString()
{
StringBuffer buf = new StringBuffer();
for (int i=0; i<numSources(); i++) {
String src = getSource(i);
for (int j=0; j<numInstances(src); j++) {
Instance inst = getInstance(src,j);
buf.append(inst.toString()+"\n");
}
}
return buf.toString();
}
/** A single item (aka record, string, etc) to match against
* others. An item has an id (for evaluating correctness of a
* match), a source (which relation its from), and a text field.
* Text is stored as a StringWrapper so that it can be
* preprocessed, if necessary.
*/
public static class Instance extends BasicStringWrapper implements SourcedStringWrapper
{
private final String source;
private final String id;
public Instance(String source, String id, String text) {
super(text);
this.source = source.trim();
this.id = id.trim();
}
public String getSource() { return source; }
public String getId() { return id; }
public boolean sameId(Instance b) {
return id!=null && b.id!=null && id.equals(b.id);
}
public String toString() { return "[src: '"+source+"' id: '"+id+"' unwrapped: '"+unwrap()+"']"; }
}
/** Iterates over all stored StringWrappers */
static public class MatchIterator implements SourcedStringWrapperIterator
{
private int sourceCursor,instanceCursor;
private String src; // caches getSource(sourceCursor)
private SourcedMatchData data;
public MatchIterator(SourcedMatchData data) {
this.data = data;
sourceCursor = 0;
instanceCursor = 0;
src = data.getSource(sourceCursor);
}
/** Not implemented. */
public void remove() { throw new IllegalStateException("remove not implemented"); }
/** Return the next StringWrapper. */
public StringWrapper nextStringWrapper() {
return (StringWrapper)next();
}
/** Return the next StringWrapper. */
public SourcedStringWrapper nextSourcedStringWrapper() {
return (SourcedStringWrapper)next();
}
public boolean hasNext() {
return sourceCursor<data.numSources() && instanceCursor<data.numInstances(src);
}
/** Returns the next StringWrapper as an object. */
public Object next() {
Instance inst = data.getInstance( src, instanceCursor++ );
if (instanceCursor>=data.numInstances(src)) {
sourceCursor++; instanceCursor=0;
if (sourceCursor<data.numSources())
src = data.getSource(sourceCursor);
}
//System.out.println("src: "+sourceCursor+" data:"+instanceCursor+" hasNext:"+hasNext());
return inst;
}
}
/** Signals an incorrectly formatted MatchData file.
*/
public static class InputFormatException extends RuntimeException {
public InputFormatException(String file, int line, String msg) {
super("line "+line+" of file "+file+": "+msg);
}
}
public static void main(String[] argv)
{
try {
SourcedMatchData md = new SourcedMatchData(argv[0]);
System.out.println(md.toString());
System.out.println("iterating!");
for (Iterator i = md.getIterator(); i.hasNext(); ) {
System.out.println(i.next().toString());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
|