File: SourcedMatchData.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (212 lines) | stat: -rw-r--r-- 6,826 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
package com.wcohen.ss.expt;

import com.wcohen.ss.*;
import com.wcohen.ss.api.*;
import java.util.*;
import java.io.*;

/**
 * Holds data for evaluating a distance metric.
 */

public class SourcedMatchData 
{
    private Map sourceLists;
    private ArrayList sourceNames;
    private String filename;
	
    /**
     * Read match data from a file.  Format should be:
     * sourceRelation TAB instanceID TAB field1 TAB ... fieldn LF
     */

    public SourcedMatchData(String filename) throws InputFormatException
    {
        this.filename = filename;
        sourceNames = new ArrayList();
        sourceLists = new HashMap();
        try {
            BufferedReader in = new BufferedReader(new FileReader(filename));
            String line;
            int lineNum = 0;
            while ((line = in.readLine())!=null) {
                lineNum++;
                String tok[] = line.split("\t",-1);
                int toklen = tok.length;
                if(toklen < 1)
                    throw new
                        InputFormatException(filename,lineNum,"no source");
                String src = tok[0];
                if (toklen < 2)
                    throw new
                        InputFormatException(filename,lineNum,"no id");
                String id = tok[1];
                if (toklen < 3)
                    throw new
                        InputFormatException(filename,lineNum,"no text fields");
                String text = tok[2];
                for(int i = 3;i < toklen;i++){
                    text += "\t" + tok[i];
                }
                addInstance(src,id,text);
            }
            in.close();
        } catch (IOException e) {
            throw new InputFormatException(filename,0,e.toString());
        }
    }


    public SourcedMatchData() 
    {
        this.filename = "none";
        sourceNames = new ArrayList();
        sourceLists = new HashMap();
    }

    /** Add a single instance, with given src and id, to the datafile */
    public void addInstance(String src,String id,String text) 
    {
        Instance inst = new Instance(src,id,text);
        ArrayList list = (ArrayList)sourceLists.get(src);
        if (list==null) {
            list = new ArrayList();
            sourceLists.put(src,list);
            sourceNames.add(src);
        }
        list.add(inst);
    }

    /** Number of sources in data set */
    public int numSources() { 
        return sourceNames.size(); 
    }

    /** Get string identifier for i-th source */
    public String getSource(int i) { 
        return (String)sourceNames.get(i); 
    }

    /** Number of records for source with given string id */
    public int numInstances(String src) { 
        return ((ArrayList)sourceLists.get(src)).size();
    }

    /** Get the j-th record for the named source. */
    public Instance getInstance(String src, int j) { 
        return (Instance)((ArrayList)sourceLists.get(src)).get(j); 
    }

    public StringWrapperIterator getIterator() {
        return new MatchIterator(this);
    }

    public String getFilename()
    {
        return filename;
    }

    public String toString() 
    {
        StringBuffer buf = new StringBuffer();
        for (int i=0; i<numSources(); i++) {
            String src = getSource(i);
            for (int j=0; j<numInstances(src); j++) {
                Instance inst = getInstance(src,j);
                buf.append(inst.toString()+"\n");
            }
        }
        return buf.toString();
    }

	
    /** A single item (aka record, string, etc) to match against
     * others.  An item has an id (for evaluating correctness of a
     * match), a source (which relation its from), and a text field.
     * Text is stored as a StringWrapper so that it can be
     * preprocessed, if necessary.
     */
    public static class Instance extends BasicStringWrapper implements SourcedStringWrapper
    {
        private final String source;
        private final String id;
        public Instance(String source, String id, String text) {
            super(text);
	    this.source = source.trim();
	    this.id = id.trim();
        }
        public String getSource() { return source; }
        public String getId() { return id; }
        public boolean sameId(Instance b) {
            return id!=null && b.id!=null && id.equals(b.id);
        }
        public String toString() { return "[src: '"+source+"' id: '"+id+"' unwrapped: '"+unwrap()+"']"; }
    }
	
    /** Iterates over all stored StringWrappers */
    static public class MatchIterator implements SourcedStringWrapperIterator 
    {
        private int sourceCursor,instanceCursor;
        private String src;  // caches getSource(sourceCursor)
        private SourcedMatchData data;

        public MatchIterator(SourcedMatchData data) { 
            this.data = data;
            sourceCursor = 0; 
            instanceCursor = 0; 
            src = data.getSource(sourceCursor); 
        }

        /** Not implemented. */
        public void remove() { throw new IllegalStateException("remove not implemented"); }

        /** Return the next StringWrapper. */
        public StringWrapper nextStringWrapper() { 
            return (StringWrapper)next(); 
        }

        /** Return the next StringWrapper. */
        public SourcedStringWrapper nextSourcedStringWrapper() { 
            return (SourcedStringWrapper)next(); 
        }

        public boolean hasNext() { 
            return sourceCursor<data.numSources() && instanceCursor<data.numInstances(src); 
        }

        /** Returns the next StringWrapper as an object. */
        public Object next() {
            Instance inst = data.getInstance( src, instanceCursor++ );
            if (instanceCursor>=data.numInstances(src)) {
                sourceCursor++; instanceCursor=0;
                if (sourceCursor<data.numSources()) 
                    src = data.getSource(sourceCursor);
            }
            //System.out.println("src: "+sourceCursor+" data:"+instanceCursor+" hasNext:"+hasNext());
            return inst;
        }
    }
	

    /** Signals an incorrectly formatted MatchData file.
     */
    public static class InputFormatException extends RuntimeException {
        public InputFormatException(String file, int line, String msg) {
	    super("line "+line+" of file "+file+": "+msg);
        }
    }
	
    public static void main(String[] argv) 
    {
        try {
            SourcedMatchData md = new SourcedMatchData(argv[0]);
	    System.out.println(md.toString());
            System.out.println("iterating!");
            for (Iterator i = md.getIterator(); i.hasNext(); ) {
                System.out.println(i.next().toString());                
            }
        } catch (Exception e) {
	    e.printStackTrace();
        }
    }
}