File: AbstractSourcedStatisticalTokenDistance.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (73 lines) | stat: -rw-r--r-- 2,821 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package com.wcohen.ss;

import java.util.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.api.*;


/**
 * Abstract token distance metric that uses frequency statistics.
 */

abstract public class AbstractSourcedStatisticalTokenDistance extends AbstractSourcedTokenizedStringDistance
{
	// to save space, allocate the small numbers only once in the documentFrequency map
	private static final Integer ONE = new Integer(1);
	private static final Integer TWO = new Integer(2);
	private static final Integer THREE = new Integer(3);

	// maps tokens to document frequency
	protected Map documentFrequency = new HashMap(); 
	// count number of documents
	protected int collectionSize = 0;
	// count number of tokens
	protected int totalTokenCount = 0;

	// count warnings
	private int warningCounter = 0;

	public AbstractSourcedStatisticalTokenDistance(SourcedTokenizer tokenizer) { super(tokenizer); }
	public AbstractSourcedStatisticalTokenDistance() { super(); }
	
	/** Accumulate statistics on how often each token value occurs 
	 */
	public void train(StringWrapperIterator i0) 
	{
            SourcedStringWrapperIterator i = (SourcedStringWrapperIterator)i0;
            Set seenTokens = new HashSet();
            while (i.hasNext()) {
                BagOfSourcedTokens bag = asBagOfSourcedTokens(i.nextSourcedStringWrapper());
                seenTokens.clear();
                for (Iterator j=bag.tokenIterator(); j.hasNext(); ) {
                    totalTokenCount++;
                    Token tokj = (Token)j.next();
                    if (!seenTokens.contains(tokj)) {
                        seenTokens.add(tokj);
                        // increment documentFrequency counts
                        Integer df = (Integer)documentFrequency.get(tokj);
                        if (df==null) documentFrequency.put(tokj,ONE); 
                        else if (df==ONE) documentFrequency.put(tokj,TWO);
                        else if (df==TWO) documentFrequency.put(tokj,THREE);
                        else documentFrequency.put(tokj, new Integer(df.intValue()+1));
                    }
                }
                collectionSize++;
            }
	}

	protected void checkTrainingHasHappened(StringWrapper s, StringWrapper t)
	{
            if (collectionSize==0 && ++warningCounter<=10) {
                System.out.println("Warning: "+this.getClass()+" not yet trained for sim('"+s+"','"+t+"')");
                if (warningCounter == 10) {
                    System.out.println("(By the way, that's the last warning you'll get about this.)");
                }
            }
	}

	public int getDocumentFrequency(Token tok) {
            Integer freqInteger = (Integer)documentFrequency.get(tok);
            if (freqInteger==null) return 0;
            else return freqInteger.intValue();
	}
}