File: AbstractStatisticalTokenDistance.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (77 lines) | stat: -rw-r--r-- 2,466 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package com.wcohen.ss;

import java.util.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.api.*;

import org.apache.log4j.Logger;

/**
 * Abstract token distance metric that uses frequency statistics.
 */

abstract public class AbstractStatisticalTokenDistance extends AbstractTokenizedStringDistance
{
	private static Logger log=Logger.getLogger(AbstractTokenizedStringDistance.class);

	// to save space, allocate the small numbers only once in the documentFrequency map
	private static final Integer ONE = new Integer(1);
	private static final Integer TWO = new Integer(2);
	private static final Integer THREE = new Integer(3);

	// maps tokens to document frequency
	protected Map documentFrequency = new HashMap(); 
	// count number of documents
	protected int collectionSize = 0;
	// count number of tokens
	protected int totalTokenCount = 0;

	// count warnings
	private int warningCounter = 0;

	public AbstractStatisticalTokenDistance(Tokenizer tokenizer) { super(tokenizer); }
	public AbstractStatisticalTokenDistance() { super(); }
	
	/** Accumulate statistics on how often each token value occurs 
	 */
	public void train(StringWrapperIterator i) 
	{
		Set seenTokens = new HashSet();
		while (i.hasNext()) {
			BagOfTokens bag = asBagOfTokens(i.nextStringWrapper());
			seenTokens.clear();
			for (Iterator j=bag.tokenIterator(); j.hasNext(); ) {
				totalTokenCount++;
				Token tokj = (Token)j.next();
				if (!seenTokens.contains(tokj)) {
					seenTokens.add(tokj);
					// increment documentFrequency counts
					Integer df = (Integer)documentFrequency.get(tokj);
					if (df==null) documentFrequency.put(tokj,ONE); 
					else if (df==ONE) documentFrequency.put(tokj,TWO);
					else if (df==TWO) documentFrequency.put(tokj,THREE);
					else documentFrequency.put(tokj, new Integer(df.intValue()+1));
				}
			}
			collectionSize++;
		}
	}

	protected void checkTrainingHasHappened(StringWrapper s, StringWrapper t)
	{
		if (collectionSize==0 && ++warningCounter<=10) {
		    log.warn(this.getClass()+" not yet trained for sim('"+s+"','"+t+"')");
		    if (warningCounter == 10) {
			log.warn("(By the way, that's the last warning you'll get about this.)");
		    }
		}
	}

	public int getDocumentFrequency(Token tok) {
		Integer freqInteger = (Integer)documentFrequency.get(tok);
		if (freqInteger==null) return 0;
		else return freqInteger.intValue();
	}
	
	public Iterator tokenIterator() { return documentFrequency.keySet().iterator(); }
}