File: Mixture.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (67 lines) | stat: -rw-r--r-- 2,080 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package com.wcohen.ss;

import java.util.*;
import com.wcohen.ss.api.*;
import com.wcohen.ss.tokens.*;

/**
 * Mixture-based distance metric.
 */

public class Mixture extends AbstractStatisticalTokenDistance
{
	private double minChange = 0.01;
	private double maxIterate = 20;

	public Mixture(Tokenizer tokenizer) { super(tokenizer);	}
	public Mixture() { super(); }

	/** Distance is argmax_lambda prod_{w in s} lambda Pr(w|t) * (1-lambda) Pr(w|background). 
	 * This is computed with E/M. */
	public double score(StringWrapper s,StringWrapper t) 
	{
		BagOfTokens sBag = asBagOfTokens(s);
		BagOfTokens tBag = asBagOfTokens(t);
		double lambda = 0.5;
		int iterations = 0;
		while (true) {
			double newLamba = 0.0;
			// E step: compute prob each token is draw from T
			for (Iterator i = sBag.tokenIterator(); i.hasNext(); ) {
				Token tok = (Token)i.next();
				double sWeight = sBag.getWeight(tok);
				double tWeight = tBag.getWeight(tok);
				double probTokGivenT = tWeight/tBag.getTotalWeight();
				double probTokGivenCorpus = ((double)getDocumentFrequency(tok))/totalTokenCount;
				double probDrawnFromT = lambda * probTokGivenT;
				double probDrawnFromCorpus = (1.0-lambda) * probTokGivenCorpus;
				double normalizingConstant = probTokGivenT + probTokGivenCorpus;
				probDrawnFromT /= normalizingConstant;
				probDrawnFromCorpus /= normalizingConstant;
				newLamba += probDrawnFromT * sWeight;
			}
			// M step: find best value of lambda
			newLamba /= sBag.getTotalWeight();
			// halt if converged
			double change = newLamba - lambda;
			if (iterations>maxIterate || (change>= -minChange && change<=minChange)) break;
			else lambda = newLamba;
		}
		return lambda;
	}
	
	/** Explain how the distance was computed. 
	 * In the output, the tokens in S and T are listed, and the
	 * common tokens are marked with an asterisk.
	 */
	public String explainScore(StringWrapper s, StringWrapper t) 
	{
		return "can't explain";
	}

	public String toString() { return "[Mixture]"; }
	
	static public void main(String[] argv) {
		doMain(new Mixture(), argv);
	}
}