File: AbstractSourcedTokenizedStringDistance.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (45 lines) | stat: -rw-r--r-- 1,571 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package com.wcohen.ss;

import java.util.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.api.*;


/**
 * Abstract distance metric for tokenized strings.
 */

abstract public class AbstractSourcedTokenizedStringDistance extends AbstractStringDistance
{
    protected SourcedTokenizer tokenizer;
    // cached, tokenized version of wrappers
    private List tokenizedWrappers; 

    public AbstractSourcedTokenizedStringDistance(Tokenizer tokenizer) { this.tokenizer = (SourcedTokenizer)tokenizer; }
    public AbstractSourcedTokenizedStringDistance() { this(SimpleSourcedTokenizer.DEFAULT_SOURCED_TOKENIZER); }
	
    final public void setStringWrapperPool(StringWrapperIterator i) { 
        train(i);
    }

    abstract public void train(StringWrapperIterator i); 

    final public StringWrapperIterator prepare(StringWrapperIterator i0) {
        SourcedStringWrapperIterator i = (SourcedStringWrapperIterator)i0;
        tokenizedWrappers = new ArrayList();
        while (i.hasNext()) {
            tokenizedWrappers.add( asBagOfSourcedTokens(i.nextSourcedStringWrapper()) );
        }
        return new BasicSourcedStringWrapperIterator(tokenizedWrappers.iterator());
    }

    // convert to a bag of tokens
    final protected BagOfSourcedTokens asBagOfSourcedTokens(SourcedStringWrapper w) 
    {
        if (w instanceof BagOfSourcedTokens) return (BagOfSourcedTokens)w;
        else {
            SourcedToken[] toks = tokenizer.sourcedTokenize(w.unwrap(), w.getSource());
            return new BagOfSourcedTokens(w.unwrap(), toks);
        }
    }
}