File: NGramTokenizer.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (69 lines) | stat: -rw-r--r-- 2,644 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
package com.wcohen.ss.tokens;

import java.util.*;
import com.wcohen.ss.api.*;

/**
 * Wraps another tokenizer, and adds all computes all ngrams of
 * characters from a single token produced by the inner tokenizer.
 */

public class NGramTokenizer implements Tokenizer
{
    private int minNGramSize;
    private int maxNGramSize;
    private boolean keepOldTokens;
    private Tokenizer innerTokenizer; // used for 
	
    public static NGramTokenizer DEFAULT_TOKENIZER = new NGramTokenizer(3,5,true,SimpleTokenizer.DEFAULT_TOKENIZER);

    public NGramTokenizer(int minNGramSize,int maxNGramSize,boolean keepOldTokens,Tokenizer innerTokenizer) {
        this.minNGramSize = minNGramSize;
        this.maxNGramSize = maxNGramSize;
        this.keepOldTokens = keepOldTokens;
        this.innerTokenizer = innerTokenizer;
    }

    /**  Return tokenized version of a string.  Tokens are all
     * character n-grams that are part of a token produced by the
     * inner tokenizer. */
    public Token[] tokenize(String input) 
    {
        Token[] initialTokens = innerTokenizer.tokenize(input);
        List tokens = new ArrayList();
        for (int i=0; i<initialTokens.length; i++) {
            Token tok = initialTokens[i];
            String str = "^"+tok.getValue()+"$";
            if (keepOldTokens) tokens.add( intern(str) );
            for (int lo=0; lo<str.length(); lo++) {
                for (int len=minNGramSize; len<=maxNGramSize; len++) {
                    if (lo+len<str.length()) {
                        tokens.add( innerTokenizer.intern( str.substring(lo,lo+len) )); 
                    }
                }
            }
        }
        return (Token[]) tokens.toArray(new BasicToken[tokens.size()]);
    }
	
    public Token intern(String s) { return innerTokenizer.intern(s); }
    public Iterator tokenIterator() { return innerTokenizer.tokenIterator(); }
    public int maxTokenIndex() { return innerTokenizer.maxTokenIndex(); }

    /** Test routine */
    public static void main(String[] argv) 
    {
        NGramTokenizer tokenizer = NGramTokenizer.DEFAULT_TOKENIZER;
        //NGramTokenizer tokenizer = new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER);
        int n = 0;
        for (int i=0; i<argv.length; i++) {
	    System.out.println("argument "+i+": '"+argv[i]+"'");
	    Token[] tokens = tokenizer.tokenize(argv[i]);
	    for (int j=0; j<tokens.length; j++) {
                System.out.println("token "+(++n)+":"
                                   +" id="+tokens[j].getIndex()
                                   +" value: '"+tokens[j].getValue()+"'");
	    }
        }
    }
}