File: SimpleSourcedTokenizer.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (53 lines) | stat: -rw-r--r-- 1,966 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package com.wcohen.ss.tokens;

import java.util.*;
import com.wcohen.ss.api.*;

/**
 * Simple implementation of a Tokenizer.  Tokens are sequences of
 * alphanumerics, optionally including single punctuation characters.
 */

public class SimpleSourcedTokenizer extends SimpleTokenizer implements SourcedTokenizer
{
    private int nextId = 0;
    private Map tokMap = new TreeMap();

    public static final SimpleSourcedTokenizer DEFAULT_SOURCED_TOKENIZER = new SimpleSourcedTokenizer(true,true);
	
    public SimpleSourcedTokenizer(boolean ignorePunctuation,boolean ignoreCase) {
        super(ignorePunctuation,ignoreCase);
    }

    /**  Return tokenized version of a string. */
    public SourcedToken[] sourcedTokenize(String input,String source) 
    {
        Token[] tokens = tokenize(input);
        SourcedToken[] sourcedTokens = new SourcedToken[tokens.length];
        for (int i=0; i<tokens.length; i++) {
            String key = tokens[i].getValue()+"@"+source;
            if (tokMap.get(key)==null) {
                tokMap.put(key,new Integer(++nextId));
            }
            int id = ((Integer)tokMap.get(key)).intValue();
            sourcedTokens[i] = new BasicSourcedToken(id, tokens[i].getValue(), source);
        }
        return sourcedTokens;
    }
    /** Test routine */
    public static void main(String[] argv) 
    {
        SimpleSourcedTokenizer tokenizer = DEFAULT_SOURCED_TOKENIZER;
        int n = 0;
        for (int i=0; i<argv.length; i++) {
	    System.out.println("argument "+i+": '"+argv[i]+"'");
	    SourcedToken[] tokens = tokenizer.sourcedTokenize(argv[i],Integer.toString(i));
	    for (int j=0; j<tokens.length; j++) {
                System.out.println("token "+(++n)+":"
                                   +" id="+tokens[j].getIndex()
                                   +" value: '"+tokens[j].getValue()
                                   +"' source: '"+tokens[j].getSource()+"'");
	    }
        }
    }
}