1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
|
package com.wcohen.ss.tokens;
import java.util.*;
import com.wcohen.ss.api.*;
/**
* Simple implementation of a Tokenizer. Tokens are sequences of
* alphanumerics, optionally including single punctuation characters.
*/
public class SimpleSourcedTokenizer extends SimpleTokenizer implements SourcedTokenizer
{
private int nextId = 0;
private Map tokMap = new TreeMap();
public static final SimpleSourcedTokenizer DEFAULT_SOURCED_TOKENIZER = new SimpleSourcedTokenizer(true,true);
public SimpleSourcedTokenizer(boolean ignorePunctuation,boolean ignoreCase) {
super(ignorePunctuation,ignoreCase);
}
/** Return tokenized version of a string. */
public SourcedToken[] sourcedTokenize(String input,String source)
{
Token[] tokens = tokenize(input);
SourcedToken[] sourcedTokens = new SourcedToken[tokens.length];
for (int i=0; i<tokens.length; i++) {
String key = tokens[i].getValue()+"@"+source;
if (tokMap.get(key)==null) {
tokMap.put(key,new Integer(++nextId));
}
int id = ((Integer)tokMap.get(key)).intValue();
sourcedTokens[i] = new BasicSourcedToken(id, tokens[i].getValue(), source);
}
return sourcedTokens;
}
/** Test routine */
public static void main(String[] argv)
{
SimpleSourcedTokenizer tokenizer = DEFAULT_SOURCED_TOKENIZER;
int n = 0;
for (int i=0; i<argv.length; i++) {
System.out.println("argument "+i+": '"+argv[i]+"'");
SourcedToken[] tokens = tokenizer.sourcedTokenize(argv[i],Integer.toString(i));
for (int j=0; j<tokens.length; j++) {
System.out.println("token "+(++n)+":"
+" id="+tokens[j].getIndex()
+" value: '"+tokens[j].getValue()
+"' source: '"+tokens[j].getSource()+"'");
}
}
}
}
|