File: CharacterTokenizer.java

package info (click to toggle)
libsecondstring-java 0.1~dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 764 kB
  • sloc: java: 9,592; xml: 114; makefile: 6
file content (89 lines) | stat: -rw-r--r-- 2,708 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package com.wcohen.ss.tokens;

import java.util.*;
import com.wcohen.ss.api.*;

/**
 * Character tokenizer implementation.  Tokens are single characters of the source string.
 */

public class CharacterTokenizer implements Tokenizer
{
    public static final CharacterTokenizer DEFAULT_TOKENIZER = new CharacterTokenizer(true,true);
	
    private boolean ignorePunctuation = true;
    private boolean ignoreCase = true;
	
    public CharacterTokenizer(boolean ignorePunctuation,boolean ignoreCase) {
        this.ignorePunctuation = ignorePunctuation;
        this.ignoreCase = ignoreCase;
    }

    // parameter setting
    public void setIgnorePunctuation(boolean flag)  { ignorePunctuation = flag; }
    public void setIgnoreCase(boolean flag)  { ignoreCase = flag; }
    public String toString() { return "[CharacterTokenizer "+ignorePunctuation+";"+ignoreCase+"]"; }
	
    /**  Return tokenized version of a string.  Tokens are sequences
     * of alphanumerics, or any single punctuation character. */
    public Token[] tokenize(String input) 
    {
    	char[] stringChars =  input.toCharArray();
        List<Token> tokens = new ArrayList<Token>();
        for (char c : stringChars) {
			if(Character.isLetterOrDigit(c)){
				tokens.add(internSomething(Character.toString(c)));
			}
			else if (!ignorePunctuation && !Character.isWhitespace(c)) {
				tokens.add(internSomething(Character.toString(c)));
			}
		}
        return (Token[]) tokens.toArray(new BasicToken[tokens.size()]);
    }
    private Token internSomething(String s) 
    {
        return intern( ignoreCase ? s.toLowerCase() : s );
    }
	
    //
    // 'interning' strings as tokens
    //
    private int nextId = 0;
    private Map<String, Token> tokMap = new TreeMap<String, Token>();

    public Token intern(String s) 
    {
        Token tok = (Token)tokMap.get(s);
        if (tok==null) {
	    tok = new BasicToken(++nextId,s);
	    tokMap.put(s,tok);
        }
        return tok;
    }

    public Iterator<Token> tokenIterator()
    {
        return tokMap.values().iterator();
    }

    public int maxTokenIndex()
    {
        return nextId;
    }

    /** Test routine */
    public static void main(String[] argv) 
    {
        CharacterTokenizer tokenizer = DEFAULT_TOKENIZER;
        int n = 0;
        for (int i=0; i<argv.length; i++) {
	    System.out.println("argument "+i+": '"+argv[i]+"'");
	    Token[] tokens = tokenizer.tokenize(argv[i]);
	    for (int j=0; j<tokens.length; j++) {
                System.out.println("token "+(++n)+":"
                                   +" id="+tokens[j].getIndex()
                                   +" value: '"+tokens[j].getValue()+"'");
	    }
        }
    }
}