1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
# Copyright (C) 2010 Peter Teichman
import re
import Stemmer
import types
class MegaHALTokenizer:
"""A traditional MegaHAL style tokenizer. This considers any of these
to be a token:
* one or more consecutive alpha characters (plus apostrophe)
* one or more consecutive numeric characters
* one or more consecutive punctuation/space characters (not apostrophe)
This tokenizer ignores differences in capitalization."""
def split(self, phrase):
if type(phrase) != types.UnicodeType:
raise TypeError("Input must be Unicode")
if len(phrase) == 0:
return []
# add ending punctuation if it is missing
if phrase[-1] not in ".!?":
phrase = phrase + "."
words = re.findall("([A-Z']+|[0-9]+|[^A-Z'0-9]+)", phrase.upper(),
re.UNICODE)
return words
def join(self, words):
"""Capitalize the first alpha character in the reply and the
first alpha character that follows one of [.?!] and a
space."""
chars = list(u"".join(words))
start = True
for i in xrange(len(chars)):
char = chars[i]
if char.isalpha():
if start:
chars[i] = char.upper()
else:
chars[i] = char.lower()
start = False
else:
if i > 2 and chars[i - 1] in ".?!" and char.isspace():
start = True
return u"".join(chars)
class CobeTokenizer:
"""A tokenizer that is somewhat improved from MegaHAL. These are
considered tokens:
* one or more consecutive Unicode word characters (plus apostrophe and dash)
* one or more consecutive Unicode non-word characters, possibly with
internal whitespace
* the whitespace between word or non-word tokens
* an HTTP url, [word]: followed by any run of non-space characters.
This tokenizer collapses multiple spaces in a whitespace token into a
single space character.
It preserves differences in case. foo, Foo, and FOO are different
tokens."""
def __init__(self):
# Add hyphen to the list of possible word characters, so hyphenated
# words become one token (e.g. hy-phen). But don't remove it from
# the list of non-word characters, so if it's found entirely within
# punctuation it's a normal non-word (e.g. :-( )
self.regex = re.compile("(\w+:\S+" # urls
"|[\w'-]+" # words
"|[^\w\s][^\w]*[^\w\s]" # multiple punctuation
"|[^\w\s]" # a single punctuation character
"|\s+)", # whitespace
re.UNICODE)
def split(self, phrase):
if type(phrase) != types.UnicodeType:
raise TypeError("Input must be Unicode")
# Strip leading and trailing whitespace. This might not be the
# correct choice long-term, but in the brain it prevents edges
# from the root node that have has_space set.
phrase = phrase.strip()
if len(phrase) == 0:
return []
tokens = self.regex.findall(phrase)
# collapse runs of whitespace into a single space
space = u" "
for i, token in enumerate(tokens):
if token[0] == " " and len(token) > 1:
tokens[i] = space
return tokens
def join(self, words):
return u"".join(words)
class CobeStemmer:
def __init__(self, name):
# use the PyStemmer Snowball stemmer bindings
self.stemmer = Stemmer.Stemmer(name)
def stem(self, word):
# Don't preserve case when stemming, i.e. create lowercase stems.
# This will allow us to create replies that switch the case of
# input words, but still generate the reply in context with the
# generated case.
stem = self.stemmer.stemWord(word.lower())
return stem
|