1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
|
"""
Aho-Corasick string search algorithm.
Author : Wojciech Muła, wojciech_mula@poczta.onet.pl
WWW : http://0x80.pl
License : public domain
"""
import pyahocorasick
test_cases = [
# example provided by @Ulitochka
{
'words' : ["alpha", "alpha beta", "gamma", "gamma alpha"],
'input' : "I went to alpha beta the alpha other day gamma alpha to pick up some spam",
'expected' : [("alpha beta", 19), ("alpha", 29), ("gamma alpha", 51)]
},
{
'words' : ["alpha", "alpha beta", "beta gamma", "gamma"],
'input' : "Cats have not idea what alpha beta gamma means",
'expected' : [("alpha beta", 33), ("gamma", 39)]
},
{
'words' : ["alpha", "alpha beta", "beta gamma", "gamma"],
'input' : "Cats have not idea what alpha beta gamma",
'expected' : [("alpha beta", 33), ("gamma", 39)]
},
]
def test(case):
tree = pyahocorasick.Trie()
for word in case['words']:
tree.add_word(word, word)
tree.make_automaton()
actual = [item for item in tree.iter_long(case['input'])]
if actual != case['expected']:
print("ERROR:")
print(actual)
print(case['expected'])
assert(False)
if __name__ == '__main__':
for data in test_cases:
test(data)
print("OK")
|