File: issue_8.py

package info (click to toggle)
python-pyahocorasick 1.4.1-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 748 kB
  • sloc: ansic: 4,554; python: 2,823; sh: 312; makefile: 242
file content (55 lines) | stat: -rw-r--r-- 1,467 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-

"""
    Aho-Corasick string search algorithm.

    Author    : Wojciech Muła, wojciech_mula@poczta.onet.pl
    WWW       : http://0x80.pl
    License   : public domain
"""
import ahocorasick

test_sentences_rus = ["!ASM Print",
"!ASM Print, tyre компания er",
"!ASM Print, рекламно-производственная компания rr",
"!Action Pact!",
"!T.O.O.H.!",
"!YES, лингвистический центр",
"!ts, магазин",
"!ФЕСТ",
'"100-th" department store',
'"1000 мелочей"',
'"1001 мелочь"',
'"19 отряд Федеральной противопожарной службы по Ленинградской области"',
'"У Друзей"',
'"ШТОРЫ и не только..."']

test_sentences_pl = [
    "wąż",  # a snake
    "mąż",  # a husband - why so similar :)
    "żółć",
    "aż",
    "waży"
]

def create_sutomata_rus():
    A = ahocorasick.Automaton()
    for sentences in test_sentences_rus[-7:]:
        for index, word in enumerate(sentences.split(' ')):
            A.add_word(word, (index, word))

    A.make_automaton()


def create_and_iter_sutomata_pl():
    A = ahocorasick.Automaton()
    for index, word in enumerate(test_sentences_pl):
        A.add_word(word, (index, word))

    A.make_automaton()
    for item in A.iter("wyważyć"):
        print(item)

if __name__ == '__main__':
    create_sutomata_rus()
    create_and_iter_sutomata_pl()