File: Dict.py

package info (click to toggle)
pprocess 0.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 404 kB
  • ctags: 440
  • sloc: python: 2,048; makefile: 106; sh: 41
file content (118 lines) | stat: -rw-r--r-- 2,859 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python

"A simple file indexer."

import codecs
import time

class Parser:
    def __init__(self, filenames, encoding=None, delay=None):
        self.filenames = filenames
        self.encoding = encoding
        self.delay = delay

    def _get_file_content(self, filename):
        if self.encoding is None:
            f = open(filename)
        else:
            f = codecs.open(filename, encoding=self.encoding)
        s = f.read()
        f.close()
        return s

    def send_entries(self, channel):

        "Send word entries from the file."

        for filename in self.filenames:
            tokens = self._get_file_content(filename).split()
            index = {}

            words = []
            for token in tokens:
                token = self._strip(token)
                if token not in words:
                    channel.send((token, filename))
                    words.append(token)

            # Introduce a delay to simulate hard work.

            if self.delay:
                time.sleep(self.delay)

    def _strip(self, token):

        "Return the token stripped of non-alphanumeric symbols at each end."

        characters = []
        in_alphanum = 0
        for c in token:
            if not c.isalpha() and not c.isdigit():
                if in_alphanum:
                    break
            else:
                in_alphanum = 1
                characters.append(c)
        return "".join(characters)

class Indexer:
    def __init__(self):
        self.index = {}

    def get_index(self):
        return self.index

    def add_entry(self, entry):

        "Add the given word 'entry' (token, filename) to the index."

        token, filename = entry

        if not token:
            return

        slot = self.index
        for c in token:
            if not slot.has_key(c):
                slot[c] = {}, {}
            slot, words = slot[c]

        if not words.has_key(token):
            words[token] = []
        words[token].append(filename)

class Searcher:
    def __init__(self, index):
        self.index = index

    def find(self, pattern):

        "Find words beginning with the given 'pattern'."

        slot = self.index
        words = []

        for c in pattern:
            if not slot.has_key(c):
                return []
            slot, words = slot[c]

        results = {}
        results.update(words)
        results.update(self.get_all_words(slot))
        return results

    def get_all_words(self, slot):

        "Get all words under the given index 'slot'."

        all_words = {}
        keys = slot.keys()
        keys.sort()
        for c in keys:
            this_slot, words = slot[c]
            all_words.update(words)
            all_words.update(self.get_all_words(this_slot))
        return all_words

# vim: tabstop=4 expandtab shiftwidth=4