1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
|
#!/usr/bin/env python
"A simple file indexer."
import codecs
import time
class Parser:
def __init__(self, filenames, encoding=None, delay=None):
self.filenames = filenames
self.encoding = encoding
self.delay = delay
def _get_file_content(self, filename):
if self.encoding is None:
f = open(filename)
else:
f = codecs.open(filename, encoding=self.encoding)
s = f.read()
f.close()
return s
def send_entries(self, channel):
"Send word entries from the file."
for filename in self.filenames:
tokens = self._get_file_content(filename).split()
index = {}
words = []
for token in tokens:
token = self._strip(token)
if token not in words:
channel.send((token, filename))
words.append(token)
# Introduce a delay to simulate hard work.
if self.delay:
time.sleep(self.delay)
def _strip(self, token):
"Return the token stripped of non-alphanumeric symbols at each end."
characters = []
in_alphanum = 0
for c in token:
if not c.isalpha() and not c.isdigit():
if in_alphanum:
break
else:
in_alphanum = 1
characters.append(c)
return "".join(characters)
class Indexer:
def __init__(self):
self.index = {}
def get_index(self):
return self.index
def add_entry(self, entry):
"Add the given word 'entry' (token, filename) to the index."
token, filename = entry
if not token:
return
slot = self.index
for c in token:
if not slot.has_key(c):
slot[c] = {}, {}
slot, words = slot[c]
if not words.has_key(token):
words[token] = []
words[token].append(filename)
class Searcher:
def __init__(self, index):
self.index = index
def find(self, pattern):
"Find words beginning with the given 'pattern'."
slot = self.index
words = []
for c in pattern:
if not slot.has_key(c):
return []
slot, words = slot[c]
results = {}
results.update(words)
results.update(self.get_all_words(slot))
return results
def get_all_words(self, slot):
"Get all words under the given index 'slot'."
all_words = {}
keys = slot.keys()
keys.sort()
for c in keys:
this_slot, words = slot[c]
all_words.update(words)
all_words.update(self.get_all_words(this_slot))
return all_words
# vim: tabstop=4 expandtab shiftwidth=4
|