File: dictionary.py

package info (click to toggle)
python-whoosh 2.7.4%2Bgit6-g9134ad92-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,656 kB
  • sloc: python: 38,517; makefile: 118
file content (43 lines) | stat: -rw-r--r-- 1,258 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os.path, gzip

from whoosh import analysis, fields
from whoosh.support.bench import Bench, Spec


class VulgarTongue(Spec):
    name = "dictionary"
    filename = "dcvgr10.txt.gz"
    headline_field = "head"

    def documents(self):
        path = os.path.join(self.options.dir, self.filename)
        f = gzip.GzipFile(path)

        head = body = None
        for line in f:
            line = line.decode("latin1")
            if line[0].isalpha():
                if head:
                    yield {"head": head, "body": head + body}
                head, body = line.split(".", 1)
            else:
                body += line

        if head:
            yield {"head": head, "body": head + body}

    def whoosh_schema(self):
        ana = analysis.StemmingAnalyzer()
        #ana = analysis.StandardAnalyzer()
        schema = fields.Schema(head=fields.ID(stored=True),
                               body=fields.TEXT(analyzer=ana, stored=True))
        return schema

    def zcatalog_setup(self, cat):
        from zcatalog import indexes  #@UnresolvedImport
        cat["head"] = indexes.FieldIndex(field_name="head")
        cat["body"] = indexes.TextIndex(field_name="body")


if __name__ == "__main__":
    Bench().run(VulgarTongue)