File: Normalizer.py

package info (click to toggle)
zope-textindexng2 1%3A2.0.8-5
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 3,772 kB
  • ctags: 3,537
  • sloc: ansic: 15,956; python: 6,129; xml: 185; makefile: 132; sh: 71
file content (105 lines) | stat: -rw-r--r-- 2,509 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
###########################################################################
#
# TextIndexNG                The next generation TextIndex for Zope
#
# This software is governed by a license. See
# LICENSE.txt for the terms of this license.
#
###########################################################################

"""
Normalizer

$Id: Normalizer.py,v 1.12 2004/02/26 17:50:35 ajung Exp $
"""

import os, re
from types import StringType 

import normalizer
from Products.TextIndexNG2.interfaces.INormalizer import NormalizerInterface

_dir = os.path.dirname(__file__)


class Normalizer:
    """  class for all Normalizer objects """

    __implements__ = NormalizerInterface

    def __init__(self, language, lst):
        self._n = normalizer.Normalizer(lst)
        self._language = language

    def getLanguage(self):
        return self._language

    def process(self, words): 
        return self._n.normalize(words)

    def __repr__(self):
        return "%s (%s)" % (self.__class__.__name__, 
            self.getLanguage()  )


    def getTable(self):
        return self._n.getTable()     


lang_reg = re.compile('#\s*language\s*=\s*([\w]+)')
enc_reg = re.compile('#\s*encoding\s*=\s*([\w\-]+)')

class FileNormalizer(Normalizer):

    def __init__(self, filename):

        assert type(filename)==StringType
        lst, language = self.readNormalizer(filename) 
        Normalizer.__init__(self, language, lst)


    def readNormalizer(self, filename):
        """ read a stopword file (line-by-line) from disk.
            'fname' is either relative to ./Normalizer/
            or has an absolute path.
        """

        d = {}
        language = None
        encoding = None

        try:        
            f = os.path.join(_dir,'normalizers',filename) 
            lines = open(f).readlines()
        except:
            try: lines = open(filename).readlines()
            except: raise

        lst = []

        for l in lines: 
            if not l.strip(): continue

            mo = lang_reg.match(l)
            if mo:
                language = mo.group(1)
                continue

            mo = enc_reg.match(l)
            if mo:
                encoding= mo.group(1)
                continue

            if l.startswith('#'): continue

            fields = l.split()
            assert len(fields) == 2
    
            k = unicode(fields[0], encoding) 
            v = unicode(fields[1], encoding) 

            lst.append( (k,v)  )

        return lst, language