File: ExplicitHyphenator.py

package info (click to toggle)
python-wordaxe 0.3.2-1
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 1,228 kB
  • ctags: 786
  • sloc: python: 9,814; makefile: 5
file content (199 lines) | stat: -rw-r--r-- 7,751 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

__license__="""
   Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
   This software is dual-licenced under the Apache 2.0 and the
   2-clauses BSD license. For details, see license.txt
"""

__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''

import codecs

from wordaxe.hyphen import SHY, HyphenatedWord
from wordaxe.BaseHyphenator import BaseHyphenator
from wordaxe.hyphrules import decodeTrennung

class ExplicitHyphenator(BaseHyphenator):
    """
    Allow to explicitly specify how a word should be hyphenated.
    This is a slight improvement compared to BaseHyphenator.
    
    Usage:
    
    hyphenator = ExplicitHyphenator("DE")
    # Add explicit hyphenation for a single word.
    hyphenator.add_entry(u"analphabet", u"an8alpha5bet")
    # Add several entries
    hyphenator.add_entries({u"urinstinkt":   u"ur8instinkt",
                            u"urinstinkte":  u"ur8instinkte",
                            u"urinstinkten": u"ur8instinkt3en",
                           })
    
    The last entry is probably not correctly hyphenated
    according to the german hyphenation rules, but you don't
    want to read "urinstink" in a text...
    
    The add_entry/add_entries usually expect unicode strings.
    Bytes strings require the encoding argument to be supplied.
    hyphenator.add-entries ("brutigam", "bru5ti5gam", encoding="iso-8859").
    
    Instead of using numbers for defining the quality of a hyphenation
    point, you may use the "~" (tilde) character, corresponding to
    a medium quality hyphenation point: "bru~ti~gam".
    """

    def __init__ (self, 
                  language="DE",
                  minWordLength=4,
                  qHaupt=8,
                  qNeben=5,
                  qVorsilbe=5,
                  qSchlecht=3,
                  hyphenDir=None,
                  **options
                 ):
        BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)

        # Qualitten fr verschiedene Trennstellen
        self.qHaupt=qHaupt
        self.qNeben=qNeben
        self.qVorsilbe=qVorsilbe
        self.qSchlecht=qSchlecht
        
        # Stammdaten initialisieren
        self.sonderfaelle = []
        
    def add_entry(self, word, trennung, encoding=unicode):
        if not isinstance(word, unicode):
            word = unicode(word, encoding)
        if not isinstance(trennung, unicode):
            trennung = unicode(trennung, encoding)
        # Ignore Case @TODO Umlaute usw.!
        word = word.lower() 
        trennung = trennung.replace(u"~", u"5")
        lenword = len(word)
        for (lae, L) in self.sonderfaelle:
            if lae == lenword:
                L[word] = trennung
                break
        else:
            self.sonderfaelle.append((lenword,{word: trennung}))
            
    def add_entries(self, mapping, encoding=unicode):
        for word, trennung in mapping.items():
            self.add_entry(word, trennung, encoding)
            
    def add_entries_from_file(self, filename, encoding=None):
        """
        Add entries from a text file (interpreting the file
        using the given encoding). If encoding is not given
        or None, try to extract the encoding from a line
        near the start of the file like
        # -*- coding: iso-8859-1 -*-
        """
        if encoding is None:
            import re
            frag = open(filename,"rt").read(1000)
            m = re.search(r"-\*- coding: ([^ ]+) -\*-", frag)
            if m is not None:
                encoding = m.group(1)
            else:
                raise ValueError("Encoding not specified and not found in file")
        fh = codecs.open(filename, "rt", encoding)
        for line in fh:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            word, trennung = line.split()
            self.add_entry(word, trennung)
        fh.close()
        
        
    def hyph(self, word):
        #print "ExplicitHyphenator hyph", word
        lenword = len(word)
        for (lae, L) in self.sonderfaelle:
            if lae == lenword:
                trennung = L.get(word.lower(), None)
                if trennung is not None:
                    hword = HyphenatedWord(word, decodeTrennung(trennung))
                    return hword
                break
        # Wort nicht gefunden
        return None
        
    def i_hyphenate(self, aWord):
        assert isinstance(aWord, unicode)
        return self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, aWord)

    def i_hyphenate_derived(self,aWord):
        """
        You can use this method in classes derived from ExplicitHyphenator.
        It will first split the word using BaseHyphenator,
        then for each "subword" it will call ExplicitHyphenator,
        and only call the derived classes hyph method for the still
        unknown subwords.
        
        TODO: The implementation does not match the docstring
              test: "hohenlimburg.de", "hohenlimburg.de)"
        """
        #print "ExplicitHyphenator.i_hyphenate_derived", aWord
        assert isinstance(aWord, unicode)

        # Helper function
        
        sub_hwords = []
        hword = BaseHyphenator.i_hyphenate(self,aWord)
        #print "BaseHyphenator.i_hyphenate returned %r" % hword
        if hword is None:
            hword = HyphenatedWord(aWord,hyphenations=[])
        base_hyph_points = hword.hyphenations
        last_indx = 0
        nr = 0
        for hpnum, hp in enumerate(base_hyph_points):
            if isinstance(hp, int):
                hp = HyphenationPoint(hp, quality=5, sl=SHY)
            subword = hword[last_indx+nr:hp.indx]
            # handle subword
            if SHY in subword:
                sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
            else:
                sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
            if sub_hword is None:
                sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
            if sub_hword is None:
                sub_hword = HyphenatedWord(subword, hyphenations=[])
            sub_hwords.append(sub_hword)
            # end handle subword
            last_indx = hp.indx
            nr = hp.nr            
        # Now the last subword
        subword = hword[last_indx:]
        # handle subword
        if SHY in subword:
            sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
        else:
            sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
        if sub_hword is None:
            sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
        if sub_hword is None:
            sub_hword = HyphenatedWord(subword, hyphenations=[])
        sub_hwords.append(sub_hword)
        #end handle subword
        if len(sub_hwords) > 1:
            return HyphenatedWord.join(sub_hwords)
        else:        
            return sub_hwords[0] # Kann auch None sein.


if __name__=="__main__":
    h = ExplicitHyphenator("DE",5)
    h.add_entry("Brutigam", "Bru5ti5gam", "iso-8859-1")
    h.add_entries({u"Urinstinkt": u"Ur8instinkt",
                   u"Urinstinkte": u"Ur8instinkte",
                   u"Urinstinkten": u"Ur8instinkt3en",
                  }
                 )
    h.test(outfname="ExplicitLearn.html")