File: hyphen.py

package info (click to toggle)
python-wordaxe 0.3.2-1
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 1,228 kB
  • ctags: 786
  • sloc: python: 9,814; makefile: 5
file content (359 lines) | stat: -rw-r--r-- 13,657 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# -*- coding: iso-8859-1 -*-

__license__="""
   Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
   This software is dual-licenced under the Apache 2.0 and the
   2-clauses BSD license. For details, see license.txt
"""

__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''

from copy import copy
SHY = "\xAD".decode("iso-8859-1")

class HyphenationPoint(object):
    """
    A possible hyphenation point in a HyphenatedWord.
    
    Attributes:
      indx        : The index where to split the word.
      quality     : The quality of this hyphenation point (0=bad,5=average,9=very good).
      nl,sl,nr,sr : Replacement parameters.
      
    Description:
      When we split the word at this hyphenation point,
      we can build the two strings left,right as follows:
      left = word[:pos-nl] + sl
      right = sr + word[pos+nr:]

    Some examples (where q is some quality, i.e. q=5):
    
      The usual case is nl=0,sl="\173",nr=0,sr="".
      In other words, just add a "shy" character to the left string.
      "Lesen" (to read) can be hyphenated as "le-" "sen":
      HyphenationPoint(2,q,0,"\173",0,"")
      
      In some cases, it is not necessary to add the shy character:
      "ABC-Buch" (ABC book) can be hyphenated as "ABC-" "buch":
      HyphenationPoint(4,q,0,"",0,"")
      
      And - especially using the OLD german rules - the case
      nl>0 or nr>0 can occur:
      
      The word "backen" (to bake) can be hyphenated between the "c" and the "k";
      however, the hyphenated version would be "bak-" "ken".
      Thus, the one and only hyphenation point in this word is
      HyphenationPoint(3,q,1,"k"+"\173",0,"")
      
      Another example: According to the old german rules, the word "Schiffahrt"
      is a concatenation of "Schiff" (ship) and "fahrt" (journey).
      The triple "f" is shortened to a double "f".
      But in case of hyphenation, it's three "f"s again: "Schiff-" "fahrt".
      HyphenationPoint(5,q,0,"f"+shy,0,"")
      This could also be expressed as HyphenationPoint(6,q,0,shy,0,"f").
    """
    __slots__ = ["indx","quality","nl","sl","nr","sr"]
    def __init__(self,indx,quality,nl=0,sl=u"",nr=0,sr=u""):
        self.indx = indx
        self.quality = quality
        self.nl = nl
        self.sl = unicode(sl)
        self.nr = nr
        self.sr = unicode(sr)
    def __str__(self):
        return 'HyphP(%d,%d)' % (self.indx,self.quality)
    def __repr__(self):
        return 'HyphenationPoint(%d,%d,%d,%s,%d,%s)' % (self.indx,self.quality,self.nl,`self.sl`,self.nr,`self.sr`)

def _lshift(hyphenations, amt):
    "Moves the hyphenation points left"
    hyph = []
    for h in hyphenations:
        if type(h) is int:
            if h > amt: 
                hyph.append(h-amt)
        else:
            if h.indx > amt:
                hyph.append(HyphenationPoint(h.indx-amt,h.quality,h.nl,h.sl,h.nr,h.sr))
    return hyph

class HyphenatedWord(unicode):
    """
    A hyphenated word.
    
    Attributes:
      word:         the word without hyphenations
      hyphenations: a list containing the possible hyphenation points.
      info:         Information about the hyphenation process.
    
    See also class Hyphenator for an explanation.
    """

    __slots__ = ["hyphenations",]
    
    def __new__(klass, word, hyphenations=None, encoding="utf-8", errors='strict'):
        if isinstance(word, unicode):
            return unicode.__new__(klass, word)
        return unicode.__new__(klass, word, encoding, errors)
    
    def __init__(self, word, hyphenations=None, encoding="utf-8", errors='strict'):
        "Constructor using the string aWord and a list of hyphenation points."
        super(HyphenatedWord, self).__init__(word, encoding, errors)
        if hyphenations is not None:
            self.hyphenations = hyphenations
        elif hasattr(word, "hyphenations"):
            self.hyphenations = word.hyphenations
        else:
            raise ValueError("'hyphenations' Argument is missing")
            
    def __str__(self):
        return self.encode("utf-8")

    def __repr__(self):
        return ("HyphenatedWord(%s)" % super(HyphenatedWord, self).__repr__())

    def __add__(self, other):
        """(other) -> instance of this class
        Like unicode.__add__, but assumes that the other element
        is either unicode or an utf-8 encoded string.
        """
        if not isinstance(other,unicode):
            other = unicode(other, "utf-8")
        return unicode(unicode.__add__(self, other))

    def __radd__(self, other):
        """(other) -> instance of this class
        Like unicode.__add__, but assumes that the other element
        is either unicode or an utf-8 encoded string.
        """
        if isinstance(other, basestring):
            if not isinstance(other,unicode):
                other = unicode(other, "utf-8")
            return unicode(unicode.__add__(other, self))
        else:
            return NotImplemented

    def split(self, hp):
        """Performs a split at the given hyphenation point.
        
           Returns a tuple (left,right)
           where left is a string (the left part, including the hyphenation character)
           and right is a HyphenatedWord describing the rest of the word.
        """
        if type(hp) is int:
            left = self[:hp] + SHY
            hyph = _lshift (self.hyphenations, hp)
            print hyph
            right = self.__class__(self[hp:], hyphenations=hyph)
        else:
            shift = hp.indx-hp.nr+len(hp.sr)
            left = self[:hp.indx-hp.nl] + hp.sl
            hyph = _lshift (self.hyphenations, shift)
            right = self.__class__(hp.sr+self[hp.indx+hp.nr:], hyphenations=hyph)
        assert isinstance(left, unicode)
        assert isinstance(right, self.__class__)
        return (left,right)
        
    def prepend(self, string):
        "Allows adding prefix chars (such as '('), returning a new HyphenatedWord"
        return self.__class__(unicode(string) + self, hyphenations=_lshift(self.hyphenations,-len(string)))

    def append(self, string):
        "Allows adding suffix chars (such as ')'), returning a new HyphenatedWord"
        return self.__class__(self + unicode(string), hyphenations=self.hyphenations)
            
    def showHyphens(self):
        "Returns the possible hyphenations as a string list, for debugging purposes."
        L = []
        for h in self.hyphenations:
            left,right = self.split(h)
            L.append(u"%s %s (%d)" % (left,right, h.quality))
        return L
        
    def get_hyphenations(self):
        "Returns an iteration of the possible hyphenations."
        for hp in self.hyphenations:
            yield self.split(hp)
                
    @staticmethod 
    def join(*hyphwords):
        """
        Create a new hyphenated word from a list of other hyphenated words.
        a = HyphenatedWord("Vogel")    # Vo-gel
        b = HyphenatedWord("grippe")   # grip-pe
        Inserts a good quality hyphenation point at the boundaries.
        c = HyphenatedWord.join(a,b)
        # Vo-gel=grip-pe.
        """
        if len(hyphwords) == 1:
            hyphwords = hyphwords[0]
        for w in hyphwords:
            assert isinstance(w,HyphenatedWord)
        word = u"".join(hyphwords)
        hps = []
        offset = 0
        for w in hyphwords:
            hps += _lshift(w.hyphenations, -offset)
            if w is not hyphwords[-1]:
                #print w.word
                if w.endswith(u"-") or w.endswith(SHY):
                    hps.append(HyphenationPoint(offset+len(w), quality=9))
                else:
                    hps.append(HyphenationPoint(offset+len(w), quality=9, sl=SHY))
                offset += len(w)
        return HyphenatedWord(word, hyphenations=hps)

class Hyphenator:
    """
    Hyphenator serves as the base class for all hyphenation implementation classes.
    
    Some general thoughts about hyphenation follow.
    
    Hyphenation is language specific.
    Hyphenation is encoding specific.
    Hyphenation does not use the context of a word.
    Good Hyphenation enables the reader to read fluently,
    bad hyphenation can make a word hard to read.
    
    Hyphenation is language specific:
    The same word may be valid in several languages,
    and the valid hyphenation points can depend on the language.
    Example: Situation
   
    Hyphenation is encoding specific:
    This is just an implementation detail really,
    however an important one.
    For example, every hyphenation algorithm uses some internal
    encoding scheme, and it should document this scheme.
    How is the input encoding and the output encoding?
    
    Hyphenation does not use the context of the word:
    Surely, it could make sense to "understand" the context.
    There may be some words that should be hyphenated differently
    depending on the context.
    But this would make a really BIG overhead;
    and I can't really think of an example. It's not worth thinking about it.
    
    Good Hyphenation enables the reader to read fluently,
    bad hyphenation can make a word hard to read.
    Some languages, for example german, make frequent use of
    the concatenation of several simple words to build more complex words,
    like "Hilberts Nullstellensatz" (something I remember from Algebra).
    Null = Zero
    Stelle = Place, Location
    Satz = Theorem (math)
 
    The one famous example for bad german hyphenation is the word "Urinstinkt".
    This is made up of
    Ur = Primal
    Instinkt = Instinct
    Hyphenatiing this word in a valid, but unfortunate position,
    yields "Urin-stinkt" (urine stinks).
   
    These thoughts have led to the following interface for hyphenation.
    """
   
    def __init__ (self, language, minWordLength=4, codec=None, shy=SHY, **options):
        """
        Creates a new hyphenator instance for the given language.
        In this base class, the language arguments serves only for
        information purposes.
        Words shorter than minWordLength letters will never be considererd 
        for hyphenation.
        """
        self.language = language
        self.minWordLength = 4
        assert isinstance(shy, unicode)
        self.shy = shy
        self.options = options
        
        """
        self.codec = codec
        if self.codec is None:
            import encodings.latin_1
            self.codec = encodings.latin_1.Codec()
        """
    """        
    def getCodec(self):
        return self.codec
    """
        
    def getLanguage(self):
        return self.language
        
    def getMinWordLength(self):
        return self.minWordLength
        
    def setMinWordLength(self,nLength):
        if type(nLength)==int and nLength>2 and nLength<100:
            self.minWordLength = nLength
        else:
            raise ValueError, nLength
            
    def __repr__(self):
        #return "%s(%s,%d,%s)" % (str(self.__class__),self.language,self.minWordLength,self.codec)
        return "%s(%s,%d)" % (str(self.__class__),self.language,self.minWordLength)
    
    def postHyphenate(self,hyphenatedWord):
        """This function is called whenever hyphenate has been called.
           It can be used to do some logging,
           or to add unknown words to a dictionary etc.
        """
        if hyphenatedWord is not None:
            assert isinstance(hyphenatedWord, HyphenatedWord)
            assert type(hyphenatedWord.hyphenations) == list

    def i_hyphenate(self, aWord):
        """
        This base class does not support any hyphenation!
        """
        return None
            
    def hyphenate(self,aWord):
        """
        Finds possible hyphenation points for a aWord, returning a HyphenatedWord
        or None if the hyphenator doesn't know the word.
        """
        assert isinstance(aWord,unicode)
        hword = self.i_hyphenate(aWord)
        self.postHyphenate(hword)
        return hword
        
class Cached(Hyphenator):
    """
    This caches the results of the hyphenate function.
    Use it if the hyphenation is too slow.
    """
    
    def __init__(self, hyphenator, max_entries):
        """
        Creates a new, cached version of hyphenator
        that caches at most max_entries of the results
        from hyphenator.hyphenate.
        If you need other functionality of the hyphenator,
        you have to access the attribute "hyphenator"
        directly.
        """
        self._max_entries = max_entries
        assert isinstance(hyphenator, Hyphenator)
        self.hyphenator = hyphenator
        self.cache = dict()
        
    def hyphenate(self, aWord):
        """
        Get the hyphenated word for word from the cache.
        If not found there, call the internal hyphenator
        and add to the cache (like a lazy setdefault).
        """
        cache = self.cache
        if aWord not in cache:
            if len(cache) >= self._max_entries:
                self.cache = dict()
            self.cache[aWord] = self.hyphenator.hyphenate(aWord)
        return self.cache[aWord]

    def purge_cache(self):
        """
        Purges the cache (freeing resources).
        """
        self.cache = dict()