File: PyHnjHyphenator.py

package info (click to toggle)
python-wordaxe 0.3.2-1
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 1,228 kB
  • ctags: 786
  • sloc: python: 9,814; makefile: 5
file content (119 lines) | stat: -rw-r--r-- 4,576 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

__license__="""
   Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
   This software is dual-licenced under the Apache 2.0 and the
   2-clauses BSD license. For details, see license.txt
"""

__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''

import os,sys
import sets
import copy

from wordaxe.hyphen import *
from xml.sax.saxutils import escape,quoteattr

from wordaxe.ExplicitHyphenator import ExplicitHyphenator

VERBOSE = False

class PyHnjHyphenator(ExplicitHyphenator):
    """
    Hyphenation using pyHnj (Knuth's algorithm).
    The pyHnj/libhnj code does not work if german words contain umlauts.
    As a work-around you can use a pure python version that does
    not use pyHnj/libhnj and should give the same results.
    """

    def __init__ (self, 
                  language="EN",
                  minWordLength=4,
                  quality=8,
                  hyphenDir=None,
                  purePython=False,
                  **options
                 ):
        """ Note:
            The purePython version does NOT use Knuth's algorithm,
            but a more simple (and slower) algorithm.
        """
        ExplicitHyphenator.__init__(self,language=language,minWordLength=minWordLength, **options)
        if hyphenDir is None:
            hyphenDir = os.path.join(os.path.split(__file__)[0], "dict")
        self.purePython = purePython
        fname = os.path.join(hyphenDir, "hyph_%s.dic" % language)
        # first line is set of characters, all other lines are patterns
        if self.purePython:
            # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
            lines = open(fname).read().splitlines()
            self.characters = lines.pop(0)
            self.patterns = {}
            for pattern in lines:
                pat = ""
                codes = ""
                digit = "0"
                for ch in pattern:
                    if ch>='0' and ch<='9':
                        digit = ch
                    else:
                        codes = codes+digit
                        pat = pat+ch
                        digit = "0"
                codes = codes+digit
                self.patterns[pat.decode("iso-8859-1")] = codes
        else:
            import pyHnj
            self.hnj = pyHnj.Hyphen(fname)
        self.quality = quality

    # Hilfsfunktion
    def schiebe(self,offset,L):
        return [HyphenationPoint(h.indx+offset,h.quality,h.nl,h.sl,h.nr,h.sr) for h in L]

    def zerlegeWort(self,zusgWort):
        if self.purePython:
            word = "." + zusgWort.lower() + "."
            # Alle Lngen durchgehen (minimum: 2)
            codes = ["0"]*len(word)
            for patlen in range(2,len(word)):
                #print "patlen %d" % patlen
                for startindx in range(len(word)-patlen):
                    #print "startindx %d" % startindx
                    try:
                        patcode = self.patterns[word[startindx:startindx+patlen]]
                        #print "testpat=%s patcode=%s" % (word[startindx:startindx+patlen],patcode)
                        for i,digit in enumerate(patcode):
                            if digit > codes[i+startindx]:
                                codes[i+startindx] = digit
                    except KeyError:
                        pass
            codes = codes[2:-1]
        else:
            codes = self.hnj.getCodes(zusgWort.lower())
        hyphPoints = []
        for i, code in enumerate(codes):
            # wir trennen nicht das erste oder letzte Zeichen ab
            if i==0 or i==len(codes)-1:
                continue
            if (ord(code)-ord('0')) % 2:
                hyphPoints.append(HyphenationPoint(i+1,self.quality,0,self.shy,0,""))
        return hyphPoints
        
    def hyph(self,aWord):
        assert isinstance(aWord, unicode)
        hword = HyphenatedWord(aWord, hyphenations=self.zerlegeWort(aWord))
        # None (unknown) kann hier nicht vorkommen, da der
        # Algorithmus musterbasiert funktioniert und die Wrter
        # sowieso nicht "kennt" oder "nicht kennt".
        return hword

    def i_hyphenate(self, aWord):
        return ExplicitHyphenator.i_hyphenate_derived(self, aWord)
    
if __name__=="__main__":
    h = PyHnjHyphenator("de_DE",5, purePython=True)
    h.test(outfname="PyHnjLearn.html")