File: hnj.py

package info (click to toggle)
python-wordaxe 0.3.2-1
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 1,228 kB
  • ctags: 786
  • sloc: python: 9,814; makefile: 5
file content (160 lines) | stat: -rw-r--r-- 6,448 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

__license__="""
   Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
   This software is dual-licenced under the Apache 2.0 and the
   2-clauses BSD license. For details, see license.txt
"""

__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''

import os,sys

from hyphen import *
from xml.sax.saxutils import escape,quoteattr

from wordaxe.BaseHyphenator import BaseHyphenator

VERBOSE = False

class PyHnjHyphenator(BaseHyphenator):
    """
    Hyphenation using pyHnj (Knuth's algorithm).
    @TODO  The current algorithm does NOT use Knuths algorithm,
           but a more or less trivial one.
    """

    def __init__ (self, 
                  language="EN",
                  minWordLength=4,
                  quality=8,
                  hyphenDir=None
                 ):
        BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength)
        if hyphenDir is None:
            hyphenDir = os.path.join (os.path.split(__file__)[0], "dict")
        # load pattern file
        fname = os.path.join(hyphenDir,"hyph_%s.dic"%language)
        # first line is set of characters, all other lines are patterns
        # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
        self.quality = quality
        lines = open(fname).read().splitlines()
        self.characters = lines.pop(0)
        self.patterns = {}
        for pattern in lines:
            pat = ""
            codes = ""
            digit = "0"
            for ch in pattern:
                if ch>='0' and ch<='9':
                    digit = ch
                else:
                    codes = codes+digit
                    pat = pat+ch
                    digit = "0"
            codes = codes+digit
            self.patterns[pat.decode("iso-8859-1")] = codes
        
    # Hilfsfunktion
    def schiebe(self,offset,L):
        return [HyphenationPoint(h.indx+offset,h.quality,h.nl,h.sl,h.nr,h.sr) for h in L]

    def zerlegeWort(self,zusgWort):
        ### This was the call to pyHnj
        ### codes = self.hnj.getCodes(zusgWort.lower())
        ###
        ### Here comes the new logic.
        
        word = "." + zusgWort.lower() + "."
        #print "word=%s" % word
        # Alle Lngen durchgehen (minimum: 2)
        codes = ["0"]*len(word)
        for patlen in range(2,len(word)-1):
            #print "patlen %d" % patlen
            for startindx in range(len(word)-patlen):
                #print "startindx %d" % startindx
                try:
                    patcode = self.patterns[word[startindx:startindx+patlen]]
                    #print "testpat=%s patcode=%s" % (word[startindx:startindx+patlen],patcode)
                    for i,digit in enumerate(patcode):
                        if digit > codes[i+startindx]:
                            codes[i+startindx] = digit
                except KeyError:
                    pass
        codes = codes[2:-1]
        #print zusgWort
        #print "".join(codes)

        ### end of the new logic.
        
        hyphPoints = []
        for i in range(len(codes)):
            if (ord(codes[i])-ord('0')) % 2:
                hyphPoints.append(HyphenationPoint(i+1,self.quality,0,self.shy,0,u""))
        return [hyphPoints]
        
    def hyphenate(self,aWord):
        assert isinstance(aWord, unicode)
        hword = HyphenatedWord(aWord)
        loesungen = self.zerlegeWort(aWord)
        if len(loesungen)>1:
            #hword.info = ("AMBIGUOUS", loesungen)
            # nimm nur solche Trennstellen, die in allen Lsungen vorkommen,
            # und fr die Qualitt nimm die schlechteste.
            loesung = []
            loesung0, andere = loesungen[0], loesungen[1:]
            for i,hp in enumerate(loesung0):
                q = hp.quality
                for a in andere:
                    if q:
                        for hp1 in a:
                            if hp1.indx==hp.indx \
                            and hp1.nl==hp.nl and hp1.sl==hp.sl \
                            and hp1.nr==hp.nr and hp1.sr==hp.sr:
                                q = min(q,hp1.quality)
                                break
                        else:
                            # Trennstelle nicht in der anderen Lsung enthalten
                            q = 0
                if q:
                    loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr))
        elif len(loesungen)==1:
            loesung = loesungen[0]
            #hword.info = ("HYPHEN_OK", loesung)
            if not loesung:
                pass #hword.info = ("NOT_HYPHENATABLE", aWord)
        else:
            #hword.info = ("UNKNOWN", aWord)
            loesung = []
            #for i in range(len(aWord)):
            for i in range(1,len(aWord)-1):
                if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789":
                    #print "Trenne", aWord,"an Position:",i,"bei",aWord[i]
                    # in zwei Teile zerlegen und getrennt betrachten
                    r = self.shy
                    if aWord[i] in [self.shy,u"-"]:
                       r = u""
                    loesung1 = self.hyphenate(aWord[:i])
                    loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u""))
                    loesung2 = self.hyphenate(aWord[i+1:])
                    # TODO diese Lsungen mssen jetzt zusammengefhrt werden.
                    if loesung2.hyphenations == []:
                        #nur der 1. Teil kann getrennt werden
                        loesung = loesung1.hyphenations
                    else:
                        #beide Teile knnen getrennt werden
                        loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations]
                    break
            else:
                loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations

        hword.hyphenations = loesung
        #print "hyphenate %s -> %d points" % (aWord,len(loesung))
        return hword

if __name__=="__main__":
    #print sys.stdout.encoding
    h = PyHnjHyphenator("de_DE",5)
    h.test(outfname="PyHnjLearn.html")