1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
__license__="""
Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
This software is dual-licenced under the Apache 2.0 and the
2-clauses BSD license. For details, see license.txt
"""
__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''
import os,sys
from hyphen import *
from xml.sax.saxutils import escape,quoteattr
from wordaxe.BaseHyphenator import BaseHyphenator
VERBOSE = False
class PyHnjHyphenator(BaseHyphenator):
"""
Hyphenation using pyHnj (Knuth's algorithm).
@TODO The current algorithm does NOT use Knuths algorithm,
but a more or less trivial one.
"""
def __init__ (self,
language="EN",
minWordLength=4,
quality=8,
hyphenDir=None
):
BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength)
if hyphenDir is None:
hyphenDir = os.path.join (os.path.split(__file__)[0], "dict")
# load pattern file
fname = os.path.join(hyphenDir,"hyph_%s.dic"%language)
# first line is set of characters, all other lines are patterns
# Note: we do not use a TRIE, we just store the patterns in a dict string:codes
self.quality = quality
lines = open(fname).read().splitlines()
self.characters = lines.pop(0)
self.patterns = {}
for pattern in lines:
pat = ""
codes = ""
digit = "0"
for ch in pattern:
if ch>='0' and ch<='9':
digit = ch
else:
codes = codes+digit
pat = pat+ch
digit = "0"
codes = codes+digit
self.patterns[pat.decode("iso-8859-1")] = codes
# Hilfsfunktion
def schiebe(self,offset,L):
return [HyphenationPoint(h.indx+offset,h.quality,h.nl,h.sl,h.nr,h.sr) for h in L]
def zerlegeWort(self,zusgWort):
### This was the call to pyHnj
### codes = self.hnj.getCodes(zusgWort.lower())
###
### Here comes the new logic.
word = "." + zusgWort.lower() + "."
#print "word=%s" % word
# Alle Lngen durchgehen (minimum: 2)
codes = ["0"]*len(word)
for patlen in range(2,len(word)-1):
#print "patlen %d" % patlen
for startindx in range(len(word)-patlen):
#print "startindx %d" % startindx
try:
patcode = self.patterns[word[startindx:startindx+patlen]]
#print "testpat=%s patcode=%s" % (word[startindx:startindx+patlen],patcode)
for i,digit in enumerate(patcode):
if digit > codes[i+startindx]:
codes[i+startindx] = digit
except KeyError:
pass
codes = codes[2:-1]
#print zusgWort
#print "".join(codes)
### end of the new logic.
hyphPoints = []
for i in range(len(codes)):
if (ord(codes[i])-ord('0')) % 2:
hyphPoints.append(HyphenationPoint(i+1,self.quality,0,self.shy,0,u""))
return [hyphPoints]
def hyphenate(self,aWord):
assert isinstance(aWord, unicode)
hword = HyphenatedWord(aWord)
loesungen = self.zerlegeWort(aWord)
if len(loesungen)>1:
#hword.info = ("AMBIGUOUS", loesungen)
# nimm nur solche Trennstellen, die in allen Lsungen vorkommen,
# und fr die Qualitt nimm die schlechteste.
loesung = []
loesung0, andere = loesungen[0], loesungen[1:]
for i,hp in enumerate(loesung0):
q = hp.quality
for a in andere:
if q:
for hp1 in a:
if hp1.indx==hp.indx \
and hp1.nl==hp.nl and hp1.sl==hp.sl \
and hp1.nr==hp.nr and hp1.sr==hp.sr:
q = min(q,hp1.quality)
break
else:
# Trennstelle nicht in der anderen Lsung enthalten
q = 0
if q:
loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr))
elif len(loesungen)==1:
loesung = loesungen[0]
#hword.info = ("HYPHEN_OK", loesung)
if not loesung:
pass #hword.info = ("NOT_HYPHENATABLE", aWord)
else:
#hword.info = ("UNKNOWN", aWord)
loesung = []
#for i in range(len(aWord)):
for i in range(1,len(aWord)-1):
if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789":
#print "Trenne", aWord,"an Position:",i,"bei",aWord[i]
# in zwei Teile zerlegen und getrennt betrachten
r = self.shy
if aWord[i] in [self.shy,u"-"]:
r = u""
loesung1 = self.hyphenate(aWord[:i])
loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u""))
loesung2 = self.hyphenate(aWord[i+1:])
# TODO diese Lsungen mssen jetzt zusammengefhrt werden.
if loesung2.hyphenations == []:
#nur der 1. Teil kann getrennt werden
loesung = loesung1.hyphenations
else:
#beide Teile knnen getrennt werden
loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations]
break
else:
loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations
hword.hyphenations = loesung
#print "hyphenate %s -> %d points" % (aWord,len(loesung))
return hword
if __name__=="__main__":
#print sys.stdout.encoding
h = PyHnjHyphenator("de_DE",5)
h.test(outfname="PyHnjLearn.html")
|