1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
__license__="""
Copyright 2004-2008 Henning von Bargen (henning.vonbargen arcor.de)
This software is dual-licenced under the Apache 2.0 and the
2-clauses BSD license. For details, see license.txt
"""
__version__=''' $Id: __init__.py,v 1.2 2004/05/31 22:22:12 hvbargen Exp $ '''
import codecs
from wordaxe.hyphen import SHY, HyphenatedWord
from wordaxe.BaseHyphenator import BaseHyphenator
from wordaxe.hyphrules import decodeTrennung
class ExplicitHyphenator(BaseHyphenator):
"""
Allow to explicitly specify how a word should be hyphenated.
This is a slight improvement compared to BaseHyphenator.
Usage:
hyphenator = ExplicitHyphenator("DE")
# Add explicit hyphenation for a single word.
hyphenator.add_entry(u"analphabet", u"an8alpha5bet")
# Add several entries
hyphenator.add_entries({u"urinstinkt": u"ur8instinkt",
u"urinstinkte": u"ur8instinkte",
u"urinstinkten": u"ur8instinkt3en",
})
The last entry is probably not correctly hyphenated
according to the german hyphenation rules, but you don't
want to read "urinstink" in a text...
The add_entry/add_entries usually expect unicode strings.
Bytes strings require the encoding argument to be supplied.
hyphenator.add-entries ("brutigam", "bru5ti5gam", encoding="iso-8859").
Instead of using numbers for defining the quality of a hyphenation
point, you may use the "~" (tilde) character, corresponding to
a medium quality hyphenation point: "bru~ti~gam".
"""
def __init__ (self,
language="DE",
minWordLength=4,
qHaupt=8,
qNeben=5,
qVorsilbe=5,
qSchlecht=3,
hyphenDir=None,
**options
):
BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)
# Qualitten fr verschiedene Trennstellen
self.qHaupt=qHaupt
self.qNeben=qNeben
self.qVorsilbe=qVorsilbe
self.qSchlecht=qSchlecht
# Stammdaten initialisieren
self.sonderfaelle = []
def add_entry(self, word, trennung, encoding=unicode):
if not isinstance(word, unicode):
word = unicode(word, encoding)
if not isinstance(trennung, unicode):
trennung = unicode(trennung, encoding)
# Ignore Case @TODO Umlaute usw.!
word = word.lower()
trennung = trennung.replace(u"~", u"5")
lenword = len(word)
for (lae, L) in self.sonderfaelle:
if lae == lenword:
L[word] = trennung
break
else:
self.sonderfaelle.append((lenword,{word: trennung}))
def add_entries(self, mapping, encoding=unicode):
for word, trennung in mapping.items():
self.add_entry(word, trennung, encoding)
def add_entries_from_file(self, filename, encoding=None):
"""
Add entries from a text file (interpreting the file
using the given encoding). If encoding is not given
or None, try to extract the encoding from a line
near the start of the file like
# -*- coding: iso-8859-1 -*-
"""
if encoding is None:
import re
frag = open(filename,"rt").read(1000)
m = re.search(r"-\*- coding: ([^ ]+) -\*-", frag)
if m is not None:
encoding = m.group(1)
else:
raise ValueError("Encoding not specified and not found in file")
fh = codecs.open(filename, "rt", encoding)
for line in fh:
line = line.strip()
if not line or line.startswith("#"):
continue
word, trennung = line.split()
self.add_entry(word, trennung)
fh.close()
def hyph(self, word):
#print "ExplicitHyphenator hyph", word
lenword = len(word)
for (lae, L) in self.sonderfaelle:
if lae == lenword:
trennung = L.get(word.lower(), None)
if trennung is not None:
hword = HyphenatedWord(word, decodeTrennung(trennung))
return hword
break
# Wort nicht gefunden
return None
def i_hyphenate(self, aWord):
assert isinstance(aWord, unicode)
return self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, aWord)
def i_hyphenate_derived(self,aWord):
"""
You can use this method in classes derived from ExplicitHyphenator.
It will first split the word using BaseHyphenator,
then for each "subword" it will call ExplicitHyphenator,
and only call the derived classes hyph method for the still
unknown subwords.
TODO: The implementation does not match the docstring
test: "hohenlimburg.de", "hohenlimburg.de)"
"""
#print "ExplicitHyphenator.i_hyphenate_derived", aWord
assert isinstance(aWord, unicode)
# Helper function
sub_hwords = []
hword = BaseHyphenator.i_hyphenate(self,aWord)
#print "BaseHyphenator.i_hyphenate returned %r" % hword
if hword is None:
hword = HyphenatedWord(aWord,hyphenations=[])
base_hyph_points = hword.hyphenations
last_indx = 0
nr = 0
for hpnum, hp in enumerate(base_hyph_points):
if isinstance(hp, int):
hp = HyphenationPoint(hp, quality=5, sl=SHY)
subword = hword[last_indx+nr:hp.indx]
# handle subword
if SHY in subword:
sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
else:
sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
if sub_hword is None:
sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
if sub_hword is None:
sub_hword = HyphenatedWord(subword, hyphenations=[])
sub_hwords.append(sub_hword)
# end handle subword
last_indx = hp.indx
nr = hp.nr
# Now the last subword
subword = hword[last_indx:]
# handle subword
if SHY in subword:
sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
else:
sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
if sub_hword is None:
sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
if sub_hword is None:
sub_hword = HyphenatedWord(subword, hyphenations=[])
sub_hwords.append(sub_hword)
#end handle subword
if len(sub_hwords) > 1:
return HyphenatedWord.join(sub_hwords)
else:
return sub_hwords[0] # Kann auch None sein.
if __name__=="__main__":
h = ExplicitHyphenator("DE",5)
h.add_entry("Brutigam", "Bru5ti5gam", "iso-8859-1")
h.add_entries({u"Urinstinkt": u"Ur8instinkt",
u"Urinstinkte": u"Ur8instinkte",
u"Urinstinkten": u"Ur8instinkt3en",
}
)
h.test(outfname="ExplicitLearn.html")
|