1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
|
import string, re, os
###
### Fallback syllable counter
###
### This is based on the algorithm in Greg Fast's perl module
### Lingua::EN::Syllable.
###
syllable_path = "specialSyllables_en"
fallback_cache = {}
fallback_subsyl = ["cial", "tia", "cius", "cious", "gui", "ion", "iou",
"sia$", ".ely$"]
fallback_addsyl = ["ia", "riet", "dien", "iu", "io", "ii",
"[aeiouy]bl$", "mbl$",
"[aeiou]{3}",
"^mc", "ism$",
"(.)(?!\\1)([aeiouy])\\2l$",
"[^l]llien",
"^coad.", "^coag.", "^coal.", "^coax.",
"(.)(?!\\1)[gq]ua(.)(?!\\2)[aeiou]",
"dnt$"]
# Compile our regular expressions
for i in range(len(fallback_subsyl)):
fallback_subsyl[i] = re.compile(fallback_subsyl[i])
for i in range(len(fallback_addsyl)):
fallback_addsyl[i] = re.compile(fallback_addsyl[i])
def _normalize_word(word):
return word.strip().lower()
# Read our syllable override file and stash that info in the cache
in_syll = open(syllable_path)
for line in in_syll.xreadlines():
line = line.strip()
if line:
toks = line.split()
assert len(toks) == 2
fallback_cache[_normalize_word(toks[0])] = int(toks[1])
in_syll.close()
def count(word):
word = _normalize_word(word)
if not word:
return 0
# Check for a cached syllable count
count = fallback_cache.get(word, -1)
if count > 0:
return count
# Remove final silent 'e'
if word[-1] == "e":
word = word[:-1]
# Count vowel groups
count = 0
prev_was_vowel = 0
for c in word:
is_vowel = c in ("a", "e", "i", "o", "u", "y")
if is_vowel and not prev_was_vowel:
count += 1
prev_was_vowel = is_vowel
# Add & subtract syllables
for r in fallback_addsyl:
if r.search(word):
count += 1
for r in fallback_subsyl:
if r.search(word):
count -= 1
# Cache the syllable count
fallback_cache[word] = count
return count
###
### Phoneme-driven syllable counting
###
def count_decomp(decomp):
count = 0
for unit in decomp:
if gnoetics.phoneme.is_xstressed(unit):
count += 1
return count
|