File: inflect.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
links: PTS, VCS
area: main
in suites: buster
size: 78,672 kB
sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (446 lines) | stat: -rw-r--r-- 16,786 bytes
parent folder | download | duplicates (2)
#### PATTERN | IT | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2013 University of Antwerp, Belgium
# Copyright (c) 2013 St. Lucas University College of Art & Design, Antwerp.
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).

####################################################################################################
# Regular expressions-based rules for Italian word inflection:
# - pluralization and singularization of nouns,
# - conjugation of verbs,
# - predicative adjectives.

# Accuracy:
# 92% for gender()
# 93% for pluralize()
# 84% for singularize()
# 82% for Verbs.find_lemma()
# 90% for Verbs.find_lexeme()
# 88% for predicative()

import os
import sys
import re

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""
    
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

# Import Verbs base class and verb tenses.
from pattern.text import Verbs as _Verbs
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
    IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
    IMPERFECT, PRETERITE,
    PARTICIPLE, GERUND
)

sys.path.pop(0)

VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"

VOWELS = "aeiouy"
re_vowel = re.compile(r"a|e|i|o|u|y", re.I)
is_vowel = lambda ch: ch in VOWELS

#### ARTICLE #######################################################################################

# Inflection gender.
MASCULINE, FEMININE, NEUTER, PLURAL = \
    MALE, FEMALE, NEUTRAL, PLURAL = \
        M, F, N, PL = "m", "f", "n", "p"

# Word starts with z or s + consonant?
zs = lambda w: w and (w[:1] == "z" or (w[:1] == "s" and not is_vowel(w[1:2])))

def definite_article(word, gender=MALE):
    """ Returns the definite article for a given word.
    """
    if PLURAL in gender and MALE in gender and (is_vowel(word[:1]) or zs(word)):
        return "gli"
    if PLURAL not in gender and word and is_vowel(word[:1]):
        return "l'"
    if PLURAL not in gender and MALE in gender and zs(word):
        return "lo"
    if MALE in gender:
        return PLURAL in gender and "i" or "il"
    if FEMALE in gender:
        return PLURAL in gender and "le" or "la"
    return "il"

def indefinite_article(word, gender=MALE):
    """ Returns the indefinite article for a given word.
    """
    if MALE in gender and zs(word):
        return PLURAL in gender and "degli" or "uno"
    if MALE in gender:
        return PLURAL in gender and "dei" or "un"
    if FEMALE in gender and is_vowel(word[:1]):
        return PLURAL in gender and "delle" or "un'"
    if FEMALE in gender:
        return PLURAL in gender and "delle" or "una"
    return "un"

DEFINITE, INDEFINITE = \
    "definite", "indefinite"

def article(word, function=INDEFINITE, gender=MALE):
    """ Returns the indefinite or definite article for the given word.
    """
    return function == DEFINITE \
       and definite_article(word, gender) \
        or indefinite_article(word, gender)

_article = article

def referenced(word, article=INDEFINITE, gender=MALE):
    """ Returns a string with the article + the word.
    """
    s = "%s&space;%s" % (_article(word, article, gender), word)
    s = s.replace("'&space;", "'")
    s = s.replace("&space;", " ")
    return s

#### GENDER #########################################################################################

def gender(word):
    """ Returns the gender for the given word, either:
        MALE, FEMALE, (MALE, FEMALE), (MALE, PLURAL) or (FEMALE, PLURAL).
    """
    w = word.lower()
    # Adjectives ending in -e: cruciale, difficile, ...
    if w.endswith(("ale", "ile", "ese", "nte")):
        return (MALE, FEMALE)
    # Most nouns ending in -a (-e) are feminine, -o (-i) masculine:
    if w.endswith(("ore", "ista", "mma")):
        return MALE            
    if w.endswith(("a", u"tà", u"tù", "ione", "rice")):
        return FEMALE
    if w.endswith(("e", "oni")):
        return (FEMALE, PLURAL)
    if w.endswith("i"):
        return (MALE, PLURAL)
    if w.endswith("o"):
        return MALE
    return MALE

#### PLURALIZE ######################################################################################

plural_co_chi = set((
    "abbaco", "baco", "cuoco", "fungo", "rammarico", "strascio", "valico" # ...
))

plural_go_ghi = set((
    "albergo", "catalogo", "chirurgo", "dialogo", "manico", "monologo", "stomaco" # ...
))

plural_irregular = {
    "braccio": "braccia", # bracci (arms of a lamp or cross)
    "budello": "budelli", # budella (intestines)
    "camicia": "camicie",
        "bue": "buoi"   ,
        "dio": "dei"    ,
       "dito": "dita"   ,
     "doccia": "docce"  ,
     "inizio": "inizi"  ,
     "labbro": "labbra" , # labbri (borders)
       "mano": "mani"   ,
    "negozio": "negozi" ,
       "osso": "ossa"   , # ossi (dog bones)
       "uomo": "uomini" ,
       "uovo": "uova"
}

def pluralize(word, pos=NOUN, custom={}):
    """ Returns the plural of a given word.
    """
    if word in custom:
        return custom[word]
    w = word.lower()
    if len(w) < 3:
        return w
    if w in plural_irregular:
        return plural_irregular[w]
    # provincia => province (but: socia => socie)
    if w.endswith(("cia", "gia")) and len(w) > 4 and not is_vowel(w[-4]):
        return w[:-2] + "e"
    # amica => amiche
    if w.endswith(("ca", "ga")):
        return w[:-2] + "he"
    # studentessa => studentesse
    if w.endswith("a"):
        return w[:-1] + "e"
    # studente => studenti
    if w.endswith("e"):
        return w[:-1] + "i"
    # viaggio => viaggi (but: leggìo => leggìi)
    if w.endswith("io"):
        return w[:-2] + "i"
    # abbaco => abbachi
    if w in plural_co_chi:
        return w[:-2] + "chi"
    # albergo => alberghi
    if w in plural_co_chi:
        return w[:-2] + "ghi"
    # amico => amici
    if w.endswith("o"):
        return w[:-1] + "i"
    return w

#### SINGULARIZE ###################################################################################

singular_majority_vote = [
    ("tenti",  "tente"), ("anti", "ante"), ( "oni", "one" ), ( "nti", "nto" ),
    (  "ali",  "ale"  ), ( "ici", "ico" ), ( "nze", "nza" ), ( "ori", "ore" ),
    (  "che",  "ca"   ), ( "ati", "ato" ), ( "ari", "ario"), ( "tti", "tto" ),
    (  "eri",  "ero"  ), ( "chi", "co"  ), ( "ani", "ano" ), ( "ure", "ura" ),
    ( u"ità", u"ità"  ), ( "ivi", "ivo" ), ( "ini", "ino" ), ( "iti", "ito" ),
    (  "emi",  "ema"  ), ( "ili", "ile" ), ( "oli", "olo" ), ( "esi", "ese" ),
    (  "ate",  "ata"  ), ( "ssi", "sso" ), ( "rie", "ria" ), ( "ine", "ina" ),
    (  "lli",  "llo"  ), ( "ggi", "ggio"), ( "tri", "tro" ), ( "imi", "imo" )
]

singular_irregular = dict((v, k) for k, v in plural_irregular.items())

def singularize(word, pos=NOUN, custom={}):
    """ Returns the singular of a given word.
    """
    if word in custom:
        return custom[word]
    w = word.lower()
    # il gatti => il gatto
    if pos == "DT":
        if w in ("i", "gli"):
            return "il"
        if w == "el":
            return "la"
        return w
    if len(w) < 3:
        return w
    if w in singular_irregular:
        return singular_irregular[w]
    # Ruleset adds 16% accuracy.
    for a, b in singular_majority_vote:
        if w.endswith(a):
            return w[:-len(a)] + b
    # Probably an adjective ending in -e: cruciale, difficile, ...
    if w.endswith(("ali", "ari", "ili", "esi", "nti")):
        return w[:-1] + "e"
    # realisti => realista
    if w.endswith("isti"):
        return w[:-1] + "a"
    # amiche => amica
    if w.endswith(("che", "ghe")):
        return w[:-2] + "a"
    # alberghi => albergo
    if w.endswith(("chi", "ghi")):
        return w[:-2] + "o"
    # problemi => problema
    if w.endswith("emi"):
        return w[:-1] + "a"
    # case => case
    if w.endswith("e"):
        return w[:-1] + "a"
    # Ambigious: both -o and -a pluralize to -i.
    if w.endswith("i"):
        return w[:-1] + "o"
    return w

#### VERB CONJUGATION ##############################################################################
# The verb table was trained on Wiktionary and contains the top 1,250 frequent verbs.

verb_majority_vote = [
    ("iresti", "ire" ), ("ireste", "ire" ), ("iremmo", "ire" ), ("irebbe", "ire" ),
    ("iranno", "ire" ), ( "ssero", "re"  ), ( "ssimo", "re"  ), ( "ivate", "ire" ),
    ( "ivamo", "ire" ), ( "irete", "ire" ), ( "iremo", "ire" ), ( "irono", "ire" ),
    ( "scano", "re"  ), ( "hiamo", "are" ), ( "scono", "re"  ), ( "hiate", "are" ),
    (  "vano", "re"  ), (  "vate", "re"  ), (  "vamo", "re"  ), (  "simo", "e"   ),
    (  "rono", "re"  ), (  "isse", "ire" ), (  "isti", "ire" ), (  "tino", "tare"),
    (  "tato", "tare"), (  "irai", "ire" ), (  "tavo", "tare"), (  "tavi", "tare"),
    (  "tava", "tare"), (  "tate", "tare"), (  "iste", "ire" ), (  "irei", "ire" ),
    (  "immo", "ire" ), ( u"rerò", "rare"), ( u"rerà", "rare"), (  "iavo", "iare"),
    (  "iavi", "iare"), (  "iava", "iare"), (  "iato", "iare"), (  "iare", "iare"),
    (  "hino", "are" ), (   "ssi", "re"  ), (   "sse", "re"  ), (   "ndo", "re"  ),
    (  u"irò", "ire" ), (   "tai", "tare"), (   "ite", "ire" ), (  u"irà", "ire" ),
    (   "sco", "re"  ), (   "sca", "re"  ), (   "iai", "iare"), (    "ii", "ire" ),
    (    "hi", "are" )
]

class Verbs(_Verbs):
    
    def __init__(self):
        _Verbs.__init__(self, os.path.join(MODULE, "it-verbs.txt"),
            language = "it",
             default = {},
              format = [
                0, 1, 2, 3, 4, 5, 6, 8,     # indicativo presente
                34, 35, 36, 37, 38, 39, 24, # indicativo passato remoto
                17, 18, 19, 20, 21, 22,     # indicativo imperfetto
                40, 41, 42, 43, 44, 45,     # indicativo futuro semplice
                46, 47, 48, 49, 50, 51,     # condizionale presente
                    52, 521,53, 54, 541,    # imperativo
                55, 56, 57, 58, 59, 60,     # congiuntivo presente
                67, 68, 69, 70, 71, 72      # congiontive imperfetto
            ])
    
    def find_lemma(self, verb):
        """ Returns the base form of the given inflected verb, using a rule-based approach.
        """
        v = verb.lower()
        # Probably infinitive if ends in -are, -ere, -ire or reflexive -rsi.
        if v.endswith(("are", "ere", "ire", "rsi")):
            return v
        # Ruleset adds 3% accuracy.
        for a, b in verb_majority_vote:
            if v.endswith(a):
                return v[:-len(a)] + b
        v = v.replace("cha", "ca")
        v = v.replace("che", "ce")
        v = v.replace("gha", "ga")
        v = v.replace("ghe", "ge")
        v = v.replace("ghi", "gi")
        v = v.replace("gge", "ggie")
        # Many verbs end in -ire and have a regular inflection:
        for x in ((
          u"irò", "irai", u"irà", "iremo", "irete", "iranno",         # future
          "irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero", # conditional
          "ascano",                                                   # subjunctive I
          "issi", "isse", "issimo", "iste", "issero",                 # subjunctive II
          "ivo", "ivi", "iva", "ivamo", "ivate", "ivano",             # past imperfective
          "isti", "immo", "iste", "irono", "ito",                     # past perfective
          "isco", "isci", "isce", "ite", "iscono", "indo")):          # present
            if v.endswith(x):
                return v[:-len(x)] + "ire"
        # Many verbs end in -are and have a regular inflection:
        for x in ((
          u"erò", "erai", u"erà", "eremo", "erete", "eranno",         # future
          "erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero", # conditional
          "iamo", "iate", "ino",                                      # subjunctive I
          "assi", "asse", "assimo", "aste", "assero",                 # subjunctive II
          "avo", "avi", "ava", "avamo", "avate", "avano",             # past imperfective
          "ai", "asti", u"ò", "ammo", "aste", "arono", "ato",         # past perfective
          "iamo", "ate", "ano", "ando")):                             # present
            if v.endswith(x):
                return v[:-len(x)] + "are"
        # Many verbs end in -ere and have a regular inflection:
        for x in ((
          "essi", "esse", "essimo", "este", "essero",                 # subjunctive II
          "evo", "evi", "eva", "evamo", "evate", "evano",             # past imperfective
          "ei", "esti", u"è", "emmo", "este", "erono", "eto",         # past perfective
          "ete", "ono", "endo")):                                     # present
            if v.endswith(x):
                return v[:-len(x)] + "ere"
        if v.endswith(u"à"):
            return v[:-1] + "e"
        if v.endswith(u"ì"):
            return v[:-1] + "ire"
        if v.endswith(u"e"):
            return v[:-1] + "ere"
        if v.endswith(("a", "i", "o")):
            return v[:-1] + "are"
        return v

    def find_lexeme(self, verb):
        """ For a regular verb (base form), returns the forms using a rule-based approach.
        """
        v = verb.lower()
        v = re.sub(r"rci$", "re", v)
        v = re.sub(r"rsi$", "re", v)
        v = re.sub(r"rre$", "re", v)
        b = v[:-3]
        if verb.endswith(("care", "gare")):
            b += "h"   # moltiplicare => tu moltiplichi
        if verb.endswith(("ciare", "giare")):
            b = b[:-1] # cominciare => tu cominci
        if v.endswith("are"):
            # -are = 1st conjugation
            a1, a2, a3, a4, a5, a6, a7 = "a", "a", u"ò", "a", "i", "e", "a"
        elif v.endswith("ere"):
            # -ere = 2nd conjugation
            a1, a2, a3, a4, a5, a6, a7 = "e", "o", u"è", "i", "a", "e", "e"
        elif v.endswith("ire"):
            # -ire = 3rd conjugation
            a1, a2, a3, a4, a5, a6, a7 = "i", "o", "i", "i", "a", "i", "e"
        else:
            # -orre, -urre = use 2nd conjugation
            a1, a2, a3, a4, a5, a6, a7 = "e", "o", u"è", "i", "a", "e", "e"
        if verb.lower().endswith("ire"):
            # –ire verbs can add -isc between the root and declination.
            isc = "isc"
        else:
            isc = ""
        v = [verb.lower(),
            b+isc+"o", b+isc+"i", b+isc+a7, b+"iamo", b+a1+"te", b+isc+a2+"no", b+a1+"ndo",
            b+a1+"i", b+a1+"sti", b+a3, b+a1+"mmo", b+a1+"ste", b+a1+"rono", b+a1+"to",
            b+a1+"vo", b+a1+"vi", b+a1+"va", b+a1+"vamo", b+a1+"vate", b+a1+"vano",
            b+a6+u"rò", b+a6+"rai", b+a6+u"rà", b+a6+"remo", b+a6+"rete", b+a6+"ranno",
            b+a6+"rei", b+a6+"resti", b+a6+"rebbe", b+a6+"remmo", b+a6+"reste", b+a6+"rebbero",
            b+isc+a4, b+isc+a5, b+"iamo", b+a1+"te", b+isc+a5+"no",
            b+isc+a5, b+isc+a5, b+isc+a5, b+"iamo", b+"iate", b+isc+a5+"no",
            b+a1+"ssi", b+a1+"ssi", b+a1+"sse", b+a1+"ssimo", b+a1+"ste", b+a1+"ssero"
        ]
        for i, x in enumerate(v):
            x = x.replace(  "ii",  "i")
            x = x.replace( "cha",  "ca")
            x = x.replace( "gha",  "ga")
            x = x.replace( "gga",  "ggia")
            x = x.replace( "cho",  "co")
            x = x.replace(u"chò", u"cò")
            v[i] = x
        return v

verbs = Verbs()

conjugate, lemma, lexeme, tenses = \
    verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses

#### ATTRIBUTIVE & PREDICATIVE #####################################################################

adjective_predicative = {
       "bei": "bello",
       "bel": "bello",
     "bell'": "bello",
     "begli": "bello",
      "buon": "buono",
     "buon'": "buona",
      "gran": "grande",
    "grand'": "grande",
    "grandi": "grande",
       "san": "santo",
     "sant'": "santa"
}

def attributive(adjective):
    """ For a predicative adjective, returns the attributive form.
    """
    # Must deal with feminine and plural.
    raise NotImplementedError

def predicative(adjective):
    """ Returns the predicative adjective.
    """
    w = adjective.lower()
    if w in adjective_predicative:
        return adjective_predicative[w]
    if w.endswith("ari"):
        return w + "o"
    if w.endswith(("ali", "ili", "esi", "nti", "ori")):
        return w[:-1] + "e"
    if w.endswith("isti"):
        return w[:-1] + "a"
    if w.endswith(("che", "ghe")):
        return w[:-2] + "a"
    if w.endswith(("chi", "ghi")):
        return w[:-2] + "o"
    if w.endswith("i"):
        return w[:-1] + "o"
    if w.endswith("e"):
        return w[:-1] + "a"
    return adjective