File: inflect.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
links: PTS, VCS
area: main
in suites: buster
size: 78,672 kB
sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (300 lines) | stat: -rw-r--r-- 12,879 bytes
parent folder | download | duplicates (2)
#### PATTERN | FR | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2013 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).

####################################################################################################
# Regular expressions-based rules for French word inflection:
# - pluralization and singularization of nouns,
# - conjugation of verbs,
# - predicative and attributive of adjectives.

# Accuracy:
# 92% for pluralize()
# 93% for singularize()
# 80% for Verbs.find_lemma() (mixed regular/irregular)
# 86% for Verbs.find_lexeme() (mixed regular/irregular)
# 95% predicative() (measured on Lexique French morphology word forms)

import os
import sys
import re

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""
    
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

from pattern.text import Verbs as _Verbs
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    INDICATIVE, IMPERATIVE, SUBJUNCTIVE, CONDITIONAL,
    IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
    IMPERFECT, PRETERITE,
    PARTICIPLE, GERUND
)

sys.path.pop(0)

VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"

VOWELS = ("a", "e", "i", "o", "u")
re_vowel = re.compile(r"a|e|i|o|u", re.I)
is_vowel = lambda ch: ch in VOWELS

#### PLURALIZE #####################################################################################

plural_irregular = {
       "bleu": "bleus",
       "pneu": "pneus",
    "travail": "travaux",
    "vitrail": "vitraux"
}

def pluralize(word, pos=NOUN, custom={}):
    """ Returns the plural of a given word.
        The custom dictionary is for user-defined replacements.
    """
    if word in custom:
        return custom[word]
    w = word.lower()
    if w in plural_irregular:
        return plural_irregular[w]
    if w.endswith(("ais", "ois")):
        return w + "es"
    if w.endswith(("s", "x")):
        return w
    if w.endswith("al"):
        return w[:-2] + "aux"
    if w.endswith(("au", "eu")):
        return w + "x"
    return w + "s"

#### SINGULARIZE ###################################################################################

def singularize(word, pos=NOUN, custom={}):
    if word in custom:
        return custom[word]
    w = word.lower()
    # Common articles, determiners, pronouns:
    if pos in ("DT", "PRP", "PRP$", "WP", "RB", "IN"):
        if w == "du" : return "de"
        if w == "ces": return "ce"
        if w == "les": return "le"
        if w == "des": return "un"
        if w == "mes": return "mon"
        if w == "ses": return "son"
        if w == "tes": return "ton"
        if w == "nos": return "notre"
        if w == "vos": return "votre"
        if w.endswith(("'", u"’")):
            return w[:-1] + "e"
    if w.endswith("nnes"):  # parisiennes => parisien
        return w[:-3]
    if w.endswith("ntes"):  # passantes => passant
        return w[:-2]
    if w.endswith("euses"): # danseuses => danseur
        return w[:-3] + "r"
    if w.endswith("s"):
        return w[:-1]
    if w.endswith(("aux", "eux", "oux")):
        return w[:-1]  
    if w.endswith("ii"):
        return w[:-1] + "o"
    if w.endswith(("ia", "ma")):
        return w[:-1] + "um"
    if "-" in w:
        return singularize(w.split("-")[0]) + "-" + "-".join(w.split("-")[1:])
    return w

#### VERB CONJUGATION ##############################################################################

verb_inflections = [
    ("issaient", "ir"   ), ("eassions",  "er"   ), ("dissions", "dre" ), (u"çassions", "cer"  ),
    ( "eraient", "er"   ), ( "assions",  "er"   ), ( "issions", "ir"  ), (  "iraient", "ir"   ),
    ( "isaient", "ire"  ), ( "geaient",  "ger"  ), ( "eassent", "er"  ), (  "geasses", "ger"  ),
    ( "eassiez", "er"   ), ( "dissiez",  "dre"  ), ( "dissent", "dre" ), (  "endrons", "endre"),
    ( "endriez", "endre"), ( "endrais",  "endre"), (  "erions", "er"  ), (   "assent", "er"   ),
    (  "assiez", "er"   ), (  "raient",  "re"   ), (  "issent", "ir"  ), (   "issiez", "ir"   ),
    (  "irions", "ir"   ), (  "issons",  "ir"   ), (  "issant", "ir"  ), (   "issait", "ir"   ),
    (  "issais", "ir"   ), (   "aient",  "er"   ), (  u"èrent", "er"  ), (    "erait", "er"   ),
    (   "eront", "er"   ), (   "erons",  "er"   ), (   "eriez", "er"  ), (    "erais", "er"   ),
    (   "asses", "er"   ), (   "rions",  "re"   ), (   "isses", "ir"  ), (    "irent", "ir"   ),
    (   "irait", "ir"   ), (   "irons",  "ir"   ), (   "iriez", "ir"  ), (    "irais", "ir"   ),
    (   "iront", "ir"   ), (   "issez",  "ir"   ), (    "ions", "er"  ), (     "erez", "er"   ),
    (    "eras", "er"   ), (    "erai",  "er"   ), (    "asse", "er"  ), (    u"âtes", "er"   ),
    (   u"âmes", "er"   ), (    "isse",  "ir"   ), (   u"îtes", "ir"  ), (    u"îmes", "ir"   ),
    (    "irez", "ir"   ), (    "iras",  "ir"   ), (    "irai", "ir"  ), (     "ront", "re"   ),
    (     "iez", "er"   ), (     "ent",  "er"   ), (     "ais", "er"  ), (      "ons", "er"   ),
    (     "ait", "er"   ), (     "ant",  "er"   ), (     "era", "er"  ), (      "ira", "ir"   ),
    (      "es", "er"   ), (      "ez",  "er"   ), (      "as", "er"  ), (       "ai", "er"   ),
    (     u"ât", "er"   ), (      "ds",  "dre"  ), (      "is", "ir"  ), (       "it", "ir"   ),
    (     u"ît", "ir"   ), (     u"ïr", u"ïr"   ), (      "nd", "ndre"), (       "nu", "nir"  ),
    (       "e", "er"   ), (      u"é",  "er"   ), (       "a", "er"  ), (        "t", "re"   ),
    (       "s", "re"   ), (       "i",  "ir"   ), (      u"û", "ir"  ), (        "u", "re"   ),          
    (       "d", "dre"  )
]

class Verbs(_Verbs):
    
    def __init__(self):
        _Verbs.__init__(self, os.path.join(MODULE, "fr-verbs.txt"),
            language = "fr",
             default = {},
              format = [
                0, 1, 2, 3, 4, 5, 6, 8, 24, # indicatif présent
                34, 35, 36, 37, 38, 39,     # indicatif passé simple
                17, 18, 19, 20, 21, 22,     # indicatif imparfait
                40, 41, 42, 43, 44, 45,     # indicatif futur simple
                46, 47, 48, 49, 50, 51,     # conditionnel présent
                52, 53, 54,                 # impératif présent
                55, 56, 57, 58, 59, 60,     # subjonctif présent
                67, 68, 69, 70, 71, 72      # subjonctif imparfait
            ])
    
    def find_lemma(self, verb):
        """ Returns the base form of the given inflected verb, using a rule-based approach.
        """
        # French has 20,000+ verbs, ending in -er (majority), -ir, -re.
        v = verb.lower()
        if v.endswith(("er", "ir", "re")):
            return v
        for a, b in verb_inflections:
            if v.endswith(a):
                return v[:-len(a)] + b
        return v

    def find_lexeme(self, verb):
        """ For a regular verb (base form), returns the forms using a rule-based approach.
        """
        v = verb.lower()
        b = v[:-2]
        if v.endswith("ir") and not \
           v.endswith(("couvrir", "cueillir", u"découvrir", "offrir", "ouvrir", "souffrir")):
            # Regular inflection for verbs ending in -ir.
            # Some -ir verbs drop the last letter of the stem: dormir => je dors (not: je dormis).
            if v.endswith(("dormir", "mentir", "partir", "sentir", "servir", "sortir")):
                b0 = b[:-1]
            else:
                b0 = b + "i"
            return [v, 
                b0+"s", b0+"s", b0+"t", b+"issons", b+"issez", b+"issent", b+"issant", b+"i",
                b+"is", b+"is", b+"it", b+u"îmes", b+u"îtes", b+"irent",
                b+"issais", b+"issais", b+"issait", b+"issions", b+"issiez", b+"issaient",
                v+"ai", v+"as", v+"a", v+"ons", v+"ez", v+u"ont",            
                v+"ais", v+"ais", v+"ait", v+"ions", v+"iez", v+"aient",
                b+"is", b+"issons", b+"issez",
                b+"isse", b+"isses", b+"isse", b+"issions", b+"issiez", b+"issent",
                b+"isse", b+"isses", b+u"ît", b+"issions", b+"issiez", b+"issent"
            ]
        elif v.endswith("re"):
            # Regular inflection for verbs ending in -re.
            # Verbs ending in -attre and -ettre drop the -t in the singular form.
            if v.endswith(("ttre")):
                b0 = b1 = b[:-1]
            else:
                b0 = b1 = b
            # Verbs ending in -aindre, -eindre and -oindre drop the -d.
            if v.endswith("indre"):
                b0, b1 = b[:-1], b[:-2] + "gn"
            # Verbs ending in -prendre drop the -d in the plural form.
            if v.endswith("prendre"):
                b0, b1 = b, b[:-1]
            return [v, 
                b0+"s", b0+"s", b0+"", b1+"ons", b1+"ez", b1+"ent", b1+"ant", b+"u",
                b+"is", b+"is", b+"it", b1+u"îmes", b1+u"îtes", b1+"irent",
                b+"ais", b+"ais", b+"ait", b1+"ions", b1+"iez", b1+"aient",
                b+"rai", b+"ras", b+"ra", b+"rons", b+"rez", b+"ront",            
                b+"ais", b+"ais", b+"ait", b1+"ions", b1+"iez", b1+"aient",
                b0+"s", b1+"ons", b1+"ez",
                b+"e", b+"es", b+u"e", b1+"ions", b1+"iez", b1+"ent",
                b+"isse", b+"isses", b+u"ît", b1+"issions", b1+"issiez", b1+"issent"
            ]
        else:
            # Regular inflection for verbs ending in -er.
            # If the stem ends in -g, use -ge before hard vowels -a and -o: manger => mangeons.
            # If the stem ends in -c, use -ç before hard vowels -a and -o: lancer => lançons.
            e = v.endswith("ger") and u"e" or ""
            c = v.endswith("cer") and b[:-1]+u"ç" or b
            return [v, 
                b+"e", b+"es", b+"e", c+e+"ons", b+"ez", b+"ent", c+e+"ant", b+u"é",
                c+e+"ai", c+e+"as", c+e+"a", c+e+u"âmes", c+e+u"âtes", b+u"èrent",
                c+e+"ais", c+e+"ais", c+e+"ait", b+"ions", b+"iez", c+e+"aient",
                v+"ai", v+u"as", v+"a", v+"ons", v+"ez", v+"ont",            
                v+"ais", v+u"ais", v+"ait", v+"ions", v+"iez", v+"aient",
                b+"e", c+e+u"ons", b+"ez",
                b+"e", b+u"es", b+"e", b+"ions", b+"iez", b+"ent",
                c+e+"asse", c+e+"asses", c+e+u"ât", c+e+"assions", c+e+"assiez", c+e+"assent"
            ]

verbs = Verbs()

conjugate, lemma, lexeme, tenses = \
    verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses

#### ATTRIBUTIVE & PREDICATIVE #####################################################################

def attributive(adjective):
    """ For a predicative adjective, returns the attributive form.
    """
    # Must deal with feminine and plural.
    raise NotImplementedError

def predicative(adjective): 
    """ Returns the predicative adjective (lowercase): belles => beau.
    """
    w = adjective.lower()
    if w.endswith(("ais", "ois")):
        return w
    if w.endswith((u"és", u"ée", u"ées")):
        return w.rstrip("es")
    if w.endswith(("que", "ques")):
        return w.rstrip("s")
    if w.endswith(("nts", "nte", "ntes")):
        return w.rstrip("es")
    if w.endswith("eaux"):
        return w.rstrip("x")
    if w.endswith(("aux", "ale", "ales")):
        return w.rstrip("uxles") + "l"
    if w.endswith(("rteuse", "rteuses", "ailleuse")):
        return w.rstrip("es") + "r"
    if w.endswith(("euse", "euses")):
        return w.rstrip("es") + "x"
    if w.endswith(("els", "elle", "elles")):
        return w.rstrip("les") + "el"
    if w.endswith(("ifs", "ive", "ives")):
        return w.rstrip("es")[:-2] + "if"
    if w.endswith(("is", "ie", "ies")):
        return w.rstrip("es")
    if w.endswith(("enne", "ennes")):
        return w.rstrip("nes") + "en"
    if w.endswith(("onne", "onnes")):
        return w.rstrip("nes") + "n"
    if w.endswith(("igne", "ignes", "ingue", "ingues")):
        return w.rstrip("s")
    if w.endswith((u"ène", u"ènes")):
        return w.rstrip("s")
    if w.endswith(("ns", "ne", "nes")):
        return w.rstrip("es")
    if w.endswith(("ite", "ites")):
        return w.rstrip("es")
    if w.endswith(("is", "ise", "ises")):
        return w.rstrip("es") + "s"
    if w.endswith(("rice", "rices")):
        return w.rstrip("rices") + "eur"
    if w.endswith(("iers", u"ière", u"ières")):
        return w.rstrip("es")[:-3] + "ier"
    if w.endswith(("ette", "ettes")):
        return w.rstrip("tes") + "et"
    if w.endswith(("rds", "rde", "rdes")):
        return w.rstrip("es")
    if w.endswith(("nds", "nde", "ndes")):
        return w.rstrip("es")
    if w.endswith(("us", "ue", "ues")):
        return w.rstrip("es")
    return w.rstrip("s")