1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
|
# This is a set of synonyms for pragmatic usage in NLP. It is based on
# working with Tesseract 3.04, but should be applicable elsewhere.
from typing import Iterable
LANG_SYNONYMS = [
("srp", "hbs", "hrv", "bos"),
("sli", "alb"),
("slk", "slo"),
("ron", "rum"),
("nor", "non"),
("nld", "dut"),
("mya", "bur"),
("msa", "may"),
("mkd", "mac"),
("kat", "geo"),
("isl", "ice"),
("isl", "ice"),
("fre", "fra"),
("fas", "per"),
("eus", "baq"),
("ell", "gre"),
("ger", "deu"),
("wel", "cym"),
("chi_sim", "chi_tra", "chi", "zho"),
("ces", "cze"),
("bod", "tib"),
("aze_cyrl", "aze"),
("fil", "tgl"),
("nep", "npi"),
("bur", "mya", "int", "tvn", "tco", "rki", "rmz"),
]
def expand_synonyms(language: str) -> Iterable[str]:
"""Expand a language code into a set of codes."""
for synonyms in LANG_SYNONYMS:
if language in synonyms:
return synonyms
return [language]
|