File: test_data_languages.py

package info (click to toggle)
python-gflanguages 0.7.2-3
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 10,004 kB
sloc: python: 818; sh: 8; makefile: 6
file content (347 lines) | stat: -rw-r--r-- 12,317 bytes
#!/usr/bin/env python3
#
# Copyright 2022 Google LLC All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import defaultdict, Counter
import regex
import unicodedata

from gflanguages import (
    LoadLanguages,
    languages_public_pb2,
    LoadScripts,
    LoadRegions,
    parse,
)
import pytest
import youseedee


LANGUAGES = LoadLanguages()
SCRIPTS = LoadScripts()
REGIONS = LoadRegions()

CLDR_SCRIPT_TO_UCD_SCRIPT = {
    "Bangla": "Bengali",
    "Traditional Han": "Han",
    "Simplified Han": "Han",
    "Korean": "Hangul",
    "Odia": "Oriya",
    "Makasar": "Buginese",
    "Lanna": "Tai Tham",
    "Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal",
    "S-A Cuneiform": "Cuneiform",
    "Pollard Phonetic": "Miao",
    "Egyptian hieroglyphs": "Egyptian Hieroglyphs",
    "Zanabazar": "Zanabazar Square",
    "Nüshu": "Nushu",
    "Mandaean": "Mandaic",
    "N’Ko": "Nko",
    "Varang Kshiti": "Warang Citi",
    "Mende": "Mende Kikakui",
    "Phags-pa": "Phags Pa",
    "Fraser": "Lisu",
    "Georgian Khutsuri": "Georgian",
    "Orkhon": "Old Turkic",
}

SKIP_EXEMPLARS = {
    "ja_Jpan": "Contains multiple scripts",
    "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
    "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
    "ykg_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
    "ady_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
    "sla_Latn": "Does indeed use Cyrillic glyphs (ь) when written in Latin",
    "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
    "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
    "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
    "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
    "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
    "gov_Latn": "Does use future Unicode 16 Latin glyphs",
}

SKIP_REGION = {
    "cpf_Latn": "French-based creole languages is a group of languages.",
    "gem_Latn": "Germanic languages is a group of languages.",
    "sla_Latn": "Slavic languages is a group of languages.",
    "hmn_Latn": "Homnic languages is a group of languages.",
    "ie_Latn": "Interlingue is an artifical language.",
    "io_Latn": "Ido is an artifical language.",
    "jbo_Latn": "Lobjan is an artifical language.",
    "tlh_Latn": "Klingon is an artifical language.",
}

LANGUAGE_NAME_REGEX = regex.compile(
    r"^[-'’ʼ\p{L} ]+(, [-'’ʼ\p{L}/ ]+)?( [(][-'’ʼ\p{L} ]+[)])?$"
)
# Some scripts have abbreviated names for reference in language names that are
# sufficient in context. If an alternate is listed here, it should be used
# universally and consistently across all language names.
ALTERNATE_SCRIPT_NAMES = {
    "Dupl": "Duployan",
    "Hans": "Simplified",
    "Hant": "Traditional",
}


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    normalized = defaultdict(set)

    for g in exemplar:
        if g[0] == "{" and g[-1] == "}":
            g = g.lstrip("{").rstrip("}")
        normalized[unicodedata.normalize("NFC", g)].add(g)

    result = [(len(gs), n) for n, gs in normalized.items()]
    expected = [(1, n) for n, _ in normalized.items()]
    assert result == expected


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_duplicates(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    counter = Counter(exemplar)
    counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
    assert counts == [(v, 1) for v in exemplar]


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
@pytest.mark.parametrize(
    "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
)
def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
    lang = LANGUAGES[lang_code]
    if lang.script != "Latn":
        return
    exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
    for chars in exemplar:
        if len(chars) > 1:
            assert chars.startswith("{") and chars.endswith("}")
            assert len(chars[1:-1]) > 1


@pytest.mark.parametrize("lang_code", LANGUAGES)
def test_languages_exemplars_marks_in_base(lang_code):
    lang = LANGUAGES[lang_code]
    bases = lang.exemplar_chars.base
    problems = []
    for chars in bases.split():
        if len(chars) > 1:
            chars = chars.lstrip("{").rstrip("}")
        if unicodedata.category(chars[0]) == "Mn":
            problems.append("\u25CC" + chars)
        if "\u25CC" in chars:
            problems.append(chars)
    assert not problems, f"Found marks in base: {problems}"


SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_language_samples(lang_code):
    # Although marked as optional in the protobuf file, all
    # SampleText fields (except note) are required, so make
    # sure they are present.
    lang = LANGUAGES[lang_code]
    if not lang.sample_text.ListFields():
        pytest.skip("No sample text for language " + lang_code)
        return

    for field in SampleText.fields:
        if field.name == "note":
            continue
        assert getattr(lang.sample_text, field.name)


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_script_is_known(lang_code):
    lang = LANGUAGES[lang_code]
    script = lang.script
    assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"


@pytest.mark.parametrize("lang_code", LANGUAGES)
def test_region_is_known(lang_code):
    lang = LANGUAGES[lang_code]
    if lang.id in SKIP_REGION:
        pytest.skip(SKIP_REGION[lang.id])
        return
    regions = lang.region
    for region in regions:
        assert region in REGIONS.keys()


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_exemplars_are_in_script(lang_code):
    lang = LANGUAGES[lang_code]
    script_name = SCRIPTS[lang.script].name
    script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
    if not lang.exemplar_chars.ListFields():
        pytest.skip("No exemplars for language " + lang_code)
        return
    if lang.id in SKIP_EXEMPLARS:
        pytest.skip(SKIP_EXEMPLARS[lang.id])
        return
    out_of_script = {}
    for field in ExemplarChars.fields:
        if field.name == "auxiliary" or field.name == "index":
            continue
        exemplars = getattr(lang.exemplar_chars, field.name)
        group_of_chars = regex.findall(r"(\{[^}]+\}|\S+)", exemplars)
        for chars in group_of_chars:
            for char in chars:
                char_script = youseedee.ucd_data(ord(char)).get("Script")
                if char_script == "Common" or char_script == "Inherited":
                    continue
                char_script = char_script.replace("_", " ")
                if char_script != script_name:
                    out_of_script[chars] = char_script
                    break
    assert not out_of_script, (
        f"{lang_code} exemplars contained out-of-script characters"
        f": {', '.join(out_of_script.keys())}"
        f" from scripts {', '.join(set(out_of_script.values()))}"
    )


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_sample_texts_are_in_script(lang_code):
    if lang_code in [
        "mak_Maka",
        "orv_Cyrl",
        "cu_Cyrl",
        "ff_Adlm",
        "idu_Latn",
        "ban_Bali",
    ]:
        pytest.xfail("These languages have known issues with their sample text")
        return
    lang = LANGUAGES[lang_code]
    script_name = SCRIPTS[lang.script].name
    script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
    if not lang.sample_text.ListFields():
        pytest.skip("No sample text for language " + lang_code)
        return
    if lang.id in SKIP_EXEMPLARS:
        pytest.skip(SKIP_EXEMPLARS[lang.id])
        return
    out_of_script = defaultdict(set)
    for field in SampleText.fields:
        if field.name == "note":
            continue
        samples = getattr(lang.sample_text, field.name)
        chars = set(samples)
        for char in chars:
            char_script = (
                youseedee.ucd_data(ord(char)).get("Script", "").replace("_", " ")
            )
            if char_script == "Common" or char_script == "Inherited":
                continue
            if char_script != script_name:
                extensions = (
                    youseedee.ucd_data(ord(char))
                    .get("Script_Extensions", "")
                    .split(" ")
                )
                if any(ext == lang.script for ext in extensions):
                    continue
                out_of_script[char_script].add(char)
                break
    msg = []
    for script, chars in out_of_script.items():
        msg.append(f"'{''.join(chars)}' ({script} != {script_name})")
    assert not out_of_script, (
        f"{lang_code} sample text contained out-of-script characters"
        f": {', '.join(msg)}"
    )


def test_exemplar_parser():
    bases = "a A ā Ā {a̍} {A̍} {kl}"
    parsed_bases = parse(bases)
    assert parsed_bases == {
        "a",
        "A",
        "ā",
        "Ā",
        "k",
        "l",
        "̍",
    }


def test_language_uniqueness():
    names = Counter([])
    for lang in LANGUAGES.values():
        if lang.preferred_name:
            names[lang.preferred_name] += 1
        else:
            names[lang.name] += 1
    if any(count > 1 for count in names.values()):
        duplicates = {name: count for name, count in names.items() if count > 1}
        pytest.fail(f"Duplicate language names: {duplicates}")


def test_language_name_structure():
    languages_with_bad_name_structure = {}
    for lang in LANGUAGES.values():
        script_name = (
            SCRIPTS[lang.script].name
            if lang.script not in ALTERNATE_SCRIPT_NAMES
            else ALTERNATE_SCRIPT_NAMES[lang.script]
        )
        names = [["name", lang.name]]
        if lang.preferred_name:
            names += [["preferred_name", lang.preferred_name]]
        bad_names = []
        for type, name in names:
            bad_structure = not regex.match(LANGUAGE_NAME_REGEX, name)
            bad_script_suffix = name.endswith(")") and not name.endswith(
                f"({script_name})"
            )
            if bad_structure or bad_script_suffix:
                bad_names.append(type)
        if len(bad_names) > 0:
            languages_with_bad_name_structure[lang.id] = bad_names
    if len(languages_with_bad_name_structure) > 0:
        misstructured_language_names = [
            f"{language_id}" if len(types) == 1 else f"{language_id}: {types}"
            for language_id, types in languages_with_bad_name_structure.items()
            if len(types) > 0
        ]
        pytest.fail(
            f'Languages names without expected structure ("LANGUAGE, MODIFIER (SCRIPT)"): {misstructured_language_names}'
        )


@pytest.mark.parametrize("lang_code", LANGUAGES)
def test_id_well_formed(lang_code):
    if lang_code in ["tw_akuapem_Latn"]:
        pytest.xfail("Well we need to have a conversation about that")
        return
    lang = LANGUAGES[lang_code]
    assert lang.id.startswith(lang.language + "_" + lang.script)