File: detect.py

package info (click to toggle)
utf8-locale 1.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 404 kB
  • sloc: python: 847; ansic: 486; sh: 121; makefile: 21
file content (250 lines) | stat: -rw-r--r-- 7,844 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# SPDX-FileCopyrightText: Peter Pentchev <roam@ringlet.net>
# SPDX-License-Identifier: BSD-2-Clause

"""Implement the actual UTF-8 locale detection."""

from __future__ import annotations

import dataclasses
import os
import re
import subprocess
import typing


if typing.TYPE_CHECKING:
    from collections.abc import Iterable


VERSION = "1.0.3"

UTF8_LANGUAGES = ("C", "en", "de", "es", "it")
UTF8_ENCODINGS = ("UTF-8", "utf8")

LOCALE_VARIABLES = (
    "LC_ALL",
    "LANG",
    "LC_MESSAGES",
    "LC_COLLATE",
    "LC_NAME",
    "LC_IDENTIFICATION",
    "LC_CTYPE",
    "LC_NUMERIC",
    "LC_TIME",
    "LC_MONETARY",
    "LC_PAPER",
    "LC_ADDRESS",
    "LC_TELEPHONE",
    "LC_MEASUREMENT",
)

RE_LOCALE_NAME = re.compile(
    r""" ^
    (?P<lang> [a-zA-Z0-9]+ )
    (?:
        _
        (?P<territory> [a-zA-Z0-9]+ )
    )?
    (?:
        \.
        (?P<codeset> [a-zA-Z0-9-]+ )
    )?
    (?:
        @
        (?P<modifier> [a-zA-Z0-9]+ )
    )?
    $ """,
    re.X,
)


class NoLanguagesError(ValueError):
    """No languages were specified to detect from."""

    def __str__(self) -> str:
        """Provide a human-readable representation of the error."""
        return "No languages specified"


@dataclasses.dataclass(frozen=True)
class _DetectState:
    """The state of processing consecutive lines of `locale -a` output."""

    priority: int
    name: str


def detect_utf8_locale(*, languages: Iterable[str] = UTF8_LANGUAGES) -> str:
    """Get a locale name that may hopefully be used for UTF-8 output.

    The `detect_utf8_locale()` function runs the external `locale` command to
    obtain a list of the supported locale names, and then picks a suitable one
    to use so that programs are more likely to output valid UTF-8 characters
    and language-neutral messages. It prefers the `C` base locale, but if
    neither `C.UTF-8` nor `C.utf8` is available, it will fall back to a list of
    other locale names that are likely to be present on the system.

    The `utf8_locale` package has a predefined list of preferred languages.
    If a program has different preferences, e.g. only expecting to parse
    messages written in English, the `detect_utf8_locale()` function may be
    passed a `languages` parameter - an iterable of strings - containing
    the language names in the preferred order. Note that `languages` should
    only contain the language name (e.g. "en") and not a territory name
    (e.g. "en_US"); locale names for the same language and different
    territories are considered equivalent. Thus, the abovementioned program
    that expects to parse messages in English may do:

        name = detect_utf8_locale(languages=["C", "en"])
    """
    weights = {}
    unweight = 0
    for lang in languages:
        if lang not in weights:
            weights[lang] = unweight
            unweight = unweight + 1
    if not weights:
        raise NoLanguagesError

    state = _DetectState(unweight, "C")
    for line in subprocess.check_output(
        ["env", "LC_ALL=C", "LANGUAGE=", "locale", "-a"],
        shell=False,
        encoding="ISO-8859-1",
    ).splitlines():
        data = RE_LOCALE_NAME.match(line)
        if not data:
            continue
        if data.group("codeset") not in UTF8_ENCODINGS:
            continue

        lang = data.group("lang")
        prio = weights.get(lang, weights.get("*", unweight))
        if prio == 0:
            return line
        if prio < state.priority:
            state = _DetectState(prio, line)

    return state.name


def get_utf8_vars(*, languages: Iterable[str] = UTF8_LANGUAGES) -> dict[str, str]:
    """Prepare the environment variables that need to be changed.

    The `get_utf8_vars()` function invokes `detect_utf8_locale()` and then
    returns a dictionary containing the `LC_ALL` variable set to the obtained
    locale name and `LANGUAGE` set to an empty string so that recent versions
    of the gettext library do not choose a different language to output
    messages in.

    The `get_utf8_vars()` function also has an optional `languages` parameter
    that is passed directory to `detect_utf8_locale()`.
    """
    return {"LC_ALL": detect_utf8_locale(languages=languages), "LANGUAGE": ""}


def get_utf8_env(
    env: dict[str, str] | None = None,
    *,
    languages: Iterable[str] = UTF8_LANGUAGES,
) -> dict[str, str]:
    """Prepare the environment to run subprocesses in.

    The `get_utf8_env()` function invokes `detect_utf8_locale()` and then
    returns a dictionary similar to `os.environ`, but with `LC_ALL` set to
    the obtained locale name and `LANGUAGE` set to an empty string so that
    recent versions of the gettext library do not choose a different language
    to output messages in. If a dictionary is passed as the `env` parameter,
    `get_utf8_env()` uses it as a base instead of the value of `os.environ`.

    The `get_utf8_env()` function also has an optional `languages` parameter
    that is passed directory to `detect_utf8_locale()`.
    """
    subenv = dict(os.environ if env is None else env)
    subenv.update(get_utf8_vars(languages=languages))
    return subenv


def get_preferred_languages(
    env: dict[str, str] | None = None,
    *,
    names: Iterable[str] = LOCALE_VARIABLES,
) -> list[str]:
    """Determine preferred languages as per the current locale settings.

    The `get_preferred_languages()` function examines either the current
    process environment or the provided dictionary and returns a list of
    the languages specified in the locale variables (`LC_ALL`, `LANG`,
    `LC_MESSAGES`, etc) in order of preference as defined by either
    the `names` parameter passed or by the `LOCALE_VARIABLES` constant.
    It may be used by programs to add the user's currently preferred locale
    to their own settings, e.g.:

        name = detect_utf8_locale(get_preferred_languages() + ["en"])

    Note that "C" is always appended to the end of the list if it is not
    already present.
    """
    if env is None:
        env = dict(os.environ)

    res = []
    for name in names:
        value = env.get(name)
        if value is None:
            continue
        data = RE_LOCALE_NAME.match(value)
        if data is None:
            continue
        if data.group("codeset") not in UTF8_ENCODINGS:
            continue

        lang = data.group("lang")
        if lang not in res:
            res.append(lang)

    # Make sure "C" is always in the list.
    if "C" not in res:
        res.append("C")
    return res


@dataclasses.dataclass(frozen=True)
class LanguagesDetect:
    """Set up the desired parameters for detecting the preferred languages."""

    env: dict[str, str] | None = None
    names: Iterable[str] | None = None

    def detect(self) -> list[str]:
        """Determine the preferred languages."""
        names = self.names if self.names is not None else LOCALE_VARIABLES
        return get_preferred_languages(self.env, names=names)


@dataclasses.dataclass(frozen=True)
class UTF8Environment:
    """The parameters for a UTF-8-capable environment."""

    env: dict[str, str]
    env_vars: dict[str, str]
    languages: list[str]
    locale: str


@dataclasses.dataclass(frozen=True)
class UTF8Detect:
    """Set up the desired parameters for detecting the UTF-8-capable environment."""

    env: dict[str, str] | None = None
    languages: Iterable[str] | None = None

    def detect(self) -> UTF8Environment:
        """Run the detection, return the results."""
        languages = list(self.languages if self.languages is not None else UTF8_LANGUAGES)
        env = get_utf8_env(self.env, languages=languages)
        return UTF8Environment(
            env=env,
            env_vars={key: env[key] for key in ("LC_ALL", "LANGUAGE")},
            languages=languages,
            locale=env["LC_ALL"],
        )