1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
|
# SPDX-FileCopyrightText: Peter Pentchev <roam@ringlet.net>
# SPDX-License-Identifier: BSD-2-Clause
"""Implement the actual UTF-8 locale detection."""
from __future__ import annotations
import dataclasses
import os
import re
import subprocess
import typing
if typing.TYPE_CHECKING:
from collections.abc import Iterable
VERSION = "1.0.3"
UTF8_LANGUAGES = ("C", "en", "de", "es", "it")
UTF8_ENCODINGS = ("UTF-8", "utf8")
LOCALE_VARIABLES = (
"LC_ALL",
"LANG",
"LC_MESSAGES",
"LC_COLLATE",
"LC_NAME",
"LC_IDENTIFICATION",
"LC_CTYPE",
"LC_NUMERIC",
"LC_TIME",
"LC_MONETARY",
"LC_PAPER",
"LC_ADDRESS",
"LC_TELEPHONE",
"LC_MEASUREMENT",
)
RE_LOCALE_NAME = re.compile(
r""" ^
(?P<lang> [a-zA-Z0-9]+ )
(?:
_
(?P<territory> [a-zA-Z0-9]+ )
)?
(?:
\.
(?P<codeset> [a-zA-Z0-9-]+ )
)?
(?:
@
(?P<modifier> [a-zA-Z0-9]+ )
)?
$ """,
re.X,
)
class NoLanguagesError(ValueError):
"""No languages were specified to detect from."""
def __str__(self) -> str:
"""Provide a human-readable representation of the error."""
return "No languages specified"
@dataclasses.dataclass(frozen=True)
class _DetectState:
"""The state of processing consecutive lines of `locale -a` output."""
priority: int
name: str
def detect_utf8_locale(*, languages: Iterable[str] = UTF8_LANGUAGES) -> str:
"""Get a locale name that may hopefully be used for UTF-8 output.
The `detect_utf8_locale()` function runs the external `locale` command to
obtain a list of the supported locale names, and then picks a suitable one
to use so that programs are more likely to output valid UTF-8 characters
and language-neutral messages. It prefers the `C` base locale, but if
neither `C.UTF-8` nor `C.utf8` is available, it will fall back to a list of
other locale names that are likely to be present on the system.
The `utf8_locale` package has a predefined list of preferred languages.
If a program has different preferences, e.g. only expecting to parse
messages written in English, the `detect_utf8_locale()` function may be
passed a `languages` parameter - an iterable of strings - containing
the language names in the preferred order. Note that `languages` should
only contain the language name (e.g. "en") and not a territory name
(e.g. "en_US"); locale names for the same language and different
territories are considered equivalent. Thus, the abovementioned program
that expects to parse messages in English may do:
name = detect_utf8_locale(languages=["C", "en"])
"""
weights = {}
unweight = 0
for lang in languages:
if lang not in weights:
weights[lang] = unweight
unweight = unweight + 1
if not weights:
raise NoLanguagesError
state = _DetectState(unweight, "C")
for line in subprocess.check_output(
["env", "LC_ALL=C", "LANGUAGE=", "locale", "-a"],
shell=False,
encoding="ISO-8859-1",
).splitlines():
data = RE_LOCALE_NAME.match(line)
if not data:
continue
if data.group("codeset") not in UTF8_ENCODINGS:
continue
lang = data.group("lang")
prio = weights.get(lang, weights.get("*", unweight))
if prio == 0:
return line
if prio < state.priority:
state = _DetectState(prio, line)
return state.name
def get_utf8_vars(*, languages: Iterable[str] = UTF8_LANGUAGES) -> dict[str, str]:
"""Prepare the environment variables that need to be changed.
The `get_utf8_vars()` function invokes `detect_utf8_locale()` and then
returns a dictionary containing the `LC_ALL` variable set to the obtained
locale name and `LANGUAGE` set to an empty string so that recent versions
of the gettext library do not choose a different language to output
messages in.
The `get_utf8_vars()` function also has an optional `languages` parameter
that is passed directory to `detect_utf8_locale()`.
"""
return {"LC_ALL": detect_utf8_locale(languages=languages), "LANGUAGE": ""}
def get_utf8_env(
env: dict[str, str] | None = None,
*,
languages: Iterable[str] = UTF8_LANGUAGES,
) -> dict[str, str]:
"""Prepare the environment to run subprocesses in.
The `get_utf8_env()` function invokes `detect_utf8_locale()` and then
returns a dictionary similar to `os.environ`, but with `LC_ALL` set to
the obtained locale name and `LANGUAGE` set to an empty string so that
recent versions of the gettext library do not choose a different language
to output messages in. If a dictionary is passed as the `env` parameter,
`get_utf8_env()` uses it as a base instead of the value of `os.environ`.
The `get_utf8_env()` function also has an optional `languages` parameter
that is passed directory to `detect_utf8_locale()`.
"""
subenv = dict(os.environ if env is None else env)
subenv.update(get_utf8_vars(languages=languages))
return subenv
def get_preferred_languages(
env: dict[str, str] | None = None,
*,
names: Iterable[str] = LOCALE_VARIABLES,
) -> list[str]:
"""Determine preferred languages as per the current locale settings.
The `get_preferred_languages()` function examines either the current
process environment or the provided dictionary and returns a list of
the languages specified in the locale variables (`LC_ALL`, `LANG`,
`LC_MESSAGES`, etc) in order of preference as defined by either
the `names` parameter passed or by the `LOCALE_VARIABLES` constant.
It may be used by programs to add the user's currently preferred locale
to their own settings, e.g.:
name = detect_utf8_locale(get_preferred_languages() + ["en"])
Note that "C" is always appended to the end of the list if it is not
already present.
"""
if env is None:
env = dict(os.environ)
res = []
for name in names:
value = env.get(name)
if value is None:
continue
data = RE_LOCALE_NAME.match(value)
if data is None:
continue
if data.group("codeset") not in UTF8_ENCODINGS:
continue
lang = data.group("lang")
if lang not in res:
res.append(lang)
# Make sure "C" is always in the list.
if "C" not in res:
res.append("C")
return res
@dataclasses.dataclass(frozen=True)
class LanguagesDetect:
"""Set up the desired parameters for detecting the preferred languages."""
env: dict[str, str] | None = None
names: Iterable[str] | None = None
def detect(self) -> list[str]:
"""Determine the preferred languages."""
names = self.names if self.names is not None else LOCALE_VARIABLES
return get_preferred_languages(self.env, names=names)
@dataclasses.dataclass(frozen=True)
class UTF8Environment:
"""The parameters for a UTF-8-capable environment."""
env: dict[str, str]
env_vars: dict[str, str]
languages: list[str]
locale: str
@dataclasses.dataclass(frozen=True)
class UTF8Detect:
"""Set up the desired parameters for detecting the UTF-8-capable environment."""
env: dict[str, str] | None = None
languages: Iterable[str] | None = None
def detect(self) -> UTF8Environment:
"""Run the detection, return the results."""
languages = list(self.languages if self.languages is not None else UTF8_LANGUAGES)
env = get_utf8_env(self.env, languages=languages)
return UTF8Environment(
env=env,
env_vars={key: env[key] for key in ("LC_ALL", "LANGUAGE")},
languages=languages,
locale=env["LC_ALL"],
)
|