File: string.py

package info (click to toggle)
python-tatsu 5.17.1%2Bds-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 1,516 kB
sloc: python: 13,185; makefile: 127
file content (238 lines) | stat: -rw-r--r-- 6,870 bytes
# Copyright (c) 2017-2026 Juancarlo Añez (apalala@gmail.com)
# SPDX-License-Identifier: BSD-4-Clause
from __future__ import annotations

import codecs
import hashlib
import re
import sys
import unicodedata
from collections.abc import Iterable
from io import StringIO
from typing import Any

from .common import is_reserved

if sys.version_info >= (3, 13):
    from re import PatternError
else:
    PatternError = re.error


def unicode_display_len(text: str) -> int:
    # by Gemini 2026/02/17 (with many ammendments)
    """
    Calculates the display width of a string in a terminal or
    fixed-width font context.
    """
    assert isinstance(text, str), repr(text)

    def uwidth(c: str) -> int:
        status = unicodedata.east_asian_width(c)
        return 1 + int(status in {'W', 'F'})

    return sum(uwidth(s) for s in text)


def hashsha(text: Any) -> str:
    """
    Generates a SHA-256 hex digest of the provided object.
    """
    # by Gemini (- 2026-02-08)
    # by [apalala@gmail.com](https://github.com/apalala)

    # hashlib requires bytes, so encode the string to UTF-8
    return hashlib.sha256(str(text).encode('utf-8')).hexdigest()


def regexp(text: Any) -> str:
    """
    Returns a printable version of the regexp pattern as a Python raw string.
    Validates input and ensures generated output is syntactically valid.
    """
    # by Gemini (2026-02-04 - 2026-02-07)
    # by [apalala@gmail.com](https://github.com/apalala)

    pattern_text = text.pattern if hasattr(text, "pattern") else str(text)

    try:
        re.compile(pattern_text)
    except PatternError as e:
        raise ValueError(
            f"Invalid regex passed to regexp(): {pattern_text!r}\n{e}",
        ) from e

    ctrl_map: dict[str, str] = {
        "\n": r"\n",
        "\r": r"\r",
        "\t": r"\t",
        "\v": r"\v",
        "\f": r"\f",
        "\b": r"\b",
        "\a": r"\a",
        "\0": r"\0",
    }

    result = "".join(ctrl_map.get(c, c) for c in pattern_text)

    # Handle trailing backslashes (odd count check for raw string safety)
    if result.endswith("\\") and (len(result) - len(result.rstrip("\\"))) % 2 != 0:
        result += "\\"

    if result.endswith("'") or result.count("'") > result.count('"'):
        output = f'r"{re.sub(r'(?<!\\)"', r"\"", result)}"'
    else:
        output = f"r'{re.sub(r"(?<!\\)'", r"\'", result)}'"

    try:
        evaluated = eval(output)  # noqa: S307
        re.compile(evaluated)
    except SyntaxError as e:
        raise RuntimeError(
            f"regexp() generated invalid Python syntax: {output}\n{e}",
        ) from e
    except PatternError as e:
        raise RuntimeError(
            f"regexp() generated an invalid regex pattern: {output}\n{e}",
        ) from e
    except Exception as e:
        raise RuntimeError(f"Unexpected error evaluating output: {output}\n{e}") from e

    return output


def eval_escapes(s: str | bytes) -> str | bytes:
    """
    Given a string, evaluate escape sequences starting with backslashes as
    they would be evaluated in Python source code. For a list of these
    sequences, see: https://docs.python.org/3/reference/lexical_analysis.html

    This is not the same as decoding the whole string with the 'unicode-escape'
    codec, because that provides no way to handle non-ASCII characters that are
    literally present in the string.
    """
    # by Rob Speer

    escape_sequence_re: re.Pattern = re.compile(r"""(?ux)
        ( \\U........      # 8-digit Unicode escapes
        | \\u....          # 4-digit Unicode escapes
        | \\x..            # 2-digit Unicode escapes
        | \\[0-7]{1,3}     # Octal character escapes
        | \\N\{[^}]+\}     # Unicode characters by name
        | \\[\\'"abfnrtv]  # Single-character escapes
        )""")

    def decode_match(match):
        return codecs.decode(match.group(0), 'unicode-escape')

    return escape_sequence_re.sub(decode_match, s)  # type: ignore[no-matching-overload]


def trim(text, tabwidth=4):
    """
    Trim text of common, leading whitespace.

    Based on the trim algorithm of PEP 257:
        http://www.python.org/dev/peps/pep-0257/
    """
    if not text:
        return ''
    lines = text.expandtabs(tabwidth).splitlines()
    maxindent = len(text)
    indent = maxindent
    for line in lines[1:]:
        stripped = line.lstrip()
        if stripped:
            indent = min(indent, len(line) - len(stripped))
    trimmed = [lines[0].strip()] + [line[indent:].rstrip() for line in lines[1:]]
    i = 0
    while i < len(trimmed) and not trimmed[i]:
        i += 1
    return '\n'.join(trimmed[i:])


def indent(text, indent=1, multiplier=4):
    """Indent the given block of text by indent*4 spaces"""
    if text is None:
        return ''
    text = str(text)
    if indent >= 0:
        sindent = ' ' * multiplier * indent
        text = '\n'.join((sindent + t).rstrip() for t in text.splitlines())
    return text


def mangle(name: str) -> str:
    return safe_name(name)


def safe_name(name: str, plug: str = "_") -> str:
    """
    Utility to transform a string into a valid Python identifier.
    Raises ValueError for empty inputs or illegal plugs. Handles
    leading digits and reserved hard/soft keywords.

    Generated by Gemini - January 24, 2026
    """

    if not plug or not all(c.isalnum() or c == "_" for c in plug):
        raise ValueError(f"Invalid plug: '{plug}'. Must be non-empty and alphanumeric.")
    if not name:
        raise ValueError("Input string cannot be empty.")

    plugged_name = re.sub(r"\W", plug, name)

    if plugged_name[0].isdigit():
        if plug[0].isdigit():
            plugged_name = f"_{plugged_name}"
        else:
            plugged_name = f"{plug}{plugged_name}"

    while is_reserved(plugged_name):
        plugged_name = f"{plugged_name}{plug}"

    return plugged_name


def pythonize_name(name: str) -> str:
    if not name:
        return name
    if name.isupper():
        return name.lower()
    return name[0].lower() + ''.join(
        '_' + c.lower() if c.isupper() else c for c in name[1:]
    )


def prints(*args, **kwargs: Any) -> str:
    with StringIO() as f:
        kwargs['file'] = f
        kwargs['end'] = ''
        print(*args, **kwargs)
        return f.getvalue()


def longest_common_prefix(strs: Iterable[str], suffix: str = '') -> str:
    if not strs:
        return ''

    strs = [s + suffix for s in sorted(strs)]
    if len(strs) == 1:
        return strs[0]

    first = strs[0]
    last = strs[-1]

    i = 0
    m = min(len(first), len(last))
    while i < m and first[i] == last[i]:
        i += 1

    return first[:i]


def without_common_prefix(strs: Iterable[str], suffix: str = '') -> list[str]:
    if not strs:
        return []
    prefix = longest_common_prefix(strs, suffix=suffix)
    return [s.lstrip(prefix) for s in strs]