File: cli_usage_lexer.py

package info (click to toggle)
tmuxp 1.64.0-1
links: PTS, VCS
area: main
in suites:
size: 3,500 kB
sloc: python: 17,788; sh: 21; makefile: 6
file content (115 lines) | stat: -rw-r--r-- 3,866 bytes
parent folder | download | duplicates (2)
"""Pygments lexer for CLI usage/help output.

This module provides a custom Pygments lexer for highlighting command-line
usage text typically generated by argparse, getopt, or similar libraries.
"""

from __future__ import annotations

from pygments.lexer import RegexLexer, bygroups, include
from pygments.token import Generic, Name, Operator, Punctuation, Text, Whitespace


class CLIUsageLexer(RegexLexer):
    """Lexer for CLI usage/help text (argparse, etc.).

    Highlights usage patterns including options, arguments, and meta-variables.

    Examples
    --------
    >>> from pygments.token import Token
    >>> lexer = CLIUsageLexer()
    >>> tokens = list(lexer.get_tokens("usage: cmd [-h]"))
    >>> tokens[0]
    (Token.Generic.Heading, 'usage:')
    >>> tokens[2]
    (Token.Name.Label, 'cmd')
    """

    name = "CLI Usage"
    aliases = ["cli-usage", "usage"]  # noqa: RUF012
    filenames: list[str] = []  # noqa: RUF012
    mimetypes = ["text/x-cli-usage"]  # noqa: RUF012

    tokens = {  # noqa: RUF012
        "root": [
            # "usage:" at start of line
            (r"^(usage:)(\s+)", bygroups(Generic.Heading, Whitespace)),  # type: ignore[no-untyped-call]
            # Continuation lines (leading whitespace for wrapped usage)
            (r"^(\s+)(?=\S)", Whitespace),
            include("inline"),
        ],
        "inline": [
            # Whitespace
            (r"\s+", Whitespace),
            # Long options with = value (e.g., --log-level=VALUE)
            (
                r"(--[a-zA-Z0-9][-a-zA-Z0-9]*)(=)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
                bygroups(Name.Tag, Operator, Name.Variable),  # type: ignore[no-untyped-call]
            ),
            # Long options standalone
            (r"--[a-zA-Z0-9][-a-zA-Z0-9]*", Name.Tag),
            # Short options with space-separated value (e.g., -S socket-path)
            (
                r"(-[a-zA-Z0-9])(\s+)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
                bygroups(Name.Attribute, Whitespace, Name.Variable),  # type: ignore[no-untyped-call]
            ),
            # Short options standalone
            (r"-[a-zA-Z0-9]", Name.Attribute),
            # UPPERCASE meta-variables (COMMAND, FILE, PATH)
            (r"\b[A-Z][A-Z0-9_]+\b", Name.Constant),
            # Opening bracket - enter optional state
            (r"\[", Punctuation, "optional"),
            # Closing bracket (fallback for unmatched)
            (r"\]", Punctuation),
            # Choice separator (pipe)
            (r"\|", Operator),
            # Parentheses for grouping
            (r"[()]", Punctuation),
            # Positional/command names (lowercase with dashes)
            (r"\b[a-z][-a-z0-9]*\b", Name.Label),
            # Catch-all for any other text
            (r"[^\s\[\]|()]+", Text),
        ],
        "optional": [
            # Nested optional bracket
            (r"\[", Punctuation, "#push"),
            # End optional
            (r"\]", Punctuation, "#pop"),
            # Contents use inline rules
            include("inline"),
        ],
    }


def tokenize_usage(text: str) -> list[tuple[str, str]]:
    """Tokenize usage text and return list of (token_type, value) tuples.

    Parameters
    ----------
    text : str
        CLI usage text to tokenize.

    Returns
    -------
    list[tuple[str, str]]
        List of (token_type_name, text_value) tuples.

    Examples
    --------
    >>> result = tokenize_usage("usage: cmd [-h]")
    >>> result[0]
    ('Token.Generic.Heading', 'usage:')
    >>> result[2]
    ('Token.Name.Label', 'cmd')
    >>> result[4]
    ('Token.Punctuation', '[')
    >>> result[5]
    ('Token.Name.Attribute', '-h')
    >>> result[6]
    ('Token.Punctuation', ']')
    """
    lexer = CLIUsageLexer()
    return [
        (str(tok_type), tok_value) for tok_type, tok_value in lexer.get_tokens(text)
    ]