1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
"""Pygments lexer for CLI usage/help output.
This module provides a custom Pygments lexer for highlighting command-line
usage text typically generated by argparse, getopt, or similar libraries.
"""
from __future__ import annotations
from pygments.lexer import RegexLexer, bygroups, include
from pygments.token import Generic, Name, Operator, Punctuation, Text, Whitespace
class CLIUsageLexer(RegexLexer):
"""Lexer for CLI usage/help text (argparse, etc.).
Highlights usage patterns including options, arguments, and meta-variables.
Examples
--------
>>> from pygments.token import Token
>>> lexer = CLIUsageLexer()
>>> tokens = list(lexer.get_tokens("usage: cmd [-h]"))
>>> tokens[0]
(Token.Generic.Heading, 'usage:')
>>> tokens[2]
(Token.Name.Label, 'cmd')
"""
name = "CLI Usage"
aliases = ["cli-usage", "usage"] # noqa: RUF012
filenames: list[str] = [] # noqa: RUF012
mimetypes = ["text/x-cli-usage"] # noqa: RUF012
tokens = { # noqa: RUF012
"root": [
# "usage:" at start of line
(r"^(usage:)(\s+)", bygroups(Generic.Heading, Whitespace)), # type: ignore[no-untyped-call]
# Continuation lines (leading whitespace for wrapped usage)
(r"^(\s+)(?=\S)", Whitespace),
include("inline"),
],
"inline": [
# Whitespace
(r"\s+", Whitespace),
# Long options with = value (e.g., --log-level=VALUE)
(
r"(--[a-zA-Z0-9][-a-zA-Z0-9]*)(=)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
bygroups(Name.Tag, Operator, Name.Variable), # type: ignore[no-untyped-call]
),
# Long options standalone
(r"--[a-zA-Z0-9][-a-zA-Z0-9]*", Name.Tag),
# Short options with space-separated value (e.g., -S socket-path)
(
r"(-[a-zA-Z0-9])(\s+)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
bygroups(Name.Attribute, Whitespace, Name.Variable), # type: ignore[no-untyped-call]
),
# Short options standalone
(r"-[a-zA-Z0-9]", Name.Attribute),
# UPPERCASE meta-variables (COMMAND, FILE, PATH)
(r"\b[A-Z][A-Z0-9_]+\b", Name.Constant),
# Opening bracket - enter optional state
(r"\[", Punctuation, "optional"),
# Closing bracket (fallback for unmatched)
(r"\]", Punctuation),
# Choice separator (pipe)
(r"\|", Operator),
# Parentheses for grouping
(r"[()]", Punctuation),
# Positional/command names (lowercase with dashes)
(r"\b[a-z][-a-z0-9]*\b", Name.Label),
# Catch-all for any other text
(r"[^\s\[\]|()]+", Text),
],
"optional": [
# Nested optional bracket
(r"\[", Punctuation, "#push"),
# End optional
(r"\]", Punctuation, "#pop"),
# Contents use inline rules
include("inline"),
],
}
def tokenize_usage(text: str) -> list[tuple[str, str]]:
"""Tokenize usage text and return list of (token_type, value) tuples.
Parameters
----------
text : str
CLI usage text to tokenize.
Returns
-------
list[tuple[str, str]]
List of (token_type_name, text_value) tuples.
Examples
--------
>>> result = tokenize_usage("usage: cmd [-h]")
>>> result[0]
('Token.Generic.Heading', 'usage:')
>>> result[2]
('Token.Name.Label', 'cmd')
>>> result[4]
('Token.Punctuation', '[')
>>> result[5]
('Token.Name.Attribute', '-h')
>>> result[6]
('Token.Punctuation', ']')
"""
lexer = CLIUsageLexer()
return [
(str(tok_type), tok_value) for tok_type, tok_value in lexer.get_tokens(text)
]
|