File: 0001-Fix-the-regex-colorizer-666.patch

package info (click to toggle)
pydoctor 22.9.1-4
links: PTS, VCS
area: main
in suites: bookworm
size: 3,976 kB
sloc: python: 22,250; javascript: 2,533; ansic: 57; makefile: 21; sh: 18
file content (371 lines) | stat: -rw-r--r-- 12,698 bytes
From: tristanlatr <tristanlatr@users.noreply.github.com>
Date: Mon, 28 Nov 2022 17:12:32 -0500
Subject: Fix the regex colorizer #666

---
 pydoctor/epydoc/markup/_pyval_repr.py   |   7 +-
 pydoctor/epydoc/sre_constants36.py      | 235 ++++++++++++++++++++++++++++++++
 pydoctor/epydoc/sre_parse36.py          |   2 +-
 pydoctor/test/epydoc/test_pyval_repr.py |  40 +++++-
 4 files changed, 276 insertions(+), 8 deletions(-)
 create mode 100644 pydoctor/epydoc/sre_constants36.py

diff --git a/pydoctor/epydoc/markup/_pyval_repr.py b/pydoctor/epydoc/markup/_pyval_repr.py
index a29f7b9..bf6ab54 100644
--- a/pydoctor/epydoc/markup/_pyval_repr.py
+++ b/pydoctor/epydoc/markup/_pyval_repr.py
@@ -39,7 +39,6 @@ import re
 import ast
 import functools
 import sys
-import sre_constants
 from inspect import signature
 from typing import Any, AnyStr, Union, Callable, Dict, Iterable, Sequence, Optional, List, Tuple, cast
 
@@ -48,7 +47,7 @@ import astor.op_util
 from docutils import nodes, utils
 from twisted.web.template import Tag
 
-from pydoctor.epydoc import sre_parse36
+from pydoctor.epydoc import sre_parse36, sre_constants36 as sre_constants
 from pydoctor.epydoc.markup import DocstringLinker
 from pydoctor.epydoc.markup.restructuredtext import ParsedRstDocstring
 from pydoctor.epydoc.docutils import set_node_attributes, wbr, obj_reference
@@ -717,7 +716,7 @@ class PyvalColorizer:
             # Can raise ValueError or re.error
             # Value of type variable "AnyStr" cannot be "Union[bytes, str]": Yes it can.
             self._colorize_re_pattern_str(pat, state) #type:ignore[type-var]
-        except (ValueError, re.error) as e:
+        except Exception as e:
             # Colorize the ast.Call as any other node if the pattern parsing fails.
             state.restore(mark)
             state.warnings.append(f"Cannot colorize regular expression, error: {str(e)}")
@@ -947,7 +946,7 @@ class PyvalColorizer:
                                         state, False, groups )
                 self._output(']', self.RE_GROUP_TAG, state)
             else:
-                raise RuntimeError(f"Error colorizing regexp, unknown element :{elt}")
+                raise ValueError(f"Unsupported element :{elt}")
         if len(tree) > 1 and not noparen:
             self._output(')', self.RE_GROUP_TAG, state)
 
diff --git a/pydoctor/epydoc/sre_constants36.py b/pydoctor/epydoc/sre_constants36.py
new file mode 100644
index 0000000..32d227f
--- /dev/null
+++ b/pydoctor/epydoc/sre_constants36.py
@@ -0,0 +1,235 @@
+# Code copied from Python 3.6 - Python Software Foundation - GNU General Public License v3.0
+#
+# Secret Labs' Regular Expression Engine
+#
+# various symbols used by the regular expression engine.
+# run this script to update the _sre include files!
+#
+# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
+#
+# See the sre.py file for information on usage and redistribution.
+#
+
+"""Internal support module for sre"""
+
+# update when constants are added or removed
+
+MAGIC = 20140917
+
+MAXREPEAT = 4294967295
+MAXGROUPS = 2147483647
+
+# SRE standard exception (access as sre.error)
+# should this really be here?
+
+class error(Exception):
+    """Exception raised for invalid regular expressions.
+
+    Attributes:
+
+        msg: The unformatted error message
+        pattern: The regular expression pattern
+        pos: The index in the pattern where compilation failed (may be None)
+        lineno: The line corresponding to pos (may be None)
+        colno: The column corresponding to pos (may be None)
+    """
+
+    def __init__(self, msg, pattern=None, pos=None):
+        self.msg = msg
+        self.pattern = pattern
+        self.pos = pos
+        if pattern is not None and pos is not None:
+            msg = '%s at position %d' % (msg, pos)
+            if isinstance(pattern, str):
+                newline = '\n'
+            else:
+                newline = b'\n'
+            self.lineno = pattern.count(newline, 0, pos) + 1
+            self.colno = pos - pattern.rfind(newline, 0, pos)
+            if newline in pattern:
+                msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
+        else:
+            self.lineno = self.colno = None
+        super().__init__(msg)
+
+
+class _NamedIntConstant(int):
+    def __new__(cls, value, name):
+        self = super(_NamedIntConstant, cls).__new__(cls, value)
+        self.name = name
+        return self
+
+    def __str__(self):
+        return self.name
+
+    __repr__ = __str__
+
+MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
+
+def _makecodes(names):
+    names = names.strip().split()
+    items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
+    globals().update({item.name: item for item in items})
+    return items
+
+# operators
+# failure=0 success=1 (just because it looks better that way :-)
+OPCODES = _makecodes("""
+    FAILURE SUCCESS
+
+    ANY ANY_ALL
+    ASSERT ASSERT_NOT
+    AT
+    BRANCH
+    CALL
+    CATEGORY
+    CHARSET BIGCHARSET
+    GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE
+    IN IN_IGNORE
+    INFO
+    JUMP
+    LITERAL LITERAL_IGNORE
+    MARK
+    MAX_UNTIL
+    MIN_UNTIL
+    NOT_LITERAL NOT_LITERAL_IGNORE
+    NEGATE
+    RANGE
+    REPEAT
+    REPEAT_ONE
+    SUBPATTERN
+    MIN_REPEAT_ONE
+    RANGE_IGNORE
+
+    MIN_REPEAT MAX_REPEAT
+""")
+del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
+
+# positions
+ATCODES = _makecodes("""
+    AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
+    AT_BOUNDARY AT_NON_BOUNDARY
+    AT_END AT_END_LINE AT_END_STRING
+    AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
+    AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
+""")
+
+# categories
+CHCODES = _makecodes("""
+    CATEGORY_DIGIT CATEGORY_NOT_DIGIT
+    CATEGORY_SPACE CATEGORY_NOT_SPACE
+    CATEGORY_WORD CATEGORY_NOT_WORD
+    CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
+    CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
+    CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
+    CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
+    CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
+    CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
+""")
+
+
+# replacement operations for "ignore case" mode
+OP_IGNORE = {
+    GROUPREF: GROUPREF_IGNORE,
+    IN: IN_IGNORE,
+    LITERAL: LITERAL_IGNORE,
+    NOT_LITERAL: NOT_LITERAL_IGNORE,
+    RANGE: RANGE_IGNORE,
+}
+
+AT_MULTILINE = {
+    AT_BEGINNING: AT_BEGINNING_LINE,
+    AT_END: AT_END_LINE
+}
+
+AT_LOCALE = {
+    AT_BOUNDARY: AT_LOC_BOUNDARY,
+    AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
+}
+
+AT_UNICODE = {
+    AT_BOUNDARY: AT_UNI_BOUNDARY,
+    AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
+}
+
+CH_LOCALE = {
+    CATEGORY_DIGIT: CATEGORY_DIGIT,
+    CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
+    CATEGORY_SPACE: CATEGORY_SPACE,
+    CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
+    CATEGORY_WORD: CATEGORY_LOC_WORD,
+    CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
+    CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
+    CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
+}
+
+CH_UNICODE = {
+    CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
+    CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
+    CATEGORY_SPACE: CATEGORY_UNI_SPACE,
+    CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
+    CATEGORY_WORD: CATEGORY_UNI_WORD,
+    CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
+    CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
+    CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
+}
+
+# flags
+SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
+SRE_FLAG_IGNORECASE = 2 # case insensitive
+SRE_FLAG_LOCALE = 4 # honour system locale
+SRE_FLAG_MULTILINE = 8 # treat target as multiline string
+SRE_FLAG_DOTALL = 16 # treat target as a single string
+SRE_FLAG_UNICODE = 32 # use unicode "locale"
+SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
+SRE_FLAG_DEBUG = 128 # debugging
+SRE_FLAG_ASCII = 256 # use ascii "locale"
+
+# flags for INFO primitive
+SRE_INFO_PREFIX = 1 # has prefix
+SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
+SRE_INFO_CHARSET = 4 # pattern starts with character from given set
+
+if __name__ == "__main__":
+    def dump(f, d, prefix):
+        items = sorted(d)
+        for item in items:
+            f.write("#define %s_%s %d\n" % (prefix, item, item))
+    with open("sre_constants.h", "w") as f:
+        f.write("""\
+/*
+ * Secret Labs' Regular Expression Engine
+ *
+ * regular expression matching engine
+ *
+ * NOTE: This file is generated by sre_constants.py.  If you need
+ * to change anything in here, edit sre_constants.py and run it.
+ *
+ * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
+ *
+ * See the _sre.c file for information on usage and redistribution.
+ */
+
+""")
+
+        f.write("#define SRE_MAGIC %d\n" % MAGIC)
+
+        dump(f, OPCODES, "SRE_OP")
+        dump(f, ATCODES, "SRE")
+        dump(f, CHCODES, "SRE")
+
+        f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
+        f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
+        f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
+        f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
+        f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
+        f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
+        f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
+        f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
+        f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
+
+        f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
+        f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
+        f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
+
+    print("done")
diff --git a/pydoctor/epydoc/sre_parse36.py b/pydoctor/epydoc/sre_parse36.py
index 879fd02..f3c7b81 100644
--- a/pydoctor/epydoc/sre_parse36.py
+++ b/pydoctor/epydoc/sre_parse36.py
@@ -74,7 +74,7 @@
 
 # XXX: show string offset and offending character for all errors
 
-from sre_constants import *
+from .sre_constants36 import *
 
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
diff --git a/pydoctor/test/epydoc/test_pyval_repr.py b/pydoctor/test/epydoc/test_pyval_repr.py
index c314f12..15adc27 100644
--- a/pydoctor/test/epydoc/test_pyval_repr.py
+++ b/pydoctor/test/epydoc/test_pyval_repr.py
@@ -1223,17 +1223,30 @@ def color_re(s: Union[bytes, str],
     val = colorizer.colorize(extract_expr(ast.parse(f"re.compile({repr(s)})")))
 
     if check_roundtrip:
-
+        raw_text = ''.join(gettext(val.to_node()))
         re_begin = 13
+        raw_string = True
+
+        if raw_text[12] != 'r':
+            # the regex has failed to be colorized since we can't find the r prefix
+            # meaning the string has been rendered as plaintext instead.
+            raw_string = False
+            re_begin -= 1
+        
         if isinstance(s, bytes):
             re_begin += 1
         re_end = -2
 
-        round_trip: Union[bytes, str] = ''.join(gettext(val.to_node()))[re_begin:re_end]
+        round_trip: Union[bytes, str] = raw_text[re_begin:re_end]
         if isinstance(s, bytes):
             assert isinstance(round_trip, str)
             round_trip = bytes(round_trip, encoding='utf-8')
-        assert round_trip == s, "%s != %s" % (repr(round_trip), repr(s))
+        
+        expected = s
+        if not raw_string:
+            expected = expected.replace('\\', '\\\\')
+        
+        assert round_trip == expected, "%s != %s" % (repr(round_trip), repr(s))
     
     return flatten(val.to_stan(NotFoundLinker()))[17:-8]
 
@@ -1344,6 +1357,27 @@ def test_re_flags() -> None:
      
     assert color_re(r"(?x)This   is   verbose", False) == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?ux)</span>Thisisverbose<span class="rst-variable-quote">'</span>"""
 
+def test_unsupported_regex_features() -> None:
+    """
+    Because pydoctor uses the regex engine of python 3.6, it does not support the 
+    latest features introduced in python3.11 like atomic groupping and possesive qualifiers.
+
+    But still, we should not crash.
+    """
+    regexes = ['e*+e',
+        '(e?){2,4}+a',
+        r"^(\w){1,2}+$",
+        "^x{}+$",
+        r'a++',
+        r'(?:ab)++',
+        r'(?:ab){1,3}+',
+        r'(?>x++)x',
+        r'(?>a{1,3})',
+        r'(?>(?:ab){1,3})',
+        ]
+    for r in regexes:
+        color_re(r)
+
 def test_re_not_literal() -> None:
 
     assert color_re(r"[^0-9]") == """r<span class="rst-variable-quote">'</span><span class="rst-re-group">[</span><span class="rst-re-op">^</span>0<span class="rst-re-op">-</span>9<span class="rst-re-group">]</span><span class="rst-variable-quote">'</span>"""