"""Hypothesis strategies for generating Python source code, somewhat like CSmith.""" import ast import dis import sys from hypothesis import assume, strategies as st from hypothesis.extra.lark import LarkStrategy from lark import Lark from lark.indenter import Indenter # To update this grammar file, run # wget https://raw.githubusercontent.com/lark-parser/lark/master/lark/grammars/python.lark -O src/hypothesmith/python.lark if sys.version_info < (3, 9): # pragma: no cover from importlib.resources import read_text LARK_GRAMMAR = read_text("hypothesmith", "python.lark") else: # pragma: no cover # not on py38, anyway from importlib.resources import files LARK_GRAMMAR = ( files("hypothesmith").joinpath("python.lark").read_text(encoding="utf8") ) COMPILE_MODES = { "eval_input": "eval", "file_input": "exec", "stmt": "single", "simple_stmt": "single", "compound_stmt": "single", } ALLOWED_CHARS = st.characters(codec="utf-8", min_codepoint=1) class PythonIndenter(Indenter): # https://github.com/lark-parser/lark/blob/master/examples/python_parser.py NL_type = "_NEWLINE" OPEN_PAREN_types = ["LPAR", "LSQB", "LBRACE"] CLOSE_PAREN_types = ["RPAR", "RSQB", "RBRACE"] INDENT_type = "_INDENT" DEDENT_type = "_DEDENT" tab_len = 4 class GrammarStrategy(LarkStrategy): def __init__(self, grammar: Lark, start: str, auto_target: bool): explicit_strategies = { PythonIndenter.INDENT_type: st.just(" " * PythonIndenter.tab_len), PythonIndenter.DEDENT_type: st.just(""), "NAME": st.text().filter(str.isidentifier), } super().__init__(grammar, start, explicit_strategies, alphabet=ALLOWED_CHARS) self.auto_target = auto_target and start != "single_input" def do_draw(self, data): # type: ignore result = super().do_draw(data) if self.auto_target: # target larger inputs - the Hypothesis engine will do a multi-objective # hill-climbing search using these scores to generate 'better' examples. nodes = list(ast.walk(ast.parse(result))) uniq_nodes = {type(n) for n in nodes} instructions = list(dis.Bytecode(compile(result, "", "exec"))) targets = data.target_observations for value, label in [ (instructions, "(hypothesmith) instructions in bytecode"), (nodes, "(hypothesmith) total number of ast nodes"), (uniq_nodes, "(hypothesmith) number of unique ast node types"), ]: targets[label] = max(float(len(value)), targets.get(label, 0.0)) return result def draw_symbol(self, data, symbol, draw_state): # type: ignore count = len(draw_state) super().draw_symbol(data, symbol, draw_state) if symbol.name in COMPILE_MODES: try: compile( source="".join(draw_state[count:]), filename="", mode=COMPILE_MODES[symbol.name], ) except SyntaxError: # Python's grammar doesn't actually fully describe the behaviour of the # CPython parser and AST-post-processor, so we just filter out errors. assume(False) except Exception as err: # pragma: no cover # Attempting to compile almost-valid strings has triggered a wide range # of bizzare errors in CPython, especially with the new PEG parser, # and so we maintain this extra clause to ensure that we get a decent # error message out of it. if isinstance(err, SystemError) and ( sys.version_info[:3] == (3, 9, 0) or sys.version_info[:3] >= (3, 9, 8) and str(err) == "Negative size passed to PyUnicode_New" ): # We've triggered https://bugs.python.org/issue42218 - it's been # fixed upstream, so we'll treat it as if it were a SyntaxError. # Or the new https://bugs.python.org/issue45738 which makes me # wish CPython would start running proptests in CI already. assume(False) source_code = ascii("".join(draw_state[count:])) raise type(err)( f"compile({source_code}, '', " f"{COMPILE_MODES[symbol.name]!r}) " f"raised {type(err).__name__}: {str(err)}" ) from err def gen_ignore(self, data, draw_state): # type: ignore # Set a consistent 1/4 chance of generating any ignored tokens (comments, # whitespace, line-continuations) as part of this draw. # See https://github.com/HypothesisWorks/hypothesis/issues/2643 for plans # to do more sophisticated swarm testing for grammars, upstream. if data.draw( st.shared( st.sampled_from([False, True, False, False]), key="hypothesmith_gen_ignored", ) ): super().gen_ignore(data, draw_state) def from_grammar( start: str = "file_input", *, auto_target: bool = True ) -> st.SearchStrategy[str]: """Generate syntactically-valid Python source code based on the grammar. Valid values for ``start`` are ``"single_input"``, ``"file_input"``, or ``"eval_input"``; respectively a single interactive statement, a module or sequence of commands read from a file, and input for the eval() function. If ``auto_target`` is True, this strategy uses ``hypothesis.target()`` internally to drive towards larger and more complex examples. We recommend leaving this enabled, as the grammar is quite complex and only simple examples tend to be generated otherwise. .. warning:: DO NOT EXECUTE CODE GENERATED BY THIS STRATEGY. It could do literally anything that running Python code is able to do, including changing, deleting, or uploading important data. Arbitrary code can be useful, but "arbitrary code execution" can be very, very bad. """ assert start in {"single_input", "file_input", "eval_input"} assert isinstance(auto_target, bool) grammar = Lark(LARK_GRAMMAR, parser="lalr", postlex=PythonIndenter(), start=start) return GrammarStrategy(grammar, start, auto_target)