File: test_regex.py

package info (click to toggle)
python-hypothesis 6.138.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,272 kB
  • sloc: python: 62,853; ruby: 1,107; sh: 253; makefile: 41; javascript: 6
file content (144 lines) | stat: -rw-r--r-- 4,850 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

import re
import string
import sys
from functools import reduce

import pytest

from hypothesis import assume, given, reject, strategies as st
from hypothesis.strategies._internal.regex import (
    IncompatibleWithAlphabet,
    base_regex_strategy,
)


@st.composite
def charset(draw):
    negated = draw(st.booleans())
    chars = draw(st.text(string.ascii_letters + string.digits, min_size=1))
    if negated:
        return f"[^{chars}]"
    else:
        return f"[{chars}]"


COMBINED_MATCHER = re.compile("[?+*]{2}")


@st.composite
def conservative_regex(draw):
    result = draw(
        st.one_of(
            st.just("."),
            st.sampled_from([re.escape(c) for c in string.printable]),
            charset(),
            CONSERVATIVE_REGEX.map(lambda s: f"({s})"),
            CONSERVATIVE_REGEX.map(lambda s: s + "+"),
            CONSERVATIVE_REGEX.map(lambda s: s + "?"),
            CONSERVATIVE_REGEX.map(lambda s: s + "*"),
            st.lists(CONSERVATIVE_REGEX, min_size=1, max_size=3).map("|".join),
            st.lists(CONSERVATIVE_REGEX, min_size=1, max_size=3).map("".join),
        )
    )
    assume(COMBINED_MATCHER.search(result) is None)
    control = sum(result.count(c) for c in "?+*")
    assume(control <= 3)
    assume(I_WITH_DOT not in result)  # known to be weird
    return result


CONSERVATIVE_REGEX = conservative_regex()
FLAGS = st.sets(
    st.sampled_from([re.ASCII, re.IGNORECASE, re.MULTILINE, re.DOTALL])
).map(lambda flag_set: reduce(int.__or__, flag_set, 0))


@given(st.data())
def test_conservative_regex_are_correct_by_construction(data):
    pattern = re.compile(data.draw(CONSERVATIVE_REGEX), flags=data.draw(FLAGS))
    result = data.draw(base_regex_strategy(pattern, alphabet=st.characters()))
    # We'll skip "capital I with dot above" due to awful casefolding behaviour
    # and "latin small letter dotless i" for the same reason.
    assume({"ı", "İ"}.isdisjoint(pattern.pattern + result))
    assert pattern.search(result) is not None


@given(st.data())
def test_fuzz_stuff(data):
    pattern = data.draw(
        st.text(min_size=1, max_size=5)
        | st.binary(min_size=1, max_size=5)
        | CONSERVATIVE_REGEX.filter(bool)
    )
    flags = data.draw(FLAGS)

    try:
        regex = re.compile(pattern, flags=flags)
    except (re.error, FutureWarning):
        # Possible nested sets, e.g. "[[", trigger a FutureWarning
        reject()

    try:
        ex = data.draw(st.from_regex(regex))
    except IncompatibleWithAlphabet:
        if isinstance(pattern, str) and flags & re.ASCII:
            with pytest.raises(UnicodeEncodeError):
                pattern.encode("ascii")
            regex = re.compile(pattern, flags=flags ^ re.ASCII)
            ex = data.draw(st.from_regex(regex))
        else:
            raise

    assert regex.search(ex)


@pytest.mark.skipif(sys.version_info[:2] < (3, 11), reason="new syntax")
@given(st.data())
def test_regex_atomic_group(data):
    pattern = "a(?>bc|b)c"
    ex = data.draw(st.from_regex(pattern))
    assert re.search(pattern, ex)


@pytest.mark.skipif(sys.version_info[:2] < (3, 11), reason="new syntax")
@given(st.data())
def test_regex_possessive(data):
    pattern = '"[^"]*+"'
    ex = data.draw(st.from_regex(pattern))
    assert re.search(pattern, ex)


# Some preliminaries, to establish what's happening:
I_WITH_DOT = "\u0130"
assert I_WITH_DOT.swapcase() == "i\u0307"  # note: string of length two!
assert re.compile(I_WITH_DOT, flags=re.IGNORECASE).match(I_WITH_DOT.swapcase())


@given(st.data())
def test_case_insensitive_not_literal_never_constructs_multichar_match(data):
    # So our goal is to confirm that we can never accidentally create a non-matching
    # string by assembling individually allowed characters.
    pattern = re.compile(f"[^{I_WITH_DOT}]+", flags=re.IGNORECASE)
    strategy = st.from_regex(pattern, fullmatch=True)
    for _ in range(5):
        s = data.draw(strategy)
        assert pattern.fullmatch(s) is not None
        # And to be on the safe side, we implement this stronger property:
        assert set(s).isdisjoint(I_WITH_DOT.swapcase())


@given(st.from_regex(re.compile(f"[^{I_WITH_DOT}_]", re.IGNORECASE), fullmatch=True))
def test_no_error_converting_negated_sets_to_strategy(s):
    # CharactersBuilder no longer triggers an internal error converting sets
    # or negated sets to a strategy when multi-char strings are whitelisted.
    pass