File: parser.py

package info (click to toggle)
python-flask-seeder 1.2.0-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 292 kB
  • sloc: python: 1,062; makefile: 2
file content (233 lines) | stat: -rw-r--r-- 7,187 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
""" String Generator pattern parser """

import re

TOKEN_MATCHERS = [
    ("CHARCODE",            r"\\([a-z])"),
    ("ONEOF",               r"\[([a-zA-Z0-9]+)\]"),
    ("RANGE",               r"\[(\w)-(\w)\]"),
    ("QUANTIFIER",          r"\{(\d+)\}"),
    ("QUANTIFIER_RANGE",    r"\{(\d*),(\d+)\}"),
    ("STRING_GROUP",        r"\(([^|]([^)]+))\)"),
    ("NUMBER",              r"\d+"),
    ("STRING",              r"[a-zA-Z]+"),
    ("LITERAL",             r"."),
]

def strtype(string):
    """ Check string type

    Arguments:
        string: String to check

    Returns:
        Returns one of the following results
        * DIGIT: String is digit [0-9]
        * ALPHA: String is alpha [a-zA-Z]
        * STRING: If no other match is found, it is assumed to be a STRING type
    """
    if str.isdigit(string):
        return "DIGIT"
    if str.isalpha(string):
        return "ALPHA"
    return "STRING"

class Token: # pylint: disable=too-few-public-methods
    """ Token class
    Used by a tokenizer to store token data

    Attributes:
        group: Token group type or classification
        value: Token value
        pos: Starting position in input stream where token is found
    """

    def __init__(self, group, value, pos=0):
        self.group = group
        self.value = value
        self.pos = pos


class Tokenizer:
    """ String Generator Tokenizer

    Attributes:
        string: String to tokenize
        tokens: List of tokens after successful tokenization
    """

    def __init__(self):
        self.tokens = []
        self.regex = "|".join("(?P<%s>%s)" % matcher for matcher in TOKEN_MATCHERS)
        self._cursor = 0

    def run(self, string=None):
        """ Run tokenizer """
        for match in re.finditer(self.regex, string):
            group = match.lastgroup
            value = match.group()
            self.tokens.append(Token(group, value, match.start()))

    def next(self):
        """ Get next token

        Retrieve next token and move cursor forward.

        Returns:
            A Token instance
        """
        if self._cursor >= len(self.tokens):
            return None

        token = self.tokens[self._cursor]
        self._cursor += 1
        return token

    def peek(self):
        """ Peek at the next token

        Retrieve next token without moving cursor forward.

        Returns:
            A Token instance
        """
        if self._cursor >= len(self.tokens):
            return None

        return self.tokens[self._cursor]

class SGParser:
    """ String Generator Pattern parser """

    def __init__(self, tokenizer=None):
        self.tokenizer = tokenizer

    def _repeat(self):
        ahead = self.tokenizer.peek()
        if ahead and ahead.group == "QUANTIFIER":
            result = self.parse_QUANTIFIER(self.tokenizer.next())
        elif ahead and ahead.group == "QUANTIFIER_RANGE":
            result = self.parse_QUANTIFIER_RANGE(self.tokenizer.next())
        else:
            result = {"type": "QUANTIFIER", "value": 1}

        return result

    def parse_STRING(self, token): # pylint: disable=invalid-name, no-self-use
        """ Parse a STRING token

        Returns:
            Dictionary representing the AST node structure for string
        """
        return {"type": token.group, "value": token.value}

    def parse_RANGE(self, token): # pylint: disable=invalid-name
        """ Parse a RANGE token

        Returns:
            Dictionary representing the AST node structure for range
        """
        (start, end) = re.search(r"\[([a-zA-Z0-9])-([a-zA-Z0-9])\]", token.value).groups()
        if start > end or strtype(start) != strtype(end):
            raise ValueError("Invalid range %s at position %d" % (token.value, token.pos))

        result = {"type": token.group,
                  "value": {
                      "start": start,
                      "end": end,
                  },
                  "repeat": self._repeat()}

        return result

    def parse_QUANTIFIER(self, token): # pylint: disable=invalid-name, no-self-use
        """ Parse a QUANTIFIER token

        Returns:
            Dictionary representing the AST node structure for quantifier
        """
        value = re.match(r"\{(\d+)\}", token.value).group(1)
        return {"type": token.group, "value": int(value)}

    def parse_CHARCODE(self, token): # pylint: disable=invalid-name
        """ Parse a CHARCODE token

        Returns:
            Dictionary representing the AST node structure for charcode
        """
        value = re.match(r"\\([a-z])", token.value).group(1)
        return {"type": token.group, "value": value, "repeat": self._repeat()}

    def parse_ONEOF(self, token): # pylint: disable=invalid-name
        """ Parse a ONEOF token

        Returns:
            Dictionary representing the AST node structure for oneof
        """
        values = re.findall("[a-zA-Z0-9]", token.value)
        return {"type": token.group, "value": values, "repeat": self._repeat()}

    def parse_QUANTIFIER_RANGE(self, token): # pylint: disable=invalid-name, no-self-use
        """ Parse a QUANTIFIER_RANGE token

        Returns:
            Dictionary representing the AST node structure for quantifier range
        """
        (start, end) = re.search(r"\{(\d+),(\d+)\}", token.value).groups()

        if int(start) > int(end):
            raise ValueError("Invalid range %s at %d" % (token.value, token.pos))

        return {"type": token.group, "value": {"start": int(start), "end": int(end)}}

    def parse_STRING_GROUP(self, token): # pylint: disable=invalid-name
        """ Parse a STRING_GROUP token

        Returns:
            Dictionary representing the AST node structure for string group
        """
        options = re.match(r"\((.+)\)", token.value).group(1)
        values = re.split(r"\|", options)
        return {"type": token.group, "value": values, "repeat": self._repeat()}

    def parse_NUMBER(self, token): # pylint: disable=invalid-name
        """ Parse a NUMBER token

        Returns:
            Dictionary representing the AST node structure for number
        """
        return {"type": token.group, "value": int(token.value), "repeat": self._repeat()}

    def parse_LITERAL(self, token): # pylint: disable=invalid-name, no-self-use
        """ Parse a LITERAL token

        Returns:
            Dictionary representing the AST node structure for literal
        """
        return {"type": "LITERAL", "value": token.value}

    def parse(self, string):
        """ Main parser method

        Parses a list of tokens into an AST

        Arguments:
            string: The string to be parsed

        Returns:
            Dictionary representing the String Generator pattern AST
        """
        ast = []
        self.tokenizer.run(string)
        token = self.tokenizer.next()
        while token:
            function_name = "parse_" + token.group
            if not hasattr(self, function_name):
                function_name = "parse_LITERAL"

            func = getattr(self, function_name)
            ast.append(func(token))

            token = self.tokenizer.next()

        return ast