File: reparser.py

package info (click to toggle)
reparser 1.4.3-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 96 kB
  • sloc: python: 142; makefile: 6
file content (176 lines) | stat: -rw-r--r-- 6,451 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import re, enum


# Precompiled regex for matching named groups in regex patterns
group_regex = re.compile(r'\?P<(.+?)>')


class Segment:
    """Segment of parsed text"""
    def __init__(self, text, token=None, match=None, **params):
        self.text = text
        self.params = params
        if token and match:
            self.update_text(token, match)
            self.update_params(token, match)

    def update_text(self, token, match):
        """Update text from results of regex match"""
        if isinstance(self.text, MatchGroup):
            self.text = self.text.get_group_value(token, match)

    def update_params(self, token, match):
        """Update dict of params from results of regex match"""
        for k, v in self.params.items():
            if isinstance(v, MatchGroup):
                self.params[k] = v.get_group_value(token, match)


class Token:
    """Definition of token which should be parsed from text"""
    def __init__(self, name, pattern_start, pattern_end=None, text=None, skip=False, **params):
        self.name = name
        self.group_start = '{}_start'.format(self.name)
        self.group_end = '{}_end'.format(self.name) if pattern_end else None
        self.pattern_start = self.modify_pattern(pattern_start, self.group_start)
        self.pattern_end = self.modify_pattern(pattern_end, self.group_end) if pattern_end else None
        self.text = text
        self.skip = skip
        self.params = params

    def modify_pattern(self, pattern, group):
        """Rename groups in regex pattern and enclose it in named group"""
        pattern = group_regex.sub(r'?P<{}_\1>'.format(self.name), pattern)
        return r'(?P<{}>{})'.format(group, pattern)


class MatchGroup:
    """Name of regex group which should be replaced by its value when token is parsed"""
    def __init__(self, group, func=None):
        self.group = group
        self.func = func

    def get_group_value(self, token, match):
        """Return value of regex match for the specified group"""
        try:
            value = match.group('{}_{}'.format(token.name, self.group))
        except IndexError:
            value = ''
        return self.func(value) if callable(self.func) else value


class MatchType(enum.Enum):
    """Type of token matched by regex"""
    start = 1
    end = 2
    single = 3


class Parser:
    """Simple regex-based lexer/parser for inline markup"""
    def __init__(self, tokens):
        self.tokens = tokens
        self.regex = self.build_regex(tokens)
        self.groups = self.build_groups(tokens)

    def preprocess(self, text):
        """Preprocess text before parsing (should be reimplemented by subclass)"""
        return text

    def postprocess(self, text):
        """Postprocess text after parsing (should be reimplemented by subclass)"""
        return text

    def build_regex(self, tokens):
        """Build compound regex from list of tokens"""
        patterns = []
        for token in tokens:
            patterns.append(token.pattern_start)
            if token.pattern_end:
                patterns.append(token.pattern_end)
        return re.compile('|'.join(patterns), re.DOTALL)

    def build_groups(self, tokens):
        """Build dict of groups from list of tokens"""
        groups = {}
        for token in tokens:
            match_type = MatchType.start if token.group_end else MatchType.single
            groups[token.group_start] = (token, match_type)
            if token.group_end:
                groups[token.group_end] = (token, MatchType.end)
        return groups

    def get_matched_token(self, match):
        """Find which token has been matched by compound regex"""
        match_groupdict = match.groupdict()
        for group in self.groups:
            if match_groupdict[group] is not None:
                token, match_type = self.groups[group]
                return (token, match_type, group)

    def get_params(self, token_stack):
        """Get params from stack of tokens"""
        params = {}
        for token in token_stack:
            params.update(token.params)
        return params

    def remove_token(self, token_stack, token):
        """Remove last occurance of token from stack"""
        token_stack.reverse()
        try:
            token_stack.remove(token)
            retval = True
        except ValueError:
            retval = False
        token_stack.reverse()
        return retval

    def parse(self, text):
        """Parse text to obtain list of Segments"""
        text = self.preprocess(text)
        token_stack = []
        last_pos = 0

        # Iterate through all matched tokens
        for match in self.regex.finditer(text):
            # Find which token has been matched by regex
            token, match_type, group = self.get_matched_token(match)

            # Get params from stack of tokens
            params = self.get_params(token_stack)

            # Should we skip interpreting tokens?
            skip = token_stack[-1].skip if token_stack else False

            # Check for end token first
            if match_type == MatchType.end:
                if not skip or token_stack[-1] == token:
                    removed = self.remove_token(token_stack, token)
                    if removed:
                        skip = False
                    else:
                        skip = True

            if not skip:
                # Append text preceding matched token
                start_pos = match.start(group)
                if start_pos > last_pos:
                    yield Segment(self.postprocess(text[last_pos:start_pos]), **params)

                # Actions specific for start token or single token
                if match_type == MatchType.start:
                    token_stack.append(token)
                elif match_type == MatchType.single:
                    single_params = params.copy()
                    single_params.update(token.params)
                    single_text = token.text if token.text is not None else match.group(group)
                    yield Segment(single_text, token=token, match=match, **single_params)

                # Move last position pointer to the end of matched token
                last_pos = match.end(group)

        # Append anything that's left
        if last_pos < len(text):
            params = self.get_params(token_stack)
            yield Segment(self.postprocess(text[last_pos:]), **params)