1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
#Copyright ReportLab Europe Ltd. 2000-2012
#see license.txt for license details
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/tools/docco/t_parse.py
"""
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).
Template initialization has the form:
T = Template(template_string, wild_card_marker, single_char_marker,
x = regex_x, y = regex_y, ...)
Parsing has the form
([match1, match2, ..., matchn], lastindex) = T.PARSE(string)
Only the first argument is mandatory.
The resultant object efficiently parses strings that match the template_string,
giving a list of substrings that correspond to each "directive" of the template.
Template directives:
Wildcard:
The template may be initialized with a wildcard that matches any string
up to the string matching the next directive (which may not be a wild
card or single character marker) or the next literal sequence of characters
of the template. The character that represents a wildcard is specified
by the wild_card_marker parameter, which has no default.
For example, using X as the wildcard:
>>> T = Template("prefixXinteriorX", "X")
>>> T.PARSE("prefix this is before interior and this is after")
([' this is before ', ' and this is after'], 47)
>>> T = Template("<X>X<X>", "X")
>>> T.PARSE('<A HREF="index.html">go to index</A>')
(['A HREF="index.html"', 'go to index', '/A'], 36)
Obviously the character used to represent the wildcard must be distinct
from the characters used to represent literals or other directives.
Fixed length character sequences:
The template may have a marker character which indicates a fixed
length field. All adjacent instances of this marker will be matched
by a substring of the same length in the parsed string. For example:
>>> T = Template("NNN-NN-NNNN", single_char_marker="N")
>>> T.PARSE("1-2-34-5-12")
(['1-2', '34', '5-12'], 11)
>>> T.PARSE("111-22-3333")
(['111', '22', '3333'], 11)
>>> T.PARSE("1111-22-3333")
ValueError: literal not found at (3, '-')
A template may have multiple fixed length markers, which allows fixed
length fields to be adjacent, but recognized separately. For example:
>>> T = Template("MMDDYYX", "X", "MDY")
>>> T.PARSE("112489 Somebody's birthday!")
(['11', '24', '89', " Somebody's birthday!"], 27)
Regular expression markers:
The template may have markers associated with regular expressions.
the regular expressions may be either string represenations of compiled.
For example:
>>> T = Template("v: s i", v=id, s=str, i=int)
>>> T.PARSE("this_is_an_identifier: 'a string' 12344")
(['this_is_an_identifier', "'a string'", '12344'], 39)
>>>
Here id, str, and int are regular expression conveniences provided by
this module.
Directive markers may be mixed and matched, except that wildcards cannot precede
wildcards or single character markers.
Example:
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
>>>
"""
import re, string
from reportlab.lib.utils import ascii_letters
#
# template parsing
#
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
# ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
#
class Template:
def __init__(self,
template,
wild_card_marker=None,
single_char_marker=None,
**marker_to_regex_dict):
self.template = template
self.wild_card = wild_card_marker
self.char = single_char_marker
# determine the set of markers for this template
markers = list(marker_to_regex_dict.keys())
if wild_card_marker:
markers.append(wild_card_marker)
if single_char_marker:
for ch in single_char_marker: # allow multiple scm's
markers.append(ch)
self.char = single_char_primary = single_char_marker[0]
self.markers = markers
for mark in markers:
if len(mark)>1:
raise ValueError("Marks must be single characters: "+repr(mark))
# compile the regular expressions if needed
self.marker_dict = marker_dict = {}
for mark, rgex in marker_to_regex_dict.items():
if isinstance(rgex,str):
rgex = re.compile(rgex)
marker_dict[mark] = rgex
# determine the parse sequence
parse_seq = []
# dummy last char
lastchar = None
index = 0
last = len(template)
# count the number of directives encountered
ndirectives = 0
while index<last:
start = index
thischar = template[index]
# is it a wildcard?
if thischar == wild_card_marker:
if lastchar == wild_card_marker:
raise ValueError("two wild cards in sequence is not allowed")
parse_seq.append( (wild_card_marker, None) )
index = index+1
ndirectives = ndirectives+1
# is it a sequence of single character markers?
elif single_char_marker and thischar in single_char_marker:
if lastchar == wild_card_marker:
raise ValueError("wild card cannot precede single char marker")
while index<last and template[index] == thischar:
index = index+1
parse_seq.append( (single_char_primary, index-start) )
ndirectives = ndirectives+1
# is it a literal sequence?
elif not thischar in markers:
while index<last and not template[index] in markers:
index = index+1
parse_seq.append( (None, template[start:index]) )
# otherwise it must be a re marker
else:
rgex = marker_dict[thischar]
parse_seq.append( (thischar, rgex) )
ndirectives = ndirectives+1
index = index+1
lastchar = template[index-1]
self.parse_seq = parse_seq
self.ndirectives = ndirectives
def PARSE(self, s, start=0):
ndirectives = self.ndirectives
wild_card = self.wild_card
single_char = self.char
parse_seq = self.parse_seq
lparse_seq = len(parse_seq) - 1
# make a list long enough for substitutions for directives
result = [None] * ndirectives
current_directive_index = 0
currentindex = start
# scan through the parse sequence, recognizing
for parse_index in range(lparse_seq + 1):
(indicator, data) = parse_seq[parse_index]
# is it a literal indicator?
if indicator is None:
if s.find(data, currentindex) != currentindex:
raise ValueError("literal not found at "+repr((currentindex,data)))
currentindex = currentindex + len(data)
else:
# anything else is a directive
# is it a wildcard?
if indicator == wild_card:
# if it is the last directive then it matches the rest of the string
if parse_index == lparse_seq:
last = len(s)
# otherwise must look at next directive to find end of wildcard
else:
# next directive must be re or literal
(nextindicator, nextdata) = parse_seq[parse_index+1]
if nextindicator is None:
# search for literal
last = s.find(nextdata, currentindex)
if last<currentindex:
raise ValueError("couldn't terminate wild with lit "+repr(currentindex))
else:
# data is a re, search for it
last = nextdata.search(s, currentindex)
if last<currentindex:
raise ValueError("couldn't terminate wild with re "+repr(currentindex))
elif indicator == single_char:
# data is length to eat
last = currentindex + data
else:
# other directives are always regular expressions
last = data.match(s, currentindex) + currentindex
if last<currentindex:
raise ValueError("couldn't match re at "+repr(currentindex))
#print("accepting", s[currentindex:last])
result[current_directive_index] = s[currentindex:last]
current_directive_index = current_directive_index+1
currentindex = last
# sanity check
if current_directive_index != ndirectives:
raise SystemError("not enough directives found?")
return (result, currentindex)
# some useful regular expressions
USERNAMEREGEX = \
"["+ascii_letters+"]["+ascii_letters+string.digits+"_]*"
STRINGLITREGEX = "'[^\n']*'"
SIMPLEINTREGEX = "["+string.digits+"]+"
id = re.compile(USERNAMEREGEX)
str = re.compile(STRINGLITREGEX)
int = re.compile(SIMPLEINTREGEX)
def test():
global T, T1, T2, T3
T = Template("(NNN)NNN-NNNN X X", "X", "N")
print(T.PARSE("(908)949-2726 Aaron Watters"))
T1 = Template("s --> s blah", s=str)
s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
print(T1.PARSE(s))
T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma"))
T3 = Template("XsXi", "X", "N", s=str, i=int)
print(T3.PARSE("prefix'string'interior1234junk not parsed"))
T4 = Template("MMDDYYX", "X", "MDY")
print(T4.PARSE("122961 Somebody's birthday!"))
if __name__=="__main__": test()
|