1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
"""
Inline tokenizer for mistletoe.
"""
import html
import re
# replacement for html._charref which matches only entitydefs ending with ';',
# according to the CommonMark spec.
_markdown_charref = re.compile(r'&(#[0-9]{1,7};'
r'|#[xX][0-9a-fA-F]{1,6};'
r'|[^\t\n\f <&#;]{1,32};)')
_stdlib_charref = html._charref
def tokenize(string, token_types):
try:
html._charref = _markdown_charref
*token_types, fallback_token = token_types
tokens = find_tokens(string, token_types, fallback_token)
token_buffer = []
if tokens:
prev = tokens[0]
for curr in tokens[1:]:
prev = eval_tokens(prev, curr, token_buffer)
token_buffer.append(prev)
return make_tokens(token_buffer, 0, len(string), string, fallback_token)
finally:
html._charref = _stdlib_charref
def find_tokens(string, token_types, fallback_token):
tokens = []
for token_type in token_types:
for m in token_type.find(string):
tokens.append(ParseToken(m.start(), m.end(), m, string, token_type, fallback_token))
return sorted(tokens)
def eval_tokens(x, y, token_buffer):
r = relation(x, y)
if r == 0:
token_buffer.append(x)
return y
if r == 1:
return x if x.cls.precedence >= y.cls.precedence else y
if r == 2:
x.append_child(y)
return x
return x
def eval_new_child(parent, child):
last_child = parent.children[-1]
r = relation(last_child, child)
if r == 0:
parent.children.append(child)
elif r == 1 and last_child.cls.precedence < child.cls.precedence:
parent.children[-1] = child
elif r == 2:
last_child.append_child(child)
def relation(x, y):
if x.end <= y.start:
return 0 # x precedes y
if x.end >= y.end:
if x.parse_start <= y.start and x.parse_end >= y.end:
return 2 # x contains y
if x.parse_end <= y.start:
return 3 # ignore y
return 1 # x intersects y
def make_tokens(tokens, start, end, string, fallback_token):
result = []
prev_end = start
for token in tokens:
if token.start > prev_end:
t = fallback_token(html.unescape(string[prev_end:token.start]))
if t is not None:
result.append(t)
t = token.make()
if t is not None:
result.append(t)
prev_end = token.end
if prev_end != end:
result.append(fallback_token(html.unescape(string[prev_end:end])))
return result
class ParseToken:
def __init__(self, start, end, match, string, cls, fallback_token):
self.start = start
self.end = end
self.parse_start = match.start(cls.parse_group)
self.parse_end = match.end(cls.parse_group)
self.match = match
self.string = string
self.cls = cls
self.fallback_token = fallback_token
self.children = []
def append_child(self, child):
if self.cls.parse_inner:
if not self.children:
self.children.append(child)
else:
eval_new_child(self, child)
def make(self):
if not self.cls.parse_inner:
return self.cls(self.match)
children = make_tokens(self.children, self.parse_start, self.parse_end, self.string, self.fallback_token)
token = self.cls(self.match)
token.children = children
return token
def __lt__(self, other):
return self.start < other.start
def __repr__(self):
pattern = '<ParseToken span=({},{}) parse_span=({},{}) cls={} children={}>'
return pattern.format(self.start, self.end,
self.parse_start, self.parse_end,
repr(self.cls.__name__), self.children)
|