1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
import re
def split_dollars(text):
r"""
Split text into text and math segments.
Returns a list of tuples ``(type, text)``, where ``type`` is either
``"text"`` or ``"math"`` and ``text`` is the text.
Example:
>>> split_dollars(r"The functions $\sin(x)$ and $\cos(x)$.")
[('text', 'The functions '), ('math', '\\sin(x)'), ('text', ' and '),
('math', '\\cos(x)'), ('text', '.')]
More precisely, do a regular expression search. To match as math, the
first character after the first $ should not be a space. This is to avoid
false positives with things like
$ cd ~
$ ls
Escaped dollars (\$) are also not matched as math delimiters, however all
escaped dollars are replaced with normal dollars in the final output.
Math is allowed to be split across multiple lines, as its assumed the
dollars will appear in places like docstrings where line wrapping is
desired.
This also doesn't replaces dollar signs enclosed in curly braces,
to avoid nested math environments, such as ::
$f(n) = 0 \text{ if $n$ is prime}$
Thus the above line would get matched fully as math.
"""
# This searches for "$blah$" inside a pair of curly braces --
# don't change these, since they're probably coming from a nested
# math environment. So for each match, we replace it with a temporary
# string, and later on we substitute the original back.
_data = {}
def repl(matchobj):
s = matchobj.group(0)
t = "___XXX_REPL_%d___" % len(_data)
_data[t] = s
return t
# Match $math$ inside of {...} and replace it with dummy text
# TODO: This will false positive if the {} are not themselves in math
text = re.sub(r"({[^{}$]*\$[^{}$]*\$[^{}]*})", repl, text)
# matches $...$
dollars = re.compile(r"(?<!\$)(?<!\\)\$([^\$ ](?:(?<=\\)\$|[^\$])*?)(?<!\\)\$")
res = []
start = 0
end = len(text)
def _add_fragment(t, typ):
t = t.replace(r'\$', '$')
# change the original {...} things in:
for r in _data:
t = t.replace(r, _data[r])
if t:
res.append((typ, t))
for m in dollars.finditer(text):
text_fragment = text[start:m.start()]
math_fragment = m.group(1)
start = m.end()
_add_fragment(text_fragment, 'text')
_add_fragment(math_fragment, 'math')
_add_fragment(text[start:end], 'text')
return res
|