File: latex2mathml.py

package info (click to toggle)
python-docutils 0.19%2Bdfsg-6
links: PTS, VCS
area: main
in suites: bookworm
size: 9,668 kB
sloc: python: 45,630; lisp: 14,475; xml: 1,789; javascript: 1,032; sh: 130; makefile: 104
file content (1430 lines) | stat: -rw-r--r-- 51,043 bytes
#!/usr/bin/env python3
# :Id: $Id: latex2mathml.py 9029 2022-03-05 23:27:50Z milde $
# :Copyright: © 2005 Jens Jørgen Mortensen [1]_
#             © 2010, 2021 Günter Milde.
#
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
#    Copying and distribution of this file, with or without modification,
#    are permitted in any medium without royalty provided the copyright
#    notice and this notice are preserved.
#    This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math`

"""Convert LaTex maths code into presentational MathML.

This module is provisional:
the API is not settled and may change with any minor Docutils version.
"""

# Usage:
#
# >>> from latex2mathml import *

import re
import unicodedata

from docutils.utils.math import tex2unichar, toplevel_code


# Character data
# --------------

# LaTeX math macro to Unicode mappings.
# Character categories.

# identifiers -> <mi>

letters = tex2unichar.mathalpha
letters['hbar'] = '\u210F'  # compatibility mapping to ℏ (\hslash).
# (ħ LATIN SMALL LETTER H WITH STROKE is upright)

# special case: Capital Greek letters: (upright in TeX style)
greek_capitals = {
    'Phi': '\u03a6', 'Xi': '\u039e', 'Sigma': '\u03a3',
    'Psi': '\u03a8', 'Delta': '\u0394', 'Theta': '\u0398',
    'Upsilon': '\u03d2', 'Pi': '\u03a0', 'Omega': '\u03a9',
    'Gamma': '\u0393', 'Lambda': '\u039b'}

# functions -> <mi>
functions = {# functions with a space in the name
             'liminf': 'lim\u202finf',
             'limsup': 'lim\u202fsup',
             'injlim': 'inj\u202flim',
             'projlim': 'proj\u202flim',
             # embellished function names (see handle_cmd() below)
             'varlimsup': 'lim',
             'varliminf': 'lim',
             'varprojlim': 'lim',
             'varinjlim': 'lim',
             # custom function name
             'operatorname': None,
            }
functions.update((name, name) for name in
                 ('arccos', 'arcsin', 'arctan', 'arg',  'cos',
                  'cosh',   'cot',    'coth',   'csc',  'deg',
                  'det',    'dim',    'exp',    'gcd',  'hom',
                  'ker',    'lg',     'ln',     'log',  'Pr',
                  'sec',    'sin',    'sinh',   'tan',  'tanh'))
# Function with limits: 'lim', 'sup', 'inf', 'max', 'min':
# use <mo> to allow "movablelimits" attribute (see below).


# math font selection -> <mi mathvariant=...> or <mstyle mathvariant=...>
math_alphabets = {# 'cmdname': 'mathvariant value'  # package
                  'boldsymbol': 'bold',
                  'mathbf':     'bold',
                  'mathit':     'italic',
                  'mathtt':     'monospace',
                  'mathrm':     'normal',
                  'mathsf':     'sans-serif',
                  'mathcal':    'script',
                  'mathbfit':   'bold-italic',            # isomath
                  'mathbb':     'double-struck',          # amssymb
                  'mathfrak':   'fraktur',                # amssymb
                  'mathsfit':   'sans-serif-italic',      # isomath
                  'mathsfbfit': 'sans-serif-bold-italic', # isomath
                  'mathscr':    'script',                 # mathrsfs
                  # unsupported: bold-fraktur
                  #              bold-script
                  #              bold-sans-serif
                 }

# operator, fence, or separator -> <mo>


stretchables = {# extensible delimiters allowed in left/right cmds
                'backslash':   '\\',
                'uparrow':     '\u2191', # ↑ UPWARDS ARROW
                'downarrow':   '\u2193', # ↓ DOWNWARDS ARROW
                'updownarrow': '\u2195', # ↕ UP DOWN ARROW
                'Uparrow':     '\u21d1', # ⇑ UPWARDS DOUBLE ARROW
                'Downarrow':   '\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW
                'Updownarrow': '\u21d5', # ⇕ UP DOWN DOUBLE ARROW
                'lmoustache':  '\u23b0', # ⎰ … CURLY BRACKET SECTION
                'rmoustache':  '\u23b1', # ⎱ … LEFT CURLY BRACKET SECTION
                'arrowvert':   '\u23d0', # ⏐ VERTICAL LINE EXTENSION
                'bracevert':   '\u23aa', # ⎪ CURLY BRACKET EXTENSION
                'lvert':      '|',      # left  |
                'lVert':      '\u2016', # left  ‖
                'rvert':      '|',      # right |
                'rVert':      '\u2016', # right ‖
                'Arrowvert':  '\u2016', # ‖
               }
stretchables.update(tex2unichar.mathfence)
stretchables.update(tex2unichar.mathopen)  # Braces
stretchables.update(tex2unichar.mathclose) # Braces

# >>> print(' '.join(sorted(set(stretchables.values()))))
# [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈

operators = {# negated symbols without pre-composed Unicode character
             'nleqq':      '\u2266\u0338', # ≦̸
             'ngeqq':      '\u2267\u0338', # ≧̸
             'nleqslant':  '\u2a7d\u0338', # ⩽̸
             'ngeqslant':  '\u2a7e\u0338', # ⩾̸
             'ngtrless':   '\u2277\u0338', # txfonts
             'nlessgtr':   '\u2276\u0338', # txfonts
             'nsubseteqq': '\u2AC5\u0338', # ⫅̸
             'nsupseteqq': '\u2AC6\u0338', # ⫆̸
             # compatibility definitions:
             'centerdot': '\u2B1D', # BLACK VERY SMALL SQUARE | mathbin
             'varnothing': '\u2300', # ⌀ DIAMETER SIGN | empty set
             'varpropto': '\u221d', # ∝ PROPORTIONAL TO | sans serif
             'triangle': '\u25B3', # WHITE UP-POINTING TRIANGLE | mathord
             'triangledown': '\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord
             # alias commands:
             'dotsb': '\u22ef', # ⋯ with binary operators/relations
             'dotsc': '\u2026', # … with commas
             'dotsi': '\u22ef', # ⋯ with integrals
             'dotsm': '\u22ef', # ⋯ multiplication dots
             'dotso': '\u2026', # … other dots
             # functions with movable limits (requires <mo>)
             'lim': 'lim',
             'sup': 'sup',
             'inf': 'inf',
             'max': 'max',
             'min': 'min',
            }
operators.update(tex2unichar.mathbin)   # Binary symbols
operators.update(tex2unichar.mathrel)   # Relation symbols, arrow symbols
operators.update(tex2unichar.mathord)   # Miscellaneous symbols
operators.update(tex2unichar.mathpunct) # Punctuation
operators.update(tex2unichar.mathop)    # Variable-sized symbols
operators.update(stretchables)


# special cases

thick_operators = {# style='font-weight: bold;'
                   'thicksim':       '\u223C', # ∼
                   'thickapprox':    '\u2248', # ≈
                  }

small_operators = {# mathsize='75%'
                   'shortmid':       '\u2223', # ∣
                   'shortparallel':  '\u2225', # ∥
                   'nshortmid':      '\u2224', # ∤
                   'nshortparallel': '\u2226', # ∦
                   'smallfrown':     '\u2322', # ⌢ FROWN
                   'smallsmile':     '\u2323', # ⌣ SMILE
                   'smallint':       '\u222b', # ∫ INTEGRAL
                  }

# Operators and functions with limits above/below in display formulas
# and in index position inline (movablelimits=True)
movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes',
                 'bigsqcup', 'biguplus', 'bigvee', 'bigwedge',
                 'coprod', 'intop', 'ointop', 'prod', 'sum',
                 'lim', 'max', 'min', 'sup', 'inf')
# Depending on settings, integrals may also be in this category.
# (e.g. if "amsmath" is loaded with option "intlimits", see
#  http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf)
# movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint',
#                       'oint', 'ointctrclockwise', 'sqint',
#                       'varointclockwise',))

# horizontal space -> <mspace>

spaces = {'qquad':         '2em',       # two \quad
          'quad':          '1em',       # 18 mu
          'thickspace':    '0.2778em',  # 5mu = 5/18em
          'medspace':      '0.2222em',  # 4mu = 2/9em
          'thinspace':     '0.1667em',  # 3mu = 1/6em
          'negthinspace':  '-0.1667em', # -3mu = -1/6em
          'negmedspace':   '-0.2222em', # -4mu = -2/9em
          'negthickspace': '-0.2778em', # -5mu = -5/18em
          ' ':             '0.25em',    # inter word space
          ';':             '0.2778em',  # 5mu thickspace
          ':':             '0.2222em',  # 4mu medspace
          ',':             '0.1667em',  # 3mu thinspace
          '!':             '-0.1667em', # negthinspace
          }

# accents -> <mover stretchy="false">
accents = {# TeX:      (spacing, combining)
           'acute':    ('´', '\u0301'),
           'bar':      ('ˉ', '\u0304'),
           'breve':    ('˘', '\u0306'),
           'check':    ('ˇ', '\u030C'),
           'dot':      ('˙', '\u0307'),
           'ddot':     ('¨', '\u0308'),
           'dddot':    ('⋯', '\u20DB'),
           'grave':    ('`', '\u0300'),
           'hat':      ('ˆ', '\u0302'),
           'mathring': ('˚', '\u030A'),
           'tilde':    ('˜', '\u0303'), # tilde ~ or small tilde ˜?
           'vec':      ('→', '\u20d7'), # → too heavy, accents="false"
           # TODO: ddddot
           }

# limits etc. -> <mover> or <munder>
over = {# TeX:                  (char,     offset-correction/em)
        'overbrace':            ('\u23DE', -0.2),  # DejaVu Math -0.6
        'overleftarrow':        ('\u2190', -0.2),
        'overleftrightarrow':   ('\u2194', -0.2),
        'overline':             ('_',      -0.2),  # \u2012 does not stretch
        'overrightarrow':       ('\u2192', -0.2),
        'widehat':              ('^',      -0.5),
        'widetilde':            ('~',      -0.3),
        }
under = {'underbrace':          ('\u23DF',  0.1),  # DejaVu Math -0.7
         'underleftarrow':      ('\u2190', -0.2),
         'underleftrightarrow': ('\u2194', -0.2),
         'underline':           ('_',      -0.8),
         'underrightarrow':     ('\u2192', -0.2),
         }

# Character translations
# ----------------------
# characters with preferred alternative in mathematical use
# cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous
anomalous_chars = {'-': '\u2212', # HYPHEN-MINUS -> MINUS SIGN
                   ':': '\u2236', # COLON -> RATIO
                   '~': '\u00a0', # NO-BREAK SPACE
                   }

# blackboard bold (Greek characters not working with "mathvariant" (Firefox 78)
mathbb = {'Γ': '\u213E',    # ℾ
          'Π': '\u213F',    # ℿ
          'Σ': '\u2140',    # ⅀
          'γ': '\u213D',    # ℽ
          'π': '\u213C',    # ℼ
          }

# Matrix environments
matrices = {# name:    fences
            'matrix':  ('', ''),
            'smallmatrix':  ('', ''), # smaller, see begin_environment()!
            'pmatrix': ('(', ')'),
            'bmatrix': ('[', ']'),
            'Bmatrix': ('{', '}'),
            'vmatrix': ('|', '|'),
            'Vmatrix': ('\u2016', '\u2016'), # ‖
            'cases':   ('{', ''),
            }

layout_styles = {
    'displaystyle':      {'displaystyle': True,  'scriptlevel': 0},
    'textstyle':         {'displaystyle': False, 'scriptlevel': 0},
    'scriptstyle':       {'displaystyle': False, 'scriptlevel': 1},
    'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2},
    }
# See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel

fractions = {# name:   style_attrs, frac_attrs
             'frac':   ({}, {}),
             'cfrac':  ({'displaystyle': True,  'scriptlevel': 0,
                         'CLASS': 'cfrac'}, {}), # in LaTeX with padding
             'dfrac':  (layout_styles['displaystyle'], {}),
             'tfrac':  (layout_styles['textstyle'], {}),
             'binom':  ({}, {'linethickness': 0}),
             'dbinom': (layout_styles['displaystyle'], {'linethickness': 0}),
             'tbinom': (layout_styles['textstyle'], {'linethickness': 0}),
             }

delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em']
bigdelimiters = {'left':  0,
                 'right': 0,
                 'bigl':  1,
                 'bigr':  1,
                 'Bigl':  2,
                 'Bigr':  2,
                 'biggl': 3,
                 'biggr': 3,
                 'Biggl': 4,
                 'Biggr': 4,
                 }


# MathML element classes
# ----------------------

class math:
    """Base class for MathML elements and root of MathML trees."""

    nchildren = None
    """Expected number of children or None"""
    # cf. https://www.w3.org/TR/MathML3/chapter3.html#id.3.1.3.2
    parent = None
    """Parent node in MathML DOM tree."""
    _level = 0 # indentation level (static class variable)
    xml_entities = { # for invalid and invisible characters
                    ord('<'): '&lt;',
                    ord('>'): '&gt;',
                    ord('&'): '&amp;',
                    0x2061:   '&ApplyFunction;',
                   }
    _boolstrings = {True: 'true', False: 'false'}
    """String representation of boolean MathML attribute values."""

    html_tagname = 'span'
    """Tag name for HTML representation."""

    def __init__(self, *children, **attributes):
        """Set up node with `children` and `attributes`.

        Attributes are downcased: Use CLASS to set "class" value.
        >>> math(mn(3), CLASS='test')
        math(mn(3), class='test')
        >>> math(CLASS='test').toprettyxml()
        '<math class="test">\n</math>'

        """
        self.children = []
        self.extend(children)

        self.attributes = {}
        for key in attributes.keys():
            # Use .lower() to allow argument `CLASS` for attribute `class`
            # (Python keyword). MathML uses only lowercase attributes.
            self.attributes[key.lower()] = attributes[key]

    def __repr__(self):
        content = [repr(item) for item in getattr(self, 'children', [])]
        if hasattr(self, 'data'):
            content.append(repr(self.data))
        if isinstance(self, MathSchema) and self.switch:
            content.append('switch=True')
        content += ["%s=%r"%(k, v) for k, v in self.attributes.items()
                    if v is not None]

        return self.__class__.__name__ + '(%s)' % ', '.join(content)

    def __len__(self):
        return len(self.children)

    # emulate dictionary-like access to attributes
    # see `docutils.nodes.Element` for dict/list interface
    def __getitem__(self, key):
        return self.attributes[key]

    def __setitem__(self, key, item):
        self.attributes[key] = item

    def get(self, *args, **kwargs):
        return self.attributes.get(*args, **kwargs)

    def full(self):
        """Return boolean indicating whether children may be appended."""
        return (self.nchildren is not None
                and len(self) >= self.nchildren)

    def append(self, child):
        """Append child and return self or first non-full parent.

        If self is full, go up the tree and return first non-full node or
        `None`.
        """
        if self.full():
            raise SyntaxError('Node %s already full!' % self)
        self.children.append(child)
        child.parent = self
        if self.full():
            return self.close()
        return self

    def extend(self, children):
        for child in children:
            self.append(child)
        return self

    def close(self):
        """Close element and return first non-full parent or None."""
        parent = self.parent
        while parent is not None and parent.full():
            parent = parent.parent
        return parent

    def toprettyxml(self):
        """Return XML representation of self as string."""
        return ''.join(self._xml())

    def _xml(self, level=0):
        return ([self.xml_starttag()]
                + self._xml_body(level)
                + ['</%s>' % self.__class__.__name__])

    def xml_starttag(self):
        attrs = ('%s="%s"' % (k, str(v).replace('True', 'true').replace('False', 'false'))
                 for k, v in self.attributes.items()
                 if v is not None)
        return '<%s>' % ' '.join((self.__class__.__name__, *attrs))

    def _xml_body(self, level=0):
        xml = []
        for child in self.children:
            xml.extend(['\n', '  ' * (level+1)])
            xml.extend(child._xml(level+1))
        xml.extend(['\n', '  ' * level])
        return xml

# >>> n2 = math(mn(2))
# >>> n2
# math(mn(2))
# >>> n2.toprettyxml()
# '<math>\n  <mn>2</mn>\n</math>'
# >>> len(n2)
# 1
# >>> eq3 = math(id='eq3', display='block')
# >>> eq3
# math(display='block', id='eq3')
# >>> eq3.toprettyxml()
# '<math display="block" id="eq3">\n</math>'
# >>> len(eq3)
# 0
# >>> math(CLASS='bold').xml_starttag()
# '<math class="bold">'


class mtable(math): pass


# >>> mt = mtable(displaystyle=True)
# >>> mt
# mtable(displaystyle=True)
# >>> math(mt).toprettyxml()
# '<math>\n  <mtable displaystyle="true">\n  </mtable>\n</math>'

class mrow(math):
    """Group sub-expressions as a horizontal row."""

    def close(self):
        """Close element and return first non-full parent or None.

        Remove <mrow>, if it is single child and the parent infers an mrow
        or if it has only one child element.
        """
        parent = self.parent
        if isinstance(parent, MathRowSchema) and parent.nchildren == 1:
            parent.nchildren = len(parent.children)
            parent.children = self.children
            for child in self.children:
                child.parent = parent
            return parent.close()
        if len(self) == 1:
            try:
                parent.children[parent.children.index(self)] = self.children[0]
                self.children[0].parent = parent
            except (AttributeError, ValueError):
                return self.children[0]
        return super().close()

# >>> mrow(displaystyle=False)
# mrow(displaystyle=False)


# The elements <msqrt>, <mstyle>, <merror>, <mpadded>, <mphantom>, <menclose>,
# <mtd>, <mscarry>, and <math> treat their contents as a single inferred mrow
# formed from all their children.
class MathRowSchema(math):
    """Base class for elements treating content as a single inferred mrow."""


class mtr(MathRowSchema): pass
class mtd(MathRowSchema): pass


class menclose(MathRowSchema):
    nchildren = 1 # \boxed expects one argument or a group


class mphantom(MathRowSchema):
    nchildren = 1 # \phantom expects one argument or a group


class msqrt(MathRowSchema):
    nchildren = 1 # \sqrt expects one argument or a group


class mstyle(MathRowSchema):
    nchildren = 1 # \mathrm, ... expect one argument or a group


class MathToken(math):
    """Token Element: contains textual data instead of children.

    Base class for mo, mi, and mn.
    """
    nchildren = 0

    def __init__(self, data, **attributes):
        self.data = data
        super().__init__(**attributes)

    def _xml_body(self, level=0):
        return [str(self.data).translate(self.xml_entities)]


class mtext(MathToken): pass
class mi(MathToken): pass
class mo(MathToken): pass
class mn(MathToken): pass


# >>> mo('<')
# mo('<')
# >>> mo('<')._xml()
# ['<mo>', '&lt;', '</mo>']

class MathSchema(math):
    """Base class for schemata expecting 2 or more children.

    The special attribute `switch` indicates that the last two child
    elements are in reversed order and must be switched before XML-export.
    """

    nchildren = 2

    def __init__(self, *children, **kwargs):
        self.switch = kwargs.pop('switch', False)
        math.__init__(self, *children, **kwargs)

    def append(self, child):
        current_node = super().append(child)
        # normalize order if full
        if self.switch and self.full():
            self.children[-1], self.children[-2] = self.children[-2], self.children[-1]
            self.switch = False
        return current_node


class msub(MathSchema): pass
class msup(MathSchema): pass


class msubsup(MathSchema):
    nchildren = 3


# >>> msub(mi('x'), mo('-'))
# msub(mi('x'), mo('-'))
# >>> msubsup(mi('base'), mi('sub'), mi('super'))
# msubsup(mi('base'), mi('sub'), mi('super'))
# >>> msubsup(mi('base'), mi('super'), mi('sub'), switch=True)
# msubsup(mi('base'), mi('sub'), mi('super'))

class munder(msub): pass
class mover(msup): pass


# >>> munder(mi('lim'), mo('-'), accent=False)
# munder(mi('lim'), mo('-'), accent=False)
# >>> mu = munder(mo('-'), accent=False, switch=True)
# >>> mu
# munder(mo('-'), switch=True, accent=False)
# >>> mu.append(mi('lim'))
# >>> mu
# munder(mi('lim'), mo('-'), accent=False)
# >>> mu.append(mi('lim'))
# Traceback (most recent call last):
# SyntaxError: Node munder(mi('lim'), mo('-'), accent=False) already full!
# >>> munder(mo('-'), mi('lim'), accent=False, switch=True).toprettyxml()
# '<munder accent="false">\n  <mi>lim</mi>\n  <mo>-</mo>\n</munder>'

class munderover(msubsup): pass


class mroot(MathSchema):
    nchildren = 2


class mfrac(math):
    nchildren = 2


class mspace(math):
    nchildren = 0


# LaTeX to MathML translation
# ---------------------------

# auxiliary functions
# ~~~~~~~~~~~~~~~~~~~

def tex_cmdname(string):
    """Return leading TeX command name and remainder of `string`.

    >>> tex_cmdname('mymacro2') # up to first non-letter
    ('mymacro', '2')
    >>> tex_cmdname('name 2') # strip trailing whitespace
    ('name', '2')
    >>> tex_cmdname('_2') # single non-letter character
    ('_', '2')

    """
    m = re.match(r'([a-zA-Z]+) *(.*)', string)
    if m is None:
        m = re.match(r'(.?)(.*)', string)
    return m.group(1), m.group(2)


# Test:
#
# >>> tex_cmdname('name_2') # first non-letter terminates
# ('name', '_2')
# >>> tex_cmdname(' next') # leading whitespace is returned
# (' ', 'next')
# >>> tex_cmdname('1 2') # whitespace after non-letter is kept
# ('1', ' 2')
# >>> tex_cmdname('') # empty string
# ('', '')


def tex_number(string):
    """Return leading number literal and remainder of `string`.

    >>> tex_number('123.4')
    ('123.4', '')

    """
    m = re.match(r'([0-9.,]*[0-9]+)(.*)', string)
    if m is None:
        return '', string
    return m.group(1), m.group(2)


# Test:
#
# >>> tex_number(' 23.4b') # leading whitespace -> no number
# ('', ' 23.4b')
# >>> tex_number('23,400/2') # comma separator included
# ('23,400', '/2')
# >>> tex_number('23. 4/2') # trailing separator not included
# ('23', '. 4/2')
# >>> tex_number('4, 2') # trailing separator not included
# ('4', ', 2')
# >>> tex_number('1 000.4')
# ('1', ' 000.4')


def tex_token(string):
    """Return first simple TeX token and remainder of `string`.

    >>> tex_token('\\command{without argument}')
    ('\\command', '{without argument}')
    >>> tex_token('or first character')
    ('o', 'r first character')

    """
    m = re.match(r"""((?P<cmd>\\[a-zA-Z]+)\s* # TeX command, skip whitespace
                      |(?P<chcmd>\\.)          # one-character TeX command
                      |(?P<ch>.?))            # first character (or empty)
                     (?P<remainder>.*$)    # remaining part of string
                 """, string, re.VERBOSE)
    cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder')
    return cmd or chcmd or ch, remainder

# Test:
#
# >>> tex_token('{opening bracket of group}')
# ('{', 'opening bracket of group}')
# >>> tex_token('\\skip whitespace after macro name')
# ('\\skip', 'whitespace after macro name')
# >>> tex_token('. but not after single char')
# ('.', ' but not after single char')
# >>> tex_token('') # empty string.
# ('', '')
# >>> tex_token('\{escaped bracket')
# ('\\{', 'escaped bracket')


def tex_group(string):
    """Return first TeX group or token and remainder of `string`.

    >>> tex_group('{first group} returned without brackets')
    ('first group', ' returned without brackets')

    """
    split_index = 0
    nest_level = 0   # level of {{nested} groups}
    escape = False   # the next character is escaped (\)

    if not string.startswith('{'):
        # special case: there is no group, return first token and remainder
        return string[:1], string[1:]
    for c in string:
        split_index += 1
        if escape:
            escape = False
        elif c == '\\':
            escape = True
        elif c == '{':
            nest_level += 1
        elif c == '}':
            nest_level -= 1
        if nest_level == 0:
            break
    else:
        raise SyntaxError('Group without closing bracket')
    return string[1:split_index-1], string[split_index:]


# >>> tex_group('{} empty group')
# ('', ' empty group')
# >>> tex_group('{group with {nested} group} ')
# ('group with {nested} group', ' ')
# >>> tex_group('{group with {nested group}} at the end')
# ('group with {nested group}', ' at the end')
# >>> tex_group('{{group} {with {{complex }nesting}} constructs}')
# ('{group} {with {{complex }nesting}} constructs', '')
# >>> tex_group('{group with \\{escaped\\} brackets}')
# ('group with \\{escaped\\} brackets', '')
# >>> tex_group('{group followed by closing bracket}} from outer group')
# ('group followed by closing bracket', '} from outer group')
# >>> tex_group('No group? Return first character.')
# ('N', 'o group? Return first character.')
# >>> tex_group(' {also whitespace}')
# (' ', '{also whitespace}')


def tex_token_or_group(string):
    """Return first TeX group or token and remainder of `string`.

    >>> tex_token_or_group('\\command{without argument}')
    ('\\command', '{without argument}')
    >>> tex_token_or_group('first character')
    ('f', 'irst character')
    >>> tex_token_or_group(' also whitespace')
    (' ', 'also whitespace')
    >>> tex_token_or_group('{first group} keep rest')
    ('first group', ' keep rest')

    """
    arg, remainder = tex_token(string)
    if arg == '{':
        arg, remainder = tex_group(string.lstrip())
    return arg, remainder

# >>> tex_token_or_group('\{no group but left bracket')
# ('\\{', 'no group but left bracket')


def tex_optarg(string):
    """Return optional argument and remainder.

    >>> tex_optarg('[optional argument] returned without brackets')
    ('optional argument', ' returned without brackets')
    >>> tex_optarg('{empty string, if there is no optional arg}')
    ('', '{empty string, if there is no optional arg}')

    """
    m = re.match(r"""\s*                            # leading whitespace
                 \[(?P<optarg>(\\]|[^\[\]]|\\])*)\] # [group] without nested groups
                 (?P<remainder>.*$)
                 """, string, re.VERBOSE)
    if m is None and not string.startswith('['):
        return '', string
    try:
        return m.group('optarg'), m.group('remainder')
    except AttributeError:
        raise SyntaxError('Could not extract optional argument from %r' % string)

# Test:
# >>> tex_optarg(' [optional argument] after whitespace')
# ('optional argument', ' after whitespace')
# >>> tex_optarg('[missing right bracket')
# Traceback (most recent call last):
# SyntaxError: Could not extract optional argument from '[missing right bracket'
# >>> tex_optarg('[group with [nested group]]')
# Traceback (most recent call last):
# SyntaxError: Could not extract optional argument from '[group with [nested group]]'


def parse_latex_math(node, string):
    """Append MathML conversion of `string` to `node` and return it.

    >>> parse_latex_math(math(), r'\alpha')
    math(mi('α'))
    >>> parse_latex_math(mrow(), r'x_{n}')
    mrow(msub(mi('x'), mi('n')))

    """
    # Normalize white-space:
    string = ' '.join(string.split())
    tree = node

    while len(string) > 0:
        # Take off first character:
        c, string = string[0], string[1:]

        if c == ' ':
            continue  # whitespace is ignored in LaTeX math mode
        if c == '\\': # start of a LaTeX macro
            cmdname, string = tex_cmdname(string)
            node, string = handle_cmd(cmdname, node, string)
        elif c in "_^":
            node = handle_script_or_limit(node, c)
        elif c == '{':
            new_node = mrow()
            node.append(new_node)
            node = new_node
        elif c == '}':
            node = node.close()
        elif c == '&':
            new_node = mtd()
            node.close().append(new_node)
            node = new_node
        elif c.isalpha():
            node = node.append(mi(c))
        elif c.isdigit():
            number, string = tex_number(string)
            node = node.append(mn(c+number))
        elif c in anomalous_chars:
            # characters with a special meaning in LaTeX math mode
            # fix spacing before "unary" minus.
            attributes = {}
            if c == '-' and node.children:
                previous_node = node.children[-1]
                if (getattr(previous_node, 'data', '-') in '([='
                    or previous_node.get('class') == 'mathopen'):
                    attributes['form'] = 'prefix'
            node = node.append(mo(anomalous_chars[c], **attributes))
        elif c in "/()[]|":
            node = node.append(mo(c, stretchy=False))
        elif c in "+*=<>,.!?`';@":
            node = node.append(mo(c))
        else:
            raise SyntaxError('Unsupported character: "%s"' % c)
    return tree

# Test:

# >>> print(parse_latex_math(math(), ''))
# math()
# >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}')
# math(msqrt(mi('α')))
# >>> parse_latex_math(math(), '23.4x')
# math(mn('23.4'), mi('x'))
# >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3')
# math(msqrt(mn('2')), mo('≠'), mn('3'))
# >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 3')
# math(msqrt(mn('2'), mo('+'), mn('3')), mo('<'), mn('3'))
# >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}')
# math(mroot(mrow(mn('2'), mo('+'), mn('3')), mn('3')))
# >>> parse_latex_math(math(), '\max_x') # function takes limits
# math(munder(mo('max', movablelimits=True), mi('x')))
# >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup
# math(msubsup(mi('x'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), '\int^j_i') # ensure correct order
# math(msubsup(mo('∫'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), 'x_{\\alpha}')
# math(msub(mi('x'), mi('α')))
# >>> parse_latex_math(math(), 'x_\\text{in}')
# math(msub(mi('x'), mtext('in')))


def handle_cmd(name, node, string): # noqa: C901 TODO make this less complex
    """Process LaTeX command `name` followed by `string`.

    Append result to `node`.
    If needed, parse `string` for command argument.
    Return new current node and remainder of `string`:

    >>> handle_cmd('hbar', math(), r' \frac')
    (math(mi('ℏ')), ' \\frac')
    >>> handle_cmd('hspace', math(), r'{1ex} (x)')
    (math(mspace(width='1ex')), ' (x)')

    """

    # Token elements
    # ==============

    # identifier  ->  <mi>

    if name in letters:
        new_node = mi(letters[name])
        if name in greek_capitals:
            # upright in "TeX style" but MathML sets them italic ("ISO style").
            # CSS styling does not change the font style in Firefox 78.
            # Use 'mathvariant="normal"'?
            new_node['class'] = 'capital-greek'
        node = node.append(new_node)
        return node, string

    if name in functions:
        # use <mi> followed by invisible function applicator character
        # (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi)
        if name == 'operatorname':
            # custom function name, e.g. ``\operatorname{abs}(x)``
            # TODO: \operatorname* -> with limits
            arg, string = tex_token_or_group(string)
            new_node = mi(arg, mathvariant='normal')
        else:
            new_node = mi(functions[name])
        # embellished function names:
        if name == 'varliminf':    # \underline\lim
            new_node = munder(new_node, mo('_'))
        elif name == 'varlimsup':  # \overline\lim
            new_node = mover(new_node, mo('¯'), accent=False)
        elif name == 'varprojlim': # \underleftarrow\lim
            new_node = munder(new_node, mo('\u2190'))
        elif name == 'varinjlim':  # \underrightarrow\lim
            new_node = munder(new_node, mo('\u2192'))

        node = node.append(new_node)
        # add ApplyFunction when appropriate (not \sin^2(x), say)
        # cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi
        if string and string[0] not in ('^', '_'):
            node = node.append(mo('\u2061')) # &ApplyFunction;
        return node, string

    if name in math_alphabets:
        if name == 'boldsymbol':
            attributes = {'class': 'boldsymbol'}
        else:
            attributes = {'mathvariant': math_alphabets[name]}
        if name == 'mathscr':
            attributes['class'] = 'mathscr'
        # Check for single symbol (letter, name, or ⅀)
        arg, remainder = tex_token_or_group(string)
        if arg.startswith('\\'):
            # convert single letters (so the isalpha() test below works).
            # TODO: convert all LICRs in a group (\matrm{\mu\Omega})
            arg = letters.get(arg[1:], arg)
        if name == 'mathbb':
            # mathvariant="double-struck" is ignored for Greek letters
            # (tested in Firefox 78). Use literal Unicode characters.
            arg = mathbb.get(arg, arg)
        if arg.isalpha() or arg == '\u2140':
            node = node.append(mi(arg, **attributes))
            return node, remainder
        # Wrap in <style>
        style = mstyle(**attributes)
        node.append(style)
        return style, string

    # operator, fence, or separator  ->  <mo>

    if name == 'colon': # trailing punctuation, not binary relation
        node = node.append(mo(':', form='postfix', lspace='0', rspace='0.28em'))
        return node, string

    if name == 'idotsint':
        node = parse_latex_math(node, r'\int\dotsi\int')
        return node, string

    if name in thick_operators:
        node = node.append(mo(thick_operators[name], style='font-weight: bold'))
        return node, string

    if name in small_operators:
        node = node.append(mo(small_operators[name], mathsize='75%'))
        return node, string

    if name in operators:
        attributes = {}
        if name in movablelimits and string and string[0] in ' _^':
            attributes['movablelimits'] = True
        elif name in ('lvert', 'lVert'):
            attributes['class'] = 'mathopen'
        node = node.append(mo(operators[name], **attributes))
        return node, string

    if name in bigdelimiters:
        delimiter_attributes = {}
        size = delimiter_sizes[bigdelimiters[name]]
        delimiter, string = tex_token_or_group(string)
        if delimiter not in '()[]/|.':
            try:
                delimiter = stretchables[delimiter.lstrip('\\')]
            except KeyError:
                raise SyntaxError('Unsupported "\\%s" delimiter "%s"!'
                                  % (name, delimiter))
        if size:
            delimiter_attributes['maxsize'] = size
            delimiter_attributes['minsize'] = size
            delimiter_attributes['symmetric'] = True
        if name == 'left' or name.endswith('l'):
            row = mrow()
            node.append(row)
            node = row
        if delimiter != '.': # '.' stands for "empty delimiter"
            node.append(mo(delimiter, **delimiter_attributes))
        if name == 'right' or name.endswith('r'):
            node = node.close()
        return node, string

    if name == 'not':
        arg, string = tex_token(string)
        if arg == '{':
            return node, '{\\not ' + string
        if arg.startswith('\\'): # LaTeX macro
            try:
                arg = operators[arg[1:]]
            except KeyError:
                raise SyntaxError('\\not: Cannot negate: "%s"!'%arg)
        arg = unicodedata.normalize('NFC', arg+'\u0338')
        node = node.append(mo(arg))
        return node, string

    # arbitrary text (usually comments)  ->  <mtext>
    if name in ('text', 'mbox', 'textrm'):
        arg, string = tex_token_or_group(string)
        parts = arg.split('$') # extract inline math
        for i, part in enumerate(parts):
            if i % 2 == 0: # i is even
                part = re.sub('(^ | $)', '\u00a0', part)
                node = node.append(mtext(part))
            else:
                parse_latex_math(node, part)
        return node, string

    # horizontal space -> <mspace>
    if name in spaces:
        node = node.append(mspace(width='%s'%spaces[name]))
        return node, string

    if name in ('hspace', 'mspace'):
        arg, string = tex_group(string)
        if arg.endswith('m'):
            arg = '%sem' % (float(arg[:-2])/18)
        node = node.append(mspace(width='%s'%arg))
        return node, string

    if name == 'phantom':
        new_node = mphantom()
        node.append(new_node)
        return new_node, string

    if name == 'boxed':
        new_node = menclose(notation='box')
        node.append(new_node)
        return new_node, string

    # Complex elements (Layout schemata)
    # ==================================

    if name == 'sqrt':
        radix, string = tex_optarg(string)
        if radix:
            indexnode = mrow()
            new_node = mroot(indexnode, switch=True)
            parse_latex_math(indexnode, radix)
            indexnode.close()
        else:
            new_node = msqrt()
        node.append(new_node)
        return new_node, string

    if name in fractions:
        (style_atts, frac_atts) = fractions[name]
        if name == 'cfrac':
            optarg, string = tex_optarg(string)
            optargs = {'l': 'left', 'r': 'right'}
            if optarg in optargs:
                frac_atts = frac_atts.copy()
                frac_atts['numalign'] = optargs[optarg] # "numalign" is deprecated
                frac_atts['class'] = 'numalign-' + optargs[optarg]
        new_node = frac = mfrac(**frac_atts)
        if name.endswith('binom'):
            new_node = mrow(mo('('), new_node, mo(')'), CLASS='binom')
            new_node.nchildren = 3
        if style_atts:
            new_node = mstyle(new_node, **style_atts)
        node.append(new_node)
        return frac, string

    if name == '\\': # end of a row
        entry = mtd()
        new_node = mtr(entry)
        node.close().close().append(new_node)
        return entry, string

    if name in accents:
        new_node = mover(mo(accents[name][0], stretchy=False), switch=True)
        if name == 'vec':
            new_node.children[0]['accent'] = False # scale down arrow but drop i-dot
        new_node.tex_cmd = name # for HTML export
        node.append(new_node)
        return new_node, string

    if name in over:
        # set "accent" to False (otherwise dots on i and j are dropped)
        # but to True on accent node get "textstyle" (full size) symbols on top
        new_node = mover(mo(over[name][0], accent=True),
                         switch=True, accent=False)
        new_node.tex_cmd = name # for HTML export
        node.append(new_node)
        return new_node, string

    if name == 'overset':
        new_node = mover(switch=True)
        node.append(new_node)
        return new_node, string

    if name in under:
        new_node = munder(mo(under[name][0]), switch=True)
        new_node.tex_cmd = name # for HTML export
        node.append(new_node)
        return new_node, string

    if name == 'underset':
        new_node = munder(switch=True)
        node.append(new_node)
        return new_node, string

    if name in ('xleftarrow', 'xrightarrow'):
        subscript, string = tex_optarg(string)
        base = mo(operators['long'+name[1:]])
        if subscript:
            new_node = munderover(base)
            sub_node = parse_latex_math(mrow(), subscript)
            if len(sub_node) == 1:
                sub_node = sub_node.children[0]
            new_node.append(sub_node)
        else:
            new_node = mover(base)
        node.append(new_node)
        return new_node, string

    if name in layout_styles: # 'displaystyle', 'textstyle', ...
        new_node = mstyle(**layout_styles[name])
        new_node.nchildren = None
        if isinstance(node, mrow) and len(node) == 0:
            # replace node with new_node
            node.parent.children[node.parent.children.index(node)] = new_node
            new_node.parent = node.parent
        elif node.__class__.__name__ == 'math':
            node.append(new_node)
        else:
            raise SyntaxError('Declaration "\\%s" must be first command '
                              'in a group.' % name)
        return new_node, string

    if name.endswith('limits'):
        arg, remainder = tex_token(string)
        if arg in '_^': # else ignore
            string = remainder
            node = handle_script_or_limit(node, arg, limits=name)
        return node, string

    # Environments

    if name == 'begin':
        return begin_environment(node, string)

    if name == 'end':
        return end_environment(node, string)

    raise SyntaxError('Unknown LaTeX command: ' + name)

# >>> handle_cmd('left', math(), '[a\\right]')
# (mrow(mo('[')), 'a\\right]')
# >>> handle_cmd('left', math(), '. a)') # empty \left
# (mrow(), ' a)')
# >>> handle_cmd('left', math(), '\\uparrow a)') # cmd
# (mrow(mo('↑')), 'a)')
# >>> handle_cmd('not', math(), '\\equiv \\alpha)') # cmd
# (math(mo('≢')), '\\alpha)')
# >>> handle_cmd('text', math(), '{ for } i>0') # group
# (math(mtext('\xa0for\xa0')), ' i>0')
# >>> handle_cmd('text', math(), '{B}T') # group
# (math(mtext('B')), 'T')
# >>> handle_cmd('text', math(), '{number of apples}}') # group
# (math(mtext('number of apples')), '}')
# >>> handle_cmd('text', math(), 'i \\sin(x)') # single char
# (math(mtext('i')), ' \\sin(x)')
# >>> handle_cmd('sin', math(), '(\\alpha)')
# (math(mi('sin'), mo('\u2061')), '(\\alpha)')
# >>> handle_cmd('sin', math(), ' \\alpha')
# (math(mi('sin'), mo('\u2061')), ' \\alpha')
# >>> handle_cmd('operatorname', math(), '{abs}(x)')
# (math(mi('abs', mathvariant='normal'), mo('\u2061')), '(x)')
# >>> handle_cmd('mathrm', math(), '\\alpha')
# (math(mi('α', mathvariant='normal')), '')
# >>> handle_cmd('mathrm', math(), '{out} = 3')
# (math(mi('out', mathvariant='normal')), ' = 3')
# >>> handle_cmd('overline', math(), '{981}')
# (mover(mo('¯', accent=True), switch=True, accent=False), '{981}')
# >>> handle_cmd('bar', math(), '{x}')
# (mover(mo('ˉ', stretchy=False), switch=True), '{x}')
# >>> handle_cmd('xleftarrow', math(), r'[\alpha]{10}')
# (munderover(mo('⟵'), mi('α')), '{10}')
# >>> handle_cmd('xleftarrow', math(), r'[\alpha=5]{10}')
# (munderover(mo('⟵'), mrow(mi('α'), mo('='), mn('5'))), '{10}')


def handle_script_or_limit(node, c, limits=''):
    """Append script or limit element to `node`."""
    child = node.children.pop()
    if limits == 'limits':
        child['movablelimits'] = False
    elif (limits == 'movablelimits'
          or getattr(child, 'data', '') in movablelimits):
        child['movablelimits'] = True

    if c == '_':
        if isinstance(child, mover):
            new_node = munderover(*child.children, switch=True)
        elif isinstance(child, msup):
            new_node = msubsup(*child.children, switch=True)
        elif (limits in ('limits', 'movablelimits')
              or limits == '' and child.get('movablelimits', None)):
            new_node = munder(child)
        else:
            new_node = msub(child)
    elif c == '^':
        if isinstance(child, munder):
            new_node = munderover(*child.children)
        elif isinstance(child, msub):
            new_node = msubsup(*child.children)
        elif (limits in ('limits', 'movablelimits')
              or limits == '' and child.get('movablelimits', None)):
            new_node = mover(child)
        else:
            new_node = msup(child)
    node.append(new_node)
    return new_node


def begin_environment(node, string):
    name, string = tex_group(string)
    if name in matrices:
        left_delimiter = matrices[name][0]
        attributes = {}
        if left_delimiter:
            wrapper = mrow(mo(left_delimiter))
            if name == 'cases':
                wrapper = mrow(mo(left_delimiter, rspace='0.17em'))
                attributes['columnalign'] = 'left'
            node.append(wrapper)
            node = wrapper
        elif name == 'smallmatrix':
            attributes['rowspacing'] = '0.02em'
            attributes['columnspacing'] = '0.333em'
            wrapper = mstyle(scriptlevel='1')
            node.append(wrapper)
            node = wrapper
        # TODO: aligned, alignedat
        # take an optional [t], [b] or the default [c]
        entry = mtd()
        node.append(mtable(mtr(entry), **attributes))
        node = entry
    else:
        raise SyntaxError('Environment not supported!')
    return node, string


def end_environment(node, string):
    name, string = tex_group(string)
    if name in matrices:
        node = node.close().close().close() # close: mtd, mdr, mtable
        right_delimiter = matrices[name][1]
        if right_delimiter:
            node = node.append(mo(right_delimiter))
            node = node.close()
        elif name == 'cases':
            node = node.close()
    else:
        raise SyntaxError('Environment not supported!')
    return node, string


# Return the number of "equation_columns" in `code_lines`. cf. "alignat"
# in http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf
def tex_equation_columns(rows):
    tabs = max(row.count('&') - row.count(r'\&') for row in rows)
    if tabs == 0:
        return 0
    return int(tabs/2 + 1)

# >>> tex_equation_columns(['a = b'])
# 0
# >>> tex_equation_columns(['a &= b'])
# 1
# >>> tex_equation_columns(['a &= b & a \in S'])
# 2
# >>> tex_equation_columns(['a &= b & c &= d'])
# 2


# Return dictionary with attributes to style an <mtable> as align environment:
def align_attributes(rows):
    atts = {'class': 'align',
            'displaystyle': True}
    tabs = max(row.count('&') - row.count(r'\&') for row in rows)
    if tabs:
        aligns = ['right', 'left'] * tabs
        spacing = ['0', '2em'] * tabs
        atts['columnalign'] = ' '.join(aligns[:tabs+1])
        atts['columnspacing'] = ' '.join(spacing[:tabs])
    return atts

# >>> align_attributes(['a = b'])
# {'class': 'align', 'displaystyle': True}
# >>> align_attributes(['a &= b'])
# {'class': 'align', 'displaystyle': True, 'columnalign': 'right left', 'columnspacing': '0'}
# >>> align_attributes(['a &= b & a \in S'])
# {'class': 'align', 'displaystyle': True, 'columnalign': 'right left right', 'columnspacing': '0 2em'}
# >>> align_attributes(['a &= b & c &= d'])
# {'class': 'align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'}


def tex2mathml(tex_math, inline=True):
    """Return string with MathML code corresponding to `tex_math`.

    Set `inline` to False for displayed math.
    """
    # Set up tree
    math_tree = math(xmlns='http://www.w3.org/1998/Math/MathML')
    node = math_tree
    if not inline:
        math_tree['display'] = 'block'
        rows = toplevel_code(tex_math).split(r'\\')
        if len(rows) > 1:
            # emulate align* environment with a math table
            node = mtd()
            math_tree.append(mtable(mtr(node),
                                    **align_attributes(rows)))
    parse_latex_math(node, tex_math)
    return math_tree.toprettyxml()

# >>> print(tex2mathml('3'))
# <math xmlns="http://www.w3.org/1998/Math/MathML">
#   <mn>3</mn>
# </math>
# >>> print(tex2mathml('3', inline=False))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
#   <mn>3</mn>
# </math>
# >>> print(tex2mathml(r'a & b \\ c & d', inline=False))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
#   <mtable class="align" columnalign="right left" columnspacing="0" displaystyle="true">
#     <mtr>
#       <mtd>
#         <mi>a</mi>
#       </mtd>
#       <mtd>
#         <mi>b</mi>
#       </mtd>
#     </mtr>
#     <mtr>
#       <mtd>
#         <mi>c</mi>
#       </mtd>
#       <mtd>
#         <mi>d</mi>
#       </mtd>
#     </mtr>
#   </mtable>
# </math>
# >>> print(tex2mathml(r'a \\ b', inline=False))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
#   <mtable class="align" displaystyle="true">
#     <mtr>
#       <mtd>
#         <mi>a</mi>
#       </mtd>
#     </mtr>
#     <mtr>
#       <mtd>
#         <mi>b</mi>
#       </mtd>
#     </mtr>
#   </mtable>
# </math>


# TODO: look up more symbols from tr25, e.g.
#
#
# Table 2.8 Using Vertical Line or Solidus Overlay
#   some of the negated forms of mathematical relations that can only be
#   encoded by using either U+0338 COMBINING LONG SOLIDUS OVERLAY or U+20D2
#   COMBINING LONG VERTICAL LINE OVERLAY . (For issues with using 0338 in
#   MathML, see Section 3.2.7, Combining Marks.
#
# Table 2.9 Variants of Mathematical Symbols using VS1?
#
# Sequence      Description
# 0030 + VS1    DIGIT ZERO - short diagonal stroke form
# 2205 + VS1    EMPTY SET - zero with long diagonal stroke overlay form
# 2229 + VS1    INTERSECTION - with serifs
# 222A + VS1    UNION - with serifs
# 2268 + VS1    LESS-THAN BUT NOT EQUAL TO - with vertical stroke
# 2269 + VS1    GREATER-THAN BUT NOT EQUAL TO - with vertical stroke
# 2272 + VS1    LESS-THAN OR EQUIVALENT TO - following the slant of the lower leg
# 2273 + VS1    GREATER-THAN OR EQUIVALENT TO - following the slant of the lower leg
# 228A + VS1    SUBSET OF WITH NOT EQUAL TO - variant with stroke through bottom members
# 228B + VS1    SUPERSET OF WITH NOT EQUAL TO - variant with stroke through bottom members
# 2293 + VS1    SQUARE CAP - with serifs
# 2294 + VS1    SQUARE CUP - with serifs
# 2295 + VS1    CIRCLED PLUS - with white rim
# 2297 + VS1    CIRCLED TIMES - with white rim
# 229C + VS1    CIRCLED EQUALS - equal sign inside and touching the circle
# 22DA + VS1    LESS-THAN slanted EQUAL TO OR GREATER-THAN
# 22DB + VS1    GREATER-THAN slanted EQUAL TO OR LESS-THAN
# 2A3C + VS1    INTERIOR PRODUCT - tall variant with narrow foot
# 2A3D + VS1    RIGHTHAND INTERIOR PRODUCT - tall variant with narrow foot
# 2A9D + VS1    SIMILAR OR LESS-THAN - following the slant of the upper leg
# 2A9E + VS1    SIMILAR OR GREATER-THAN - following the slant of the upper leg
# 2AAC + VS1    SMALLER THAN OR slanted EQUAL
# 2AAD + VS1    LARGER THAN OR slanted EQUAL
# 2ACB + VS1    SUBSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members
# 2ACC + VS1    SUPERSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members