Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
docutils.utils.math.latex2mathml.py Maven / Gradle / Ivy
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# :Id: $Id: latex2mathml.py 8878 2021-11-05 11:10:44Z milde $
# :Copyright: © 2005 Jens Jørgen Mortensen [1]_
# © 2010, 2021 Günter Milde.
#
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
# are permitted in any medium without royalty provided the copyright
# notice and this notice are preserved.
# This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math`
"""Convert LaTex maths code into presentational MathML.
This module is provisional:
the API is not settled and may change with any minor Docutils version.
"""
# Usage:
#
# >>> from latex2mathml import *
import collections
import copy
import re
import sys
import unicodedata
if sys.version_info >= (3, 0):
unicode = str # noqa
from docutils.utils.math import tex2unichar, toplevel_code
# Character data
# --------------
# LaTeX math macro to Unicode mappings.
# Character categories.
# identifiers ->
letters = tex2unichar.mathalpha
letters['hbar'] = u'\u210F' # compatibility mapping to ℏ (\hslash).
# (ħ LATIN SMALL LETTER H WITH STROKE is upright)
# special case: Capital Greek letters: (upright in TeX style)
greek_capitals = {
'Phi':u'\u03a6', 'Xi':u'\u039e', 'Sigma':u'\u03a3',
'Psi':u'\u03a8', 'Delta':u'\u0394', 'Theta':u'\u0398',
'Upsilon':u'\u03d2', 'Pi':u'\u03a0', 'Omega':u'\u03a9',
'Gamma':u'\u0393', 'Lambda':u'\u039b'}
# functions ->
functions = {# functions with a space in the name
'liminf': u'lim\u202finf',
'limsup': u'lim\u202fsup',
'injlim': u'inj\u202flim',
'projlim': u'proj\u202flim',
# embellished function names (see handle_cmd() below)
'varlimsup': 'lim',
'varliminf': 'lim',
'varprojlim': 'lim',
'varinjlim': 'lim',
# custom function name
'operatorname': None,
}
functions.update((name, name) for name in
('arccos', 'arcsin', 'arctan', 'arg', 'cos',
'cosh', 'cot', 'coth', 'csc', 'deg',
'det', 'dim', 'exp', 'gcd', 'hom',
'ker', 'lg', 'ln', 'log', 'Pr',
'sec', 'sin', 'sinh', 'tan', 'tanh'))
# Function with limits: 'lim', 'sup', 'inf', 'max', 'min':
# use to allow "movablelimits" attribute (see below).
# math font selection -> or
math_alphabets = {# 'cmdname': 'mathvariant value' # package
'boldsymbol': 'bold',
'mathbf': 'bold',
'mathit': 'italic',
'mathtt': 'monospace',
'mathrm': 'normal',
'mathsf': 'sans-serif',
'mathcal': 'script',
'mathbfit': 'bold-italic', # isomath
'mathbb': 'double-struck', # amssymb
'mathfrak': 'fraktur', # amssymb
'mathsfit': 'sans-serif-italic', # isomath
'mathsfbfit': 'sans-serif-bold-italic', # isomath
'mathscr': 'script', # mathrsfs
# unsupported: bold-fraktur
# bold-script
# bold-sans-serif
}
# operator, fence, or separator ->
stretchables = {# extensible delimiters allowed in left/right cmds
'backslash': '\\',
'uparrow': u'\u2191', # ↑ UPWARDS ARROW
'downarrow': u'\u2193', # ↓ DOWNWARDS ARROW
'updownarrow': u'\u2195', # ↕ UP DOWN ARROW
'Uparrow': u'\u21d1', # ⇑ UPWARDS DOUBLE ARROW
'Downarrow': u'\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW
'Updownarrow': u'\u21d5', # ⇕ UP DOWN DOUBLE ARROW
'lmoustache': u'\u23b0', # ⎰ UPPER LEFT OR LOWER RIGHT CURLY BRACKET SECTION
'rmoustache': u'\u23b1', # ⎱ UPPER RIGHT OR LOWER LEFT CURLY BRACKET SECTION
'arrowvert': u'\u23d0', # ⏐ VERTICAL LINE EXTENSION
'bracevert': u'\u23aa', # ⎪ CURLY BRACKET EXTENSION
'lvert': u'|', # left |
'lVert': u'\u2016', # left ‖
'rvert': u'|', # right |
'rVert': u'\u2016', # right ‖
'Arrowvert': u'\u2016', # ‖
}
stretchables.update(tex2unichar.mathfence)
stretchables.update(tex2unichar.mathopen) # Braces
stretchables.update(tex2unichar.mathclose) # Braces
# >>> print(' '.join(sorted(set(stretchables.values()))))
# [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈
operators = {# negated symbols without pre-composed Unicode character
'nleqq': u'\u2266\u0338', # ≦̸
'ngeqq': u'\u2267\u0338', # ≧̸
'nleqslant': u'\u2a7d\u0338', # ⩽̸
'ngeqslant': u'\u2a7e\u0338', # ⩾̸
'ngtrless': u'\u2277\u0338', # txfonts
'nlessgtr': u'\u2276\u0338', # txfonts
'nsubseteqq': u'\u2AC5\u0338', # ⫅̸
'nsupseteqq': u'\u2AC6\u0338', # ⫆̸
# compatibility definitions:
'centerdot': u'\u2B1D', # BLACK VERY SMALL SQUARE | mathbin
'varnothing': u'\u2300', # ⌀ DIAMETER SIGN | empty set
'varpropto': u'\u221d', # ∝ PROPORTIONAL TO | sans serif
'triangle': u'\u25B3', # WHITE UP-POINTING TRIANGLE | mathord
'triangledown': u'\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord
# alias commands:
'dotsb': u'\u22ef', # ⋯ with binary operators/relations
'dotsc': u'\u2026', # … with commas
'dotsi': u'\u22ef', # ⋯ with integrals
'dotsm': u'\u22ef', # ⋯ multiplication dots
'dotso': u'\u2026', # … other dots
# functions with movable limits (requires )
'lim': 'lim',
'sup': 'sup',
'inf': 'inf',
'max': 'max',
'min': 'min',
}
operators.update(tex2unichar.mathbin) # Binary symbols
operators.update(tex2unichar.mathrel) # Relation symbols, arrow symbols
operators.update(tex2unichar.mathord) # Miscellaneous symbols
operators.update(tex2unichar.mathpunct) # Punctuation
operators.update(tex2unichar.mathop) # Variable-sized symbols
operators.update(stretchables)
# special cases
thick_operators = {# style='font-weight: bold;'
'thicksim': u'\u223C', # ∼
'thickapprox':u'\u2248', # ≈
}
small_operators = {# mathsize='75%'
'shortmid': u'\u2223', # ∣
'shortparallel': u'\u2225', # ∥
'nshortmid': u'\u2224', # ∤
'nshortparallel': u'\u2226', # ∦
'smallfrown': u'\u2322', # ⌢ FROWN
'smallsmile': u'\u2323', # ⌣ SMILE
'smallint': u'\u222b', # ∫ INTEGRAL
}
# Operators and functions with limits above/below in display formulas
# and in index position inline (movablelimits=True)
movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes',
'bigsqcup', 'biguplus', 'bigvee', 'bigwedge',
'coprod', 'intop', 'ointop', 'prod', 'sum',
'lim', 'max', 'min', 'sup', 'inf')
# Depending on settings, integrals may also be in this category.
# (e.g. if "amsmath" is loaded with option "intlimits", see
# http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf)
# movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint',
# 'oint', 'ointctrclockwise', 'sqint',
# 'varointclockwise',))
# horizontal space ->
spaces = {'qquad': '2em', # two \quad
'quad': '1em', # 18 mu
'thickspace': '0.2778em', # 5mu = 5/18em
'medspace': '0.2222em', # 4mu = 2/9em
'thinspace': '0.1667em', # 3mu = 1/6em
'negthinspace': '-0.1667em', # -3mu = -1/6em
'negmedspace': '-0.2222em', # -4mu = -2/9em
'negthickspace': '-0.2778em', # -5mu = -5/18em
' ': '0.25em', # inter word space
';': '0.2778em', # 5mu thickspace
':': '0.2222em', # 4mu medspace
',': '0.1667em', # 3mu thinspace
'!': '-0.1667em', # negthinspace
}
# accents ->
accents = {# TeX: (spacing, combining)
'acute': (u'´', u'\u0301'),
'bar': (u'ˉ', u'\u0304'),
'breve': (u'˘', u'\u0306'),
'check': (u'ˇ', u'\u030C'),
'dot': (u'˙', u'\u0307'),
'ddot': (u'¨', u'\u0308'),
'dddot': (u'⋯', u'\u20DB'),
'grave': (u'`', u'\u0300'),
'hat': (u'ˆ', u'\u0302'),
'mathring': (u'˚', u'\u030A'),
'tilde': (u'˜', u'\u0303'), # tilde ~ or small tilde ˜?
'vec': (u'→', u'\u20d7'), # → too heavy, accents="false"
# TODO: ddddot
}
# limits etc. -> or
over = {# TeX: (char, offset-correction/em)
'overbrace': (u'\u23DE', -0.2), # DejaVu Math -0.6
'overleftarrow': (u'\u2190', -0.2),
'overleftrightarrow': (u'\u2194', -0.2),
'overline': (u'_', -0.2), # \u2012' FIGURE DASH does not stretch
'overrightarrow': (u'\u2192', -0.2),
'widehat': (u'^', -0.5),
'widetilde': (u'~', -0.3),
}
under = {'underbrace': (u'\u23DF', 0.1), # DejaVu Math -0.7
'underleftarrow': (u'\u2190', -0.2),
'underleftrightarrow': (u'\u2194', -0.2),
'underline': (u'_', -0.8),
'underrightarrow': (u'\u2192', -0.2),
}
# Character translations
# ----------------------
# characters with preferred alternative in mathematical use
# cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous
anomalous_chars = {'-': u'\u2212', # HYPHEN-MINUS -> MINUS SIGN
':': u'\u2236', # COLON -> RATIO
'~': u'\u00a0', # NO-BREAK SPACE
}
# blackboard bold (Greek characters not working with "mathvariant" (Firefox 78)
mathbb = {u'Γ': u'\u213E', # ℾ
u'Π': u'\u213F', # ℿ
u'Σ': u'\u2140', # ⅀
u'γ': u'\u213D', # ℽ
u'π': u'\u213C', # ℼ
}
# Matrix environments
matrices = {# name: fences
'matrix': ('', ''),
'smallmatrix': ('', ''), # smaller, see begin_environment()!
'pmatrix': ('(', ')'),
'bmatrix': ('[', ']'),
'Bmatrix': ('{', '}'),
'vmatrix': ('|', '|'),
'Vmatrix': (u'\u2016', u'\u2016'), # ‖
'cases': ('{', ''),
}
layout_styles = {
'displaystyle': {'displaystyle': True, 'scriptlevel': 0},
'textstyle': {'displaystyle': False, 'scriptlevel': 0},
'scriptstyle': {'displaystyle': False, 'scriptlevel': 1},
'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2},
}
# See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel
fractions = {# name: style_attrs, frac_attrs
'frac': ({}, {}),
'cfrac': ({'displaystyle': True, 'scriptlevel': 0,
'CLASS': 'cfrac'}, {}), # in LaTeX with padding
'dfrac': (layout_styles['displaystyle'], {}),
'tfrac': (layout_styles['textstyle'], {}),
'binom': ({}, {'linethickness': 0}),
'dbinom': (layout_styles['displaystyle'], {'linethickness': 0}),
'tbinom': (layout_styles['textstyle'], {'linethickness': 0}),
}
delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em']
bigdelimiters = {'left': 0,
'right': 0,
'bigl': 1,
'bigr': 1,
'Bigl': 2,
'Bigr': 2,
'biggl': 3,
'biggr': 3,
'Biggl': 4,
'Biggr': 4,
}
# MathML element classes
# ----------------------
class math(object):
"""Base class for MathML elements and root of MathML trees."""
nchildren = None
"""Expected number of children or None"""
# cf. https://www.w3.org/TR/MathML3/chapter3.html#id.3.1.3.2
parent = None
"""Parent node in MathML DOM tree."""
_level = 0 # indentation level (static class variable)
xml_entities = { # for invalid and invisible characters
ord('<'): u'<',
ord('>'): u'>',
ord('&'): u'&',
0x2061: u'⁡',
}
_boolstrings = {True: 'true', False: 'false'}
"""String representation of boolean MathML attribute values."""
html_tagname = 'span'
"""Tag name for HTML representation."""
def __init__(self, *children, **attributes):
"""Set up node with `children` and `attributes`.
Attributes are downcased: Use CLASS to set "class" value.
>>> math(mn(3), CLASS='test')
math(mn(3), class='test')
>>> math(CLASS='test').toprettyxml()
'\n '
"""
self.children = []
self.extend(children)
self.attributes = collections.OrderedDict()
# sort attributes for predictable functional tests
# as self.attributes.update(attributes) does not keep order in Python < 3.6
for key in sorted(attributes.keys()):
# Use .lower() to allow argument `CLASS` for attribute `class`
# (Python keyword). MathML uses only lowercase attributes.
self.attributes[key.lower()] = attributes[key]
def __repr__(self):
content = [repr(item) for item in getattr(self, 'children', [])]
if hasattr(self, 'data'):
content.append(repr(self.data))
if isinstance(self, MathSchema) and self.switch:
content.append('switch=True')
content += ["%s=%r"%(k, v) for k, v in self.attributes.items()
if v is not None]
return self.__class__.__name__ + '(%s)' % ', '.join(content)
def __len__(self):
return len(self.children)
# emulate dictionary-like access to attributes
# see `docutils.nodes.Element` for dict/list interface
def __getitem__(self, key):
return self.attributes[key]
def __setitem__(self, key, item):
self.attributes[key] = item
def get(self, *args, **kwargs):
return self.attributes.get(*args, **kwargs)
def full(self):
"""Return boolean indicating whether children may be appended."""
return (self.nchildren is not None
and len(self) >= self.nchildren)
def append(self, child):
"""Append child and return self or first non-full parent.
If self is full, go up the tree and return first non-full node or
`None`.
"""
if self.full():
raise SyntaxError('Node %s already full!' % self)
self.children.append(child)
child.parent = self
if self.full():
return self.close()
return self
def extend(self, children):
for child in children:
self.append(child)
return self
def close(self):
"""Close element and return first non-full parent or None."""
parent = self.parent
while parent is not None and parent.full():
parent = parent.parent
return parent
def toprettyxml(self):
"""Return XML representation of self as string."""
return ''.join(self._xml())
def _xml(self, level=0):
return ([self.xml_starttag()]
+ self._xml_body(level)
+ ['%s>' % self.__class__.__name__])
def xml_starttag(self):
attrs = ['%s="%s"' % (k, str(v).replace('True', 'true').replace('False', 'false'))
for k, v in self.attributes.items()
if v is not None]
return '<%s>' % ' '.join([self.__class__.__name__] + attrs)
def _xml_body(self, level=0):
xml = []
for child in self.children:
xml.extend(['\n', ' ' * (level+1)])
xml.extend(child._xml(level+1))
xml.extend(['\n', ' ' * level])
return xml
# >>> n2 = math(mn(2))
# >>> n2
# math(mn(2))
# >>> n2.toprettyxml()
# '\n 2 \n '
# >>> len(n2)
# 1
# >>> eq3 = math(id='eq3', display='block')
# >>> eq3
# math(display='block', id='eq3')
# >>> eq3.toprettyxml()
# '\n '
# >>> len(eq3)
# 0
# >>> math(CLASS='bold').xml_starttag()
# ''
class mtable(math): pass
# >>> mt = mtable(displaystyle=True)
# >>> mt
# mtable(displaystyle=True)
# >>> math(mt).toprettyxml()
# '\n \n \n '
class mrow(math):
"""Group sub-expressions as a horizontal row."""
def close(self):
"""Close element and return first non-full parent or None.
Remove , if it is single child and the parent infers an mrow
or if it has only one child element.
"""
parent = self.parent
if isinstance(parent, MathRowSchema) and parent.nchildren == 1:
parent.nchildren = len(parent.children)
parent.children = self.children
for child in self.children:
child.parent = parent
return parent.close()
if len(self) == 1:
try:
parent.children[parent.children.index(self)] = self.children[0]
self.children[0].parent = parent
except (AttributeError, ValueError):
return self.children[0]
return super(mrow, self).close()
# >>> mrow(displaystyle=False)
# mrow(displaystyle=False)
# The elements , , , , , ,
# , , and treat their contents as a single inferred mrow
# formed from all their children.
class MathRowSchema(math):
"""Base class for elements treating content as a single inferred mrow."""
class mtr(MathRowSchema): pass
class mtd(MathRowSchema): pass
class menclose(MathRowSchema):
nchildren = 1 # \boxed expects one argument or a group
class mphantom(MathRowSchema):
nchildren = 1 # \phantom expects one argument or a group
class msqrt(MathRowSchema):
nchildren = 1 # \sqrt expects one argument or a group
class mstyle(MathRowSchema):
nchildren = 1 # \mathrm, ... expect one argument or a group
class MathToken(math):
"""Token Element: contains textual data instead of children.
Base class for mo, mi, and mn.
"""
nchildren = 0
def __init__(self, data, **attributes):
self.data = data
super(MathToken, self).__init__(**attributes)
def _xml_body(self, level=0):
return [unicode(self.data).translate(self.xml_entities)]
class mtext(MathToken): pass
class mi(MathToken): pass
class mo(MathToken): pass
class mn(MathToken): pass
# >>> mo(u'<')
# mo('<')
# >>> mo(u'<')._xml()
# ['', '<', ' ']
class MathSchema(math):
"""Base class for schemata expecting 2 or more children.
The special attribute `switch` indicates that the last two child
elements are in reversed order and must be switched before XML-export.
"""
nchildren = 2
def __init__(self, *children, **kwargs):
self.switch = kwargs.pop('switch', False)
math.__init__(self, *children, **kwargs)
def append(self, child):
current_node = super(MathSchema, self).append(child)
# normalize order if full
if self.switch and self.full():
self.children[-1], self.children[-2] = self.children[-2], self.children[-1]
self.switch = False
return current_node
class msub(MathSchema): pass
class msup(MathSchema): pass
class msubsup(MathSchema):
nchildren = 3
# >>> msub(mi('x'), mo('-'))
# msub(mi('x'), mo('-'))
# >>> msubsup(mi('base'), mi('sub'), mi('super'))
# msubsup(mi('base'), mi('sub'), mi('super'))
# >>> msubsup(mi('base'), mi('super'), mi('sub'), switch=True)
# msubsup(mi('base'), mi('sub'), mi('super'))
class munder(msub): pass
class mover(msup): pass
# >>> munder(mi('lim'), mo('-'), accent=False)
# munder(mi('lim'), mo('-'), accent=False)
# >>> mu = munder(mo('-'), accent=False, switch=True)
# >>> mu
# munder(mo('-'), switch=True, accent=False)
# >>> mu.append(mi('lim'))
# >>> mu
# munder(mi('lim'), mo('-'), accent=False)
# >>> mu.append(mi('lim'))
# Traceback (most recent call last):
# SyntaxError: Node munder(mi('lim'), mo('-'), accent=False) already full!
# >>> munder(mo('-'), mi('lim'), accent=False, switch=True).toprettyxml()
# '\n lim \n - \n '
class munderover(msubsup): pass
class mroot(MathSchema):
nchildren = 2
class mfrac(math):
nchildren = 2
class mspace(math):
nchildren = 0
# LaTeX to MathML translation
# ---------------------------
# auxiliary functions
# ~~~~~~~~~~~~~~~~~~~
def tex_cmdname(string):
"""Return leading TeX command name and remainder of `string`.
>>> tex_cmdname('mymacro2') # up to first non-letter
('mymacro', '2')
>>> tex_cmdname('name 2') # strip trailing whitespace
('name', '2')
>>> tex_cmdname('_2') # single non-letter character
('_', '2')
"""
m = re.match(r'([a-zA-Z]+) *(.*)', string)
if m is None:
m = re.match(r'(.?)(.*)', string)
return m.group(1), m.group(2)
# Test:
#
# >>> tex_cmdname('name_2') # first non-letter terminates
# ('name', '_2')
# >>> tex_cmdname(' next') # leading whitespace is returned
# (' ', 'next')
# >>> tex_cmdname('1 2') # whitespace after non-letter is kept
# ('1', ' 2')
# >>> tex_cmdname('') # empty string
# ('', '')
def tex_number(string):
"""Return leading number literal and remainder of `string`.
>>> tex_number('123.4')
('123.4', '')
"""
m = re.match(r'([0-9.,]*[0-9]+)(.*)', string)
if m is None:
return '', string
return m.group(1), m.group(2)
# Test:
#
# >>> tex_number(' 23.4b') # leading whitespace -> no number
# ('', ' 23.4b')
# >>> tex_number('23,400/2') # comma separator included
# ('23,400', '/2')
# >>> tex_number('23. 4/2') # trailing separator not included
# ('23', '. 4/2')
# >>> tex_number('4, 2') # trailing separator not included
# ('4', ', 2')
# >>> tex_number('1 000.4')
# ('1', ' 000.4')
def tex_token(string):
"""Return first simple TeX token and remainder of `string`.
>>> tex_token('\\command{without argument}')
('\\command', '{without argument}')
>>> tex_token('or first character')
('o', 'r first character')
"""
m = re.match(r"""((?P\\[a-zA-Z]+)\s* # TeX command, skip whitespace
|(?P\\.) # one-character TeX command
|(?P.?)) # first character (or empty)
(?P.*$) # remaining part of string
""", string, re.VERBOSE)
cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder')
return cmd or chcmd or ch, remainder
# Test:
#
# >>> tex_token('{opening bracket of group}')
# ('{', 'opening bracket of group}')
# >>> tex_token('\\skip whitespace after macro name')
# ('\\skip', 'whitespace after macro name')
# >>> tex_token('. but not after single char')
# ('.', ' but not after single char')
# >>> tex_token('') # empty string.
# ('', '')
# >>> tex_token('\{escaped bracket')
# ('\\{', 'escaped bracket')
def tex_group(string):
"""Return first TeX group or token and remainder of `string`.
>>> tex_group('{first group} returned without brackets')
('first group', ' returned without brackets')
"""
split_index = 0
nest_level = 0 # level of {{nested} groups}
escape = False # the next character is escaped (\)
if not string.startswith('{'):
# special case: there is no group, return first token and remainder
return string[:1], string[1:]
for c in string:
split_index += 1
if escape:
escape = False
elif c == '\\':
escape = True
elif c == '{':
nest_level += 1
elif c == '}':
nest_level -= 1
if nest_level == 0:
break
else:
raise SyntaxError('Group without closing bracket')
return string[1:split_index-1], string[split_index:]
# >>> tex_group('{} empty group')
# ('', ' empty group')
# >>> tex_group('{group with {nested} group} ')
# ('group with {nested} group', ' ')
# >>> tex_group('{group with {nested group}} at the end')
# ('group with {nested group}', ' at the end')
# >>> tex_group('{{group} {with {{complex }nesting}} constructs}')
# ('{group} {with {{complex }nesting}} constructs', '')
# >>> tex_group('{group with \\{escaped\\} brackets}')
# ('group with \\{escaped\\} brackets', '')
# >>> tex_group('{group followed by closing bracket}} from outer group')
# ('group followed by closing bracket', '} from outer group')
# >>> tex_group('No group? Return first character.')
# ('N', 'o group? Return first character.')
# >>> tex_group(' {also whitespace}')
# (' ', '{also whitespace}')
def tex_token_or_group(string):
"""Return first TeX group or token and remainder of `string`.
>>> tex_token_or_group('\\command{without argument}')
('\\command', '{without argument}')
>>> tex_token_or_group('first character')
('f', 'irst character')
>>> tex_token_or_group(' also whitespace')
(' ', 'also whitespace')
>>> tex_token_or_group('{first group} keep rest')
('first group', ' keep rest')
"""
arg, remainder = tex_token(string)
if arg == '{':
arg, remainder = tex_group(string.lstrip())
return arg, remainder
# >>> tex_token_or_group('\{no group but left bracket')
# ('\\{', 'no group but left bracket')
def tex_optarg(string):
"""Return optional argument and remainder.
>>> tex_optarg('[optional argument] returned without brackets')
('optional argument', ' returned without brackets')
>>> tex_optarg('{empty string, if there is no optional arg}')
('', '{empty string, if there is no optional arg}')
"""
m = re.match(r"""\s* # leading whitespace
\[(?P(\\]|[^\[\]]|\\])*)\] # [group] without nested groups
(?P.*$)
""", string, re.VERBOSE)
if m is None and not string.startswith('['):
return '', string
try:
return m.group('optarg'), m.group('remainder')
except AttributeError:
raise SyntaxError('Could not extract optional argument from %r' % string)
# Test:
# >>> tex_optarg(' [optional argument] after whitespace')
# ('optional argument', ' after whitespace')
# >>> tex_optarg('[missing right bracket')
# Traceback (most recent call last):
# SyntaxError: Could not extract optional argument from '[missing right bracket'
# >>> tex_optarg('[group with [nested group]]')
# Traceback (most recent call last):
# SyntaxError: Could not extract optional argument from '[group with [nested group]]'
def parse_latex_math(node, string):
"""Append MathML conversion of `string` to `node` and return it.
>>> parse_latex_math(math(), r'\alpha')
math(mi('α'))
>>> parse_latex_math(mrow(), r'x_{n}')
mrow(msub(mi('x'), mi('n')))
"""
# Normalize white-space:
string = ' '.join(string.split())
tree = node
while len(string) > 0:
# Take off first character:
c, string = string[0], string[1:]
if c == ' ':
continue # whitespace is ignored in LaTeX math mode
if c == '\\': # start of a LaTeX macro
cmdname, string = tex_cmdname(string)
node, string = handle_cmd(cmdname, node, string)
elif c in "_^":
node = handle_script_or_limit(node, c)
elif c == '{':
new_node = mrow()
node.append(new_node)
node = new_node
elif c == '}':
node = node.close()
elif c == '&':
new_node = mtd()
node.close().append(new_node)
node = new_node
elif c.isalpha():
node = node.append(mi(c))
elif c.isdigit():
number, string = tex_number(string)
node = node.append(mn(c+number))
elif c in anomalous_chars:
# characters with a special meaning in LaTeX math mode
# fix spacing before "unary" minus.
attributes = {}
if c == '-' and node.children:
previous_node = node.children[-1]
if (getattr(previous_node, 'data', '-') in '([='
or previous_node.get('class') == 'mathopen'):
attributes['form'] = 'prefix'
node = node.append(mo(anomalous_chars[c], **attributes))
elif c in "/()[]|":
node = node.append(mo(c, stretchy=False))
elif c in "+*=<>,.!?`';@":
node = node.append(mo(c))
else:
raise SyntaxError(u'Unsupported character: "%s"' % c)
return tree
# Test:
# >>> print(parse_latex_math(math(), ''))
# math()
# >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}')
# math(msqrt(mi('α')))
# >>> parse_latex_math(math(), '23.4x')
# math(mn('23.4'), mi('x'))
# >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3')
# math(msqrt(mn('2')), mo('≠'), mn('3'))
# >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 3')
# math(msqrt(mn('2'), mo('+'), mn('3')), mo('<'), mn('3'))
# >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}')
# math(mroot(mrow(mn('2'), mo('+'), mn('3')), mn('3')))
# >>> parse_latex_math(math(), '\max_x') # function takes limits
# math(munder(mo('max', movablelimits=True), mi('x')))
# >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup
# math(msubsup(mi('x'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), '\int^j_i') # ensure correct order
# math(msubsup(mo('∫'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), 'x_{\\alpha}')
# math(msub(mi('x'), mi('α')))
# >>> parse_latex_math(math(), 'x_\\text{in}')
# math(msub(mi('x'), mtext('in')))
def handle_cmd(name, node, string):
"""Process LaTeX command `name` followed by `string`.
Append result to `node`.
If needed, parse `string` for command argument.
Return new current node and remainder of `string`:
>>> handle_cmd('hbar', math(), r' \frac')
(math(mi('ℏ')), ' \\frac')
>>> handle_cmd('hspace', math(), r'{1ex} (x)')
(math(mspace(width='1ex')), ' (x)')
"""
# Token elements
# ==============
# identifier ->
if name in letters:
new_node = mi(letters[name])
if name in greek_capitals:
# upright in "TeX style" but MathML sets them italic ("ISO style").
# CSS styling does not change the font style in Firefox 78.
# Use 'mathvariant="normal"'?
new_node['class'] = 'capital-greek'
node = node.append(new_node)
return node, string
if name in functions:
# use followed by invisible function applicator character
# (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi)
if name == 'operatorname':
# custom function name, e.g. ``\operatorname{abs}(x)``
# TODO: \operatorname* -> with limits
arg, string = tex_token_or_group(string)
new_node = mi(arg, mathvariant='normal')
else:
new_node = mi(functions[name])
# embellished function names:
if name == 'varliminf': # \underline\lim
new_node = munder(new_node, mo(u'_'))
elif name == 'varlimsup': # \overline\lim
new_node = mover(new_node, mo(u'¯'), accent=False)
elif name == 'varprojlim': # \underleftarrow\lim
new_node = munder(new_node, mo(u'\u2190'))
elif name == 'varinjlim': # \underrightarrow\lim
new_node = munder(new_node, mo(u'\u2192'))
node = node.append(new_node)
# add ApplyFunction when appropriate (not \sin^2(x), say)
# cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi
if string and string[0] not in ('^', '_'):
node = node.append(mo(u'\u2061')) # ⁡
return node, string
if name in math_alphabets:
if name == 'boldsymbol':
attributes = {'class': 'boldsymbol'}
else:
attributes = {'mathvariant': math_alphabets[name]}
if name == 'mathscr':
attributes['class'] = 'mathscr'
# Check for single symbol (letter, name, or ⅀)
arg, remainder = tex_token_or_group(string)
if arg.startswith('\\'):
# convert single letters (so the isalpha() test below works).
# TODO: convert all LICRs in a group (\matrm{\mu\Omega})
arg = letters.get(arg[1:], arg)
if name == 'mathbb':
# mathvariant="double-struck" is ignored for Greek letters
# (tested in Firefox 78). Use literal Unicode characters.
arg = mathbb.get(arg, arg)
if arg.isalpha() or arg == u'\u2140':
node = node.append(mi(arg, **attributes))
return node, remainder
# Wrap in