docutils.utils.math.latex2mathml.py Maven / Gradle / Ivy

Go to download
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# :Id: $Id: latex2mathml.py 8878 2021-11-05 11:10:44Z milde $
# :Copyright: © 2005 Jens Jørgen Mortensen [1]_
#             © 2010, 2021 Günter Milde.
#
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
#    Copying and distribution of this file, with or without modification,
#    are permitted in any medium without royalty provided the copyright
#    notice and this notice are preserved.
#    This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math`

"""Convert LaTex maths code into presentational MathML.

This module is provisional:
the API is not settled and may change with any minor Docutils version.
"""

# Usage:
#
# >>> from latex2mathml import *

import collections
import copy
import re
import sys
import unicodedata
if sys.version_info >= (3, 0):
    unicode = str  # noqa

from docutils.utils.math import tex2unichar, toplevel_code


# Character data
# --------------

# LaTeX math macro to Unicode mappings.
# Character categories.

# identifiers -> 

letters = tex2unichar.mathalpha
letters['hbar'] = u'\u210F' # compatibility mapping to ℏ (\hslash).
# (ħ LATIN SMALL LETTER H WITH STROKE is upright)

# special case: Capital Greek letters: (upright in TeX style)
greek_capitals = {
    'Phi':u'\u03a6', 'Xi':u'\u039e', 'Sigma':u'\u03a3',
    'Psi':u'\u03a8', 'Delta':u'\u0394', 'Theta':u'\u0398',
    'Upsilon':u'\u03d2', 'Pi':u'\u03a0', 'Omega':u'\u03a9',
    'Gamma':u'\u0393', 'Lambda':u'\u039b'}

# functions -> 
functions = {# functions with a space in the name
             'liminf': u'lim\u202finf',
             'limsup': u'lim\u202fsup',
             'injlim': u'inj\u202flim',
             'projlim': u'proj\u202flim',
             # embellished function names (see handle_cmd() below)
             'varlimsup': 'lim',
             'varliminf': 'lim',
             'varprojlim': 'lim',
             'varinjlim': 'lim',
             # custom function name
             'operatorname': None,
            }
functions.update((name, name) for name in
                 ('arccos', 'arcsin', 'arctan', 'arg',  'cos',
                  'cosh',   'cot',    'coth',   'csc',  'deg',
                  'det',    'dim',    'exp',    'gcd',  'hom',
                  'ker',    'lg',     'ln',     'log',  'Pr',
                  'sec',    'sin',    'sinh',   'tan',  'tanh'))
# Function with limits: 'lim', 'sup', 'inf', 'max', 'min':
# use  to allow "movablelimits" attribute (see below).


# math font selection ->  or 
math_alphabets = {# 'cmdname': 'mathvariant value'  # package
                  'boldsymbol': 'bold',
                  'mathbf':     'bold',
                  'mathit':     'italic',
                  'mathtt':     'monospace',
                  'mathrm':     'normal',
                  'mathsf':     'sans-serif',
                  'mathcal':    'script',
                  'mathbfit':   'bold-italic',            # isomath
                  'mathbb':     'double-struck',          # amssymb
                  'mathfrak':   'fraktur',                # amssymb
                  'mathsfit':   'sans-serif-italic',      # isomath
                  'mathsfbfit': 'sans-serif-bold-italic', # isomath
                  'mathscr':    'script',                 # mathrsfs
                  # unsupported: bold-fraktur
                  #              bold-script
                  #              bold-sans-serif
                 }

# operator, fence, or separator -> 


stretchables = {# extensible delimiters allowed in left/right cmds
                'backslash':   '\\',
                'uparrow':     u'\u2191', # ↑ UPWARDS ARROW
                'downarrow':   u'\u2193', # ↓ DOWNWARDS ARROW
                'updownarrow': u'\u2195', # ↕ UP DOWN ARROW
                'Uparrow':     u'\u21d1', # ⇑ UPWARDS DOUBLE ARROW
                'Downarrow':   u'\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW
                'Updownarrow': u'\u21d5', # ⇕ UP DOWN DOUBLE ARROW
                'lmoustache':  u'\u23b0', # ⎰ UPPER LEFT OR LOWER RIGHT CURLY BRACKET SECTION
                'rmoustache':  u'\u23b1', # ⎱ UPPER RIGHT OR LOWER LEFT CURLY BRACKET SECTION
                'arrowvert':   u'\u23d0', # ⏐ VERTICAL LINE EXTENSION
                'bracevert':   u'\u23aa', # ⎪ CURLY BRACKET EXTENSION
                'lvert':      u'|',      # left  |
                'lVert':      u'\u2016', # left  ‖
                'rvert':      u'|',      # right |
                'rVert':      u'\u2016', # right ‖
                'Arrowvert':  u'\u2016', # ‖
               }
stretchables.update(tex2unichar.mathfence)
stretchables.update(tex2unichar.mathopen)  # Braces
stretchables.update(tex2unichar.mathclose) # Braces

# >>> print(' '.join(sorted(set(stretchables.values()))))
# [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈

operators = {# negated symbols without pre-composed Unicode character
             'nleqq':      u'\u2266\u0338', # ≦̸
             'ngeqq':      u'\u2267\u0338', # ≧̸
             'nleqslant':  u'\u2a7d\u0338', # ⩽̸
             'ngeqslant':  u'\u2a7e\u0338', # ⩾̸
             'ngtrless':   u'\u2277\u0338', # txfonts
             'nlessgtr':   u'\u2276\u0338', # txfonts
             'nsubseteqq': u'\u2AC5\u0338', # ⫅̸
             'nsupseteqq': u'\u2AC6\u0338', # ⫆̸
             # compatibility definitions:
             'centerdot': u'\u2B1D', # BLACK VERY SMALL SQUARE | mathbin
             'varnothing': u'\u2300', # ⌀ DIAMETER SIGN | empty set
             'varpropto': u'\u221d', # ∝ PROPORTIONAL TO | sans serif
             'triangle': u'\u25B3', # WHITE UP-POINTING TRIANGLE | mathord
             'triangledown': u'\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord
             # alias commands:
             'dotsb': u'\u22ef', # ⋯ with binary operators/relations
             'dotsc': u'\u2026', # … with commas
             'dotsi': u'\u22ef', # ⋯ with integrals
             'dotsm': u'\u22ef', # ⋯ multiplication dots
             'dotso': u'\u2026', # … other dots
             # functions with movable limits (requires )
             'lim': 'lim',
             'sup': 'sup',
             'inf': 'inf',
             'max': 'max',
             'min': 'min',
            }
operators.update(tex2unichar.mathbin)   # Binary symbols
operators.update(tex2unichar.mathrel)   # Relation symbols, arrow symbols
operators.update(tex2unichar.mathord)   # Miscellaneous symbols
operators.update(tex2unichar.mathpunct) # Punctuation
operators.update(tex2unichar.mathop)    # Variable-sized symbols
operators.update(stretchables)


# special cases

thick_operators = {# style='font-weight: bold;'
                   'thicksim':   u'\u223C', # ∼
                   'thickapprox':u'\u2248', # ≈
                  }

small_operators = {# mathsize='75%'
                   'shortmid':       u'\u2223', # ∣
                   'shortparallel':  u'\u2225', # ∥
                   'nshortmid':      u'\u2224', # ∤
                   'nshortparallel': u'\u2226', # ∦
                   'smallfrown':     u'\u2322', # ⌢ FROWN
                   'smallsmile':     u'\u2323', # ⌣ SMILE
                   'smallint':       u'\u222b', # ∫ INTEGRAL
                  }

# Operators and functions with limits above/below in display formulas
# and in index position inline (movablelimits=True)
movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes',
                  'bigsqcup', 'biguplus', 'bigvee', 'bigwedge',
                  'coprod', 'intop', 'ointop', 'prod', 'sum',
                  'lim', 'max', 'min', 'sup', 'inf')
# Depending on settings, integrals may also be in this category.
# (e.g. if "amsmath" is loaded with option "intlimits", see
#  http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf)
# movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint',
#                       'oint', 'ointctrclockwise', 'sqint',
#                       'varointclockwise',))

# horizontal space -> 

spaces = {'qquad':         '2em',       # two \quad
          'quad':          '1em',       # 18 mu
          'thickspace':    '0.2778em',  # 5mu = 5/18em
          'medspace':      '0.2222em',  # 4mu = 2/9em
          'thinspace':     '0.1667em',  # 3mu = 1/6em
          'negthinspace':  '-0.1667em', # -3mu = -1/6em
          'negmedspace':   '-0.2222em', # -4mu = -2/9em
          'negthickspace': '-0.2778em', # -5mu = -5/18em
          ' ':             '0.25em',    # inter word space
          ';':             '0.2778em',  # 5mu thickspace
          ':':             '0.2222em',  # 4mu medspace
          ',':             '0.1667em',  # 3mu thinspace
          '!':             '-0.1667em', # negthinspace
         }

# accents -> 
accents = {# TeX:      (spacing, combining)
           'acute':    (u'´', u'\u0301'),
           'bar':      (u'ˉ', u'\u0304'),
           'breve':    (u'˘', u'\u0306'),
           'check':    (u'ˇ', u'\u030C'),
           'dot':      (u'˙', u'\u0307'),
           'ddot':     (u'¨', u'\u0308'),
           'dddot':    (u'⋯', u'\u20DB'),
           'grave':    (u'`', u'\u0300'),
           'hat':      (u'ˆ', u'\u0302'),
           'mathring': (u'˚', u'\u030A'),
           'tilde':    (u'˜', u'\u0303'), # tilde ~ or small tilde ˜?
           'vec':      (u'→', u'\u20d7'), # → too heavy, accents="false"
           # TODO: ddddot
       }

# limits etc. ->  or 
over = {# TeX:                  (char,     offset-correction/em)
        'overbrace':            (u'\u23DE', -0.2), # DejaVu Math -0.6
        'overleftarrow':        (u'\u2190', -0.2),
        'overleftrightarrow':   (u'\u2194', -0.2),
        'overline':             (u'_',      -0.2),   # \u2012' FIGURE DASH does not stretch
        'overrightarrow':       (u'\u2192', -0.2),
        'widehat':              (u'^',      -0.5),
        'widetilde':            (u'~',      -0.3),
       }
under = {'underbrace':          (u'\u23DF',  0.1), # DejaVu Math -0.7
         'underleftarrow':      (u'\u2190', -0.2),
         'underleftrightarrow': (u'\u2194', -0.2),
         'underline':           (u'_',      -0.8),
         'underrightarrow':     (u'\u2192', -0.2),
        }

# Character translations
# ----------------------
# characters with preferred alternative in mathematical use
# cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous
anomalous_chars = {'-': u'\u2212', # HYPHEN-MINUS -> MINUS SIGN
                   ':': u'\u2236', # COLON -> RATIO
                   '~': u'\u00a0', # NO-BREAK SPACE
                  }

# blackboard bold (Greek characters not working with "mathvariant" (Firefox 78)
mathbb = {u'Γ': u'\u213E',    # ℾ
          u'Π': u'\u213F',    # ℿ
          u'Σ': u'\u2140',    # ⅀
          u'γ': u'\u213D',    # ℽ
          u'π': u'\u213C',    # ℼ
         }

# Matrix environments
matrices = {# name:    fences
            'matrix':  ('', ''),
            'smallmatrix':  ('', ''), # smaller, see begin_environment()!
            'pmatrix': ('(', ')'),
            'bmatrix': ('[', ']'),
            'Bmatrix': ('{', '}'),
            'vmatrix': ('|', '|'),
            'Vmatrix': (u'\u2016', u'\u2016'), # ‖
            'cases':   ('{', ''),
           }

layout_styles = {
    'displaystyle':      {'displaystyle': True,  'scriptlevel': 0},
    'textstyle':         {'displaystyle': False, 'scriptlevel': 0},
    'scriptstyle':       {'displaystyle': False, 'scriptlevel': 1},
    'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2},
    }
# See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel

fractions = {# name:   style_attrs, frac_attrs
             'frac':   ({}, {}),
             'cfrac':  ({'displaystyle': True,  'scriptlevel': 0,
                         'CLASS': 'cfrac'}, {}), # in LaTeX with padding
             'dfrac':  (layout_styles['displaystyle'], {}),
             'tfrac':  (layout_styles['textstyle'], {}),
             'binom':  ({}, {'linethickness': 0}),
             'dbinom': (layout_styles['displaystyle'], {'linethickness': 0}),
             'tbinom': (layout_styles['textstyle'], {'linethickness': 0}),
            }

delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em']
bigdelimiters = {'left':  0,
                 'right': 0,
                 'bigl':  1,
                 'bigr':  1,
                 'Bigl':  2,
                 'Bigr':  2,
                 'biggl': 3,
                 'biggr': 3,
                 'Biggl': 4,
                 'Biggr': 4,
                }


# MathML element classes
# ----------------------

class math(object):
    """Base class for MathML elements and root of MathML trees."""

    nchildren = None
    """Expected number of children or None"""
    # cf. https://www.w3.org/TR/MathML3/chapter3.html#id.3.1.3.2
    parent = None
    """Parent node in MathML DOM tree."""
    _level = 0 # indentation level (static class variable)
    xml_entities = { # for invalid and invisible characters
                    ord('<'): u'<',
                    ord('>'): u'>',
                    ord('&'): u'&',
                    0x2061:   u'⁡',
                   }
    _boolstrings = {True: 'true', False: 'false'}
    """String representation of boolean MathML attribute values."""

    html_tagname = 'span'
    """Tag name for HTML representation."""

    def __init__(self, *children, **attributes):
        """Set up node with `children` and `attributes`.

        Attributes are downcased: Use CLASS to set "class" value.
        >>> math(mn(3), CLASS='test')
        math(mn(3), class='test')
        >>> math(CLASS='test').toprettyxml()
        ' $\n$ '

        """
        self.children = []
        self.extend(children)

        self.attributes = collections.OrderedDict()
        # sort attributes for predictable functional tests
        # as self.attributes.update(attributes) does not keep order in Python < 3.6
        for key in sorted(attributes.keys()):
            # Use .lower() to allow argument `CLASS` for attribute `class`
            # (Python keyword). MathML uses only lowercase attributes.
            self.attributes[key.lower()] = attributes[key]

    def __repr__(self):
        content = [repr(item) for item in getattr(self, 'children', [])]
        if hasattr(self, 'data'):
            content.append(repr(self.data))
        if isinstance(self, MathSchema) and self.switch:
            content.append('switch=True')
        content += ["%s=%r"%(k, v) for k, v in self.attributes.items()
                    if v is not None]

        return self.__class__.__name__ + '(%s)' % ', '.join(content)

    def __len__(self):
        return len(self.children)

    # emulate dictionary-like access to attributes
    # see `docutils.nodes.Element` for dict/list interface
    def __getitem__(self, key):
        return self.attributes[key]
    def __setitem__(self, key, item):
        self.attributes[key] = item
    def get(self, *args, **kwargs):
        return self.attributes.get(*args, **kwargs)

    def full(self):
        """Return boolean indicating whether children may be appended."""
        return (self.nchildren is not None
                and len(self) >= self.nchildren)

    def append(self, child):
        """Append child and return self or first non-full parent.

        If self is full, go up the tree and return first non-full node or
        `None`.
        """
        if self.full():
            raise SyntaxError('Node %s already full!' % self)
        self.children.append(child)
        child.parent = self
        if self.full():
            return self.close()
        return self

    def extend(self, children):
        for child in children:
            self.append(child)
        return self

    def close(self):
        """Close element and return first non-full parent or None."""
        parent = self.parent
        while parent is not None and parent.full():
            parent = parent.parent
        return parent

    def toprettyxml(self):
        """Return XML representation of self as string."""
        return ''.join(self._xml())

    def _xml(self, level=0):
        return ([self.xml_starttag()]
                + self._xml_body(level)
                + ['' % self.__class__.__name__])

    def xml_starttag(self):
        attrs = ['%s="%s"' % (k, str(v).replace('True', 'true').replace('False', 'false'))
                 for k, v in self.attributes.items()
                 if v is not None]
        return '<%s>' % ' '.join([self.__class__.__name__] + attrs)

    def _xml_body(self, level=0):
        xml = []
        for child in self.children:
            xml.extend(['\n', '  ' * (level+1)])
            xml.extend(child._xml(level+1))
        xml.extend(['\n', '  ' * level])
        return xml

# >>> n2 = math(mn(2))
# >>> n2
# math(mn(2))
# >>> n2.toprettyxml()
# ' $\n 2 \n$ '
# >>> len(n2)
# 1
# >>> eq3 = math(id='eq3', display='block')
# >>> eq3
# math(display='block', id='eq3')
# >>> eq3.toprettyxml()
# ' $\n$ '
# >>> len(eq3)
# 0
# >>> math(CLASS='bold').xml_starttag()
# ' $'

class mtable(math): pass

# >>> mt = mtable(displaystyle=True)
# >>> mt
# mtable(displaystyle=True)
# >>> math(mt).toprettyxml()
# ' \n \begin{matrix} \n \end{matrix} \n'

class mrow(math):
    """Group sub-expressions as a horizontal row."""

    def close(self):
        """Close element and return first non-full parent or None.

        Remove, if it is single child and the parent infers an mrow
        or if it has only one child element.
        """
        parent = self.parent
        if isinstance(parent, MathRowSchema) and parent.nchildren == 1:
            parent.nchildren = len(parent.children)
            parent.children = self.children
            for child in self.children:
                child.parent = parent
            return parent.close()
        if len(self) == 1:
            try:
                parent.children[parent.children.index(self)] = self.children[0]
                self.children[0].parent = parent
            except (AttributeError, ValueError):
                return self.children[0]
        return super(mrow, self).close()

# >>> mrow(displaystyle=False)
# mrow(displaystyle=False)

# The elements \sqrt{,,,,}$