All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formulasearchengine.mathmltools.utils.Utility Maven / Gradle / Ivy

package com.formulasearchengine.mathmltools.utils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.unbescape.html.HtmlEscape;
import org.w3c.dom.Document;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Static utility class for some typical functions we need
 */
public final class Utility {
    public static final String NL = System.lineSeparator();

    // Possible line endings
    public static final char CR = (char) 0x0D;   // Mac (pre-OSX)
    public static final char LF = (char) 0x0A;   // Unix, Mac (OSX)
    public static final String CRLF = "" + CR + LF; // Windows
    // Own logger
    private static final Logger LOG = LogManager.getLogger(Utility.class.getName());
    // Surrounding underscore
    private static final String POM_BUG_AVOIDANCE_UNDERSCORE = "_(\\\\[^\\s\\n-+]+)";
    // Start/End with ,;. or \[ \] will be deleted
    private static final String ELEMINATE_ENDINGS = "[\\s,;.!]*(\\\\])?[\\s,;.!]*$";
    private static final String ELEMINATE_STARTS = "^[\\s,;.!]*(\\\\\\[)?[\\s,;.!]*";
    private static final String ELEMINATE_SIMPLE_ENDS = "\\\\+[\\s,.;!]*$";
    private static final String ELEMINATE_SIMPLE_STARTS = "^\\\\+[\\s,.;!]+";
    // Commented line breaks in Latex looks like: %\r
    private static final String LATEX_COMMENTED_LINEBREAK = "%\\s*(" + CR + "|" + LF + "|" + CRLF + ")";

    private static final String SPECIAL_UNESCPAE_OPEN = "<";
    private static final String SPECIAL_UNESCAPE_CLOSE = ">";
    private static final String PLACEHOLDER_OPEN = "xxxOxxx";
    private static final String PLACEHOLDER_CLOSED = "xxxCxxx";

    private static final String SINGLE_AND = "\\s+&\\s+";
    private static final String TMP_SINGLE_AND = "---AND---";
    private static final Pattern ALTTEXT_PATTERN = Pattern.compile("alttext=\"(.*?)\"");

    // static class usage only
    private Utility() {
    }

    /**
     * Pre processing mathematical latex expressions with
     * several methods.
     *
     * @param latex raw latex input
     * @return pre processed latex string
     */
    public static String latexPreProcessing(String latex) {
        LOG.debug(" Pre-Processing for:  " + latex);

        if (latex.contains("subarray")) {
            latex = latex.replaceAll("subarray", "array");
            LOG.trace(" Eval replacement of subarray: " + latex);
        }

        latex = latex.replaceAll(POM_BUG_AVOIDANCE_UNDERSCORE, "_{$1}");
        LOG.trace("Surround underscore:  " + latex);

        latex = latex.replaceAll(SINGLE_AND, TMP_SINGLE_AND);
        latex = HtmlEscape.unescapeHtml(latex);
        latex = latex.replaceAll(TMP_SINGLE_AND, " & ");
        LOG.trace("HTML Unescaped:       " + latex);

        latex = latex.replaceAll(LATEX_COMMENTED_LINEBREAK, "");
        LOG.trace("Commented linebreaks: " + latex);

        latex = latex.replaceAll(ELEMINATE_ENDINGS, "");
        latex = latex.replaceAll(ELEMINATE_STARTS, "");
        latex = latex.replaceAll(ELEMINATE_SIMPLE_STARTS, "");
        latex = latex.replaceAll(ELEMINATE_SIMPLE_ENDS, "");
        LOG.trace("Replace bad end/start:" + latex);
        LOG.debug("Finalize Pre-Processing for POM-Tagger: " + latex);

        return latex;
    }

    /**
     * @param doc a
     * @param indent a
     * @return a
     */
    public static String documentToString(Document doc, boolean indent) {
        try {
            StringWriter sw = new StringWriter();
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer transer = tf.newTransformer();
            transer.setOutputProperty(OutputKeys.METHOD, "xml");
            if (indent) {
                transer.setOutputProperty(OutputKeys.INDENT, "yes");
            }
            transer.transform(new DOMSource(doc), new StreamResult(sw));
            return sw.toString();
        } catch (Exception e) {
            LOG.error("Cannot convert document into string.", e);
            return "";
        }
    }

    public static String safeUnescape(String escaped) {
        escaped = escaped.replaceAll(SPECIAL_UNESCPAE_OPEN, PLACEHOLDER_OPEN);
        escaped = escaped.replaceAll(SPECIAL_UNESCAPE_CLOSE, PLACEHOLDER_CLOSED);
        escaped = escaped.replaceAll(SINGLE_AND, TMP_SINGLE_AND);
        String unescaped = HtmlEscape.unescapeHtml(escaped);
        unescaped = unescaped.replaceAll(TMP_SINGLE_AND, " & ");
        unescaped = unescaped.replaceAll(PLACEHOLDER_OPEN, SPECIAL_UNESCPAE_OPEN);
        unescaped = unescaped.replaceAll(PLACEHOLDER_CLOSED, SPECIAL_UNESCAPE_CLOSE);
        return unescaped;
    }

    public static String unescapeJustAlttext(String mml) {
        Matcher matcher = ALTTEXT_PATTERN.matcher(mml);
        if (matcher.find()) {
            String orig = matcher.group(1);
            String unescape = HtmlEscape.unescapeHtml(orig);
            return mml.replace(matcher.group(0), "alttext=\"" + unescape + "\"");
        } else {
            return mml;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy