![JAR search and dependency download from the Maven repository](/logo.png)
cz.muni.fi.mir.mathmlcanonicalization.modules.OperatorNormalizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mathml-canonicalizer Show documentation
Show all versions of mathml-canonicalizer Show documentation
MathMLCanonicalizer is able to canonicalize MathML input. Modular
architecture allows to set up canonicalization features according to
users needs.
/**
* Copyright 2013 MIR@MU Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package cz.muni.fi.mir.mathmlcanonicalization.modules;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom2.Content;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Text;
import org.jdom2.filter.ContentFilter;
import org.jdom2.filter.ElementFilter;
/**
* Normalize the way to express an function applied to arguments in MathML.
* Input Well-formed MathML, not processed by
* MrowMinimizer yet
* Output The original code with:
*
* - normalized Unicode symbols
* - unified operators
* - no redundant operators
*
*
* @author David Formanek
*/
public class OperatorNormalizer extends AbstractModule implements DOMModule {
/**
* Path to the property file with module settings.
*/
private static final String PROPERTIES_FILENAME = "OperatorNormalizer.properties";
private static final Logger LOGGER = Logger.getLogger(OperatorNormalizer.class.getName());
// properties key names
private static final String REMOVE_EMPTY_OPERATORS = "removeempty";
private static final String OPERATORS_TO_REMOVE = "removeoperators";
private static final String OPERATOR_REPLACEMENTS = "replaceoperators";
private static final String COLON_REPLACEMENT = "colonreplacement";
private static final String NORMALIZATION_FORM = "normalizationform";
private static final String OPERATORS = "operators";
public OperatorNormalizer() {
loadProperties(PROPERTIES_FILENAME);
}
@Override
public void execute(final Document doc) {
if (doc == null) {
throw new NullPointerException("doc");
}
final Element root = doc.getRootElement();
// TODO: convert Unicode superscripts (supX entities) to msup etc.
final String normalizerFormStr = getProperty(NORMALIZATION_FORM);
if (normalizerFormStr.isEmpty()) {
LOGGER.fine("Unicode text normalization is switched off");
} else {
try {
Normalizer.Form normalizerForm = Normalizer.Form.valueOf(normalizerFormStr);
normalizeUnicode(root, normalizerForm);
} catch (IllegalArgumentException ex) {
throw new IllegalArgumentException("Invalid configuration value: "
+ NORMALIZATION_FORM, ex);
}
}
unifyOperators(root);
}
/**
* Converts bad identifiers to operators, removes redundant and replaces
*/
private void unifyOperators(final Element ancestor) {
assert ancestor != null;
final Set toRemove = getPropertySet(OPERATORS_TO_REMOVE);
final Map replaceMap = getPropertyMap(OPERATOR_REPLACEMENTS);
if (!getProperty(COLON_REPLACEMENT).isEmpty()) {
replaceMap.put(":", getProperty(COLON_REPLACEMENT));
}
final Set operators = getPropertySet(OPERATORS);
operators.addAll(toRemove);
operators.addAll(replaceMap.keySet());
operators.addAll(replaceMap.values());
replaceIdentifiers(ancestor, operators);
if (isEnabled(REMOVE_EMPTY_OPERATORS) || !toRemove.isEmpty()) {
removeSpareOperators(ancestor, toRemove);
} else {
LOGGER.fine("No operators set for removal");
}
if (replaceMap.isEmpty()) {
LOGGER.fine("No operators set to replace");
} else {
replaceOperators(ancestor, replaceMap);
}
}
private void normalizeUnicode(final Element ancestor, final Normalizer.Form form) {
assert ancestor != null && form != null;
final List texts = new ArrayList();
final ContentFilter textFilter = new ContentFilter(ContentFilter.TEXT);
for (Content text : ancestor.getContent(textFilter)) {
texts.add((Text) text);
}
for (Element element : ancestor.getDescendants(new ElementFilter())) {
for (Content text : element.getContent(textFilter)) {
texts.add((Text) text);
}
}
for (Text text : texts) {
if (Normalizer.isNormalized(text.getText(), form)) {
continue;
}
final String normalizedString = Normalizer.normalize(text.getText(), form);
LOGGER.log(Level.FINE, "Text ''{0}'' normalized to ''{1}''",
new Object[]{text.getText(), normalizedString});
text.setText(normalizedString);
assert Normalizer.isNormalized(text.getText(), form);
}
}
private void removeSpareOperators(final Element element, final Collection spareOperators) {
assert element != null && spareOperators != null && !spareOperators.isEmpty();
final List children = element.getChildren();
for (int i = 0; i < children.size(); i++) {
final Element actual = children.get(i); // actual element
if (isOperator(actual)) {
if (isSpareOperator(actual, spareOperators)) {
actual.detach();
i--; // move iterator back after detaching so it points to next element
LOGGER.log(Level.FINE, "Operator {0} removed", actual);
}
} else {
removeSpareOperators(actual, spareOperators);
}
}
}
private boolean isSpareOperator(final Element operator, final Collection spareOperators) {
assert operator != null && spareOperators != null && isOperator(operator);
return (isEnabled(REMOVE_EMPTY_OPERATORS) && operator.getText().isEmpty())
|| (spareOperators.contains(operator.getTextTrim()));
}
private void replaceOperators(final Element element, final Map replacements) {
assert element != null && replacements != null;
List operatorsToReplace = new ArrayList();
for (Element operator : element.getDescendants(new ElementFilter(OPERATOR))) {
if (replacements.containsKey(operator.getTextTrim())) {
operatorsToReplace.add(operator);
}
}
for (Element operator : operatorsToReplace) {
final String oldOperator = operator.getTextTrim();
final String newOperator = replacements.get(oldOperator);
operator.setText(newOperator);
LOGGER.log(Level.FINE, "Operator ''{0}'' was replaced by ''{1}''",
new Object[]{oldOperator, newOperator});
}
}
private void replaceIdentifiers(final Element ancestor, final Set operators) {
assert ancestor != null && operators != null;
final List toReplace = new ArrayList();
for (Element element : ancestor.getDescendants(new ElementFilter(IDENTIFIER))) {
// TODO: control whole ranges of symbols rather than listed ones
if (operators.contains(element.getTextTrim())) {
toReplace.add(element);
}
}
for (Element element : toReplace) {
LOGGER.log(Level.FINE, "Creating an operator from {0}", element.getText());
replaceElement(element, OPERATOR);
}
}
private Map getPropertyMap(final String property) {
assert property != null && isProperty(property);
final Map propertyMap = new HashMap();
final String[] mappings = getProperty(property).split(" ");
for (int i = 0; i < mappings.length; i++) {
final String[] mapping = mappings[i].split(":", 2);
if (mapping.length != 2) {
throw new IllegalArgumentException("property has wrong format");
}
propertyMap.put(mapping[0], mapping[1]);
}
return propertyMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy