com.formulasearchengine.mathmltools.similarity.distances.Distances Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mathml-similarity Show documentation
Show all versions of mathml-similarity Show documentation
API to calculate similarities between MathML documents.
package com.formulasearchengine.mathmltools.similarity.distances;
import com.formulasearchengine.mathmltools.helper.XMLHelper;
import com.formulasearchengine.mathmltools.mml.CMMLInfo;
import com.formulasearchengine.mathmltools.similarity.distances.earthmover.EarthMoverDistanceWrapper;
import com.formulasearchengine.mathmltools.similarity.distances.earthmover.JFastEMD;
import com.formulasearchengine.mathmltools.similarity.distances.earthmover.Signature;
import com.formulasearchengine.mathmltools.utils.mml.ValidCSymbols;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPathExpressionException;
import java.text.DecimalFormat;
import java.util.*;
/**
* Created by Felix Hamborg on 05.12.16.
*/
public class Distances {
private static final Logger LOG = LogManager.getLogger(Distances.class.getName());
private static final DecimalFormat decimalFormat = new DecimalFormat("#.###");
private Distances() {
}
/**
* probably only makes sense to compute this on CI
*
* @param h1
* @param h2
* @return
*/
public static double computeEarthMoverAbsoluteDistance(Map h1, Map h2) {
Signature s1 = EarthMoverDistanceWrapper.histogramToSignature(h1);
Signature s2 = EarthMoverDistanceWrapper.histogramToSignature(h2);
return JFastEMD.distance(s1, s2, 0.0);
}
public static double computeRelativeDistance(Map h1, Map h2) {
int totalNumberOfElements = 0;
for (Double frequency : h1.values()) {
totalNumberOfElements += frequency;
}
for (Double frequency : h2.values()) {
totalNumberOfElements += frequency;
}
if (totalNumberOfElements == 0) {
return 0.0;
}
final double absoluteDistance = computeAbsoluteDistance(h1, h2);
return absoluteDistance / totalNumberOfElements;
}
/**
* compares two histograms and returns the accumulated number of differences (absolute)
*
* @param h1
* @param h2
* @return
*/
public static double computeAbsoluteDistance(Map h1, Map h2) {
double distance = 0;
final Set keySet = new HashSet();
keySet.addAll(h1.keySet());
keySet.addAll(h2.keySet());
for (String key : keySet) {
double v1 = 0.0;
double v2 = 0.0;
if (h1.get(key) != null) {
v1 = h1.get(key);
}
if (h2.get(key) != null) {
v2 = h2.get(key);
}
distance += Math.abs(v1 - v2);
}
return distance;
}
/**
* Returns a map of the names and their accumulated frequency of the given content-elements (that could be identifiers, numbers, or operators)
*
* @param nodes
* @return
*/
public static HashMap contentElementsToHistogram(NodeList nodes) {
final HashMap histogram = new HashMap<>();
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
String contentElementName = node.getTextContent().trim();
// increment frequency by 1
histogram.put(contentElementName, histogram.getOrDefault(contentElementName, 0.0) + 1.0);
}
return histogram;
}
/**
* Adds all elements from all histogram
*
* @return
*/
public static Map histogramsPlus(List