
com.intuit.fuzzymatcher.domain.Element Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-matcher Show documentation
Show all versions of fuzzy-matcher Show documentation
A java library to determine probability of objects being similar
package com.intuit.fuzzymatcher.domain;
import org.apache.commons.lang3.StringUtils;
import java.util.AbstractMap;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.intuit.fuzzymatcher.function.PreProcessFunction.toLowerCase;
import static com.intuit.fuzzymatcher.function.PreProcessFunction.trim;
/**
*
* This class represent the string "value" against which match are run.
*
* Configurable attributes
*
* - type - The ElementType for the value. This determines the functions applied at different steps of the match
* - weight - Used in scoring function to increase the Document score for an Element. Default is 1.0 for all elements
* - threshold - Value above which elements are considered a match, default 0.3
* - neighborhoodRange - Relevant for NEAREST_NEIGHBORS MatchType. Defines how close should the value be, to be considered a match (default 0.9)
* - preProcessFunction - Function to pre-process the value. If this is not set, the function defined in ElementType is used
* - tokenizerFunction - Function to break values into tokens. If this is not set, the function defined in ElementType is used
* - matchType - MatchType used. If this is not set, the type defined in ElementType is used
*
*/
public class Element implements Matchable {
private T value;
private double weight;
private double threshold;
private double neighborhoodRange;
private ElementClassification elementClassification;
private Document document;
private Function preProcessFunction;
private Function, Stream> tokenizerFunction;
private List tokens;
private MatchType matchType;
private T preProcessedValue;
public Element(ElementType type, String variance, T value, double weight, double threshold,
double neighborhoodRange, Function preProcessFunction,
Function, Stream> tokenizerFunction, MatchType matchType) {
this.weight = weight;
this.elementClassification = new ElementClassification(type, variance);
this.value = value;
this.threshold = threshold;
this.preProcessFunction = preProcessFunction == null ? type.getPreProcessFunction() : preProcessFunction;
this.tokenizerFunction = tokenizerFunction == null ? type.getTokenizerFunction() : tokenizerFunction;
this.matchType = matchType == null ? type.getMatchType() : matchType;
this.neighborhoodRange = neighborhoodRange;
}
public ElementClassification getElementClassification() {
return elementClassification;
}
public T getValue() {
return value;
}
@Override
public double getWeight() {
return weight;
}
public double getThreshold() {
return threshold;
}
public double getNeighborhoodRange() {
return neighborhoodRange;
}
public Document getDocument() {
return document;
}
public void setDocument(Document document) {
this.document = document;
}
public void setPreProcessedValue(T preProcessedValue) {
this.preProcessedValue = preProcessedValue;
}
public Function getPreProcessFunction() {
return this.preProcessFunction;
}
public T getPreProcessedValue() {
if (this.preProcessedValue == null) {
if (this.value instanceof String) {
// Default String pre-processing
Function preProcessingFunc = (Function) getPreProcessFunction();
setPreProcessedValue((T) preProcessingFunc.andThen(trim()).andThen(toLowerCase()).apply((String) this.value));
} else {
setPreProcessedValue(getPreProcessFunction().apply(this.value));
}
}
return this.preProcessedValue;
}
public AbstractMap.SimpleEntry getPreprocessedValueWithType() {
return new AbstractMap.SimpleEntry(this.getElementClassification(), this.getPreProcessedValue());
}
public Function, Stream> getTokenizerFunction() {
return this.tokenizerFunction;
}
public MatchType getMatchType() {
return this.matchType;
}
public List getTokens() {
if (this.tokens == null) {
this.tokens = getTokenizerFunction().apply(this).distinct().collect(Collectors.toList());
}
return this.tokens;
}
public double getScore(Integer matchingCount, Element other) {
return ((double)matchingCount / (double) getChildCount(other));
}
/**
* This gets the Max number of tokens present between matching Elements.
* For Elements that do not have a balanced set of tokens, it can push the score down.
*/
@Override
public long getChildCount(Matchable other) {
if (other instanceof Element) {
Element o = (Element) other;
return Math.max(this.getTokens().size(), o.getTokens().size());
}
return 0;
}
@Override
public long getUnmatchedChildCount(Matchable other) {
if (other instanceof Element) {
Element o = (Element) other;
long emptyChildren = this.getTokens().stream()
.filter(token -> token == null || StringUtils.isEmpty(token.getValue().toString()))
.count();
long oEmptyChildren = o.getTokens().stream()
.filter(token -> token == null || StringUtils.isEmpty(token.getValue().toString()))
.count();
return Math.max(emptyChildren, oEmptyChildren);
}
return 0;
}
@Override
public BiFunction, Score> getScoringFunction() {
return null;
}
public static class Builder {
private ElementType type;
private String variance;
private T value;
private double weight = 1.0;
private double threshold = 0.3;
private double neighborhoodRange = 0.9;
private Function preProcessFunction;
private MatchType matchType;
private Function, Stream> tokenizerFunction;
public Builder setType(ElementType type) {
this.type = type;
return this;
}
public Builder setVariance(String variance) {
this.variance = variance;
return this;
}
public Builder setValue(T value) {
this.value = value;
return this;
}
public Builder setWeight(double weight) {
this.weight = weight;
return this;
}
public Builder setThreshold(double threshold) {
this.threshold = threshold;
return this;
}
public Builder setNeighborhoodRange(double neighborhoodRange) {
this.neighborhoodRange = neighborhoodRange;
return this;
}
public Builder setPreProcessingFunction(Function preProcessingFunction) {
this.preProcessFunction = preProcessingFunction;
return this;
}
public Builder setTokenizerFunction(Function, Stream> tokenizerFunction) {
this.tokenizerFunction = tokenizerFunction;
return this;
}
public Builder setMatchType(MatchType matchType) {
this.matchType = matchType;
return this;
}
public Element createElement() {
return new Element(type, variance, value, weight, threshold, neighborhoodRange, preProcessFunction, tokenizerFunction, matchType);
}
}
@Override
public String toString() {
return "{" +
"'" + value + '\'' +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Element element = (Element) o;
if (value != null ? !value.equals(element.value) : element.value != null) return false;
if (elementClassification != null ? !elementClassification.equals(element.elementClassification) : element.elementClassification != null)
return false;
return document != null ? document.equals(element.document) : element.document == null;
}
@Override
public int hashCode() {
int result = value != null ? value.hashCode() : 0;
result = 31 * result + (elementClassification != null ? elementClassification.hashCode() : 0);
result = 31 * result + (document != null ? document.hashCode() : 0);
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy