
com.intuit.fuzzymatcher.component.TokenMatch Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-matcher Show documentation
Show all versions of fuzzy-matcher Show documentation
A java library to determine probability of objects being similar
package com.intuit.fuzzymatcher.component;
import com.intuit.fuzzymatcher.domain.ElementType;
import com.intuit.fuzzymatcher.domain.Match;
import com.intuit.fuzzymatcher.domain.NGram;
import com.intuit.fuzzymatcher.domain.Token;
import org.apache.commons.lang3.BooleanUtils;
import org.springframework.stereotype.Component;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
*
* Matches at Token level, this class uses the SimilarityMatchFunction to get a score at a Token level
* This class also optimizes which tokens undergo match, by breaking it to NGram and figuring out the Search Groups
*/
@Component
public class TokenMatch {
public Stream> matchTokens(Stream input) {
List tokenList = input.collect(Collectors.toList());
initializeSearchGroups(tokenList);
return tokenList.parallelStream()
.filter(left -> BooleanUtils.isNotFalse(left.getElement().getDocument().isSource()))
.flatMap(
left -> left.getSearchGroups()
.filter(right -> !left.getElement().getDocument().getKey().equals(right.getElement().getDocument().getKey()))
.map(right -> {
double result = left.getElement().getSimilarityMatchFunction().apply(left, right);
return new Match(left, right, result);
})
.filter(match -> match.getResult() >= match.getData().getElement().getThreshold())
);
}
private void initializeSearchGroups(List input) {
Stream nGramStream = input.parallelStream().flatMap(token -> token.getNGrams());
Map> elementTypeNGramMap = nGramStream
.collect(Collectors.groupingBy(ngram -> ngram.getToken().getElement().getType()));
elementTypeNGramMap.entrySet().parallelStream().forEach(entry -> {
List ngramsList = entry.getValue();
Map> stringNGramMap = ngramsList.parallelStream().collect(Collectors.groupingBy(NGram::getValue));
stringNGramMap.entrySet().stream().forEach(stringListEntry -> {
List groups = stringListEntry.getValue().parallelStream()
.map(NGram::getToken)
.distinct()
.collect(Collectors.toList());
groups.parallelStream()
.filter(token -> BooleanUtils.isNotFalse(token.getElement().getDocument().isSource()))
.forEach(token -> token.setSearchGroups(Stream.concat(groups.stream(), token.getSearchGroups())));
});
});
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy