
com.intuit.fuzzymatcher.function.TokenizerFunction Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-matcher Show documentation
Show all versions of fuzzy-matcher Show documentation
A java library to determine probability of objects being similar
package com.intuit.fuzzymatcher.function;
import com.intuit.fuzzymatcher.domain.Element;
import com.intuit.fuzzymatcher.domain.Token;
import com.intuit.fuzzymatcher.exception.MatchException;
import com.intuit.fuzzymatcher.util.Utils;
import org.apache.commons.codec.language.Soundex;
import java.util.Arrays;
import java.util.function.Function;
import java.util.stream.Stream;
/**
* A functional interface to Tokenize Elements
*/
public class TokenizerFunction {
private static final Soundex soundex = new Soundex();
public static Function> valueTokenizer() {
return (element -> Stream.of(new Token(element.getPreProcessedValue(), element)));
}
public static Function, Stream>> wordTokenizer() {
return (element) -> Arrays.stream(element.getPreProcessedValue().split("\\s+"))
.map(token -> new Token(token, element));
}
public static Function, Stream>> wordSoundexEncodeTokenizer() {
return (element) -> Arrays.stream(element.getPreProcessedValue().toString().split("\\s+"))
.map(val -> {
String code = val;
if (!Utils.isNumeric(val)) {
code = soundex.encode(val);
if (code.equals("")) {
code = val;
}
}
// System.out.println(val +"->" + code);
return code;
}).map(token -> new Token(token, element));
}
public static Function, Stream>> triGramTokenizer() {
return (element) -> getNGramTokens(3, element);
}
public static Function, Stream>> decaGramTokenizer() {
return (element) -> getNGramTokens(10, element);
}
public static Stream> getNGramTokens(int size, Element element) {
Object elementValue = element.getPreProcessedValue();
String elementValueStr;
if (elementValue instanceof String) {
elementValueStr = (String) elementValue;
} else {
throw new MatchException("Unsupported data type");
}
return Utils.getNGrams(elementValueStr, size)
.map(str -> new Token(str, element));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy