All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.intuit.fuzzymatcher.function.TokenizerFunction Maven / Gradle / Ivy

There is a newer version: 1.2.1
Show newest version
package com.intuit.fuzzymatcher.function;

import com.intuit.fuzzymatcher.domain.Element;
import com.intuit.fuzzymatcher.domain.Token;
import com.intuit.fuzzymatcher.util.Utils;

import java.util.Arrays;
import java.util.function.Function;
import java.util.stream.Stream;

/**
 * A functional interface to Tokenize Elements
 */
public interface TokenizerFunction extends Function> {

    static TokenizerFunction valueTokenizer() {
        return (element -> Stream.of(getToken(element, element.getPreProcessedValue(), false)));
    }

    static TokenizerFunction wordTokenizer() {
        return (element) -> Arrays.stream(element.getPreProcessedValue().split("\\s+"))
                .map(token -> getToken(element, token, false));
    }

    static TokenizerFunction triGramTokenizer() {
        return (element) -> getTokens(3, element);
    }

    static TokenizerFunction decaGramTokenizer() {
        return (element) -> getTokens(10, element);
    }


    static Stream getTokens(int size, Element element) {
        String elementValue = element.getPreProcessedValue();
        return Utils.getNGrams(elementValue, size)
                .map(str -> getToken(element, str, true));

    }

    static Token getToken(Element element, String token, boolean nGramTokenized) {
        return new Token(token, element, nGramTokenized);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy