com.intuit.fuzzymatcher.component.MatchService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-matcher Show documentation
A java library to determine probability of objects being similar
There is a newer version: 1.2.1
package com.intuit.fuzzymatcher.component;

import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Match;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Entry Point for Fuzzy Matching. This class provides different ways to accept Documents for primarily 3 use case
 * 
 * 1. De-duplication of data - Where for a given list of documents it finds duplicates
 * 2. Check duplicate for a new data - Where it checks for a new Document a duplicate is present in existing list
 * 3. Check duplicates for bulk inserts - Similar to 2, where a list of new Documents is checked against existing
 * 
 * This also has similar implementation to aggregate results in different formats.
 */
@Component
public class MatchService {

    @Autowired
    private DocumentMatch documentMatch;

    /**
     * Use this for De-duplication of data, where for a given list of documents it finds duplicates
     * Data is aggregated by a given Document
     *
     * @param documents the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(List documents) {
        return documentMatch.matchDocuments(documents.parallelStream())
                .collect(Collectors.groupingBy(Match::getData));
    }

    /**
     * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list
     * Data is aggregated by a given Document
     *
     * @param documents the list of documents to match from
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(List documents, List matchWith) {
        return documentMatch.matchDocuments(Stream.concat(
                documents.parallelStream().map(document -> {
                    document.setSource(true);
                    return document;
                }), matchWith.parallelStream().map(document -> {
                    document.setSource(false);
                    return document;
                })))
                .collect(Collectors.groupingBy(Match::getData));
    }

    /**
     * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
     * Data is aggregated by a given Document
     *
     * @param document the document to match
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(Document document, List matchWith) {
        return applyMatch(Arrays.asList(document), matchWith);
    }

    /**
     * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
     * Data is aggregated by a given Document Id
     *
     * @param document the document to match
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(Document document, List matchWith) {
        return applyMatchByDocId(Arrays.asList(document), matchWith);
    }

    /**
     * Use this for De-duplication of data, where for a given list of documents it finds duplicates
     * Data is aggregated by a given Document Id
     *
     * @param documents the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(List documents) {
        return documentMatch.matchDocuments(documents.parallelStream())
                .collect(Collectors.groupingBy(match -> match.getData().getKey()));
    }

    /**
     * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list
     * Data is aggregated by a given Document Id
     *
     * @param documents the list of documents to match from
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(List documents, List matchWith) {
        return documentMatch.matchDocuments(Stream.concat(
                documents.parallelStream().map(document -> {
                    document.setSource(true);
                    return document;
                }), matchWith.parallelStream().map(document -> {
                    document.setSource(false);
                    return document;
                })))
                .collect(Collectors.groupingBy(match -> match.getData().getKey()));
    }
}