com.intuit.fuzzymatcher.component.MatchService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-matcher Show documentation
A java library to determine probability of objects being similar
There is a newer version: 1.2.1
package com.intuit.fuzzymatcher.component;

import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Match;
import org.apache.commons.collections4.CollectionUtils;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Entry Point for Fuzzy Matching. This class provides different ways to accept Documents for primarily 3 use case
 * 
 * 1. De-duplication of data - Where for a given list of documents it finds duplicates
 * 2. Check duplicate for a new data - Where it checks for a new Document a duplicate is present in existing list
 * 3. Check duplicates for bulk inserts - Similar to 2, where a list of new Documents is checked against existing
 * 
 * This also has similar implementation to aggregate results in different formats.
 */
public class MatchService {

    /**
     * Use this for De-duplication of data, where for a given list of documents it finds duplicates
     * Data is aggregated by a given Document
     *
     * @param documents the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(List documents) {
        DocumentMatch documentMatch = new DocumentMatch();
        return documentMatch.matchDocuments(documents.stream())
                .collect(Collectors.groupingBy(Match::getData));
    }

    /**
     * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list
     * Data is aggregated by a given Document
     *
     * @param documents the list of documents to match from
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(List documents, List matchWith) {
        DocumentMatch documentMatch = new DocumentMatch();
        return documentMatch.matchDocuments(Stream.concat(
                matchWith.stream().map(document -> {
                    document.setSource(false);
                    return document;
                }),
                documents.stream().map(document -> {
                    document.setSource(true);
                    return document;
                })))
                .collect(Collectors.groupingBy(Match::getData));
    }

    /**
     * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
     * Data is aggregated by a given Document
     *
     * @param document  the document to match
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document and its corresponding matches
     */
    public Map>> applyMatch(Document document, List matchWith) {
        DocumentMatch documentMatch = new DocumentMatch();
        return applyMatch(Arrays.asList(document), matchWith);
    }

    /**
     * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
     * Data is aggregated by a given Document Id
     *
     * @param document  the document to match
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(Document document, List matchWith) {
        DocumentMatch documentMatch = new DocumentMatch();
        return applyMatchByDocId(Arrays.asList(document), matchWith);
    }

    /**
     * Use this for De-duplication of data, where for a given list of documents it finds duplicates
     * Data is aggregated by a given Document Id
     *
     * @param documents the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(List documents) {
        DocumentMatch documentMatch = new DocumentMatch();
        return documentMatch.matchDocuments(documents.stream())
                .collect(Collectors.groupingBy(match -> match.getData().getKey()));
    }

    /**
     * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list
     * Data is aggregated by a given Document Id
     *
     * @param documents the list of documents to match from
     * @param matchWith the list of documents to match against
     * @return a map containing the grouping of each document id and its corresponding matches
     */
    public Map>> applyMatchByDocId(List documents, List matchWith) {
        DocumentMatch documentMatch = new DocumentMatch();
        return documentMatch.matchDocuments(Stream.concat(
                matchWith.stream().map(document -> {
                    document.setSource(false);
                    return document;
                }), documents.stream().map(document -> {
                    document.setSource(true);
                    return document;
                })))
                .collect(Collectors.groupingBy(match -> match.getData().getKey()));
    }

    /**
     * Use this for De-duplication of data, where for a given list of documents it finds duplicates
     * Data is aggregated by a given Document Id
     *
     * @param documents the list of documents to match against
     * @return a set containing the grouping of all relevant matches. So if A matches B, and B matches C. They will be grouped together
     */
    public Set>> applyMatchByGroups(List documents) {
        DocumentMatch documentMatch = new DocumentMatch();
        Map>> matchByKey = documentMatch.matchDocuments(documents.stream())
                .collect(Collectors.groupingBy(match -> match.getData().getKey()));

        Set docKeys = new HashSet<>(matchByKey.keySet());
        Set>> result = new HashSet<>();

        docKeys.forEach(key -> {
            Set> matchGroups = new HashSet<>();
            groupSimilar(matchByKey, key, matchGroups);
            if (CollectionUtils.isNotEmpty(matchGroups)) {
                result.add(matchGroups);
            }
        });
        return result;
    }

    private void groupSimilar(Map>> matchMap, String key, Set> matchGroups) {
        List> matches = matchMap.get(key);
        if (matches == null) {
            return;
        }
        matchMap.remove(key);

        matches.forEach(match -> {
            if (!containsMatch(matchGroups, match)) {
                matchGroups.add(match);
            }
            String matchedWithKey = match.getMatchedWith().getKey();
            groupSimilar(matchMap, matchedWithKey, matchGroups);
        });
    }

    private boolean containsMatch(Set> matchGroups, Match match) {
        return matchGroups.stream()
                .anyMatch(m -> m.getData().getKey().equals(match.getMatchedWith().getKey())
                        && m.getMatchedWith().getKey().equals(match.getData().getKey())
                );
    }
}