All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.intuit.fuzzymatcher.component.MatchService Maven / Gradle / Ivy

There is a newer version: 1.2.1
Show newest version
package com.intuit.fuzzymatcher.component;

import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Match;
import org.apache.commons.collections4.CollectionUtils;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Entry Point for Fuzzy Matching. This class provides different ways to accept Documents for primarily 3 use case
 * 

* 1. De-duplication of data - Where for a given list of documents it finds duplicates * 2. Check duplicate for a new data - Where it checks for a new Document a duplicate is present in existing list * 3. Check duplicates for bulk inserts - Similar to 2, where a list of new Documents is checked against existing *

* This also has similar implementation to aggregate results in different formats. */ public class MatchService { /** * Use this for De-duplication of data, where for a given list of documents it finds duplicates * Data is aggregated by a given Document * * @param documents the list of documents to match against * @return a map containing the grouping of each document and its corresponding matches */ public Map>> applyMatch(List documents) { DocumentMatch documentMatch = new DocumentMatch(); return documentMatch.matchDocuments(documents.stream()) .collect(Collectors.groupingBy(Match::getData)); } /** * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list * Data is aggregated by a given Document * * @param documents the list of documents to match from * @param matchWith the list of documents to match against * @return a map containing the grouping of each document and its corresponding matches */ public Map>> applyMatch(List documents, List matchWith) { DocumentMatch documentMatch = new DocumentMatch(); return documentMatch.matchDocuments(Stream.concat( matchWith.stream().map(document -> { document.setSource(false); return document; }), documents.stream().map(document -> { document.setSource(true); return document; }))) .collect(Collectors.groupingBy(Match::getData)); } /** * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list * Data is aggregated by a given Document * * @param document the document to match * @param matchWith the list of documents to match against * @return a map containing the grouping of each document and its corresponding matches */ public Map>> applyMatch(Document document, List matchWith) { DocumentMatch documentMatch = new DocumentMatch(); return applyMatch(Arrays.asList(document), matchWith); } /** * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list * Data is aggregated by a given Document Id * * @param document the document to match * @param matchWith the list of documents to match against * @return a map containing the grouping of each document id and its corresponding matches */ public Map>> applyMatchByDocId(Document document, List matchWith) { DocumentMatch documentMatch = new DocumentMatch(); return applyMatchByDocId(Arrays.asList(document), matchWith); } /** * Use this for De-duplication of data, where for a given list of documents it finds duplicates * Data is aggregated by a given Document Id * * @param documents the list of documents to match against * @return a map containing the grouping of each document id and its corresponding matches */ public Map>> applyMatchByDocId(List documents) { DocumentMatch documentMatch = new DocumentMatch(); return documentMatch.matchDocuments(documents.stream()) .collect(Collectors.groupingBy(match -> match.getData().getKey())); } /** * Use this to check duplicates for bulk inserts, where a list of new Documents is checked against existing list * Data is aggregated by a given Document Id * * @param documents the list of documents to match from * @param matchWith the list of documents to match against * @return a map containing the grouping of each document id and its corresponding matches */ public Map>> applyMatchByDocId(List documents, List matchWith) { DocumentMatch documentMatch = new DocumentMatch(); return documentMatch.matchDocuments(Stream.concat( matchWith.stream().map(document -> { document.setSource(false); return document; }), documents.stream().map(document -> { document.setSource(true); return document; }))) .collect(Collectors.groupingBy(match -> match.getData().getKey())); } /** * Use this for De-duplication of data, where for a given list of documents it finds duplicates * Data is aggregated by a given Document Id * * @param documents the list of documents to match against * @return a set containing the grouping of all relevant matches. So if A matches B, and B matches C. They will be grouped together */ public Set>> applyMatchByGroups(List documents) { DocumentMatch documentMatch = new DocumentMatch(); Map>> matchByKey = documentMatch.matchDocuments(documents.stream()) .collect(Collectors.groupingBy(match -> match.getData().getKey())); Set docKeys = new HashSet<>(matchByKey.keySet()); Set>> result = new HashSet<>(); docKeys.forEach(key -> { Set> matchGroups = new HashSet<>(); groupSimilar(matchByKey, key, matchGroups); if (CollectionUtils.isNotEmpty(matchGroups)) { result.add(matchGroups); } }); return result; } private void groupSimilar(Map>> matchMap, String key, Set> matchGroups) { List> matches = matchMap.get(key); if (matches == null) { return; } matchMap.remove(key); matches.forEach(match -> { if (!containsMatch(matchGroups, match)) { matchGroups.add(match); } String matchedWithKey = match.getMatchedWith().getKey(); groupSimilar(matchMap, matchedWithKey, matchGroups); }); } private boolean containsMatch(Set> matchGroups, Match match) { return matchGroups.stream() .anyMatch(m -> m.getData().getKey().equals(match.getMatchedWith().getKey()) && m.getMatchedWith().getKey().equals(match.getData().getKey()) ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy