All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.projectmonkey.object.mapper.analysis.token.matcher.TokenMatcher Maven / Gradle / Ivy

Go to download

Object mapping implementation written as an alternative to modelmapper which is able to support inheritance, handles flattening / expanding in a precise way, and is extensible / configurable

The newest version!
package net.projectmonkey.object.mapper.analysis.token.matcher;

/*
 *
 *  * Copyright 2012 the original author or authors.
 *  *
 *  * Licensed under the Apache License, Version 2.0 (the "License");
 *  * you may not use this file except in compliance with the License.
 *  * You may obtain a copy of the License at
 *  *
 *  *      http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 */

/**
 *
 * @author Andy Moody
 */

import net.projectmonkey.object.mapper.util.CollectionUtil;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class TokenMatcher
{

	public static TokenMatcher INSTANCE = new TokenMatcher();

	private TokenMatcher(){}

	/**
	 * @param sourceProperties
	 * @param destinationProperties
	 * @return a list of MatchingResult objects representing the closest match of destination properties
	 * for each source property in the list followed by a PropertyMatchResult for each unmatched destination property.
	 *
	 * N.B. the algorithm operates in a sequential basis so if sourceProperties[0] has a match with
	 * destinationProperties[1] then sourceProperties[1] will be unable to match with destinationProperties[1]
	 * even if this may provide a closer match.
	 *
	 * I haven't found a real world scenario where this actually creates an issue yet - so if you have one let me know the details
	 * and I'll implement a solution.
	 */
	public List match(List> sourceProperties, List> destinationProperties)
	{
		List> unmatchedTokens = initUnmatchedTokens(destinationProperties);

		List results = matchWithTheClosestMatchingProperties(sourceProperties, destinationProperties, unmatchedTokens);

		matchRemainingSourceTokensWithUmatchedTokensInAnyProperty(sourceProperties, results, unmatchedTokens);

		for(int destinationPropertyPos = 0; destinationPropertyPos < unmatchedTokens.size(); destinationPropertyPos++)
		{
			List unmatchedTokensForDestinationProperty = unmatchedTokens.get(destinationPropertyPos);
			if(!unmatchedTokensForDestinationProperty.isEmpty())
			{
				List unmatchedDestinations = new ArrayList();
				List destinationTokens = destinationProperties.get(destinationPropertyPos);
				for(int unmatchedDestinationTokenPos = 0; unmatchedDestinationTokenPos < unmatchedTokensForDestinationProperty.size(); unmatchedDestinationTokenPos ++)
				{
					String token = unmatchedTokensForDestinationProperty.get(unmatchedDestinationTokenPos);
					int numberOfTokensUnmatched = CollectionUtil.countInstancesOf(token, unmatchedTokensForDestinationProperty);
					int destinationTokenPos = CollectionUtil.nthLastIndexOf(token, destinationTokens, numberOfTokensUnmatched);
					unmatchedDestinations.add(new TokenMatch(MatchStrength.NONE, null, destinationPropertyPos, null, destinationTokenPos));
				}
				if(results.size() > destinationPropertyPos && unmatchedDestinations.size() < destinationTokens.size())
				{
					//it's a partially matched destination - add it to the appropriate source property tally.
					results.get(destinationPropertyPos).getMatches().addAll(unmatchedDestinations);
				}
				else
				{
					results.add(new PropertyMatchResult(unmatchedDestinations));
				}
			}
		}

		return results;
	}

	private void matchRemainingSourceTokensWithUmatchedTokensInAnyProperty(final List> sourceProperties, final List results,
																		   final List> remainingUnmatchedTokens)
	{
		for(int resultPos = 0; resultPos < results.size() ; resultPos++)
		{
			List matches = results.get(resultPos).getMatches();
			List unmatchedSourceTokens = resolveUnmatchedSourceTokens(sourceProperties, resultPos, matches);
			for(int sourceTokenPos = 0; sourceTokenPos < unmatchedSourceTokens.size(); sourceTokenPos ++)
			{
				TokenAndPosition sourceTokenAndPosition = unmatchedSourceTokens.get(sourceTokenPos);
				MatchingSummary bestMatch = locateBestRemainingMatchForSingleTokenInAnyProperty(remainingUnmatchedTokens, sourceTokenAndPosition);
				if(bestMatch != null)
				{
					int matchingDestinationPropertyPosition = bestMatch.getPropertyPosition();
					TokenMatch tokenMatch = bestMatch.getMatches().get(0);
					CollectionUtil.replace(matches, sourceTokenAndPosition.tokenPosition, tokenMatch);
					CollectionUtil.replace(remainingUnmatchedTokens, matchingDestinationPropertyPosition, bestMatch.getUnmatchedDestinationTokens());
				}
			}
		}
	}

	private MatchingSummary locateBestRemainingMatchForSingleTokenInAnyProperty(final List> remainingUnmatchedTokens,
																				final TokenAndPosition sourceTokenAndPosition)
	{
		MatchingSummary bestMatch = null;
		int destinationPropertyPosition = 0;
		for(List unmatchedTokens : remainingUnmatchedTokens)
		{
			if(!unmatchedTokens.isEmpty())
			{
				List newUnmatchedTokens = new ArrayList(unmatchedTokens);
				boolean inCorrectProperty = destinationPropertyPosition == sourceTokenAndPosition.propertyPosition;
				TokenMatch match = matchSingleToken(newUnmatchedTokens, new ArrayList(unmatchedTokens), sourceTokenAndPosition,
													destinationPropertyPosition);
				MatchStrength strengthForToken = match.getStrength();
				if(bestMatch == null && !MatchStrength.NONE.equals(strengthForToken))
				{
					bestMatch = new MatchingSummary(match, newUnmatchedTokens, destinationPropertyPosition);
				}
				else if (bestMatch != null && bestMatch.getMatches().get(0).getStrength().getPriority() > strengthForToken.getPriority())
				{
					bestMatch = new MatchingSummary(match, newUnmatchedTokens, destinationPropertyPosition);
				}
			}
			destinationPropertyPosition ++;
		}
		return bestMatch;
	}

	private List resolveUnmatchedSourceTokens(final List> sourceProperties, final int resultPos,
																final List matches)
	{
		List unmatchedSourceTokens = new ArrayList();
		for(int tokenMatchPos = 0; tokenMatchPos < matches.size(); tokenMatchPos++)
		{
			if(MatchStrength.NONE.equals(matches.get(tokenMatchPos).getStrength()))
			{
				if(sourceProperties.size() > resultPos && sourceProperties.get(resultPos).size() > tokenMatchPos)
				{
					unmatchedSourceTokens.add(new TokenAndPosition(sourceProperties.get(resultPos).get(tokenMatchPos), resultPos, tokenMatchPos));
				}
			}
		}
		return unmatchedSourceTokens;
	}

	private List matchWithTheClosestMatchingProperties(final List> sourceProperties, final List> destinationProperties,
													   final List> unmatchedTokens)
	{
		List results = new ArrayList();

		int sourcePropertyPosition = 0;
		Set matchedDestinationProperties = new HashSet();
		for(List sourceTokens: sourceProperties)
		{
			MatchingSummary bestMatch = null;
			for(int i = 0; i < destinationProperties.size(); i++)
			{
				if(!matchedDestinationProperties.contains(i))
				{
					List destinationTokens = destinationProperties.get(i);
					List unmatchedTokensForDestination = new ArrayList(destinationTokens);
					List matchStrengths = matchTokens(unmatchedTokensForDestination, sourceTokens, destinationTokens, sourcePropertyPosition, i);
					MatchingSummary summary = new MatchingSummary(matchStrengths, unmatchedTokensForDestination, i);
					if(summary.isMatch() && (bestMatch == null || summary.getScore() > bestMatch.getScore()))
					{
						bestMatch = summary;
					}
				}
			}
			if(bestMatch != null)
			{
				results.add(new PropertyMatchResult(bestMatch.getMatches()));
				int destinationPropertyPosition = bestMatch.getPropertyPosition();
				CollectionUtil.replace(unmatchedTokens, destinationPropertyPosition, bestMatch.getUnmatchedDestinationTokens());
				matchedDestinationProperties.add(destinationPropertyPosition);
			}
			else
			{
				//we have more source properties than destination properties - fill up with MatchStrength.NONE
				List tokenMatches = new ArrayList();
				for(int sourceTokenPosition = 0; sourceTokenPosition < sourceTokens.size(); sourceTokenPosition++)
				{
					tokenMatches.add(new TokenMatch(MatchStrength.NONE, sourcePropertyPosition, null, sourceTokenPosition, null));
				}
				results.add(new PropertyMatchResult(tokenMatches));
			}
			sourcePropertyPosition++;
		}
		return results;
	}

	private List> initUnmatchedTokens(final List> destinationProperties)
	{
		List> unmatchedTokens = new ArrayList>(destinationProperties.size());

		for(List destinationTokens: destinationProperties)
		{
			unmatchedTokens.add(new ArrayList(destinationTokens));
		}
		return unmatchedTokens;
	}

	private List> initResult(final String[][] sourceProperties)
	{
		int length = sourceProperties.length;
		List> strengths = new ArrayList>(length);
		for(int i = 0; i < length; i++)
		{
			strengths.add(new ArrayList());
		}
		return strengths;
	}

	private List matchTokens(final List unmatchedTokens, final List sourceTokens, final List destinationTokens,
											final int sourcePropertyPosition, final int destinationPropertyPosition)
	{
		final List matches = new ArrayList();
		int sourceTokenPosition = 0;
		for(String token : sourceTokens)
		{
			TokenMatch strength = matchSingleToken(unmatchedTokens, destinationTokens, new TokenAndPosition(token, sourcePropertyPosition, sourceTokenPosition),
												   destinationPropertyPosition);
			matches.add(strength);
			sourceTokenPosition++;
		}
		return matches;
	}

	private TokenMatch matchSingleToken(final List unmatchedTokens, final List destinationTokens, final TokenAndPosition sourceTokenAndPosition,
										final int destinationPropertyPosition)
	{
		TokenMatch match = null;
		int sourceTokenPosition = sourceTokenAndPosition.tokenPosition;
		String token = sourceTokenAndPosition.token;
		boolean tokenAvailableInDestination = destinationTokens.size() > sourceTokenPosition;
		boolean inCorrectProperty = destinationPropertyPosition == sourceTokenAndPosition.propertyPosition;
		if(tokenAvailableInDestination)
		{
			String destToken = destinationTokens.get(sourceTokenPosition);
			if(destToken.equals(token))
			{
				MatchStrength strength = inCorrectProperty ? MatchStrength.EXACT : MatchStrength.IGNORING_PROPERTY_POSITION;
				match = new TokenMatch(strength, sourceTokenAndPosition.propertyPosition, destinationPropertyPosition, sourceTokenPosition, sourceTokenPosition);
				unmatchedTokens.remove(token);
			}
			else if(destToken.equalsIgnoreCase(token))
			{
				MatchStrength strength = inCorrectProperty ? MatchStrength.IGNORING_CASE : MatchStrength.IGNORING_CASE_AND_PROPERTY_POSITION;
				match = new TokenMatch(strength, sourceTokenAndPosition.propertyPosition, destinationPropertyPosition, sourceTokenPosition, sourceTokenPosition);
				boolean remove = unmatchedTokens.remove(destToken);
				if(!remove)
				{
					/*
					 * Handle the case where we have a token appearing twice with different cases
					 * and the corresponding one has already been matched in unmatched properties list
					 */
					remove(unmatchedTokens, destToken);
				}
			}
			else
			{
				match = manuallyMatch(destinationTokens, unmatchedTokens, sourceTokenAndPosition, destinationPropertyPosition);
			}
		}
		else
		{
			match = manuallyMatch(destinationTokens, unmatchedTokens, sourceTokenAndPosition, destinationPropertyPosition);
		}
		return match;
	}

	private TokenMatch manuallyMatch(final List destinationTokens, final List unmatchedTokens, final TokenAndPosition sourceTokenAndPosition, int destinationPropertyPosition)
	{
		MatchStrength strength = MatchStrength.NONE;
		Integer matchPosition = null;
		Integer unmatchedTokenPosition = null;
		String token = sourceTokenAndPosition.token;
		Integer sourcePropertyPosition = sourceTokenAndPosition.propertyPosition;
		boolean inCorrectProperty = destinationPropertyPosition == sourcePropertyPosition;
		for(int i = 0; i < destinationTokens.size() && matchPosition == null; i++)
		{
			String otherToken = destinationTokens.get(i);
			int numberOfInstancesOfThisTokenInDestinationProperties = CollectionUtil.countInstancesOf(otherToken, destinationTokens, i);
			int numberOfUnmatchedInstancesOfThisToken = CollectionUtil.countInstancesOf(otherToken, unmatchedTokens);
			if(unmatchedTokens.contains(otherToken) && numberOfInstancesOfThisTokenInDestinationProperties == numberOfUnmatchedInstancesOfThisToken)
			{
				if(token.equals(otherToken))
				{
					strength = inCorrectProperty ? MatchStrength.IGNORING_TOKEN_POSITION : MatchStrength.IGNORING_PROPERTY_AND_TOKEN_POSITION;
					matchPosition = i;
					unmatchedTokens.remove(otherToken);
				}
				else if(token.equalsIgnoreCase(otherToken))
				{
					strength = inCorrectProperty ? MatchStrength.IGNORING_CASE_AND_TOKEN_POSITION : MatchStrength.IGNORING_CASE_AND_PROPERTY_AND_TOKEN_POSITION;
					matchPosition = i;
					unmatchedTokens.remove(otherToken);
				}
			}
		}
		Integer reportedDestPropertyPosition = strength == MatchStrength.NONE ? null : destinationPropertyPosition;
		return new TokenMatch(strength, sourcePropertyPosition, reportedDestPropertyPosition, sourceTokenAndPosition.tokenPosition, matchPosition);
	}

	private void remove(final List unmatchedTokens, final String destToken)
	{
		int matchPosition = -1;
		for(int i = 0; i < unmatchedTokens.size(); i++)
		{
			String otherToken = unmatchedTokens.get(i);
			if(destToken.equalsIgnoreCase(otherToken))
			{
				matchPosition = i;
				break;
			}
		}
		unmatchedTokens.remove(matchPosition);
	}

	private static class TokenAndPosition {
		private String token;
		private Integer propertyPosition;
		private Integer tokenPosition;

		private TokenAndPosition(final String token, final Integer propertyPosition, final Integer tokenPosition)
		{
			this.token = token;
			this.propertyPosition = propertyPosition;
			this.tokenPosition = tokenPosition;
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy