ma.glasnost.orika.metadata.ScoringClassMapBuilder Maven / Gradle / Ivy

/*
 * Orika - simpler, better and faster Java bean mapping
 *
 * Copyright (C) 2011-2013 Orika authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ma.glasnost.orika.metadata;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.Set;

import ma.glasnost.orika.DefaultFieldMapper;
import ma.glasnost.orika.MapperFactory;
import ma.glasnost.orika.property.PropertyResolverStrategy;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * ScoringClassMapBuilder is an extension of the basic ClassMapBuilder that
 * attempts to compute a best-fit matching of all properties (at every level
 * of nesting) of one type to another, based on various metrics used to measure
 * a given property match.


 * 
 * Since this builder generates mappings based on scoring matches, it cannot always
 * guess the correct mappings; be sure to test and double-check the mappings
 * generated to assure they match expectations.


 * 
 * Note: levenshtein distance implementation is pulled from code found in
 * Apache Commons Lang org.apache.commons.lang.StringUtils, which is based on
 * the implementation provided by Chas Emerick 
 * http://www.merriampark.com/ldjava.htm
 * 
 * @author [email protected]
 * @param 
 * @param 
 */
public class ScoringClassMapBuilder extends ClassMapBuilder {
    
    private static final Logger LOGGER = LoggerFactory.getLogger(ScoringClassMapBuilder.class);
    
    private final PropertyMatchingWeights matchingWeights;
    
    /**
     * PropertyMatchingWeights  is a class used to describe how different
     * matching scenarios should be weighted when computing a match
     * score for a set of properties.
     * 
     * @author [email protected]
     *
     */
    public static final class PropertyMatchingWeights {
        
        private static final double MIN_WEIGHT = 0.0;
        private static final double MAX_WEIGHT = 1.0;
        
        private double nestedDepth = MAX_WEIGHT / 2.0;
        private double unmatchedWords = MAX_WEIGHT / 2.0;
        private double editDistance = MAX_WEIGHT / 2.0;
        private double containsName = MAX_WEIGHT / 2.0;
        private double typeMatch = MAX_WEIGHT / 2.0;
        private double commonWordCount = MAX_WEIGHT / 2.0;
        private double minimumScore = MAX_WEIGHT / 2.0;
        
        /**
         * @return the weight associated with the number of words found in common
         * between two property expressions
         */
        public double commonWordCount() {
            return commonWordCount;
        }
        /**
         * Set the weight associated with the number of words found in common
         * between two property expressions
         * 
         * @param weight the weight associated with the number of words found in common
         * @return this instance of PropertyMatchingWeights
         * between two property expressions
         */
        public PropertyMatchingWeights commonWordCount(double weight) {
            validateWeight(weight);
            this.commonWordCount = weight;
            return this;
        }
        /**
         * @return the weight associated with one property containing the
         * entire name of another property
         */
        public double containsName() {
            return containsName;
        }
        /**
         * Set the weight associated with one property containing the
         * entire name of another property.
         * 
         * @param weight the weight associated with one property containing the
         * entire name of another property
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights containsName(double weight) {
            validateWeight(weight);
            this.containsName = weight;
            return this;
        }
        /**
         * @return the weight associated with one property matching the type of the other
         */
        public double typeMatch() {
            return typeMatch;
        }
        /**
         * Set the weight associated with one property matching the type of the other
         * 
         * @param weight the weight associated with one property matching the type of the other
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights typeMatch(double weight) {
            validateWeight(weight);
            this.typeMatch = weight;
            return this;
        }
        /**
         * @return the weight modifier associated with a property word's edit distance based on
         * it's nesting depth
         */
        public double nestedDepth() {
            return nestedDepth;
        }
        /**
         * Set the weight modifier associated with a property word's edit distance based on
         * it's nesting depth; higher values here causes the matching to be more focused toward
         * the final name of a nested property, lower values focus on the entire name more evenly
         * 
         * @param weight the weight modifier associated with a property word's edit distance based on
         * it's nesting depth
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights nestedDepth(double weight) {
            validateWeight(weight);
            this.nestedDepth = weight;
            return this;
        }
        
        /**
         * @return the weight associated with the number of unmatched words between two property expressions
         */
        public double unmatchedWords() {
            return unmatchedWords;
        }
        
        /**
         * Set the weight associated with the number of unmatched words between two property expressions
         * 
         * @param weight the weight associated with the number of unmatched words between two property expressions
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights unmatchedWords(double weight) {
            validateWeight(weight);
            this.unmatchedWords = weight;
            return this;
        }
        /**
         * @return the weight associated with the edit distance between words in two property expressions
         */
        public double editDistance() {
            return editDistance;
        }
        /**
         * Set the weight associated with the edit distance between words in two property expressions
         * 
         * 
         * @param weight the weight associated with the edit distance between words in two property expressions
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights editDistance(double weight) {
            validateWeight(weight);
            this.editDistance = weight;
            return this;
        }
        /**
         * @return the weight applied to the minimum score needed to accept a given match
         */
        public double minimumScore() {
            return minimumScore;
        }
        
        /**
         * Set the weight applied to the minimum score needed to accept a given match; setting higher
         * values makes the matching more restrictive, lower scores make matching more lenient.
         * 
         * @param weight the weight applied to the minimum score needed to accept a given match
         * @return this instance of PropertyMatchingWeights
         */
        public PropertyMatchingWeights minimumScore(double weight) {
            validateWeight(weight);
            this.minimumScore = weight;
            return this;
        }
        private void validateWeight(double weight) {
            if (weight < MIN_WEIGHT || weight > MAX_WEIGHT) {
                throw new IllegalArgumentException("weights should be between " + MIN_WEIGHT + " and " + MAX_WEIGHT);
            }
        }
    }
    
    
    /**
     * Constructs a new instance of ScoringClassMapBuilder, using the provided PropertyMatchingWeights
     * to adjust the overall scoring of how properties are matched.
     * 
     * @param aType
     * @param bType
     * @param propertyResolver
     * @param defaults
     */
    protected ScoringClassMapBuilder(Type aType, Type bType, MapperFactory mapperFactory, PropertyResolverStrategy propertyResolver,
            DefaultFieldMapper[] defaults, PropertyMatchingWeights matchingWeights) {
        super(aType, bType, mapperFactory, propertyResolver, defaults);
        this.matchingWeights = matchingWeights;
    }
    
    /*
     * (non-Javadoc)
     * 
     * @see ma.glasnost.orika.metadata.ClassMapBuilder#byDefault(ma.glasnost.
     * orika.DefaultFieldMapper[])
     */
    public ClassMapBuilder byDefault(MappingDirection direction, DefaultFieldMapper... withDefaults) {
        
        DefaultFieldMapper[] defaults;
        if (withDefaults.length == 0) {
            defaults = getDefaultFieldMappers();
        } else {
            defaults = withDefaults;
        }
        /*
         * For our custom 'byDefault' method, we're going to try and match
         * fields by their Levenshtein distance
         */
        PriorityQueue matchScores = new PriorityQueue();
        
        Map propertiesForA = getPropertyExpressions(getAType());
        Map propertiesForB = getPropertyExpressions(getBType());
        
        for (final Entry propertyA : propertiesForA.entrySet()) {
            if (!propertyA.getValue().getName().equals("class")) {
                for (final Entry propertyB : propertiesForB.entrySet()) {
                    if (!propertyB.getValue().getName().equals("class")) {
                        FieldMatchScore matchScore = new FieldMatchScore(propertyA.getValue(), propertyB.getValue(), matchingWeights);
                        matchScores.add(matchScore);
                    }
                }
            }
        }
        
        Set unmatchedFields = new LinkedHashSet(this.getPropertiesForTypeA());
        unmatchedFields.remove("class");
        
        for (FieldMatchScore score : matchScores) {
            
            if (!this.getMappedPropertiesForTypeA().contains(score.propertyA.getExpression())
                    && !this.getMappedPropertiesForTypeB().contains(score.propertyB.getExpression())) {
                if (LOGGER.isTraceEnabled()) {
                    LOGGER.trace("\n" + score.toString());
                }
                if (score.meetsMinimumScore()) {
                    fieldMap(score.propertyA.getExpression(), score.propertyB.getExpression()).direction(direction).add();
                    unmatchedFields.remove(score.propertyA.getExpression());
                }
            }
        }
        
        /*
         * Apply any default field mappers to the unmapped fields
         */
        for (String propertyNameA : unmatchedFields) {
            Property prop = resolvePropertyForA(propertyNameA);
            for (DefaultFieldMapper defaulter : defaults) {
                String suggestion = defaulter.suggestMappedField(propertyNameA, prop.getType());
                if (suggestion != null && getPropertiesForTypeB().contains(suggestion)) {
                    if (!getMappedPropertiesForTypeB().contains(suggestion)) {
                        fieldMap(propertyNameA, suggestion).direction(direction).add();
                    }
                }
            }
        }
        
        return this;
    }
    
    /**
     * @author mattdeboer
     *
     */
    public static class Factory extends ClassMapBuilderFactory {
        
        private PropertyMatchingWeights matchingWeights;
        
        /**
         * Constructs a new Factory for ScoringClassMapBuilder instances
         */
        public Factory() {
            matchingWeights = new PropertyMatchingWeights();
        }
        
        /**
         * Constructs a new Factory for ScoringClassMapBuilder instances
         * 
         * @param matchingWeights the weights used to control the scorin on ScoringClassMapBuilder instances
         * created by this factory
         */
        public Factory(PropertyMatchingWeights matchingWeights) {
            this.matchingWeights = matchingWeights;
        }
        
        /*
         * (non-Javadoc)
         * 
         * @see
         * ma.glasnost.orika.metadata.ClassMapBuilderFactory#newClassMapBuilder
         * (ma.glasnost.orika.metadata.Type, ma.glasnost.orika.metadata.Type,
         * ma.glasnost.orika.property.PropertyResolverStrategy,
         * ma.glasnost.orika.DefaultFieldMapper[])
         */
        @Override
        protected  ClassMapBuilder newClassMapBuilder(Type aType, Type bType, MapperFactory mapperFactory,
                PropertyResolverStrategy propertyResolver, DefaultFieldMapper[] defaults) {
            
            return new ScoringClassMapBuilder(aType, bType, mapperFactory, propertyResolver, defaults, matchingWeights);
        }
        
    }
    
    /**
     * FieldMatchScore is used to score the match of a pair of property expressions
     * 
     * @author [email protected]
     * 
     */
    public static class FieldMatchScore implements Comparable {
        
        private static final List IGNORED_WORDS = Arrays.asList("with","this","that","an","a","of","the");
        /*
         * TODO: static for now; should probably be computed
         */
        private static final double MAX_POSSIBLE_SCORE = 50.0;
        
        private final PropertyMatchingWeights matchingWeights;
        
        private boolean contains;
        private boolean containsIgnoreCase;
        private double typeMatch;
        private Property propertyA;
        private Property propertyB;
        private int hashCode;
        private double commonWordCount;
        private double avgWordCount;
        private double wordMatchScore;
        private double score;
        private double typeMatchScore;
        private double commonWordsScore;
        private double containsScore;
        
        /**
         * Constructs a new FieldMatchScore based on the provided pair of properties, with scoring modified by
         * the provided PropertyMatchingWeights
         * 
         * @param propertyA
         * @param propertyB
         * @param matchingWeights
         */
        public FieldMatchScore(Property propertyA, Property propertyB, PropertyMatchingWeights matchingWeights) {
            
            this.matchingWeights = matchingWeights;
            this.propertyA = propertyA;
            this.propertyB = propertyB;
            
            String propertyALower = propertyA.getName().toLowerCase();
            String propertyBLower = propertyB.getName().toLowerCase();
            
            List> aWords = splitIntoLowerCaseWords(propertyA.getExpression());
            List> bWords = splitIntoLowerCaseWords(propertyB.getExpression());
            
            aWords.removeAll(IGNORED_WORDS);
            bWords.removeAll(IGNORED_WORDS);
            
            Set commonWords = intersection(aWords,bWords);
            
            this.avgWordCount = (aWords.size() + bWords.size()) / 2.0;
            
            this.commonWordCount = commonWords.size();
            this.wordMatchScore = computeWordMatchScore(aWords, bWords);
            
            this.contains = propertyA.getName().contains(propertyB.getName()) || propertyB.getName().contains(propertyA.getName());
            this.containsIgnoreCase = contains || propertyALower.contains(propertyBLower) || propertyBLower.contains(propertyALower);
            
            if ((propertyA.isMultiOccurrence() && !propertyB.isMultiOccurrence())
                    || (!propertyA.isMultiOccurrence() && propertyB.isMultiOccurrence())) {
                this.typeMatch = Double.NEGATIVE_INFINITY;
            } else if (propertyA.getType().isAssignableFrom(propertyB.getType()) 
                    || propertyB.getType().isAssignableFrom(propertyA.getType())){
                this.typeMatch = 1.0;
            } else {
                this.typeMatch = 0.0;
            }
            
            
            computeOverallScore();
            
            this.hashCode = computeHashCode();
        }
        
        public String toString() {
            return 
                "[" + propertyA.getExpression() + ", " + propertyB.getExpression() + "] {\n" +
                "   wordMatchScore: " + wordMatchScore + "\n" +
                "   commonWordScore: " + commonWordsScore + "\n" +
                "   containsScore: " + containsScore + "\n" +
                "   typeMatchScore: " + typeMatchScore + "\n" +
                "   ------------------- \n" +
                "   total: " + score + "\n" +
                "}";
        }
        
        private  Set intersection(List> setA, List> setB) {
            Set intersection = flatten(setA);
            Set temp = flatten(setB);
            intersection.retainAll(temp);
            return intersection;
        }

        private  Set flatten(List> aWords) {
            Set set = new LinkedHashSet();
            for (List collection: aWords) {
                for (T item: collection) {
                    set.add(item);
                }
            }
            return set;
        }
        
        /**
         * @return true if this match meets the minimum score (determined by the matching weights)
         */
        public boolean meetsMinimumScore() {
            double normalizedScore = ((MAX_POSSIBLE_SCORE / 2.0)* this.matchingWeights.minimumScore());
            return this.score >= normalizedScore;
        }
        
        /**
         * Compute the match score between two properties, broken up into arrays of
         * words at each property divider level.
         * 
         * @param aWords
         * @param bWords
         * @return
         */
        double computeWordMatchScore(List> aWords, List> bWords) {
            
            Set aWordsRemaining = new LinkedHashSet(flatten(aWords));
            Set bWordsRemaining = new LinkedHashSet(flatten(bWords));
            
            PriorityQueue orderedPairs = new PriorityQueue();
            double aDepth = 0;
            for (List aWordList : aWords) {
                ++aDepth;
                for (String aWord : aWordList) {
                    double bDepth = 0;
                    for (List bWordList: bWords) {
                        for (String bWord : bWordList) {
                            ++bDepth;
                            orderedPairs.add(new WordPair(aWord, bWord, (aDepth/aWords.size()), (bDepth/bWords.size()), matchingWeights));
                        }
                    } 
                }
            }
            
            double score = 0.0d;
            for (WordPair w: orderedPairs) {
                if (aWordsRemaining.contains(w.aWord) && bWordsRemaining.contains(w.bWord)) {
                    score += w.score;
                    aWordsRemaining.remove(w.aWord);
                    bWordsRemaining.remove(w.bWord);
                } 
            }
            
            double remains = (aWordsRemaining.size() + bWordsRemaining.size()) / 2.0;
            double initial = (aWords.size() + bWords.size()) / 2.0; 
            double unmatchedWordsCount = (remains - initial) * (matchingWeights.unmatchedWords());
            
            return score + unmatchedWordsCount;
        }
        
        private void computeOverallScore() {
            
            this.containsScore = this.matchingWeights.containsName() * (this.containsIgnoreCase ? 10 : 0);
            if (this.commonWordCount == 0) {
                this.commonWordsScore = 0.0;
            } else {
                this.commonWordsScore = (this.matchingWeights.commonWordCount()) * (Math.pow(2 * this.commonWordCount, 2.0)*((avgWordCount + commonWordCount)/avgWordCount));
            }
            this.typeMatchScore = (this.matchingWeights.typeMatch()) * this.typeMatch;
            this.score =  this.wordMatchScore + commonWordsScore + containsScore + typeMatchScore;
        }
        
        /**
         * WordPair is used to rank a match of a given set of words based on 
         * word depth and levenshtein distance between the words
         * 
         */
        private static class WordPair implements Comparable{
            private String aWord;
            private String bWord;
            private double score;
            
            private WordPair(String aWord, String bWord,  double aWordDepth, double bWordDepth, PropertyMatchingWeights matchingWeights) {
                this.aWord = aWord;
                this.bWord = bWord;
                double aDepth = (1.0 + aWordDepth) * (matchingWeights.nestedDepth);
                double bDepth = (1.0 + bWordDepth) * (matchingWeights.nestedDepth);
                double editDistance = getLevenshteinDistance(aWord, bWord);
                double distanceWeight =  matchingWeights.editDistance * (1.0 / (editDistance + 1.0));
                double wordLength = Math.max(aWord.length(), bWord.length());
                double wordLengthWeight = matchingWeights.editDistance * Math.sqrt(wordLength);
                this.score =  aDepth + bDepth + distanceWeight + wordLengthWeight;
            }
            /* (non-Javadoc)
             * @see java.lang.Comparable#compareTo(java.lang.Object)
             */
            public int compareTo(WordPair o) {
                double score = this.score - o.score;
                if (score < 0) {
                    return 1;
                } else if (score > 0) {
                    return -1;
                } else {
                    return 0;
                }
            }
            
            public String toString() {
                return "[" + aWord + "],[" + bWord + "] = " + score;
            }
            /* (non-Javadoc)
             * @see java.lang.Object#hashCode()
             */
            @Override
            public int hashCode() {
                final int prime = 31;
                int result = 1;
                result = prime * result + ((aWord == null) ? 0 : aWord.hashCode());
                result = prime * result + ((bWord == null) ? 0 : bWord.hashCode());
                long temp;
                temp = Double.doubleToLongBits(score);
                result = prime * result + (int) (temp ^ (temp >>> 32));
                return result;
            }
            /* (non-Javadoc)
             * @see java.lang.Object#equals(java.lang.Object)
             */
            @Override
            public boolean equals(Object obj) {
                if (this == obj)
                    return true;
                if (obj == null)
                    return false;
                if (getClass() != obj.getClass())
                    return false;
                WordPair other = (WordPair) obj;
                if (aWord == null) {
                    if (other.aWord != null)
                        return false;
                } else if (!aWord.equals(other.aWord))
                    return false;
                if (bWord == null) {
                    if (other.bWord != null)
                        return false;
                } else if (!bWord.equals(other.bWord))
                    return false;
                if (Double.doubleToLongBits(score) != Double.doubleToLongBits(other.score))
                    return false;
                return true;
            }
            
            
        }
        
        /**
         * Computes the levenshtein distance of 2 strings
         * 
         * @param s
         * @param t
         * @return
         */
        private static int getLevenshteinDistance(String s, String t) {
            if (s == null || t == null) {
                throw new IllegalArgumentException("Strings must not be null");
            }
            int lengthOfS = s.length();
            int lengthOfT = t.length();
            
            if (lengthOfS == 0) {
                return lengthOfT;
            } else if (lengthOfT == 0) {
                return lengthOfS;
            }
            
            if (lengthOfS > lengthOfT) {
                // swap the input strings to consume less memory
                String tmp = s;
                s = t;
                t = tmp;
                lengthOfS = lengthOfT;
                lengthOfT = t.length();
            }
            
            int previousCosts[] = new int[lengthOfS + 1];
            int costs[] = new int[lengthOfS + 1];
            int swap[];
            
            int indexOfS;
            int indexOfT;
            
            char charAtIndexOfT; // jth character of t
            int cost;
            
            for (indexOfS = 0; indexOfS <= lengthOfS; indexOfS++) {
                previousCosts[indexOfS] = indexOfS;
            }
            
            for (indexOfT = 1; indexOfT <= lengthOfT; indexOfT++) {
                charAtIndexOfT = t.charAt(indexOfT - 1);
                costs[0] = indexOfT;
                
                for (indexOfS = 1; indexOfS <= lengthOfS; indexOfS++) {
                    cost = s.charAt(indexOfS - 1) == charAtIndexOfT ? 0 : 1;
                    // minimum of cell to the left+1, to the top+1, diagonally
                    // left and up +cost
                    costs[indexOfS] = Math.min(Math.min(costs[indexOfS - 1] + 1, previousCosts[indexOfS] + 1), previousCosts[indexOfS - 1]
                            + cost);
                }
                
                // copy current distance counts to 'previous row' distance
                // counts
                swap = previousCosts;
                previousCosts = costs;
                costs = swap;
            }
            
            // previousCosts now has the most recent cost counts
            return previousCosts[lengthOfS];
        }
        
        /**
         * Pattern is used to split a string into words on camel-case word boundaries
         */
        private static final String WORD_SPLITTER = String.format("%s|%s|%s|%s", 
                "([\\{\\}\\]\\[-_])", "(?<=[A-Z])(?=[A-Z][a-z])", "(?<=[^A-Z])(?=[A-Z])",
                "(?<=[A-Za-z])(?=[^A-Za-z])");
        
        /**
         * Splits a given property expression into arrays of lower-case words;
         * result is returned as a set of String[], which represent a property
         * component split on word boundaries.
         * 
         * @param s
         * @return
         */
        private static List> splitIntoLowerCaseWords(String s) {
            List> results = new ArrayList>();
            for (String property: s.split("[.]")) {
                List words = new LinkedList();
                results.add(words);
                for (String word : property.split(WORD_SPLITTER)) {
                    if (word != null && word.trim().length() > 0) {
                        words.add(word.toLowerCase());
                    }
                }
            }
            return results;
        }
        /*
         * (non-Javadoc)
         * 
         * @see java.lang.Comparable#compareTo(java.lang.Object)
         */
        public int compareTo(FieldMatchScore that) {
            /*
             * Higher scores are better, and should be ordered first ("lower")
             */
            if (this.score < that.score) {
                return 1;
            } else if (this.score > that.score) {
                return -1;
            } else {
                return 0;
            }
        }
        
        private int computeHashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((propertyA == null) ? 0 : propertyA.hashCode());
            result = prime * result + ((propertyB == null) ? 0 : propertyB.hashCode());
            return result;
        }
        
        public int hashCode() {
            return hashCode;
        }
        
        /*
         * (non-Javadoc)
         * 
         * @see java.lang.Object#equals(java.lang.Object)
         */
        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            FieldMatchScore other = (FieldMatchScore) obj;
            if (propertyA == null) {
                if (other.propertyA != null)
                    return false;
            } else if (!propertyA.equals(other.propertyA))
                return false;
            if (propertyB == null) {
                if (other.propertyB != null)
                    return false;
            } else if (!propertyB.equals(other.propertyB))
                return false;
            return true;
        }
        
    }
    
}