All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.micronaut.cli.util.CosineSimilarity.groovy Maven / Gradle / Ivy

There is a newer version: 2.0.0.M2
Show newest version
/*
 * Copyright 2017-2019 original authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.micronaut.cli.util

/**
 * Uses cosine similarity to find matches from a candidate set for a specified input.
 * Based on code from http://www.nearinfinity.com/blogs/seth_schroeder/groovy_cosine_similarity_in_grails.html
 *
 * @author Burt Beckwith
 */
class CosineSimilarity {

    /**
     * Sort the candidates by their similarity to the specified input.
     * @param pattern the input string
     * @param candidates the possible matches
     * @return the ordered candidates
     */
    static List mostSimilar(String pattern, candidates, double threshold = 0) {
        SortedMap sorted = new TreeMap()
        for (candidate in candidates) {
            double score = stringSimilarity(pattern, candidate)
            if (score > threshold) {
                sorted[score] = candidate
            }
        }

        (sorted.values() as List).reverse()
    }

    private static double stringSimilarity(String s1, String s2, int degree = 2) {
        similarity s1.toLowerCase().toCharArray(), s2.toLowerCase().toCharArray(), degree
    }

    private static double similarity(sequence1, sequence2, int degree = 2) {
        Map m1 = countNgramFrequency(sequence1, degree)
        Map m2 = countNgramFrequency(sequence2, degree)

        dotProduct(m1, m2) / Math.sqrt(dotProduct(m1, m1) * dotProduct(m2, m2))
    }

    private static Map countNgramFrequency(sequence, int degree) {
        Map m = [:]
        int count = sequence.size()

        for (int i = 0; i + degree <= count; i++) {
            List gram = sequence[i..<(i + degree)]
            m[gram] = 1 + m.get(gram, 0)
        }

        m
    }

    private static double dotProduct(Map m1, Map m2) {
        m1.keySet().collect { key -> m1[key] * m2.get(key, 0) }.sum()
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy