All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.inria.edelweiss.kgraph.approximate.algorithm.impl.NGram Maven / Gradle / Ivy

The newest version!
package fr.inria.edelweiss.kgraph.approximate.algorithm.impl;

import fr.inria.edelweiss.kgraph.approximate.strategy.AlgType;
import fr.inria.edelweiss.kgraph.approximate.algorithm.Utils;
import java.util.HashMap;
import java.util.Map;

/**
 * N-Gram similarity measurement algorithm
 *
 * @author Fuqi Song, Wimmics Inria I3S
 * @date 27 août 2015
 */
public class NGram extends BaseAlgorithm {

    private final static int NG = 3;//default
    private int n;

    public NGram() {
        this(NG);
    }

    public NGram(int n) {
        super(AlgType.ng);
        this.n = n;
    }

    @Override
    public double calculate(String s1, String s2, String parameter) {
        double sim = this.calculate(s1, s2);
        Utils.msg("N-Gram" + NG, s1, s2, parameter, sim);
        return sim;
    }

    private double calculate(String s1, String s2) {
        double sim = MAX;
        if (!s1.equalsIgnoreCase(s2)) {
            Map res1 = tokenize(s1, n);
            Map res2 = tokenize(s2, n);

            int c = common(res1, res2);
            //int u = Math.max(s1.length(), s2.length()) - n + 1;
            int u = res1.size() + res2.size() - c;
            //u = u > 0 ? u : 1;
            sim = (double) c / (double) u;
        }
        return sim;
    }

    private int common(Map tokens1, Map tokens2) {
        int res = 0;

        for (String t1 : tokens1.keySet()) {
            if (tokens2.keySet().contains(t1)) {
                res++;
            }
        }

        return res;
    }

    private Map tokenize(String c, int n) {
        Map tokens = new HashMap();

        String spacer = "";
        c = spacer + c + spacer;

        for (int i = 0; i < c.length(); i++) {
            if (i <= (c.length() - n)) {
                String t = c.substring(i, n + i).toLowerCase();
                if (tokens.containsKey(t)) {
                    tokens.put(t, tokens.get(t) + 1);
                } else {
                    tokens.put(t, 1);
                }
            }
        }
        return tokens;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy