All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nlp.WNsim Maven / Gradle / Ivy

Go to download

Natural language processing toolbox using Sigma knowledge engineering system.

There is a newer version: 1.1
Show newest version
package nlp;

import com.google.common.collect.HashBiMap;
import com.google.common.collect.Sets;
import com.articulate.sigma.*;

import java.io.File;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * an implementation of
 * Resnik, P. (1995). Using information content to evaluate semantic similarity in a taxonomy.
 * In Proceedings of the 14th International Joint Conference on Artificial Intelligence, 448–453.
 *
 * Adding a similarity method based on synset subsumption.
 */
public class WNsim {

    // frequency of a given synset
    private static HashMap subsumingFreq = null;

    // for each synset key, the set of hyponyms and instance hyponyms
    // similarity is -log of
    private static HashMap> covered = new HashMap<>();

    // opportunistically cache similarities, keys are words, not synsets
    private static HashMap> cachedSims = new HashMap<>();

    // find maximum values for each POS.  Keys are "1", "2", "3" and "4"
    private static HashMap maxFreqs = new HashMap<>();

    // A bi-directional index of synsets to indexes which are in order of synset number
    private static HashBiMap nounSynsetToIndex = HashBiMap.create();
    private static HashBiMap verbSynsetToIndex = HashBiMap.create();
    private static HashBiMap adjectiveSynsetToIndex = HashBiMap.create();
    private static HashBiMap adverbSynsetToIndex = HashBiMap.create();

    // a bit is true if the synset (in order of byte offset number) is subsumed by the
    // synset in the key
    private static HashMap nouns = new HashMap<>();
    private static HashMap verbs = new HashMap<>();
    private static HashMap adjectives = new HashMap<>();
    private static HashMap adverbs = new HashMap<>();

    // default bitmaps
    private static BitSet nounBits = null;
    private static BitSet verbBits = null;
    private static BitSet adjectiveBits = null;
    private static BitSet adverbBits = null;

    private static boolean ResnikSim = true;
    private static boolean SubsumptionSim = false; // if both are true,
    // subsumption will be used by overwriting the result from Resnik

    /** ***************************************************************
     */
    public static void calcMaxFreqs() {

        maxFreqs.put("1",0);
        maxFreqs.put("2",0);
        maxFreqs.put("3",0);
        maxFreqs.put("4",0);
        for (String s : subsumingFreq.keySet()) {
            if (subsumingFreq.get(s) > maxFreqs.get(s.substring(0,1)))
                maxFreqs.put(s.substring(0, 1), subsumingFreq.get(s));
        }
    }

    /** ***************************************************************
     */
    public static HashMap getSubsumingFreq() {

        if (subsumingFreq == null) {
            subsumingFreq = new HashMap();
            computeSubsumingFreq2(Sets.newHashSet("hyponym", "instance hyponym"),
                    Sets.newHashSet("hypernym", "instance hypernym"));
            calcMaxFreqs();
        }
        return subsumingFreq;
    }

    /** ***************************************************************
     */
    public static HashMap computeSubsumingFreq(HashSet childRels,
                                                               HashSet parentRels)  {

        int safetyCounter = 0;
        HashMap freqs = new HashMap<>();
        TreeSet ts = new TreeSet<>();
        TreeSet seen = new TreeSet<>();
        ts.addAll(WordNetUtilities.findLeavesInTree(childRels));

        int count = 0;
        //System.out.println();
        //System.out.println("====================================");
        for (String s: ts) {
           // System.out.print(WordNet.wn.getWordsFromSynset(s).get(0)+"-" + s + ", ");
            if (count++ > 6) {
                //System.out.println();
                count = 0;
            }
        }
        //System.out.println();
        //System.out.println("====================================");

        //System.out.println("Iterating:");
        while (!ts.isEmpty() && safetyCounter < 50) {
            TreeSet tsnew = new TreeSet<>();
            for (String synset : ts) {
                if (seen.contains(synset))
                    continue;
                seen.add(synset);
                //System.out.println(WordNet.wn.getWordsFromSynset(synset).get(0) + "-" + synset);
                if (!freqs.keySet().contains(synset)) {
                    freqs.put(synset, new Integer(WordNet.wn.senseFrequencies.get(synset)));
                }
                ArrayList wnrels = WordNet.wn.relations.get(synset);
                for (AVPair avp : wnrels) {
                    if (parentRels.contains(avp.attribute)) {
                        tsnew.add(avp.value);
                        if (freqs.keySet().contains(avp.value))
                            freqs.put(avp.value, freqs.get(avp.value) + freqs.get(synset));
                        else
                            freqs.put(avp.value, freqs.get(synset));
                    }
                }
            }
            ts = new TreeSet<>();
            ts.addAll(tsnew);
            //System.out.println("====================================");
            //System.out.println(tsnew);
            //System.out.println("====================================");
            safetyCounter++;
        }
        //System.out.println("WordNet.wn.computeSubsumingFreq() " + safetyCounter);
        return freqs;
    }

    /** ***************************************************************
     */
    public static int computeSubsumingFreq2(HashSet childRels,
                                            HashSet parentRels) {

        int safetyCounter = 0;
        HashMap freqs = new HashMap<>();
        TreeSet ts = new TreeSet<>();
        TreeSet seen = new TreeSet<>();
        ts.addAll(WordNetUtilities.findLeavesInTree(childRels));

        for (String s : ts) {
            // System.out.print(WordNet.wn.getWordsFromSynset(s).get(0)+"-" + s + ", ");
            freqs.put(s, new Integer(1));
        }
        return computeSubsumingFreq2("100001740", childRels, parentRels);
    }

    /** ***************************************************************
     */
    public static int computeSubsumingFreq2(String synset,
            HashSet childRels,
            HashSet parentRels)  {

        ArrayList wnrels = WordNet.wn.relations.get(synset);
        int total = 1;
        for (AVPair avp : wnrels) {
            if (childRels.contains(avp.attribute))
                total += computeSubsumingFreq2(avp.value, childRels,parentRels);
        }
        subsumingFreq.put(synset,total);
        return total;
    }

    /** ***************************************************************
     * Compute similarity of two synsets. Note that currently only synsets of the
     * same part of speech have any similarity score.
     */
    public static float computeSynsetSim(String s1, String s2) {

        if (s1.charAt(0) != s2.charAt(0)) {
            System.out.println("Error in WNsim.computeSynsetSim(): incompatible synsets" + s1 + " " + s2);
            return 0;
        }
        char pos = s1.charAt(0);
        //System.out.println("Info in WNsim.computeSynsetSim(): pos: " + pos);
        int n = maxFreqs.get(Character.toString(pos));
        //System.out.println("Info in WNsim.computeSynsetSim(): n: " + n);
        String parent = WordNetUtilities.lowestCommonParent(s1, s2);
        if (parent == null)
            return (float) 0;
        //System.out.println("Info in WNsim.computeSynsetSim(): parent: " + parent);
        getSubsumingFreq();
        float parFreq = 0;
        if (getSubsumingFreq().containsKey(parent))
            parFreq= getSubsumingFreq().get(parent);
        //System.out.println("Info in WNsim.computeSynsetSim(): probability: " + parFreq / n);
        //System.out.println("Info in WNsim.computeSynsetSim(): score: " + (-Math.log(parFreq / n)));
        return (float) (-Math.log(parFreq / n));
        //return (float) (1 - (parFreq / n));
    }

    /** ***************************************************************
     * Compute similarity of two words as the maximum similarity among
     * their possible synsets. Note that currently only synsets of the
     * same part of speech have any similarity score.
     * @param allSynsets determines whether to take the best fit from
     *                   all possible synsets for the words, or just
     *                   to compare the most frequent synsets for each
     *                   word
     */
    public static float computeWordSim(String s1, String s2, boolean allSynsets) {

        //System.out.println("Info in WNsim.computeWordSim(): " + s1 + " " + s2);
        //if (subsumingFreq == null)
        //    subsumingFreq = computeSubsumingFreq(Sets.newHashSet("hyponym", "instance hyponym"),
        //            Sets.newHashSet("hypernym","instance hypernym"));
        if (cachedSims.containsKey(s1) && cachedSims.get(s1).containsKey(s2))
            return cachedSims.get(s1).get(s2);
        float result = (float) -100.0;
        HashSet syns1 = WordNetUtilities.wordsToSynsets(s1);
        HashSet syns2 = WordNetUtilities.wordsToSynsets(s2);
        if (!allSynsets) {
            syns1 = new HashSet();
            String sense1 = WSD.getBestDefaultSense(s1);
            if (!sense1.isEmpty())
                syns1.add(sense1);
            syns2 = new HashSet();
            String sense2 = WSD.getBestDefaultSense(s2);
            if (!sense2.isEmpty())
                syns2.add(sense2);
        }
        if (syns1 == null && syns2 == null || syns1.size() == 0 || syns2.size() == 0)
            return 0;
        String bestSyn1 = "";
        String bestSyn2 = "";
        if (syns1 == null || syns2 == null)
            return (float) -100.0;
        for (String syn1 : syns1) {
            for (String syn2 : syns2) {
                if (syn1.charAt(0) == syn2.charAt(0)) {
                    float score = 0;
                    if (ResnikSim)
                        score = computeSynsetSim(syn1,syn2);
                    if (SubsumptionSim)
                        score = maxCommonSynsetOverlap(syn1,syn2);
                    //System.out.println("Info in WNsim.computeWordSim(): " + syn1 + " " + syn2 + " " + score);
                    if (score > result) {
                        result = score;
                        bestSyn1 = syn1;
                        bestSyn2 = syn2;
                    }
                }
                else {
                    //System.out.println("Info in WNsim.computeWordSim(): diff. types: " + syn1 + " " + syn2);
                }
            }
        }
        return result;
    }

    /** ***************************************************************
     * Read a space-delimited file of word pairs and their similarity
     * rating.
     */
    public static HashMap> readTestFile(String filename,
                                                                     String regex, boolean removePOS) {

        HashMap> sims = new HashMap<>();
        //System.out.println("INFO in WNsim.readTestFile()");
        LineNumberReader lr = null;
        try {
            String line;

            File testFile = new File(filename);
            FileReader r = new FileReader( testFile );
            lr = new LineNumberReader(r);
            while ((line = lr.readLine()) != null) {
                //System.out.println("INFO in WNsim.readTestFile(): " + line);
                Pattern p = Pattern.compile(regex);
                // Pattern p = Pattern.compile("([^ ]+) ([^ ]+) (.*)");
                Matcher m = p.matcher(line);
                if (m.matches()) {
                    String w1 = m.group(1);
                    String w2 = m.group(2);
                    if (removePOS) {
                        w1 = w1.substring(0, w1.lastIndexOf('-'));
                        w2 = w2.substring(0, w2.lastIndexOf('-'));
                    }
                    String n1 = m.group(3);
                    //System.out.println("INFO in WNsim.readTestFile(): results: " + w1 + " " + w2 + " " + n1);
                    float f = Float.parseFloat(n1);
                    HashMap hm = new HashMap<>();
                    if (sims.containsKey(w1))
                        hm = sims.get(w1);
                    hm.put(w2,f);
                    sims.put(w1,hm);
                    hm = new HashMap<>();
                    if (sims.containsKey(w2))
                        hm = sims.get(w2);
                    hm.put(w1,f);
                    sims.put(w2,hm);
                }
            }
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
        finally {
            try {
                if (lr != null)
                    lr.close();
            }
            catch (Exception ex) {
            }
        }
        return sims;
    }

    /** ***************************************************************
     */
    public static float mean(HashMap x) {

        float total = 0;
        for (String s : x.keySet()) {
            total += x.get(s);
        }
        return total / (float) x.keySet().size();
    }

    /** ***************************************************************
     */
    public static float deviation(HashMap x, HashMap y,
                                  float meanx, float meany) {

        float dev = 0;
        for (String s : x.keySet()) {
            dev += (x.get(s) - meanx) * (y.get(s) - meany);
        }
        return dev;
    }

    /** ***************************************************************
     */
    public static float sqDeviation(HashMap x, float mean) {

        float dev = 0;
        for (String s : x.keySet()) {
            dev += (x.get(s) - mean) * (x.get(s) - mean);
        }
        return dev;
    }

    /** ***************************************************************
     * determine the sample correlation coefficient
     */
    public static double correlation(HashMap x, HashMap y) {

        float result = 0;
        // find means
        float meanx = mean(x);
        float meany = mean(y);
        float deviation = deviation(x, y, meanx, meany);
        float sqdeviationX = sqDeviation(x, meanx);
        float sqdeviationY = sqDeviation(y, meany);
        return deviation / (Math.sqrt(sqdeviationX * sqdeviationY));
    }

    /** ***************************************************************
     * @return a set of all the immediate children of the given synset from
     * hyponym and instance hyponym links
     */
    public static HashSet getChildList(String s) {

        HashSet result = new HashSet();
        ArrayList rels = WordNet.wn.relations.get(s);
        for (AVPair avp : rels) {
            if (avp.attribute.equals("hyponym") || avp.attribute.equals("instance hyponym"))
                result.add(avp.value);
        }
        return result;
    }

    /** ***************************************************************
     * @return an integer index into the bit vector
     * @param s is a POS-prefixed synset
     */
    public static int getIndex(String s) {

        int index = -1;
        switch (s.charAt(0)) {
            case '1' : index = nounSynsetToIndex.get(s);
                break;
            case '2' : index = verbSynsetToIndex.get(s);
                break;
            case '3' : index = adjectiveSynsetToIndex.get(s);
                break;
            case '4' : index = adverbSynsetToIndex.get(s);
                break;
        }
        return index;
    }

    /** ***************************************************************
     * @return a bit vector comprising set bits for all the children of
     * the given synset
     * @param s is a POS-prefixed synset
     */
    public static BitSet setBits(String s) {

        //System.out.println("INFO in WNsim.setBits(): " + s);
        HashSet children = getChildList(s);
        BitSet posMap = null;
        int index = -1;
        switch (s.charAt(0)) {
            case '1' : posMap = (BitSet) nounBits.clone();
                index = nounSynsetToIndex.get(s);
                break;
            case '2' : posMap = (BitSet) verbBits.clone();
                index = verbSynsetToIndex.get(s);
                break;
            case '3' : posMap = (BitSet) adjectiveBits.clone();
                index = adjectiveSynsetToIndex.get(s);
                break;
            case '4' : posMap = (BitSet) adverbBits.clone();
                index = adverbSynsetToIndex.get(s);
                break;
        }
        for (String c : children) {
            posMap.or(setBits(c));
            posMap.set(getIndex(c));
        }
        //System.out.println("INFO in WNsim.setBits(): s " + posMap.cardinality());
        switch (s.charAt(0)) {
            case '1' : nouns.put(s, posMap);
                break;
            case '2' : verbs.put(s,posMap);
                break;
            case '3' : adjectives.put(s,posMap);
                break;
            case '4' : adverbs.put(s, posMap);
                break;
        }
        return posMap;
    }

    /** ***************************************************************
     */
    public static void createIndexes() {

        int i = 0;
        for (String s : WordNet.wn.nounDocumentationHash.keySet()) {
            nounSynsetToIndex.put("1" + s,i++);
        }
        i = 0;
        for (String s : WordNet.wn.verbDocumentationHash.keySet()) {
            verbSynsetToIndex.put("2" + s,i++);
        }
        i = 0;
        for (String s : WordNet.wn.adverbDocumentationHash.keySet()) {
            adverbSynsetToIndex.put("3" + s,i++);
        }
        i = 0;
        for (String s : WordNet.wn.adjectiveDocumentationHash.keySet()) {
            adjectiveSynsetToIndex.put("4" + s,i++);
        }

        HashSet leaves = WordNetUtilities.findLeavesInTree(Sets.newHashSet("hyponym", "instance hyponym"));

        // a bit is true if the synset (in order of byte offset number) is subsumed by the
        // synset in the key
        nounBits = new BitSet(nounSynsetToIndex.size());
        verbBits = new BitSet(verbSynsetToIndex.size());
        adjectiveBits = new BitSet(adjectiveSynsetToIndex.size());
        adverbBits = new BitSet(adverbSynsetToIndex.size());

        HashSet roots = WordNetUtilities.findLeavesInTree(Sets.newHashSet("hypernym", "instance hypernym"));
        for (String s : roots) {
            setBits(s);
        }
    }

    /** ***************************************************************
     * Compute the number of child synsets shared by the two synsets
     */
    public static int synsetOverlap(String s1, String s2) {

        int result = 0;
        if (s1.charAt(0) != s2.charAt(0))
            return 0;
        BitSet bs1 = null;
        BitSet bs2 = null;
        switch (s1.charAt(0)) {
            case '1' : bs1 = (BitSet) nouns.get(s1).clone();
                bs2 = (BitSet) nouns.get(s2).clone();
                break;
            case '2' : bs1 = (BitSet) verbs.get(s1).clone();
                bs2 = (BitSet) verbs.get(s2).clone();
                break;
            case '3' : bs1 = (BitSet) adjectives.get(s1).clone();
                bs2 = (BitSet) adjectives.get(s2).clone();
                break;
            case '4' : bs1 = (BitSet) adverbs.get(s1).clone();
                bs2 = (BitSet) adverbs.get(s2).clone();
                break;
        }
        //System.out.println("INFO in WNsim.synsetOverlap(): s1: " + s1 + " " + bs1.cardinality());
        //System.out.println("INFO in WNsim.synsetOverlap(): s2: " + s2 + " " + bs2.cardinality());
        bs1.and(bs2);
        return bs1.cardinality();
    }

    /** ***************************************************************
     * Compute the percentage of child synsets of the first synset
     * shared by the second synset
     */
    public static float synsetPercentOverlap(String s1, String s2) {

        int result = 0;
        if (s1.charAt(0) != s2.charAt(0))
            return 0;
        BitSet bs1 = null;
        BitSet bs2 = null;
        switch (s1.charAt(0)) {
            case '1' : bs1 = (BitSet) nouns.get(s1).clone();
                bs2 = (BitSet) nouns.get(s2).clone();
                break;
            case '2' : bs1 = (BitSet) verbs.get(s1).clone();
                bs2 = (BitSet) verbs.get(s2).clone();
                break;
            case '3' : bs1 = (BitSet) adjectives.get(s1).clone();
                bs2 = (BitSet) adjectives.get(s2).clone();
                break;
            case '4' : bs1 = (BitSet) adverbs.get(s1).clone();
                bs2 = (BitSet) adverbs.get(s2).clone();
                break;
        }
        if (bs1.cardinality() == 0)
            return 0;
        //System.out.println("INFO in WNsim.synsetOverlap(): s1: " + s1 + " " + bs1.cardinality());
        //System.out.println("INFO in WNsim.synsetOverlap(): s2: " + s2 + " " + bs2.cardinality());
        BitSet bs1copy = (BitSet) bs1.clone();
        bs1copy.and(bs2);
        return (float) Math.exp(((float) bs1copy.cardinality()) / ((float) bs1.cardinality()));
        //return (float) -Math.log(((float) bs1copy.cardinality()) / ((float) bs1.cardinality()));
        //return (float) ((float) bs1copy.cardinality()) / ((float) bs1.cardinality());
    }

    /** ***************************************************************
     */
    public static float maxCommonSynsetOverlap(String s1, String s2) {

        float result = 0;
        String parent = WordNetUtilities.lowestCommonParent(s1, s2);
        if (parent == null)
            return  0;
        result = synsetPercentOverlap(parent,s1);
        float newval = synsetPercentOverlap(parent, s2);
        if (newval > result)
            result = newval;
        return result;
    }

    /** ***************************************************************
     */
    public static void test() {

        //System.out.println("INFO in WNsim.test() ");
        HashMap> hm = readTestFile("/home/apease/WordSim/scws.csv",
                "([^\\t]+)\\t([^\\t]+)\\t(.*)",false);
        //HashMap> hm = readTestFile("/home/apease/WordSim/wordsim353/combined-noHead.tab",
        //        "([^\\t]+)\\t([^\\t]+)\\t(.*)",false);
        //HashMap> hm = readTestFile("/home/apease/WordSim/rw/rw.txt",
        //        "([^\\t]+)\\t([^\\t]+)\\t([^\\t]+).*",false);
        //HashMap> hm = readTestFile("/home/apease/WordSim/Resnik3.txt",
        //        "([^ ]+) ([^ ]+) (.*)",false);
        //HashMap> hm = readTestFile("/home/apease/WordSim/MEN/MEN_dataset_lemma_form_full",
        //        "([^ ]+) ([^ ]+) (.*)",true);
        HashMap> hm2 = new HashMap<>();
        HashMap results1 = new HashMap<>();
        HashMap results2 = new HashMap<>();
        for (String s1 : hm.keySet()) {
            HashMap m = hm.get(s1);
            if (m != null) {
                for (String s2 : m.keySet()) {
                    boolean allSynsets = false;
                    float factual = computeWordSim(s1,s2,allSynsets);
                    float fexpected = m.get(s2);
                    if (factual > -100) {
                        HashMap h = new HashMap<>();
                        if (hm2.containsKey(s1))
                            h = hm2.get(s1);
                        h.put(s2, factual);
                        h = new HashMap<>();
                        if (hm2.containsKey(s2))
                            h = hm2.get(s2);
                        h.put(s1, factual);
                        //System.out.println("INFO in WNsim.readTestFile(): s1: " + s1 + " s2: " +
                        //        s2 + " factual: " + factual + " fexpected: " + fexpected);
                        //System.out.println(s1 + ", " + s2 + ", " + factual + ", " + fexpected);
                        results1.put(s1 + "-" + s2, factual);
                        results2.put(s1 + "-" + s2, fexpected);
                    }
                }
            }
        }
        System.out.println("Correlation: " + correlation(results1,results2));
    }

    /** ***************************************************************
     *  A main method, used only for testing.  It should not be called
     *  during normal operation.
     */
    public static void main (String[] args) {

        /*
        String line = "explosion stencil 3.000000";
        System.out.println("INFO in WNsim.main(): " + line);
        Pattern p = Pattern.compile("([^ ]+) ([^ ]+) (.*)");
        Matcher m = p.matcher(line);
        if (m.matches()) {
            String w1 = m.group(1);
            String w2 = m.group(2);
            String n1 = m.group(3);
            System.out.println("results: " + w1 + " " + w2 + " " + n1);
        }
*/
        try {
            KBmanager.getMgr().initializeOnce();
            createIndexes();
            System.out.println(synsetOverlap("104254777", "104199027"));
            System.out.println(maxCommonSynsetOverlap("104254777", "104199027"));
            //System.out.println(
            getSubsumingFreq(); //.toString().replace(",", "\n"));
            long start = System.currentTimeMillis();
            test();
            long finish = System.currentTimeMillis();
            System.out.println("Total time in milis: " + (finish - start));
            //System.out.println("Similarity: " + computeSynsetSim("102858304", "102958343"));
            //System.out.println("Similarity of car and boat: " + computeWordSim("car", "boat"));
            //String parent = WordNetUtilities.lowestCommonParent("102858304", "102958343");

            //System.out.println("frequency of " + parent + " : " + getSubsumingFreq().get(parent));
            //System.out.println("Similarity: " +
            //        (-Math.log(getSubsumingFreq().get(parent) / WordNetUtilities.numSynsets('1'))));
            //for (String s: hm.keySet()) {
            //    System.out.print(WordNet.wn.getWordsFromSynset(s).get(0) + "-" + s + ":" + hm.get(s) + ", ");
            //    if (count++ > 6) {
            //        System.out.println();
            //        count = 0;
            //    }
            //}
        }
        catch (Exception e) {
            System.out.println("Error in WordNetUtilities.main(): Exception: " + e.getMessage());
            e.printStackTrace();
        }

    }
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy