com.articulate.sigma.nlp.WNsim Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-component Show documentation
Show all versions of sigma-component Show documentation
Sigma knowledge engineering system is an system for developing, viewing and debugging theories in first
order logic. It works with Knowledge Interchange Format (KIF) and is optimized for the Suggested Upper Merged
Ontology (SUMO) www.ontologyportal.org.
The newest version!
package com.articulate.sigma.nlp;
import com.articulate.sigma.*;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Sets;
import java.io.File;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* an implementation of
* Resnik, P. (1995). Using information content to evaluate semantic similarity in a taxonomy.
* In Proceedings of the 14th International Joint Conference on Artificial Intelligence, 448–453.
*
* Adding a similarity method based on synset subsumption.
*/
public class WNsim {
// frequency of a given synset
private static HashMap subsumingFreq = null;
// for each synset key, the set of hyponyms and instance hyponyms
// similarity is -log of
private static HashMap> covered = new HashMap<>();
// opportunistically cache similarities, keys are words, not synsets
private static HashMap> cachedSims = new HashMap<>();
// find maximum values for each POS. Keys are "1", "2", "3" and "4"
private static HashMap maxFreqs = new HashMap<>();
// A bi-directional index of synsets to indexes which are in order of synset number
private static HashBiMap nounSynsetToIndex = HashBiMap.create();
private static HashBiMap verbSynsetToIndex = HashBiMap.create();
private static HashBiMap adjectiveSynsetToIndex = HashBiMap.create();
private static HashBiMap adverbSynsetToIndex = HashBiMap.create();
// a bit is true if the synset (in order of byte offset number) is subsumed by the
// synset in the key
private static HashMap nouns = new HashMap<>();
private static HashMap verbs = new HashMap<>();
private static HashMap adjectives = new HashMap<>();
private static HashMap adverbs = new HashMap<>();
// default bitmaps
private static BitSet nounBits = null;
private static BitSet verbBits = null;
private static BitSet adjectiveBits = null;
private static BitSet adverbBits = null;
private static boolean ResnikSim = true;
private static boolean SubsumptionSim = false; // if both are true,
// subsumption will be used by overwriting the result from Resnik
/** ***************************************************************
*/
public static void calcMaxFreqs() {
maxFreqs.put("1",0);
maxFreqs.put("2",0);
maxFreqs.put("3",0);
maxFreqs.put("4",0);
for (String s : subsumingFreq.keySet()) {
if (subsumingFreq.get(s) > maxFreqs.get(s.substring(0,1)))
maxFreqs.put(s.substring(0, 1), subsumingFreq.get(s));
}
}
/** ***************************************************************
*/
public static HashMap getSubsumingFreq() {
if (subsumingFreq == null) {
subsumingFreq = new HashMap();
computeSubsumingFreq2(Sets.newHashSet("hyponym", "instance hyponym"),
Sets.newHashSet("hypernym", "instance hypernym"));
calcMaxFreqs();
}
return subsumingFreq;
}
/** ***************************************************************
*/
public static HashMap computeSubsumingFreq(HashSet childRels,
HashSet parentRels) {
int safetyCounter = 0;
HashMap freqs = new HashMap<>();
TreeSet ts = new TreeSet<>();
TreeSet seen = new TreeSet<>();
ts.addAll(WordNetUtilities.findLeavesInTree(childRels));
int count = 0;
//System.out.println();
//System.out.println("====================================");
for (String s: ts) {
// System.out.print(WordNet.wn.getWordsFromSynset(s).get(0)+"-" + s + ", ");
if (count++ > 6) {
//System.out.println();
count = 0;
}
}
//System.out.println();
//System.out.println("====================================");
//System.out.println("Iterating:");
while (!ts.isEmpty() && safetyCounter < 50) {
TreeSet tsnew = new TreeSet<>();
for (String synset : ts) {
if (seen.contains(synset))
continue;
seen.add(synset);
//System.out.println(WordNet.wn.getWordsFromSynset(synset).get(0) + "-" + synset);
if (!freqs.keySet().contains(synset)) {
freqs.put(synset, new Integer(WordNet.wn.senseFrequencies.get(synset)));
}
ArrayList wnrels = WordNet.wn.relations.get(synset);
for (AVPair avp : wnrels) {
if (parentRels.contains(avp.attribute)) {
tsnew.add(avp.value);
if (freqs.keySet().contains(avp.value))
freqs.put(avp.value, freqs.get(avp.value) + freqs.get(synset));
else
freqs.put(avp.value, freqs.get(synset));
}
}
}
ts = new TreeSet<>();
ts.addAll(tsnew);
//System.out.println("====================================");
//System.out.println(tsnew);
//System.out.println("====================================");
safetyCounter++;
}
//System.out.println("WordNet.wn.computeSubsumingFreq() " + safetyCounter);
return freqs;
}
/** ***************************************************************
*/
public static int computeSubsumingFreq2(HashSet childRels,
HashSet parentRels) {
int safetyCounter = 0;
HashMap freqs = new HashMap<>();
TreeSet ts = new TreeSet<>();
TreeSet seen = new TreeSet<>();
ts.addAll(WordNetUtilities.findLeavesInTree(childRels));
for (String s : ts) {
// System.out.print(WordNet.wn.getWordsFromSynset(s).get(0)+"-" + s + ", ");
freqs.put(s, new Integer(1));
}
return computeSubsumingFreq2("100001740", childRels, parentRels);
}
/** ***************************************************************
*/
public static int computeSubsumingFreq2(String synset,
HashSet childRels,
HashSet parentRels) {
ArrayList wnrels = WordNet.wn.relations.get(synset);
int total = 1;
for (AVPair avp : wnrels) {
if (childRels.contains(avp.attribute))
total += computeSubsumingFreq2(avp.value, childRels,parentRels);
}
subsumingFreq.put(synset,total);
return total;
}
/** ***************************************************************
* Compute similarity of two synsets. Note that currently only synsets of the
* same part of speech have any similarity score.
*/
public static float computeSynsetSim(String s1, String s2) {
if (s1.charAt(0) != s2.charAt(0)) {
System.out.println("Error in WNsim.computeSynsetSim(): incompatible synsets" + s1 + " " + s2);
return 0;
}
char pos = s1.charAt(0);
//System.out.println("Info in WNsim.computeSynsetSim(): pos: " + pos);
int n = maxFreqs.get(Character.toString(pos));
//System.out.println("Info in WNsim.computeSynsetSim(): n: " + n);
String parent = WordNetUtilities.lowestCommonParent(s1, s2);
if (parent == null)
return (float) 0;
//System.out.println("Info in WNsim.computeSynsetSim(): parent: " + parent);
getSubsumingFreq();
float parFreq = 0;
if (getSubsumingFreq().containsKey(parent))
parFreq= getSubsumingFreq().get(parent);
//System.out.println("Info in WNsim.computeSynsetSim(): probability: " + parFreq / n);
//System.out.println("Info in WNsim.computeSynsetSim(): score: " + (-Math.log(parFreq / n)));
return (float) (-Math.log(parFreq / n));
//return (float) (1 - (parFreq / n));
}
/** ***************************************************************
* Compute similarity of two words as the maximum similarity among
* their possible synsets. Note that currently only synsets of the
* same part of speech have any similarity score.
* @param allSynsets determines whether to take the best fit from
* all possible synsets for the words, or just
* to compare the most frequent synsets for each
* word
*/
public static float computeWordSim(String s1, String s2, boolean allSynsets) {
//System.out.println("Info in WNsim.computeWordSim(): " + s1 + " " + s2);
//if (subsumingFreq == null)
// subsumingFreq = computeSubsumingFreq(Sets.newHashSet("hyponym", "instance hyponym"),
// Sets.newHashSet("hypernym","instance hypernym"));
if (cachedSims.containsKey(s1) && cachedSims.get(s1).containsKey(s2))
return cachedSims.get(s1).get(s2);
float result = (float) -100.0;
HashSet syns1 = WordNetUtilities.wordsToSynsets(s1);
HashSet syns2 = WordNetUtilities.wordsToSynsets(s2);
if (!allSynsets) {
syns1 = new HashSet();
String sense1 = WSD.getBestDefaultSense(s1);
if (!sense1.isEmpty())
syns1.add(sense1);
syns2 = new HashSet();
String sense2 = WSD.getBestDefaultSense(s2);
if (!sense2.isEmpty())
syns2.add(sense2);
}
if (syns1 == null && syns2 == null || syns1.size() == 0 || syns2.size() == 0)
return 0;
String bestSyn1 = "";
String bestSyn2 = "";
if (syns1 == null || syns2 == null)
return (float) -100.0;
for (String syn1 : syns1) {
for (String syn2 : syns2) {
if (syn1.charAt(0) == syn2.charAt(0)) {
float score = 0;
if (ResnikSim)
score = computeSynsetSim(syn1,syn2);
if (SubsumptionSim)
score = maxCommonSynsetOverlap(syn1,syn2);
//System.out.println("Info in WNsim.computeWordSim(): " + syn1 + " " + syn2 + " " + score);
if (score > result) {
result = score;
bestSyn1 = syn1;
bestSyn2 = syn2;
}
}
else {
//System.out.println("Info in WNsim.computeWordSim(): diff. types: " + syn1 + " " + syn2);
}
}
}
return result;
}
/** ***************************************************************
* Read a space-delimited file of word pairs and their similarity
* rating.
*/
public static HashMap> readTestFile(String filename,
String regex, boolean removePOS) {
HashMap> sims = new HashMap<>();
//System.out.println("INFO in WNsim.readTestFile()");
LineNumberReader lr = null;
try {
String line;
File testFile = new File(filename);
FileReader r = new FileReader( testFile );
lr = new LineNumberReader(r);
while ((line = lr.readLine()) != null) {
//System.out.println("INFO in WNsim.readTestFile(): " + line);
Pattern p = Pattern.compile(regex);
// Pattern p = Pattern.compile("([^ ]+) ([^ ]+) (.*)");
Matcher m = p.matcher(line);
if (m.matches()) {
String w1 = m.group(1);
String w2 = m.group(2);
if (removePOS) {
w1 = w1.substring(0, w1.lastIndexOf('-'));
w2 = w2.substring(0, w2.lastIndexOf('-'));
}
String n1 = m.group(3);
//System.out.println("INFO in WNsim.readTestFile(): results: " + w1 + " " + w2 + " " + n1);
float f = Float.parseFloat(n1);
HashMap hm = new HashMap<>();
if (sims.containsKey(w1))
hm = sims.get(w1);
hm.put(w2,f);
sims.put(w1,hm);
hm = new HashMap<>();
if (sims.containsKey(w2))
hm = sims.get(w2);
hm.put(w1,f);
sims.put(w2,hm);
}
}
}
catch (Exception ex) {
ex.printStackTrace();
}
finally {
try {
if (lr != null)
lr.close();
}
catch (Exception ex) {
}
}
return sims;
}
/** ***************************************************************
*/
public static float mean(HashMap x) {
float total = 0;
for (String s : x.keySet()) {
total += x.get(s);
}
return total / (float) x.keySet().size();
}
/** ***************************************************************
*/
public static float deviation(HashMap x, HashMap y,
float meanx, float meany) {
float dev = 0;
for (String s : x.keySet()) {
dev += (x.get(s) - meanx) * (y.get(s) - meany);
}
return dev;
}
/** ***************************************************************
*/
public static float sqDeviation(HashMap x, float mean) {
float dev = 0;
for (String s : x.keySet()) {
dev += (x.get(s) - mean) * (x.get(s) - mean);
}
return dev;
}
/** ***************************************************************
* determine the sample correlation coefficient
*/
public static double correlation(HashMap x, HashMap y) {
float result = 0;
// find means
float meanx = mean(x);
float meany = mean(y);
float deviation = deviation(x, y, meanx, meany);
float sqdeviationX = sqDeviation(x, meanx);
float sqdeviationY = sqDeviation(y, meany);
return deviation / (Math.sqrt(sqdeviationX * sqdeviationY));
}
/** ***************************************************************
* @return a set of all the immediate children of the given synset from
* hyponym and instance hyponym links
*/
public static HashSet getChildList(String s) {
HashSet result = new HashSet();
ArrayList rels = WordNet.wn.relations.get(s);
for (AVPair avp : rels) {
if (avp.attribute.equals("hyponym") || avp.attribute.equals("instance hyponym"))
result.add(avp.value);
}
return result;
}
/** ***************************************************************
* @return an integer index into the bit vector
* @param s is a POS-prefixed synset
*/
public static int getIndex(String s) {
int index = -1;
switch (s.charAt(0)) {
case '1' : index = nounSynsetToIndex.get(s);
break;
case '2' : index = verbSynsetToIndex.get(s);
break;
case '3' : index = adjectiveSynsetToIndex.get(s);
break;
case '4' : index = adverbSynsetToIndex.get(s);
break;
}
return index;
}
/** ***************************************************************
* @return a bit vector comprising set bits for all the children of
* the given synset
* @param s is a POS-prefixed synset
*/
public static BitSet setBits(String s) {
//System.out.println("INFO in WNsim.setBits(): " + s);
HashSet children = getChildList(s);
BitSet posMap = null;
int index = -1;
switch (s.charAt(0)) {
case '1' : posMap = (BitSet) nounBits.clone();
index = nounSynsetToIndex.get(s);
break;
case '2' : posMap = (BitSet) verbBits.clone();
index = verbSynsetToIndex.get(s);
break;
case '3' : posMap = (BitSet) adjectiveBits.clone();
index = adjectiveSynsetToIndex.get(s);
break;
case '4' : posMap = (BitSet) adverbBits.clone();
index = adverbSynsetToIndex.get(s);
break;
}
for (String c : children) {
posMap.or(setBits(c));
posMap.set(getIndex(c));
}
//System.out.println("INFO in WNsim.setBits(): s " + posMap.cardinality());
switch (s.charAt(0)) {
case '1' : nouns.put(s, posMap);
break;
case '2' : verbs.put(s,posMap);
break;
case '3' : adjectives.put(s,posMap);
break;
case '4' : adverbs.put(s, posMap);
break;
}
return posMap;
}
/** ***************************************************************
*/
public static void createIndexes() {
int i = 0;
for (String s : WordNet.wn.nounDocumentationHash.keySet()) {
nounSynsetToIndex.put("1" + s,i++);
}
i = 0;
for (String s : WordNet.wn.verbDocumentationHash.keySet()) {
verbSynsetToIndex.put("2" + s,i++);
}
i = 0;
for (String s : WordNet.wn.adverbDocumentationHash.keySet()) {
adverbSynsetToIndex.put("3" + s,i++);
}
i = 0;
for (String s : WordNet.wn.adjectiveDocumentationHash.keySet()) {
adjectiveSynsetToIndex.put("4" + s,i++);
}
HashSet leaves = WordNetUtilities.findLeavesInTree(Sets.newHashSet("hyponym", "instance hyponym"));
// a bit is true if the synset (in order of byte offset number) is subsumed by the
// synset in the key
nounBits = new BitSet(nounSynsetToIndex.size());
verbBits = new BitSet(verbSynsetToIndex.size());
adjectiveBits = new BitSet(adjectiveSynsetToIndex.size());
adverbBits = new BitSet(adverbSynsetToIndex.size());
HashSet roots = WordNetUtilities.findLeavesInTree(Sets.newHashSet("hypernym", "instance hypernym"));
for (String s : roots) {
setBits(s);
}
}
/** ***************************************************************
* Compute the number of child synsets shared by the two synsets
*/
public static int synsetOverlap(String s1, String s2) {
int result = 0;
if (s1.charAt(0) != s2.charAt(0))
return 0;
BitSet bs1 = null;
BitSet bs2 = null;
switch (s1.charAt(0)) {
case '1' : bs1 = (BitSet) nouns.get(s1).clone();
bs2 = (BitSet) nouns.get(s2).clone();
break;
case '2' : bs1 = (BitSet) verbs.get(s1).clone();
bs2 = (BitSet) verbs.get(s2).clone();
break;
case '3' : bs1 = (BitSet) adjectives.get(s1).clone();
bs2 = (BitSet) adjectives.get(s2).clone();
break;
case '4' : bs1 = (BitSet) adverbs.get(s1).clone();
bs2 = (BitSet) adverbs.get(s2).clone();
break;
}
//System.out.println("INFO in WNsim.synsetOverlap(): s1: " + s1 + " " + bs1.cardinality());
//System.out.println("INFO in WNsim.synsetOverlap(): s2: " + s2 + " " + bs2.cardinality());
bs1.and(bs2);
return bs1.cardinality();
}
/** ***************************************************************
* Compute the percentage of child synsets of the first synset
* shared by the second synset
*/
public static float synsetPercentOverlap(String s1, String s2) {
int result = 0;
if (s1.charAt(0) != s2.charAt(0))
return 0;
BitSet bs1 = null;
BitSet bs2 = null;
switch (s1.charAt(0)) {
case '1' : bs1 = (BitSet) nouns.get(s1).clone();
bs2 = (BitSet) nouns.get(s2).clone();
break;
case '2' : bs1 = (BitSet) verbs.get(s1).clone();
bs2 = (BitSet) verbs.get(s2).clone();
break;
case '3' : bs1 = (BitSet) adjectives.get(s1).clone();
bs2 = (BitSet) adjectives.get(s2).clone();
break;
case '4' : bs1 = (BitSet) adverbs.get(s1).clone();
bs2 = (BitSet) adverbs.get(s2).clone();
break;
}
if (bs1.cardinality() == 0)
return 0;
//System.out.println("INFO in WNsim.synsetOverlap(): s1: " + s1 + " " + bs1.cardinality());
//System.out.println("INFO in WNsim.synsetOverlap(): s2: " + s2 + " " + bs2.cardinality());
BitSet bs1copy = (BitSet) bs1.clone();
bs1copy.and(bs2);
return (float) Math.exp(((float) bs1copy.cardinality()) / ((float) bs1.cardinality()));
//return (float) -Math.log(((float) bs1copy.cardinality()) / ((float) bs1.cardinality()));
//return (float) ((float) bs1copy.cardinality()) / ((float) bs1.cardinality());
}
/** ***************************************************************
*/
public static float maxCommonSynsetOverlap(String s1, String s2) {
float result = 0;
String parent = WordNetUtilities.lowestCommonParent(s1, s2);
if (parent == null)
return 0;
result = synsetPercentOverlap(parent,s1);
float newval = synsetPercentOverlap(parent, s2);
if (newval > result)
result = newval;
return result;
}
/** ***************************************************************
*/
public static void test() {
//System.out.println("INFO in WNsim.test() ");
HashMap> hm = readTestFile("/home/apease/WordSim/scws.csv",
"([^\\t]+)\\t([^\\t]+)\\t(.*)",false);
//HashMap> hm = readTestFile("/home/apease/WordSim/wordsim353/combined-noHead.tab",
// "([^\\t]+)\\t([^\\t]+)\\t(.*)",false);
//HashMap> hm = readTestFile("/home/apease/WordSim/rw/rw.txt",
// "([^\\t]+)\\t([^\\t]+)\\t([^\\t]+).*",false);
//HashMap> hm = readTestFile("/home/apease/WordSim/Resnik3.txt",
// "([^ ]+) ([^ ]+) (.*)",false);
//HashMap> hm = readTestFile("/home/apease/WordSim/MEN/MEN_dataset_lemma_form_full",
// "([^ ]+) ([^ ]+) (.*)",true);
HashMap> hm2 = new HashMap<>();
HashMap results1 = new HashMap<>();
HashMap results2 = new HashMap<>();
for (String s1 : hm.keySet()) {
HashMap m = hm.get(s1);
if (m != null) {
for (String s2 : m.keySet()) {
boolean allSynsets = false;
float factual = computeWordSim(s1,s2,allSynsets);
float fexpected = m.get(s2);
if (factual > -100) {
HashMap h = new HashMap<>();
if (hm2.containsKey(s1))
h = hm2.get(s1);
h.put(s2, factual);
h = new HashMap<>();
if (hm2.containsKey(s2))
h = hm2.get(s2);
h.put(s1, factual);
//System.out.println("INFO in WNsim.readTestFile(): s1: " + s1 + " s2: " +
// s2 + " factual: " + factual + " fexpected: " + fexpected);
//System.out.println(s1 + ", " + s2 + ", " + factual + ", " + fexpected);
results1.put(s1 + "-" + s2, factual);
results2.put(s1 + "-" + s2, fexpected);
}
}
}
}
System.out.println("Correlation: " + correlation(results1,results2));
}
/** ***************************************************************
* A main method, used only for testing. It should not be called
* during normal operation.
*/
public static void main (String[] args) {
/*
String line = "explosion stencil 3.000000";
System.out.println("INFO in WNsim.main(): " + line);
Pattern p = Pattern.compile("([^ ]+) ([^ ]+) (.*)");
Matcher m = p.matcher(line);
if (m.matches()) {
String w1 = m.group(1);
String w2 = m.group(2);
String n1 = m.group(3);
System.out.println("results: " + w1 + " " + w2 + " " + n1);
}
*/
try {
KBmanager.getMgr().initializeOnce();
createIndexes();
System.out.println(synsetOverlap("104254777", "104199027"));
System.out.println(maxCommonSynsetOverlap("104254777", "104199027"));
//System.out.println(
getSubsumingFreq(); //.toString().replace(",", "\n"));
long start = System.currentTimeMillis();
test();
long finish = System.currentTimeMillis();
System.out.println("Total time in milis: " + (finish - start));
//System.out.println("Similarity: " + computeSynsetSim("102858304", "102958343"));
//System.out.println("Similarity of car and boat: " + computeWordSim("car", "boat"));
//String parent = WordNetUtilities.lowestCommonParent("102858304", "102958343");
//System.out.println("frequency of " + parent + " : " + getSubsumingFreq().get(parent));
//System.out.println("Similarity: " +
// (-Math.log(getSubsumingFreq().get(parent) / WordNetUtilities.numSynsets('1'))));
//for (String s: hm.keySet()) {
// System.out.print(WordNet.wn.getWordsFromSynset(s).get(0) + "-" + s + ":" + hm.get(s) + ", ");
// if (count++ > 6) {
// System.out.println();
// count = 0;
// }
//}
}
catch (Exception e) {
System.out.println("Error in WordNetUtilities.main(): Exception: " + e.getMessage());
e.printStackTrace();
}
}
}