com.articulate.sigma.Mapping Maven / Gradle / Ivy
Show all versions of sigma-component Show documentation
package com.articulate.sigma;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.TreeSet;
import com.articulate.sigma.KB;
/** This code is copyright Articulate Software (c) 2004.
This software is released under the GNU Public License .
Users of this code also consent, by use of this code, to credit Articulate Software
in any writings, briefings, publications, presentations, or
other representations of any software which incorporates, builds on, or uses this
code. Please cite the following article in any publication with references:
Pease, A., (2003). The Sigma Ontology Development Environment,
in Working Notes of the IJCAI-2003 Workshop on Ontology and Distributed Systems,
August 9, Acapulco, Mexico.See also http://sigmakee.sourceforge.net
This class maps ontologies. It includes embedded subclasses that
implement specific mapping heuristics.
This class also includes utilities for converting other
ad-hoc formats to KIF
*/
public class Mapping {
public static TreeMap> mappings =
new TreeMap>();
public static char termSeparator = '!';
/** *************************************************************
* Write synonymousExternalConcept expressions for term pairs
* given in cbset. They are strings of the form
* [checkbox|subcheckbox]_[T_]name1-name2
*
* There's a known bug when ontology terms contain dashes.
*
* @return error messages if necessary
*/
public static String writeEquivalences(TreeSet cbset, String kbname1, String kbname2) throws IOException {
System.out.println("INFO in Mapping.writeEquivalences(): size: " + cbset.size());
FileWriter fw = null;
PrintWriter pw = null;
String dir = (String) KBmanager.getMgr().getPref("baseDir");
String filename = dir + File.separator + kbname1 + "-" + kbname2 + "-links";
if (mappings.keySet().size() < 1)
return "Error: No mappings found";
try {
File f = new File(filename + ".kif");
int fileCounter = 0;
while (f.exists()) {
fileCounter++;
f = new File(filename + fileCounter + ".kif");
}
if (fileCounter == 0)
filename = filename + ".kif";
else
filename = filename + fileCounter + ".kif";
fw = new FileWriter(filename);
pw = new PrintWriter(fw);
Iterator it = cbset.iterator();
while (it.hasNext()) {
String st = (String) it.next();
boolean subcheckbox = false;
if (st.startsWith("sub_checkbox_")) {
st = st.substring(13);
subcheckbox = true;
}
else {
if (st.startsWith ("checkbox_"))
st = st.substring(9);
else
return "Error in Mapping.writeEquivalences(): malformed string " + st;
}
if (st.startsWith ("T_"))
st = st.substring(2);
int i = st.indexOf(termSeparator);
if (i < 0)
return "Error in Mapping.writeEquivalences(): malformed string (no '" + termSeparator + "') " + st;
String term1 = st.substring(0,i);
String term2 = st.substring(i+1);
if (!subcheckbox)
pw.println("(synonymousExternalConcept \"" + term2 +
"\" " + term1 + " " + kbname2 + ")");
else
pw.println("(subsumedExternalConcept \"" + term2 +
"\" " + term1 + " " + kbname2 + ")");
}
}
catch (java.io.IOException e) {
throw new IOException("Error writing file " + filename + "\n" + e.getMessage());
}
finally {
if (pw != null) {
pw.flush();
pw.close();
}
if (fw != null) {
fw.close();
}
}
return "Wrote: " + filename;
}
/** *************************************************************
* rename terms in KB kbname2 to conform to names in kbname1
* @return error messages if necessary
*/
public static String merge(TreeSet cbset, String kbname1, String kbname2) {
System.out.println("INFO in Mapping.merge()");
if (mappings.keySet().size() < 1)
return "Error: No mappings found";
KB kb1 = KBmanager.getMgr().getKB(kbname1);
KB kb2 = KBmanager.getMgr().getKB(kbname2);
Iterator it = mappings.keySet().iterator();
while (it.hasNext()) {
String term1 = (String) it.next();
TreeMap value = (TreeMap) mappings.get(term1);
// System.out.println("INFO in Mapping.merge(): outer loop, examining " + term1);
Iterator it2 = value.keySet().iterator();
int counter = 0;
while (it2.hasNext()) {
counter++;
Integer score = (Integer) it2.next();
String term2 = (String) value.get(score);
// System.out.println("INFO in Mapping.merge(): inner loop, examining " + term2);
String topScoreFlag = "";
if (counter == 1)
topScoreFlag = "T_";
String cbName = "checkbox_" + topScoreFlag + term1 + termSeparator + term2;
String subName = "sub_checkbox_" + topScoreFlag + term1 + termSeparator + term2;
if (cbset.contains(cbName) && !term2.equals(term1))
kb2.rename(term2,term1);
if (cbset.contains(subName)) {
if (kb2.isInstance(term2)) {
kb2.tell("(instance " + term2 + " " + term1 + ")");
System.out.println("(instance " + term2 + " " + term1 + ")");
}
else {
kb2.tell("(subclass " + term2 + " " + term1 + ")");
System.out.println("(subclass " + term2 + " " + term1 + ")");
}
}
}
}
String dir = (String) KBmanager.getMgr().getPref("baseDir");
String filename = dir + File.separator + kbname2 + "-merged-" + kbname1;
try {
File f = new File(filename + ".kif");
int counter = 0;
while (f.exists()) {
counter++;
f = new File(filename + counter + ".kif");
}
if (counter == 0)
filename = filename + ".kif";
else
filename = filename + counter + ".kif";
kb2.writeFile(filename);
kb1.addConstituent(filename);
KBmanager.getMgr().removeKB(kbname2);
}
catch (java.io.IOException e) {
return "Error writing file " + filename + "\n" + e.getMessage();
}
return "Successful renaming of terms in " + kbname2 + " to those in " + kbname1;
}
/** *************************************************************
* Convert a YAGO file into KIF
*/
public static void convertYAGO(String file, String relName) throws IOException {
File f = new File(file);
if (f == null) {
System.out.println( "INFO in convertYAGO(): "
+ "The file " + file + " does not exist" );
return;
}
FileReader r = new FileReader(f);
LineNumberReader lr = new LineNumberReader(r);
String line = null;
while ((line = lr.readLine()) != null) {
line = line.trim();
if (line != null && line.length() > 0) {
int tab1 = line.indexOf("\t");
int tab2 = line.indexOf("\t",tab1+1);
int tab3 = line.indexOf("\t",tab2+1);
String term1 = line.substring(tab1+1,tab2);
String term2 = line.substring(tab2+1,tab3);
term1 = StringUtil.StringToKIFid(term1);
term2 = StringUtil.StringToKIFid(term2);
System.out.println("(" + relName + " " + term1 + " " + term2 + ")");
}
}
}
/** *************************************************************
* Get the termFormat label for a term. Return only the first
* such label. Return null if no label.
*/
public static String getTermFormat(KB kb, String term) {
if (kb != null) {
ArrayList al = kb.askWithRestriction(0,"termFormat",2,term);
if (al != null && al.size() > 0) {
Formula f = (Formula) al.get(0);
String t = f.getArgument(3);
t = OWLtranslator.removeQuotes(t);
return t;
}
}
return null;
}
/** *************************************************************
* @return the minimum of two ints
*/
private static int min(int n1, int n2) {
if (n1 100) {
System.out.print(".");
counter = 0;
}
String term1 = (String) it1.next();
if (isValidTerm(term1)) {
String normTerm1 = normalize(term1);
String normLabel1 = normalize(getTermFormat(kb1,term1));
TreeMap tm = (TreeMap) result.get(term1);
if (tm == null)
tm = new TreeMap();
Iterator it2 = kb2.getTerms().iterator();
while (it2.hasNext()) {
String term2 = (String) it2.next();
if (isValidTerm(term2)) {
String normTerm2 = normalize(term2);
String normLabel2 = normalize(getTermFormat(kb2,term2));
int score = Integer.MAX_VALUE;
score = min(score,stringMatch(normTerm1, normTerm2,matchMethod));
//System.out.println(normTerm1 + " " + normTerm2);
if (normLabel1 != null && isValidTerm(normLabel1))
score = min(score,stringMatch(normLabel1,normTerm2,matchMethod));
if (normLabel2 != null && isValidTerm(normLabel2))
score = min(score,stringMatch(normTerm1, normLabel2,matchMethod));
if (normLabel1 != null && normLabel2 != null &&
isValidTerm(normLabel1) && isValidTerm(normLabel2))
score = min(score,stringMatch(normLabel1, normLabel2,matchMethod));
if (score > 0 && score < Integer.MAX_VALUE) {
if (score < threshold) {
tm.put(new Integer(score), term2);
mapCount++;
}
}
}
}
if (tm.keySet().size() > 0)
result.put(term1,tm);
}
}
}
}
}
else {
if (kb1 == null)
System.out.println(kbName1 + " not found\n");
if (kb2 == null)
System.out.println(kbName2 + " not found
\n");
}
System.out.println();
System.out.println(totalCandidates + " " + " possible mappings checked with " +
mapCount + " mappings found in "
+ ((System.currentTimeMillis() - t1) / 1000.0) + " seconds");
mappings = result;
}
/** *************************************************************
* check whether a term is valid (worthy of being compared)
*/
public static boolean isValidTerm(String term) {
return term.length() > 2 && !Formula.isLogicalOperator(term);
}
/** *************************************************************
* Normalize a string by replacing all non-letter, non-digit
* characters with spaces, adding spaces on capitalization
* boundaries, and then converting to lower case
*/
public static String normalize(String s) {
if (s == null || s.length() < 1)
return null;
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
if ((Character.isLetter(s.charAt(i)) && Character.isLowerCase(s.charAt(i))) ||
Character.isDigit(s.charAt(i)))
result.append(s.charAt(i));
else {
if (Character.isLetter(s.charAt(i)) && Character.isUpperCase(s.charAt(i))) {
if (result.length() > 0 && result.charAt(result.length()-1) != ' ')
result.append(" ");
result.append(Character.toLowerCase(s.charAt(i)));
}
else
if (result.length() > 0 && result.charAt(result.length()-1) != ' ')
result.append(" ");
}
}
return result.toString();
}
/** *************************************************************
* Substring Mapping Method: returns 1 if the two strings
* are identical, scores >1 if one string is a substring of
* the other, and Integer.MAX_VALUE if there is no substring
* match
*
* This approach is based on:
* John Li, "LOM: A Lexicon-based Ontology Mapping Tool",
* Proceedings of the Performance Metrics for Intelligent
* Systems (PerMIS.'04), 2004.
*
* *** This is not yet fully implemented here ***
*/
public static int getSubstringDistance(String term1, String term2) {
if (term1.equals(term2))
return 1;
else if (term1.indexOf(term2) > -1)
return term1.indexOf(term2) +
(term1.length() - term2.length());
else if (term2.indexOf(term1) > -1)
return term2.indexOf(term1) + (term2.length() - term1.length());
else
return Integer.MAX_VALUE;
}
/** *************************************************************
*/
private static int minimum(int a, int b, int c) {
int ans = a;
if (b < ans) ans = b;
if (c < ans) ans = c;
return ans;
}
/** *************************************************************
* LevenshteinDistance(char s[1..m], char t[1..n])
* courtesy of Wikipedia
* http://en.wikipedia.org/wiki/Levenshtein_distance
* int LevenshteinDistance(char s[1..m], char t[1..n])
*/
public static int getLevenshteinDistance(String s, String t) {
int m = s.length();
int n = t.length();
// d is a table with m+1 rows and n+1 columns
int[][] d = new int[m][n];
for (int i = 0; i < m; i++)
d[i][0] = i; // deletion
for (int j = 0; j < n; j++)
d[0][j] = j; // insertion
for (int j = 1; j < n; j++) {
for (int i = 1; i < m; i++) {
if (s.charAt(i) == t.charAt(j))
d[i][j] = d[i-1][j-1];
else
d[i][j] = minimum(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + 1);
} // deletion, insertion, substitution
}
/**
int result = 0;
for (int j = 1; j < n; j++) {
int min = Integer.MAX_VALUE;
for (int i = 1; i < m; i++) {
if (d[i][j] < min)
min = d[i][j];
}
result =+ min - 1;
}
* */
return d[m-1][n-1];
}
/** *************************************************************
* Jaro-Winkler Mapping Method
* implemented by Gerard de Melo
*/
public static int getJaroWinklerDistance(String s1, String s2) {
int SCALING_FACTOR = 100;
int winklerMaxPrefixLen = 4;
double winklerPrefixWeight = 0.1;
int len1 = s1.length();
int len2 = s2.length();
if (len1 == 0 || len2 == 0)
return SCALING_FACTOR;
// without loss of generality assume s1 is longer
if (len1 < len2) {
String t = s1;
s1 = s2;
s2 = t;
len1 = len2;
len2 = s2.length();
}
// count and flag the matched pairs
int maxDistance = (len1 >= 4) ? (int) Math.floor(len1 / 2) - 1 : 0;
boolean[] s1Matches = new boolean[len1]; // initialized to false
boolean[] s2Matches = new boolean[len2]; // initialized to false
int nMatches = 0;
for (int i = 0; i < len2; i++) { // index in s2
char c = s2.charAt(i);
int jStart = (i > maxDistance) ? i - maxDistance : 0;
int jEnd = i + maxDistance + 1;
if (jEnd > len1)
jEnd = len1;
for (int j = jStart; j < jEnd; j++) { // possible matching positions within s1
if (!s1Matches[j] && c == s1.charAt(j)) {
s1Matches[j] = true;
s2Matches[i] = true;
nMatches++;
break;
}
}
}
if (nMatches == 0)
return SCALING_FACTOR;
// count transpositions
int nTranspositions = 0;
int k = 0;
for (int i = 0; i < len2; i++) // index in s2
if (s2Matches[i]) {
int j;
for (j = k; j < len1; j++)
if (s1Matches[j]) {
k = j + 1;
break;
}
if (s2.charAt(i) != s1.charAt(j))
nTranspositions++;
}
int halfTranspositions = nTranspositions / 2;
double jaroScore = ((double) nMatches / len1
+(double) nMatches / len2
+(double) (nMatches - halfTranspositions) / nMatches)
/ 3.0;
// Winkler bias
int cMaxPrefixLen = winklerMaxPrefixLen;
if (len1 < cMaxPrefixLen)
cMaxPrefixLen = len1;
if (len2 < cMaxPrefixLen)
cMaxPrefixLen = len2;
int l = 0;
while (l < cMaxPrefixLen)
if (s1.charAt(l) == s2.charAt(l))
l++;
else
break;
double jaroWinklerScore = jaroScore + l * winklerPrefixWeight * (1.0 - jaroScore);
// return as a distance value such that larger
// values indicate greater distances
return (int) (SCALING_FACTOR * (1.0 - jaroWinklerScore));
}
/** *************************************************************
* A test method.
*/
private static void timingTest() {
String s1 = normalize("sitting");
String s2 = normalize("kitten");
String s3 = normalize("arm");
String s4 = normalize("Arm");
String s5 = normalize("alarm");
String s6 = normalize("Arm");
String s7 = normalize("farm");
String s8 = normalize("Armory");
String s9 = normalize("hiccup");
String s10 = normalize("Armory");
String s11 = normalize("isSubclassOf");
String s12 = normalize("subclass");
String s13 = normalize("subclassOf");
String s14 = normalize("subclass");
long t1 = System.currentTimeMillis();
for (int i = 0; i < 100000; i++) {
getJaroWinklerDistance(s1,s2);
getJaroWinklerDistance(s3,s4);
getJaroWinklerDistance(s5,s6);
getJaroWinklerDistance(s7,s8);
getJaroWinklerDistance(s9,s10);
getJaroWinklerDistance(s11,s12);
getJaroWinklerDistance(s13,s14);
}
System.out.println("Jaro-Winkler: " + ((System.currentTimeMillis() - t1) / 1000.0) + " seconds");
t1 = System.currentTimeMillis();
for (int i = 0; i < 100000; i++) {
getLevenshteinDistance(s1,s2);
getLevenshteinDistance(s3,s4);
getLevenshteinDistance(s5,s6);
getLevenshteinDistance(s7,s8);
getLevenshteinDistance(s9,s10);
getLevenshteinDistance(s11,s12);
getLevenshteinDistance(s13,s14);
}
System.out.println("Levenshtein: " + ((System.currentTimeMillis() - t1) / 1000.0) + " seconds");
t1 = System.currentTimeMillis();
for (int i = 0; i < 100000; i++) {
getSubstringDistance(s1,s2);
getSubstringDistance(s3,s4);
getSubstringDistance(s5,s6);
getSubstringDistance(s7,s8);
getSubstringDistance(s9,s10);
getSubstringDistance(s11,s12);
getSubstringDistance(s13,s14);
}
System.out.println("Substring: " + ((System.currentTimeMillis() - t1) / 1000.0) + " seconds");
}
/** *************************************************************
* A test method.
*/
private static void printTest(String s1, String s2) {
System.out.println("\"" + s1 + "\" \"" + s2 + "\"");
s1 = normalize(s1);
s2 = normalize(s2);
System.out.println("\"" + s1 + "\" \"" + s2 + "\"");
System.out.print(getJaroWinklerDistance(s1,s2));
System.out.print(" ");
System.out.print(getLevenshteinDistance(s1,s2));
System.out.print(" ");
System.out.println(getSubstringDistance(s1,s2));
System.out.println();
}
/** *************************************************************
* A test method.
*/
public static void main(String args[]) {
// read(args[0]);
printTest("sitting","kitten");
printTest("arm","Arm");
printTest("alarm","Arm");
printTest("farm","Armory");
printTest("hiccup","Armory");
printTest("isSubclassOf","subclass");
printTest("subclassOf","subclass");
printTest("supercalafragalisticexpialadotious","subclass");
printTest("subclass","supercalafragalisticexpialadotious");
printTest("fix","arm");
timingTest();
/**
try {
KBmanager.getMgr().initializeOnce();
}
catch (Exception e ) {
System.out.println(e.getMessage());
}
Mapping.mapOntologies("SUMO","OBO",10);
**/
//System.out.println(m.normalize("Philippe_Mex-s"));
//System.out.println(m.normalize("AntiguaAndBarbuda"));
//System.out.println(m.normalize("SUMO"));
/***
try {
convertYAGO("TypeExtractor.txt","citizen");
}
catch (Exception e ) {
System.out.println(e.getMessage());
e.printStackTrace();
}
*/
}
}