Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
edu.stanford.nlp.ie.pascal.AcronymModel Maven / Gradle / Ivy
package edu.stanford.nlp.ie.pascal;
import java.util.*;
import java.io.*;
/**
* Scores Pascal challenge workshop information templates.
* This score reflects which fields are present/absent, how well acronyms
* agree with the names and URLs they correspond to.
*
* @author Jamie Nicolson
*/
public class AcronymModel implements RelationalModel {
private static final double HIGH_PROB = 1.0;
private static final double LOW_PROB = 0.0;
private static boolean DEBUG= false;
private static final String acronymStatistics =
"workshopname workshopacronym workshophomepage conferencename conferenceacronym conferencehomepage\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00549450549450549\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.0521978021978022\n" +
"0.00274725274725275\n" +
"0.0357142857142857\n" +
"0.00549450549450549\n" +
"0.021978021978022\n" +
"0.010989010989011\n" +
"0.0357142857142857\n" +
"0.0302197802197802\n" +
"0.0824175824175824\n" +
"0.00549450549450549\n" +
"0.043956043956044\n" +
"0.010989010989011\n" +
"0.021978021978022\n" +
"0.00549450549450549\n" +
"0.0521978021978022\n" +
"0.0412087912087912\n" +
"0.0467032967032967\n" +
"0.00274725274725275\n" +
"0.010989010989011\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.00274725274725275\n" +
"0.0137362637362637\n" +
"0.00824175824175824\n" +
"0.167582417582418\n" +
"0.00549450549450549\n" +
"0.0494505494505494\n" +
"0.00824175824175824\n" +
"0.0164835164835165\n" +
"0.00549450549450549\n" +
"0.0604395604395604\n" +
"0.0467032967032967\n";
private Prior priors;
/**
* Scores the partial template containing only the fields relevant to the score.
* @param temp the {@link InfoTemplate} to be scored.
* @return the model's score
*/
public double computeProb(InfoTemplate temp){
return computeProb(temp.wname,temp.wacronym,temp.cname,temp.cacronym,
temp.whomepage, temp.chomepage);
}
/**
* Scores the {@link PascalTemplate} using the fields it contains which are relevant to the score.
* (Ignores location and date fields.)
* @param temp the full {@link PascalTemplate} to be scored
* @return the model's score
*/
public double computeProb(PascalTemplate temp) {
double prob = 1.0;
String wsname = temp.getValue("workshopname");
String confname = temp.getValue("conferencename");
String wsacronym = temp.getValue("workshopacronym");
String confacronym = temp.getValue("conferenceacronym");
String wsurl = temp.getValue("workshophomepage");
String confurl = temp.getValue("conferencehomepage");
return computeProb(wsname, wsacronym,confname,confacronym, wsurl, confurl);
}
/**
* @throws IOException if the acronym statistics/weights can't be read from file.
*/
public AcronymModel() throws IOException {
priors = new Prior(new BufferedReader(new StringReader(acronymStatistics)));
features = new Feature[]{new AcronymModel.LettersAligned(), new AcronymModel.BegWord(), new AcronymModel.EndWord(), new AcronymModel.AfterAligned(), new AcronymModel.AlignedPerWord(), new AcronymModel.WordsSkipped(), new AcronymModel.SyllableBoundary()};
weights = new double[]{// here's weights from a bunch of training examples
//-4.1004, 18.4127, 0.1789, 16.3189, 0.8818, -0.0725, -0.6550
//-12.4082, 18.3893, 2.1826, 18.8487, 0.5042, -0.1231, 1.8876
-11.8880, 14.4534, -2.6316, 24.1838, -2.2320, -0.2508, 4.3501
};
//intercept = -14.1449;
//intercept = -7.4882;
intercept = -2.2062;
}
private double computeProb(String wsname, String wsacronym, String confname,
String confacronym, String wsurl, String confurl){
HashSet presentFields = new HashSet();
if( wsname != null && !wsname.equals("null") && !wsname.equals("") )
presentFields.add("workshopname");
if( wsacronym != null && !wsacronym.equals("null") && !wsacronym.equals(""))
presentFields.add("workshopacronym");
if( confname != null && !confname.equals("null")
&& !confname.equals(""))
presentFields.add("conferencename");
if( confacronym != null && !confacronym.equals("null")
&& !confacronym.equals(""))
presentFields.add("conferenceacronym");
if( wsurl != null && !wsurl.equals("null") && !wsurl.equals(""))
presentFields.add("workshophomepage");
if( confurl != null && !confurl.equals("null") && !confurl.equals(""))
presentFields.add("conferencehomepage");
//if the workshop and conference have the same acronym we return 0.
if(presentFields.contains("conferenceacronym") &&
presentFields.contains("workshopacronym") &&
confacronym.equals(wsacronym)){
return 0.0;
}
double prob = priors.get(presentFields);
//System.out.println("Setting prior to " + prob + " based on the following "+
// "fields being present: " + presentFields.toString());
if( wsname != null && wsacronym != null ) {
if(DEBUG)System.err.println("computing similarity for workshop");
prob *= similarity(wsname, wsacronym);
} else {
if(DEBUG)System.err.println("NOT computing similarity for workshop");
}
if( confname != null && confacronym != null ) {
if(DEBUG)System.err.println("computing similarity for conference");
prob *= similarity(confname, confacronym);
} else {
if(DEBUG)System.err.println("NOT computing similarity for conference");
}
if( confacronym != null && confurl != null ) {
if( acronymMatchesURL(confacronym, confurl) ) {
prob *= probMatchFromAcronymAndURLMatch;
} else {
prob *= probMatchFromAcronymAndURLNoMatch;
}
}
if( wsacronym != null && wsurl != null ) {
if( acronymMatchesURL(wsacronym, wsurl) ) {
prob *= probMatchFromAcronymAndURLMatch;
} else {
prob *= probMatchFromAcronymAndURLNoMatch;
}
}
return prob;
}
private static boolean acronymMatchesURL(String ac, String url) {
String lowerURL = url.toLowerCase();
String strippedAc = (new String(AcronymModel.stripAcronym(ac))).toLowerCase();
return lowerURL.indexOf(strippedAc) != -1;
}
private static final double probMatchFromAcronymAndURLMatch = .23934426;
private static final double probMatchFromAcronymAndURLNoMatch = .052516411378;
/**
* Finds longest subsequent string of digits. Returns empty string
* if there aren't any digits.
*/
private static String acronymNumber(String acronym) {
return "";
}
public static double URLSimilarity(String URL, String acronym) {
String strippedAc = new String(stripAcronym(acronym));
String acNumber = acronymNumber(acronym);
return 0.0;
}
/**
* @return the "rich similarity" score
*/
public double similarity(String name, String acronym) {
return RichSimilarity(name, acronym);
}
/**
*
* @return the "naive similarity" score
*/
public double naiveSimilarity(String name, String acronym) {
double similarity = LOW_PROB;
String[] nameWords = splitOnWhitespace(name);
String[] acronymWords = splitOnWhitespace(acronym);
// first put together the letters in the acronym
char[] acLetters = allLetters(acronymWords);
// first let's try pulling the first letters from the name, and combining them to get the acronym
char[] nameFirstLetters = firstLetters(nameWords);
if (firstLetterInOrderMatch(nameFirstLetters, acLetters)) {
// the letters in acronym can be constructed from the first letters in the name, in order
similarity = HIGH_PROB;
}
if (DEBUG) {
System.err.println("Similarity between (" + name + ") and (" + acronym + ") is " + similarity);
}
return similarity;
}
/**
*
* @return the Hearst similarity score
*/
public double HearstSimilarity(String name, String acronym) {
char[] namechars = name.toLowerCase().toCharArray();
char[] acrochars = acronym.toLowerCase().toCharArray();
int nindex = namechars.length - 1;
for (int aindex = acrochars.length - 1; aindex >= 0; --aindex) {
if (!Character.isLetter(acrochars[aindex])) {
continue;
}
while ((nindex >= 0 && namechars[nindex] != acrochars[aindex]) || (aindex == 0 && nindex > 0 && Character.isLetterOrDigit(namechars[nindex - 1]))) {
nindex--;
}
if (nindex < 0) {
// System.err.println("\"" + name + "\" does NOT match \"" +
// acronym + "\"\n");
return 0;
}
nindex--;
}
//System.err.println("\"" + name + "\" matches \"" + acronym + "\"\n");
return 1.0;
}
public static interface Feature {
public double value(Alignment alignment);
public String toString();
}
public static class LettersAligned implements Feature {
public String toString() {
return "LettersAligned";
};
public double value(Alignment alignment) {
int numAligned = 0;
for (int i = 0; i < alignment.pointers.length; ++i) {
if (alignment.pointers[i] != -1) {
numAligned++;
}
}
double pct = (double) numAligned / (double) alignment.pointers.length;
if (DEBUG)
System.out.println("LettersAligned=" + pct);
return pct;
}
}
public static class BegWord implements Feature {
public String toString() { return "BegWord"; };
public double value(Alignment alignment) {
int begAligned = 0;
for( int s = 0; s < alignment.pointers.length; ++s) {
int idx = alignment.pointers[s];
if( idx == 0 ) {
begAligned++;
} else if( idx > 0) {
char cur = alignment.longForm[idx];
char prev = alignment.longForm[idx-1];
if( !Character.isLetterOrDigit(prev) &&
Character.isLetterOrDigit(cur) )
{
begAligned++;
}
}
}
return (double)begAligned / (double)alignment.shortForm.length;
}
}
public static class EndWord implements Feature {
public String toString() { return "EndWord"; };
public double value(Alignment alignment) {
int endAligned = 0;
for( int s = 0; s < alignment.pointers.length; ++s) {
int idx = alignment.pointers[s];
if( idx == alignment.longForm.length-1 ) {
endAligned++;
} else if( idx >= 0) {
char cur = alignment.longForm[idx];
char next = alignment.longForm[idx+1];
if( !Character.isLetterOrDigit(next) &&
Character.isLetterOrDigit(cur) )
{
endAligned++;
}
}
}
return (double)endAligned / (double)alignment.shortForm.length;
}
}
/**
* Percent of letters aligned immediately after another aligned letter.
*/
public static class AfterAligned implements Feature {
public String toString() { return "AfterAligned"; }
public double value(Alignment alignment) {
int numAfter = 0;
for( int i = 1; i < alignment.pointers.length; ++i) {
if( alignment.pointers[i] == alignment.pointers[i-1] + 1 ) {
numAfter++;
}
}
return (double)numAfter / (double)alignment.shortForm.length;
}
}
private static class RunningAverage {
double average;
int numSamples;
public RunningAverage() {
average = 0.0;
numSamples = 0;
}
public void addSample(double sample) {
average = (numSamples * average) + sample;
numSamples++;
average /= numSamples;
}
public double getAverage() {
return average;
}
public double getNumSammples() {
return numSamples;
}
}
/**
* Average number of aligned letters per word.
*/
public static class AlignedPerWord implements Feature {
public String toString() { return "AlignedPerWord"; }
public double value(Alignment alignment) {
/*
RunningAverage alignedPerWord = new RunningAverage();
boolean inWord = false;
int alignCount = 0;
int sidx = 0;
for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) {
char cur = alignment.longForm[lidx];
if( Character.isLetterOrDigit(cur) && !inWord ) {
// beginning of word
inWord = true;
} else if( inWord && !Character.isLetterOrDigit(cur) ) {
// end of word
alignedPerWord.addSample(alignCount);
alignCount = 0;
inWord = false;
}
while( sidx < alignment.pointers.length &&
alignment.pointers[sidx] < lidx )
sidx++;
if( sidx < alignment.pointers.length &&
alignment.pointers[sidx] == lidx && inWord)
{
alignCount++;
}
}
if( inWord ) {
// end of last word
alignedPerWord.addSample(alignCount);
}
return alignedPerWord.getAverage();
*/
boolean inWord = false;
int wordCount = 0;
for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) {
char cur = alignment.longForm[lidx];
if( Character.isLetterOrDigit(cur) && !inWord ) {
// beginning of word
++wordCount;
inWord = true;
} else if( inWord && !Character.isLetterOrDigit(cur) ) {
// end of word
inWord = false;
}
}
int alignCount = 0;
for( int sidx = 0; sidx < alignment.pointers.length; ++sidx) {
if( alignment.pointers[sidx] != -1 ) {
++alignCount;
}
}
if( wordCount == 0 ) {
return 0;
} else {
return (double)alignCount / (double)wordCount;
}
}
}
public static class WordsSkipped implements Feature {
public String toString() { return "WordsSkipped"; };
public double value(Alignment alignment) {
int wordsSkipped = 0;
int wordsAligned = 0;
boolean inWord = false;
boolean gotAlignedChar = false;
boolean []isAligned = new boolean[alignment.longForm.length];
for( int s = 0; s < alignment.pointers.length; ++s ) {
if( alignment.pointers[s] != -1 ) {
isAligned[alignment.pointers[s]] = true;
}
}
for( int l = 0; l < alignment.longForm.length; ++l ) {
char cur = alignment.longForm[l];
if( inWord ) {
if( !Character.isLetterOrDigit(cur)) {
// just finished a word
if( gotAlignedChar ) {
wordsAligned++;
} else {
wordsSkipped++;
}
inWord = false;
}
} else {
if( Character.isLetterOrDigit(cur)) {
inWord = true;
gotAlignedChar = false;
}
}
if( isAligned[l] ) gotAlignedChar = true;
}
if( inWord ) {
if( gotAlignedChar ) {
wordsAligned++;
} else {
wordsSkipped++;
}
}
if(DEBUG)System.out.println("Words skipped: " + wordsSkipped + "/" +
(wordsSkipped + wordsAligned) );
return wordsSkipped;
}
}
public static class SyllableBoundary implements Feature {
public String toString() { return "SyllableBoundary"; };
TeXHyphenator teXHyphenator = new TeXHyphenator();
public SyllableBoundary() throws IOException {
teXHyphenator.loadDefault();
}
public double value(Alignment alignment) {
char [] lcLongForm =
(new String(alignment.longForm)).toLowerCase().toCharArray();
boolean [] breakPoints = teXHyphenator.findBreakPoints(lcLongForm);
int numSylAligned = 0;
for( int i = 0; i < alignment.pointers.length; ++i ) {
if( alignment.pointers[i] != -1 &&
breakPoints[alignment.pointers[i]] )
{
numSylAligned++;
}
}
return (double)numSylAligned / (double)alignment.pointers.length;
}
}
private final Feature[] features;
private final double[] weights;
private final double intercept;
public static char[] stripAcronym(String acronym) {
char [] raw = acronym.toCharArray();
char [] firstTry = new char[raw.length];
int outIdx = 0;
for( int inIdx = 0; inIdx < raw.length; ++inIdx) {
if( Character.isLetter(raw[inIdx]) ) {
firstTry[outIdx++] = raw[inIdx];
}
}
if( outIdx == firstTry.length ) {
if(DEBUG) System.out.println("Converted \"" + acronym + "\" to \"" +
(new String(firstTry)) + "\"\n");
return firstTry;
} else {
char [] polished = new char[outIdx];
System.arraycopy(firstTry, 0, polished, 0, outIdx);
if(DEBUG) System.out.println("Converted \"" + acronym + "\" to \"" +
(new String(polished)) + "\"\n");
return polished;
}
}
public double RichSimilarity(String name, String acronym) {
AlignmentFactory fact = new AlignmentFactory(
name.toCharArray(), stripAcronym(acronym) );
double maxprob = 0.0;
Iterator iter = fact.getAlignments();
while(iter.hasNext()) {
Alignment align = (Alignment) iter.next();
double [] featureVals = new double[features.length];
for( int f = 0; f < features.length; ++f) {
featureVals[f] = features[f].value(align);
}
// compute dotproduct and sigmoid
double dotprod = dotproduct(weights, featureVals) + intercept;
double exp = Math.exp(dotprod);
double prob = exp / (1 + exp);
// align.print();
//System.out.println("Prob: " + prob + "\n-----------\n");
if( prob > maxprob ){
maxprob = prob;
}
}
return maxprob;
}
private static double dotproduct(double[] one, double[]two) {
double sum = 0.0;
for( int i = 0; i < one.length; ++i) {
double product = one[i] * two[i];
if(DEBUG)System.out.println("product: " + product);
sum += product;
}
if(DEBUG)System.out.println("sum: " + sum);
return sum;
}
private static final String[] stringArrayType = new String[0];
private static String[] splitOnWhitespace(String words) {
String[] firstCut = words.split("\\s+");
ArrayList wordList = new ArrayList(firstCut.length);
for( int i = 0; i < firstCut.length; ++i ) {
if( firstCut[i].length() > 0 ) {
wordList.add(firstCut[i]);
}
}
return wordList.toArray(stringArrayType);
}
private static boolean firstLetterInOrderMatch(char[] nameFirstLetters, char[] acLetters) {
int nameIdx = 0;
int acIdx = 0;
for( ; acIdx < acLetters.length; ++acIdx) {
while( nameIdx < nameFirstLetters.length && nameFirstLetters[nameIdx] != acLetters[acIdx] ) {
++nameIdx;
}
if( nameIdx == nameFirstLetters.length ) {
return false;
}
}
return true;
}
private static char[] allLetters(String[] acronym) {
StringBuffer sb = new StringBuffer();
for( int s = 0; s < acronym.length; ++s ) {
String acr = acronym[s];
for(int c = 0; c < acr.length(); ++c ) {
char ch = acr.charAt(c);
if( Character.isLetter( ch ) ) {
sb.append(ch);
}
}
}
return sbToChars(sb);
}
private static char[] firstLetters(String[] name) {
StringBuffer sb = new StringBuffer(name.length);
for( int s = 0; s < name.length; ++s) {
char c = name[s].charAt(0);
if( Character.isLetter(c) ) {
sb.append(c);
}
}
return sbToChars(sb);
}
private static char[] sbToChars(StringBuffer sb) {
char[] letters = new char[sb.length()];
sb.getChars(0, sb.length(), letters, 0);
return letters;
}
public static void main(String[] args) throws Exception {
AcronymModel am = new AcronymModel();
String s1 = args[0];
String s2 = args[1];
System.out.println("Hearst: "+am.HearstSimilarity(s1, s2));
System.out.println("naive: "+am.naiveSimilarity(s1, s2));
System.out.println("Rich: "+am.RichSimilarity(s1, s2));
System.out.println("default: "+am.similarity(s1, s2));
}
}