edu.stanford.nlp.ie.pascal.AcronymModel Maven / Gradle / Ivy

package edu.stanford.nlp.ie.pascal;

import java.util.*;
import java.io.*;
 /**
  * Scores Pascal challenge workshop information templates.
  * This score reflects which fields are present/absent, how well acronyms
  * agree with the names and URLs they correspond to.
  *
  * @author Jamie Nicolson
  */
public class AcronymModel implements RelationalModel {

  
  private static final double HIGH_PROB = 1.0;
  private static final double LOW_PROB = 0.0;
  private static boolean DEBUG= false;

  private static final String acronymStatistics =
    "workshopname workshopacronym workshophomepage conferencename conferenceacronym conferencehomepage\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00549450549450549\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.0521978021978022\n" +
    "0.00274725274725275\n" +
    "0.0357142857142857\n" +
    "0.00549450549450549\n" +
    "0.021978021978022\n" +
    "0.010989010989011\n" +
    "0.0357142857142857\n" +
    "0.0302197802197802\n" +
    "0.0824175824175824\n" +
    "0.00549450549450549\n" +
    "0.043956043956044\n" +
    "0.010989010989011\n" +
    "0.021978021978022\n" +
    "0.00549450549450549\n" +
    "0.0521978021978022\n" +
    "0.0412087912087912\n" +
    "0.0467032967032967\n" +
    "0.00274725274725275\n" +
    "0.010989010989011\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.00274725274725275\n" +
    "0.0137362637362637\n" +
    "0.00824175824175824\n" +
    "0.167582417582418\n" +
    "0.00549450549450549\n" +
    "0.0494505494505494\n" +
    "0.00824175824175824\n" +
    "0.0164835164835165\n" +
    "0.00549450549450549\n" +
    "0.0604395604395604\n" +
    "0.0467032967032967\n";
                                                                                
  private Prior priors;

  /**
   * Scores the partial template containing only the fields relevant to the score.
   * @param temp the {@link InfoTemplate} to be scored.
   * @return the model's score
   */
  public double computeProb(InfoTemplate temp){
    return computeProb(temp.wname,temp.wacronym,temp.cname,temp.cacronym,
      temp.whomepage, temp.chomepage);
  }
/**
 * Scores the {@link PascalTemplate} using the fields it contains which are relevant to the score.
 * (Ignores location and date fields.)
 * @param temp the full {@link PascalTemplate} to be scored
 * @return the model's score
 */

  public double computeProb(PascalTemplate temp) {
    double prob = 1.0;

    String wsname = temp.getValue("workshopname");
    String confname = temp.getValue("conferencename");
    String wsacronym = temp.getValue("workshopacronym");
    String confacronym = temp.getValue("conferenceacronym");
    String wsurl = temp.getValue("workshophomepage");
    String confurl = temp.getValue("conferencehomepage");
    return computeProb(wsname, wsacronym,confname,confacronym, wsurl, confurl);
  }

   /**
    * @throws IOException if the acronym statistics/weights can't be read from file.
    */
   public AcronymModel() throws IOException {
     priors = new Prior(new BufferedReader(new StringReader(acronymStatistics)));
     features = new Feature[]{new AcronymModel.LettersAligned(), new AcronymModel.BegWord(), new AcronymModel.EndWord(), new AcronymModel.AfterAligned(), new AcronymModel.AlignedPerWord(), new AcronymModel.WordsSkipped(), new AcronymModel.SyllableBoundary()};
     weights = new double[]{// here's weights from a bunch of training examples
       //-4.1004, 18.4127, 0.1789, 16.3189, 0.8818, -0.0725, -0.6550
       //-12.4082, 18.3893, 2.1826, 18.8487, 0.5042, -0.1231, 1.8876
       -11.8880, 14.4534, -2.6316, 24.1838, -2.2320, -0.2508, 4.3501

     };
     //intercept = -14.1449;
     //intercept = -7.4882;
     intercept = -2.2062;
   }

  private double computeProb(String wsname, String wsacronym, String confname,
                            String confacronym, String wsurl, String confurl){

    HashSet presentFields = new HashSet();
    if( wsname != null && !wsname.equals("null") && !wsname.equals("") )
      presentFields.add("workshopname");
    if( wsacronym != null && !wsacronym.equals("null") && !wsacronym.equals(""))
      presentFields.add("workshopacronym");
    if( confname != null && !confname.equals("null")
        && !confname.equals(""))
      presentFields.add("conferencename");
    if( confacronym != null && !confacronym.equals("null")
        && !confacronym.equals(""))
      presentFields.add("conferenceacronym");
    if( wsurl != null && !wsurl.equals("null") && !wsurl.equals(""))
      presentFields.add("workshophomepage");
    if( confurl != null && !confurl.equals("null") && !confurl.equals(""))
      presentFields.add("conferencehomepage");

    //if the workshop and conference have the same acronym we return 0.
    if(presentFields.contains("conferenceacronym") &&
       presentFields.contains("workshopacronym") &&
       confacronym.equals(wsacronym)){

      return 0.0;
    }

    double prob = priors.get(presentFields);
    //System.out.println("Setting prior to " + prob + " based on the following "+
    // "fields being present: " + presentFields.toString());
    if( wsname != null && wsacronym != null ) {
      if(DEBUG)System.err.println("computing similarity for workshop");
      prob *= similarity(wsname, wsacronym);
    } else {
      if(DEBUG)System.err.println("NOT computing similarity for workshop");
    }

    if( confname != null && confacronym != null ) {
      if(DEBUG)System.err.println("computing similarity for conference");
      prob *= similarity(confname, confacronym);
    } else {
      if(DEBUG)System.err.println("NOT computing similarity for conference");
    }

    if( confacronym != null && confurl != null ) {
      if( acronymMatchesURL(confacronym, confurl) ) {
        prob *= probMatchFromAcronymAndURLMatch;
      } else {
        prob *= probMatchFromAcronymAndURLNoMatch;
      }
    }

    if( wsacronym != null && wsurl != null ) {
      if( acronymMatchesURL(wsacronym, wsurl) ) {
        prob *= probMatchFromAcronymAndURLMatch;
      } else {
        prob *= probMatchFromAcronymAndURLNoMatch;
      }
    }
    return prob;
  }

  private static boolean acronymMatchesURL(String ac, String url) {
    String lowerURL = url.toLowerCase();
    String strippedAc = (new String(AcronymModel.stripAcronym(ac))).toLowerCase();

    return lowerURL.indexOf(strippedAc) != -1;
  }

   private static final double probMatchFromAcronymAndURLMatch = .23934426;
   private static final double probMatchFromAcronymAndURLNoMatch = .052516411378;

   /**
    * Finds longest subsequent string of digits. Returns empty string
    * if there aren't any digits.
    */
   private static String acronymNumber(String acronym) {
     return "";
   }

   public static double URLSimilarity(String URL, String acronym) {
     String strippedAc = new String(stripAcronym(acronym));
     String acNumber = acronymNumber(acronym);
     return 0.0;
   }

   /**
    * @return the "rich similarity" score
    */
   public double similarity(String name, String acronym) {
     return RichSimilarity(name, acronym);
   }
     /**
      *
      * @return the "naive similarity" score
      */
   public double naiveSimilarity(String name, String acronym) {
     double similarity = LOW_PROB;
     String[] nameWords = splitOnWhitespace(name);
     String[] acronymWords = splitOnWhitespace(acronym);

     // first put together the letters in the acronym
     char[] acLetters = allLetters(acronymWords);

     // first let's try pulling the first letters from the name, and combining them to get the acronym
     char[] nameFirstLetters = firstLetters(nameWords);

     if (firstLetterInOrderMatch(nameFirstLetters, acLetters)) {
       // the letters in acronym can be constructed from the first letters in the name, in order
       similarity = HIGH_PROB;
     }

     if (DEBUG) {
       System.err.println("Similarity between (" + name + ") and (" + acronym + ") is " + similarity);
     }
     return similarity;
   }

   /**
    *
    * @return the Hearst similarity score
    */
   public double HearstSimilarity(String name, String acronym) {
     char[] namechars = name.toLowerCase().toCharArray();
     char[] acrochars = acronym.toLowerCase().toCharArray();

     int nindex = namechars.length - 1;
     for (int aindex = acrochars.length - 1; aindex >= 0; --aindex) {
       if (!Character.isLetter(acrochars[aindex])) {
         continue;
       }
       while ((nindex >= 0 && namechars[nindex] != acrochars[aindex]) || (aindex == 0 && nindex > 0 && Character.isLetterOrDigit(namechars[nindex - 1]))) {
         nindex--;
       }
       if (nindex < 0) {
         //  System.err.println("\"" + name + "\" does NOT match \"" +
         //     acronym + "\"\n");
         return 0;
       }

       nindex--;
     }

     //System.err.println("\"" + name + "\" matches \"" + acronym + "\"\n");
     return 1.0;
   }

   public static interface Feature {
     public double value(Alignment alignment);

     public String toString();
   }

   public static class LettersAligned implements Feature {
     public String toString() {
       return "LettersAligned";
     };
     public double value(Alignment alignment) {
       int numAligned = 0;
       for (int i = 0; i < alignment.pointers.length; ++i) {
         if (alignment.pointers[i] != -1) {
           numAligned++;
         }
       }
       double pct = (double) numAligned / (double) alignment.pointers.length;
       if (DEBUG)
         System.out.println("LettersAligned=" + pct);
       return pct;
     }
   }

    public static class BegWord implements Feature {
        public String toString() { return "BegWord"; };
        public double value(Alignment alignment) {
            int begAligned = 0;
            for( int s = 0; s < alignment.pointers.length; ++s) {
                int idx = alignment.pointers[s];
                if( idx == 0 ) {
                    begAligned++;
                } else if( idx > 0) {
                    char cur = alignment.longForm[idx];
                    char prev = alignment.longForm[idx-1];
                    if( !Character.isLetterOrDigit(prev) &&
                        Character.isLetterOrDigit(cur) )
                    {
                        begAligned++;
                    }
                }
            }
            return (double)begAligned / (double)alignment.shortForm.length;
        }
    }

    public static class EndWord implements Feature {
        public String toString() { return "EndWord"; };
        public double value(Alignment alignment) {
            int endAligned = 0;
            for( int s = 0; s < alignment.pointers.length; ++s) {
                int idx = alignment.pointers[s];
                if( idx == alignment.longForm.length-1 ) {
                    endAligned++;
                } else if( idx >= 0) {
                    char cur = alignment.longForm[idx];
                    char next = alignment.longForm[idx+1];
                    if( !Character.isLetterOrDigit(next) &&
                        Character.isLetterOrDigit(cur) )
                    {
                        endAligned++;
                    }
                }
            }
            return (double)endAligned / (double)alignment.shortForm.length;
        }
    }

    /**
     * Percent of letters aligned immediately after another aligned letter.
     */
    public static class AfterAligned implements Feature {
      public String toString() { return "AfterAligned"; }

      public double value(Alignment alignment) {
        int numAfter = 0;
        for( int i = 1; i < alignment.pointers.length; ++i) {
          if( alignment.pointers[i] == alignment.pointers[i-1] + 1 ) {
            numAfter++;
          }
        }
        return (double)numAfter / (double)alignment.shortForm.length;
      }
    }

    private static class RunningAverage {
      double average;
      int numSamples;

      public RunningAverage() {
        average = 0.0;
        numSamples = 0;
      }

      public void addSample(double sample) {
        average = (numSamples * average) + sample;
        numSamples++;
        average /= numSamples;
      }

      public double getAverage() {
        return average;
      }

      public double getNumSammples() {
        return numSamples;
      }
    }

    /**
     * Average number of aligned letters per word.
     */
    public static class AlignedPerWord implements Feature {
      public String toString() { return "AlignedPerWord"; }

      public double value(Alignment alignment) {
/*
        RunningAverage alignedPerWord = new RunningAverage();
        boolean inWord = false;
        int alignCount = 0;
        int sidx = 0;
        for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) {
          char cur = alignment.longForm[lidx];
          if( Character.isLetterOrDigit(cur) && !inWord ) {
            // beginning of word
            inWord = true;
          } else if( inWord && !Character.isLetterOrDigit(cur) ) {
            // end of word
            alignedPerWord.addSample(alignCount);
            alignCount = 0;
            inWord = false;
          }

          while( sidx < alignment.pointers.length &&
                 alignment.pointers[sidx] < lidx )
            sidx++;

          if( sidx < alignment.pointers.length &&
              alignment.pointers[sidx] == lidx && inWord)
          {
            alignCount++;
          }
        }
        if( inWord ) {
          // end of last word
          alignedPerWord.addSample(alignCount);
        }

        return alignedPerWord.getAverage();
*/
        boolean inWord = false;
        int wordCount = 0;
        for(int lidx = 0; lidx < alignment.longForm.length; ++lidx ) {
          char cur = alignment.longForm[lidx];
          if( Character.isLetterOrDigit(cur) && !inWord ) {
            // beginning of word
            ++wordCount;
            inWord = true;
          } else if( inWord && !Character.isLetterOrDigit(cur) ) {
            // end of word
            inWord = false;
          }
        }
        int alignCount = 0;
        for( int sidx = 0; sidx < alignment.pointers.length; ++sidx) {
          if( alignment.pointers[sidx] != -1 ) {
            ++alignCount;
          }
        }
        if( wordCount == 0 ) {
          return 0;
        } else {
          return (double)alignCount / (double)wordCount;
        }
      }
    }

    public static class WordsSkipped implements Feature {
        public String toString() { return "WordsSkipped"; };
        public double value(Alignment alignment) {
            int wordsSkipped = 0;
            int wordsAligned = 0;
            boolean inWord = false;
            boolean gotAlignedChar = false;
            boolean []isAligned = new boolean[alignment.longForm.length];
            for( int s = 0; s < alignment.pointers.length; ++s ) {
                if( alignment.pointers[s] != -1 ) {
                    isAligned[alignment.pointers[s]] = true;
                }
            }
            for( int l = 0; l < alignment.longForm.length; ++l ) {
                char cur = alignment.longForm[l];
                if( inWord ) {
                    if( !Character.isLetterOrDigit(cur)) {
                        // just finished a word
                        if( gotAlignedChar ) {
                            wordsAligned++;
                        } else {
                            wordsSkipped++;
                        }
                        inWord = false;
                    }
                } else {
                    if( Character.isLetterOrDigit(cur)) {
                        inWord = true;
                        gotAlignedChar = false;
                    }
                }
                if( isAligned[l] ) gotAlignedChar = true;
            }
            if( inWord ) {
                if( gotAlignedChar ) {
                    wordsAligned++;
                } else {
                    wordsSkipped++;
                }
            }
            if(DEBUG)System.out.println("Words skipped: " + wordsSkipped + "/" +
                (wordsSkipped + wordsAligned) );
            return wordsSkipped;
        }
    }

    public static class SyllableBoundary implements Feature {
        public String toString() { return "SyllableBoundary"; };
        TeXHyphenator teXHyphenator = new TeXHyphenator();
        public SyllableBoundary() throws IOException {
          teXHyphenator.loadDefault();
        }
        public double value(Alignment alignment) {
          char [] lcLongForm =
            (new String(alignment.longForm)).toLowerCase().toCharArray();
          boolean [] breakPoints = teXHyphenator.findBreakPoints(lcLongForm);
          int numSylAligned = 0;
          for( int i = 0; i < alignment.pointers.length; ++i ) {
            if( alignment.pointers[i] != -1 &&
                breakPoints[alignment.pointers[i]] )
            {
              numSylAligned++;
            }
          }
          return (double)numSylAligned / (double)alignment.pointers.length;
        }
    }

    private final Feature[] features;

    private final double[] weights;
    private final double intercept;

    public static char[] stripAcronym(String acronym) {
        char [] raw = acronym.toCharArray();
        char [] firstTry = new char[raw.length];
        int outIdx = 0;
        for( int inIdx = 0; inIdx < raw.length; ++inIdx) {
            if( Character.isLetter(raw[inIdx]) ) {
                firstTry[outIdx++] = raw[inIdx];
            }
        }
        if( outIdx == firstTry.length ) {
          if(DEBUG)  System.out.println("Converted \"" + acronym + "\" to \"" +
                (new String(firstTry)) + "\"\n");
            return firstTry;
        } else {
            char [] polished = new char[outIdx];
            System.arraycopy(firstTry, 0, polished, 0, outIdx);
          if(DEBUG)  System.out.println("Converted \"" + acronym + "\" to \"" +
                (new String(polished)) + "\"\n");
            return polished;
        }
    }
        

    public double RichSimilarity(String name, String acronym) {
        AlignmentFactory fact = new AlignmentFactory(
            name.toCharArray(), stripAcronym(acronym) );

        double maxprob = 0.0;
        Iterator iter = fact.getAlignments();
        while(iter.hasNext()) {
            Alignment align = (Alignment) iter.next();

            double [] featureVals = new double[features.length];
            for( int f = 0; f < features.length; ++f) {
                featureVals[f] = features[f].value(align);
            }

            // compute dotproduct and sigmoid
            double dotprod = dotproduct(weights, featureVals) + intercept;
            double exp = Math.exp(dotprod);
            double prob = exp / (1 + exp);

            // align.print();
            //System.out.println("Prob: " + prob + "\n-----------\n");

            if( prob > maxprob ){
                maxprob = prob;
            }
        }

        return maxprob;
    }

    private static double dotproduct(double[] one, double[]two) {
        double sum = 0.0;
        for( int i = 0; i < one.length; ++i) {
            double product = one[i] * two[i];
            if(DEBUG)System.out.println("product: " + product);
            sum += product;
        }
        if(DEBUG)System.out.println("sum: " + sum);
        return sum;
    }

    private static final String[] stringArrayType = new String[0];

    private static String[] splitOnWhitespace(String words) {
        String[] firstCut = words.split("\\s+");

        ArrayList wordList = new ArrayList(firstCut.length);
        for( int i = 0; i < firstCut.length; ++i ) {
            if( firstCut[i].length() > 0 ) {
                wordList.add(firstCut[i]);
            }
        }
        return wordList.toArray(stringArrayType);
    }

    private static boolean firstLetterInOrderMatch(char[] nameFirstLetters, char[] acLetters) {
        int nameIdx = 0;
        int acIdx = 0;

        for( ; acIdx < acLetters.length; ++acIdx) {
            while( nameIdx < nameFirstLetters.length && nameFirstLetters[nameIdx] != acLetters[acIdx] ) {
                ++nameIdx;
            }
            if( nameIdx == nameFirstLetters.length ) {
                return false;
            }
        }
        return true;
    }

    private static char[] allLetters(String[] acronym) {
        StringBuffer sb = new StringBuffer();
        for( int s = 0; s < acronym.length; ++s ) {
            String acr = acronym[s];
            for(int c = 0; c < acr.length(); ++c ) {
                char ch = acr.charAt(c);
                if( Character.isLetter( ch ) ) {
                    sb.append(ch);
                }
            }
        }
        return sbToChars(sb);
    }

    private static char[] firstLetters(String[] name) {
        StringBuffer sb = new StringBuffer(name.length);
        for( int s = 0; s < name.length; ++s) {
            char c = name[s].charAt(0);
            if( Character.isLetter(c) ) {
                sb.append(c);
            }
        }
        return sbToChars(sb);
    }

    private static char[] sbToChars(StringBuffer sb) {
        char[] letters = new char[sb.length()];
        sb.getChars(0, sb.length(), letters, 0);
        return letters;
    }

  public static void main(String[] args) throws Exception {

    AcronymModel am = new AcronymModel();
    String s1 = args[0];
    String s2 = args[1];
    System.out.println("Hearst:  "+am.HearstSimilarity(s1, s2));
    System.out.println("naive:   "+am.naiveSimilarity(s1, s2));
    System.out.println("Rich:    "+am.RichSimilarity(s1, s2));
    System.out.println("default: "+am.similarity(s1, s2));

  }

}