edu.berkeley.nlp.PCFGLA.SimpleLexicon Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.ArrayUtil;
import edu.berkeley.nlp.util.Counter;
import edu.berkeley.nlp.util.Indexer;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.ScalingTools;

import java.io.Serializable;
import java.util.*;

/**
 * Simple default implementation of a lexicon, 
 * which scores word, tag pairs with P(word|tag)
 *
 * instead of dealing with words as strings we will map them to integers
 * with a wordIndexer. to further simplify things each tag will have 
 * its own mapping from wordIndex to a tag-specific index so that we
 * don't have to deal with unobserved events
 * 
 * assumes that rare words have been replaced with some unknown word token
 */
public class SimpleLexicon implements java.io.Serializable, Lexicon {
	public IntegerIndexer[] tagWordIndexer;
	public double[][][] expectedCounts; // indexed by tag, substate, word 
	public double[][][] scores; // indexed by tag, word, substate, substate
	
	public int[] wordCounter; // how many times each word occured
//	public boolean[] wordIsAmbiguous;
	
  /** A trick to allow loading of saved Lexicons even if the version has changed. */
  private static final long serialVersionUID = 2L;
  /** The number of substates for each state */
  public short[] numSubStates;
  int numStates;
  int nWords;
  
  double threshold;
  boolean isLogarithmMode;
  boolean useVarDP = false;
  
  public Indexer wordIndexer;
  Smoother smoother;
  
  // additions from the stanford parser which are needed for a better 
  // unknown word model...
  /**
   * We cache the last signature looked up, because it asks for the same one
   * many times when an unknown word is encountered!  (Note that under the
   * current scheme, one unknown word, if seen sentence-initially and
   * non-initially, will be parsed with two different signatures....)
   */
  protected transient String lastSignature = "";
  protected transient int lastSentencePosition = -1;
  protected transient String lastWordToSignaturize = "";
  private int unknownLevel = 5; //different modes for unknown words, 5 is english specific

  public void optimize() {  	
  	for (int tag=0; tag();
    this.numStates = numSubStates.length;
    this.isLogarithmMode = false;
    if (Corpus.myTreebank != Corpus.TreeBankType.WSJ || Corpus.myTreebank == Corpus.TreeBankType.BROWN)
    	unknownLevel = 4;

  }

  public double[] score(String word, short tag, int pos, boolean noSmoothing, boolean isSignature) {
  	StateSet stateSet = new StateSet(tag, (short)1, word, (short)pos, (short)(pos+1));
  	stateSet.wordIndex = -2;
  	stateSet.sigIndex = -2;
  	return score(stateSet,tag,noSmoothing,isSignature);
  }
	
	public double[] score(StateSet stateSet, short tag, boolean noSmoothing, boolean isSignature) {
		double[] res = new double[numSubStates[tag]];
		int globalWordIndex = stateSet.wordIndex;
		if (globalWordIndex==-2)
			globalWordIndex = stateSet.wordIndex = wordIndexer.indexOf(stateSet.getWord());
		if (globalWordIndex==-1)
			globalWordIndex = stateSet.sigIndex;
		if (globalWordIndex==-2) 
			globalWordIndex = stateSet.sigIndex = wordIndexer.indexOf(getSignature(stateSet.getWord(), stateSet.from));
		if (globalWordIndex==-1){
			System.out.println("unknown signature for word "+stateSet.getWord());
			Arrays.fill(res, 0.001);
			return res;
		}

		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
		if (tagSpecificWordIndex==-1){
			if (isLogarithmMode) Arrays.fill(res, Double.NEGATIVE_INFINITY);//-80??Double.NEGATIVE_INFINITY);
//			else Arrays.fill(res, 1e-80);
			return res;
		}
		for (int i=0; i trainTree, double randomness, Lexicon oldLexicon,
  		boolean secondHalf, boolean noSmoothing, int unusedUnkThreshold) {
  	// scan data
    //for all substates that the word's preterminal tag has
    double sentenceScore = 0;
    if (randomness == -1){
      sentenceScore = trainTree.getLabel().getIScore(0);
  		if (sentenceScore==0){
  			System.out.println("Something is wrong with this tree. I will skip it.");
  			return;
  		}
    }
    int sentenceScale = trainTree.getLabel().getIScale();
  	
  	List words = trainTree.getYield();
  	List tags = trainTree.getPreTerminalYield();
  	//for all words in sentence
  	for (int position = 0; position < words.size(); position++) {
  		
  		int nSubStates = tags.get(position).numSubStates();
  		short tag = tags.get(position).getState();
  		
  		String word = words.get(position).getWord();
  		int globalWordIndex = wordIndexer.indexOf(word);
  		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
  		
  		double[] oldLexiconScores = null;
  		if (randomness==-1)
  			oldLexiconScores = oldLexicon.score(word,tag,position,noSmoothing,false);
  		
  		StateSet currentState = tags.get(position);
      double scale = ScalingTools.calcScaleFactor(currentState.getOScale()-sentenceScale) / sentenceScore;
  		
      for (short substate=0; substate tree : trainTrees){
  		List words = tree.getYield();
  		for (StateSet word : words){
				String sig = word.getWord();
				wordIndexer.add(sig);
  		}
  	}
  	tagWordIndexer = new IntegerIndexer[numStates];
  	for (int tag=0; tag tree : trainTrees){
  		List tags = tree.getPreTerminalYield();
  		List words = tree.getYield();
  		int ind = 0;
  		for (StateSet word : words){
				String sig = word.getWord();
				wordCounter[wordIndexer.indexOf(sig)]++;
				tagWordIndexer[tags.get(ind).getState()].add(wordIndexer.indexOf(sig));
  			ind++;
  		}  		
  	}
  	expectedCounts = new double[numStates][][];
  	scores = new double[numStates][][];
  	for (int tag=0; tag=counts[i]) { 
//				newNumSubStates[i]=numSubStates[i];
//				} 
//			else{
				newNumSubStates[i] = (short)(numSubStates[i] * 2);
//			}
		}
		splitLex.numSubStates = newNumSubStates;
		double[][][] newScores = new double[scores.length][][];
		double[][][] newExpCounts = new double[scores.length][][];
  	for (int tag=0; tag 0) {
            sb.append("-CAPS");
          } else if (hasLower) { // (Character.isLowerCase(ch0)) {
            sb.append("-LC");
          }
          if (hasDigit) {
            sb.append("-NUM");
          }
          if (hasDash) {
            sb.append("-DASH");
          }
          if (lowered.endsWith("s") && wlen >= 3) {
            // here length 3, so you don't miss out on ones like 80s
            char ch2 = lowered.charAt(wlen - 2);
            // not -ess suffixes or greek/latin -us, -is
            if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
              sb.append("-s");
            }
          } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
            // don't do for very short words;
            // Implement common discriminating suffixes
/*          	if (Corpus.myLanguage==Corpus.GERMAN){
          		sb.append(lowered.substring(lowered.length()-1));
          	}else{*/
            if (lowered.endsWith("ed")) {
              sb.append("-ed");
            } else if (lowered.endsWith("ing")) {
              sb.append("-ing");
            } else if (lowered.endsWith("ion")) {
              sb.append("-ion");
            } else if (lowered.endsWith("er")) {
              sb.append("-er");
            } else if (lowered.endsWith("est")) {
              sb.append("-est");
            } else if (lowered.endsWith("ly")) {
              sb.append("-ly");
            } else if (lowered.endsWith("ity")) {
              sb.append("-ity");
            } else if (lowered.endsWith("y")) {
              sb.append("-y");
            } else if (lowered.endsWith("al")) {
              sb.append("-al");
              // } else if (lowered.endsWith("ble")) {
              // sb.append("-ble");
              // } else if (lowered.endsWith("e")) {
              // sb.append("-e");
            }
          }
          break;
        }

      case 4:
        {
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLetter = false;
          boolean hasLower = false;
          boolean hasDash = false;
          boolean hasPeriod = false;
          boolean hasComma = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                hasLetter = true;
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              } else {
                if (ch == '-') {
                  hasDash = true;
                } else if (ch == '.') {
                  hasPeriod = true;
                } else if (ch == ',') {
                  hasComma = true;
                }
              }
            }
          }
          // 6 way on letters
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-AC");
            } else if (loc == 0) {
              sb.append("-SC");
            } else {
              sb.append("-C");
            }
          } else if (hasLower) {
            sb.append("-L");
          } else if (hasLetter) {
            sb.append("-U");
          } else {
            // no letter
            sb.append("-S");
          }
          // 3 way on number
          if (hasDigit && !hasNonDigit) {
            sb.append("-N");
          } else if (hasDigit) {
            sb.append("-n");
          }
          // binary on period, dash, comma
          if (hasDash) {
            sb.append("-H");
          }
          if (hasPeriod) {
            sb.append("-P");
          }
          if (hasComma) {
            sb.append("-C");
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            if (Character.isLetter(ch)) {
              sb.append("-");
              sb.append(Character.toLowerCase(ch));
            }
          }
          break;
        }

      case 3:
        {
          // This basically works right, except note that 'S' is applied to all
          // capitalized letters in first word of sentence, not just first....
          sb.append("-");
          char lastClass = '-';  // i.e., nothing
          char newClass;
          int num = 0;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
              if (loc == 0) {
                newClass = 'S';
              } else {
                newClass = 'L';
              }
            } else if (Character.isLetter(ch)) {
              newClass = 'l';
            } else if (Character.isDigit(ch)) {
              newClass = 'd';
            } else if (ch == '-') {
              newClass = 'h';
            } else if (ch == '.') {
              newClass = 'p';
            } else {
              newClass = 's';
            }
            if (newClass != lastClass) {
              lastClass = newClass;
              sb.append(lastClass);
              num = 1;
            } else {
              if (num < 2) {
                sb.append('+');
              }
              num++;
            }
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = Character.toLowerCase(word.charAt(word.length() - 1));
            sb.append('-');
            sb.append(ch);
          }
          break;
        }

      case 2:
        {
          // {-ALLC, -INIT, -UC, -LC, zero} +
          // {-DASH, zero} +
          // {-NUM, -DIG, zero} +
          // {lowerLastChar, zeroIfShort}
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLower = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              }
            }
          }
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-ALLC");
            } else if (loc == 0) {
              sb.append("-INIT");
            } else {
              sb.append("-UC");
            }
          } else if (hasLower) {   // if (Character.isLowerCase(word.charAt(0))) {
            sb.append("-LC");
          }
          // no suffix = no (lowercase) letters
          if (word.indexOf('-') >= 0) {
            sb.append("-DASH");
          }
          if (hasDigit) {
            if (!hasNonDigit) {
              sb.append("-NUM");
            } else {
              sb.append("-DIG");
            }
          } else if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            sb.append(Character.toLowerCase(ch));
          }
          // no suffix = short non-number, non-alphabetic
          break;
        }

      default:
        sb.append("-");
        sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
        sb.append("-");
        if (Character.isLowerCase(word.charAt(0))) {
          sb.append("LOWER");
        } else {
          if (Character.isUpperCase(word.charAt(0))) {
            if (loc == 0) {
              sb.append("INIT");
            } else {
              sb.append("UPPER");
            }
          } else {
            sb.append("OTHER");
          }
        }
    } // end switch (unknownLevel)
    // System.err.println("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()

  
  public String toString() {
  	StringBuffer sb = new StringBuffer();
  	Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
  	for (int tag=0; tag tree : trainTrees){
  		List words = tree.getYield();
//  		List tags = tree.getPreTerminalYield();
//  		int ind = 0;
  		for (StateSet word : words){
  			word.wordIndex = wordIndexer.indexOf(word.getWord());
  			word.sigIndex = -1;
//  			short tag = tags.get(ind).getState();
////				if (wordIsAmbiguous[word.wordIndex]) {
//					String sig = getSignature(word.getWord(), ind);
//					wordIndexer.add(sig);
//	  			word.sigIndex = (short)wordIndexer.indexOf(sig);
//					tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
////				}
////				else { word.sigIndex = -1; }
//				ind++;
  		}  		
  	}

	}


	/*
	public void clearMapping() {
		toBeIgnored = null;
		linearIndex = null;
	}

*/
	public static class IntegerIndexer implements Serializable{
		private int[] indexTo;
		private int[] indexFrom;
		private int n;
		
		IntegerIndexer(int capacity){
			indexTo = new int[capacity];
			indexFrom = new int[capacity];
			Arrays.fill(indexTo, -1);
			Arrays.fill(indexFrom, -1);
			n = 0;
		}
		
		public void add(int i){
			if (i==-1) return;
			if (indexTo[i]==-1){
				indexTo[i] = n;
				indexFrom[n] = i;
				n++;
			}
		}
		
		public int get(int i){
			if (i < indexFrom.length) return indexFrom[i];
			else return -1;
		}
		
		public int indexOf(int i){
			if (i < indexTo.length) return indexTo[i];
			else return -1;
		}
		
		public int size(){
			return n;
		}
		
		public IntegerIndexer copy(){
			IntegerIndexer copy = new IntegerIndexer(indexFrom.length);
			copy.n = n;
			copy.indexFrom = this.indexFrom.clone();
			copy.indexTo = this.indexTo.clone();
			return copy;
		}
	}


	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#computeScores()
	 */
	public void explicitlyComputeScores(int finalLevel) {
		// TODO Auto-generated method stub
		
	}

	public Counter getWordCounter() {
		return null;
	}

	public void tieRareWordStats(int threshold) {
		return;
	}

	
}