All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.SimpleLexicon Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.ArrayUtil;
import edu.berkeley.nlp.util.Counter;
import edu.berkeley.nlp.util.Indexer;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.ScalingTools;

import java.io.Serializable;
import java.util.*;

/**
 * Simple default implementation of a lexicon, 
 * which scores word, tag pairs with P(word|tag)
 *
 * instead of dealing with words as strings we will map them to integers
 * with a wordIndexer. to further simplify things each tag will have 
 * its own mapping from wordIndex to a tag-specific index so that we
 * don't have to deal with unobserved events
 * 
 * assumes that rare words have been replaced with some unknown word token
 */
public class SimpleLexicon implements java.io.Serializable, Lexicon {
	public IntegerIndexer[] tagWordIndexer;
	public double[][][] expectedCounts; // indexed by tag, substate, word 
	public double[][][] scores; // indexed by tag, word, substate, substate
	
	public int[] wordCounter; // how many times each word occured
//	public boolean[] wordIsAmbiguous;
	
  /** A trick to allow loading of saved Lexicons even if the version has changed. */
  private static final long serialVersionUID = 2L;
  /** The number of substates for each state */
  public short[] numSubStates;
  int numStates;
  int nWords;
  
  double threshold;
  boolean isLogarithmMode;
  boolean useVarDP = false;
  
  public Indexer wordIndexer;
  Smoother smoother;
  
  // additions from the stanford parser which are needed for a better 
  // unknown word model...
  /**
   * We cache the last signature looked up, because it asks for the same one
   * many times when an unknown word is encountered!  (Note that under the
   * current scheme, one unknown word, if seen sentence-initially and
   * non-initially, will be parsed with two different signatures....)
   */
  protected transient String lastSignature = "";
  protected transient int lastSentencePosition = -1;
  protected transient String lastWordToSignaturize = "";
  private int unknownLevel = 5; //different modes for unknown words, 5 is english specific

  public void optimize() {  	
  	for (int tag=0; tag();
    this.numStates = numSubStates.length;
    this.isLogarithmMode = false;
    if (Corpus.myTreebank != Corpus.TreeBankType.WSJ || Corpus.myTreebank == Corpus.TreeBankType.BROWN)
    	unknownLevel = 4;

  }

  public double[] score(String word, short tag, int pos, boolean noSmoothing, boolean isSignature) {
  	StateSet stateSet = new StateSet(tag, (short)1, word, (short)pos, (short)(pos+1));
  	stateSet.wordIndex = -2;
  	stateSet.sigIndex = -2;
  	return score(stateSet,tag,noSmoothing,isSignature);
  }
	
	public double[] score(StateSet stateSet, short tag, boolean noSmoothing, boolean isSignature) {
		double[] res = new double[numSubStates[tag]];
		int globalWordIndex = stateSet.wordIndex;
		if (globalWordIndex==-2)
			globalWordIndex = stateSet.wordIndex = wordIndexer.indexOf(stateSet.getWord());
		if (globalWordIndex==-1)
			globalWordIndex = stateSet.sigIndex;
		if (globalWordIndex==-2) 
			globalWordIndex = stateSet.sigIndex = wordIndexer.indexOf(getSignature(stateSet.getWord(), stateSet.from));
		if (globalWordIndex==-1){
			System.out.println("unknown signature for word "+stateSet.getWord());
			Arrays.fill(res, 0.001);
			return res;
		}

		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
		if (tagSpecificWordIndex==-1){
			if (isLogarithmMode) Arrays.fill(res, Double.NEGATIVE_INFINITY);//-80??Double.NEGATIVE_INFINITY);
//			else Arrays.fill(res, 1e-80);
			return res;
		}
		for (int i=0; i trainTree, double randomness, Lexicon oldLexicon,
  		boolean secondHalf, boolean noSmoothing, int unusedUnkThreshold) {
  	// scan data
    //for all substates that the word's preterminal tag has
    double sentenceScore = 0;
    if (randomness == -1){
      sentenceScore = trainTree.getLabel().getIScore(0);
  		if (sentenceScore==0){
  			System.out.println("Something is wrong with this tree. I will skip it.");
  			return;
  		}
    }
    int sentenceScale = trainTree.getLabel().getIScale();
  	
  	List words = trainTree.getYield();
  	List tags = trainTree.getPreTerminalYield();
  	//for all words in sentence
  	for (int position = 0; position < words.size(); position++) {
  		
  		int nSubStates = tags.get(position).numSubStates();
  		short tag = tags.get(position).getState();
  		
  		String word = words.get(position).getWord();
  		int globalWordIndex = wordIndexer.indexOf(word);
  		int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex);
  		
  		double[] oldLexiconScores = null;
  		if (randomness==-1)
  			oldLexiconScores = oldLexicon.score(word,tag,position,noSmoothing,false);
  		
  		StateSet currentState = tags.get(position);
      double scale = ScalingTools.calcScaleFactor(currentState.getOScale()-sentenceScale) / sentenceScore;
  		
      for (short substate=0; substate tree : trainTrees){
  		List words = tree.getYield();
  		for (StateSet word : words){
				String sig = word.getWord();
				wordIndexer.add(sig);
  		}
  	}
  	tagWordIndexer = new IntegerIndexer[numStates];
  	for (int tag=0; tag tree : trainTrees){
  		List tags = tree.getPreTerminalYield();
  		List words = tree.getYield();
  		int ind = 0;
  		for (StateSet word : words){
				String sig = word.getWord();
				wordCounter[wordIndexer.indexOf(sig)]++;
				tagWordIndexer[tags.get(ind).getState()].add(wordIndexer.indexOf(sig));
  			ind++;
  		}  		
  	}
  	expectedCounts = new double[numStates][][];
  	scores = new double[numStates][][];
  	for (int tag=0; tag=counts[i]) { 
//				newNumSubStates[i]=numSubStates[i];
//				} 
//			else{
				newNumSubStates[i] = (short)(numSubStates[i] * 2);
//			}
		}
		splitLex.numSubStates = newNumSubStates;
		double[][][] newScores = new double[scores.length][][];
		double[][][] newExpCounts = new double[scores.length][][];
  	for (int tag=0; tag 0) {
            sb.append("-CAPS");
          } else if (hasLower) { // (Character.isLowerCase(ch0)) {
            sb.append("-LC");
          }
          if (hasDigit) {
            sb.append("-NUM");
          }
          if (hasDash) {
            sb.append("-DASH");
          }
          if (lowered.endsWith("s") && wlen >= 3) {
            // here length 3, so you don't miss out on ones like 80s
            char ch2 = lowered.charAt(wlen - 2);
            // not -ess suffixes or greek/latin -us, -is
            if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
              sb.append("-s");
            }
          } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
            // don't do for very short words;
            // Implement common discriminating suffixes
/*          	if (Corpus.myLanguage==Corpus.GERMAN){
          		sb.append(lowered.substring(lowered.length()-1));
          	}else{*/
            if (lowered.endsWith("ed")) {
              sb.append("-ed");
            } else if (lowered.endsWith("ing")) {
              sb.append("-ing");
            } else if (lowered.endsWith("ion")) {
              sb.append("-ion");
            } else if (lowered.endsWith("er")) {
              sb.append("-er");
            } else if (lowered.endsWith("est")) {
              sb.append("-est");
            } else if (lowered.endsWith("ly")) {
              sb.append("-ly");
            } else if (lowered.endsWith("ity")) {
              sb.append("-ity");
            } else if (lowered.endsWith("y")) {
              sb.append("-y");
            } else if (lowered.endsWith("al")) {
              sb.append("-al");
              // } else if (lowered.endsWith("ble")) {
              // sb.append("-ble");
              // } else if (lowered.endsWith("e")) {
              // sb.append("-e");
            }
          }
          break;
        }

      case 4:
        {
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLetter = false;
          boolean hasLower = false;
          boolean hasDash = false;
          boolean hasPeriod = false;
          boolean hasComma = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                hasLetter = true;
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              } else {
                if (ch == '-') {
                  hasDash = true;
                } else if (ch == '.') {
                  hasPeriod = true;
                } else if (ch == ',') {
                  hasComma = true;
                }
              }
            }
          }
          // 6 way on letters
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-AC");
            } else if (loc == 0) {
              sb.append("-SC");
            } else {
              sb.append("-C");
            }
          } else if (hasLower) {
            sb.append("-L");
          } else if (hasLetter) {
            sb.append("-U");
          } else {
            // no letter
            sb.append("-S");
          }
          // 3 way on number
          if (hasDigit && !hasNonDigit) {
            sb.append("-N");
          } else if (hasDigit) {
            sb.append("-n");
          }
          // binary on period, dash, comma
          if (hasDash) {
            sb.append("-H");
          }
          if (hasPeriod) {
            sb.append("-P");
          }
          if (hasComma) {
            sb.append("-C");
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            if (Character.isLetter(ch)) {
              sb.append("-");
              sb.append(Character.toLowerCase(ch));
            }
          }
          break;
        }

      case 3:
        {
          // This basically works right, except note that 'S' is applied to all
          // capitalized letters in first word of sentence, not just first....
          sb.append("-");
          char lastClass = '-';  // i.e., nothing
          char newClass;
          int num = 0;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
              if (loc == 0) {
                newClass = 'S';
              } else {
                newClass = 'L';
              }
            } else if (Character.isLetter(ch)) {
              newClass = 'l';
            } else if (Character.isDigit(ch)) {
              newClass = 'd';
            } else if (ch == '-') {
              newClass = 'h';
            } else if (ch == '.') {
              newClass = 'p';
            } else {
              newClass = 's';
            }
            if (newClass != lastClass) {
              lastClass = newClass;
              sb.append(lastClass);
              num = 1;
            } else {
              if (num < 2) {
                sb.append('+');
              }
              num++;
            }
          }
          if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = Character.toLowerCase(word.charAt(word.length() - 1));
            sb.append('-');
            sb.append(ch);
          }
          break;
        }

      case 2:
        {
          // {-ALLC, -INIT, -UC, -LC, zero} +
          // {-DASH, zero} +
          // {-NUM, -DIG, zero} +
          // {lowerLastChar, zeroIfShort}
          boolean hasDigit = false;
          boolean hasNonDigit = false;
          boolean hasLower = false;
          for (int i = 0; i < word.length(); i++) {
            char ch = word.charAt(i);
            if (Character.isDigit(ch)) {
              hasDigit = true;
            } else {
              hasNonDigit = true;
              if (Character.isLetter(ch)) {
                if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
                  hasLower = true;
                }
              }
            }
          }
          if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
            if (!hasLower) {
              sb.append("-ALLC");
            } else if (loc == 0) {
              sb.append("-INIT");
            } else {
              sb.append("-UC");
            }
          } else if (hasLower) {   // if (Character.isLowerCase(word.charAt(0))) {
            sb.append("-LC");
          }
          // no suffix = no (lowercase) letters
          if (word.indexOf('-') >= 0) {
            sb.append("-DASH");
          }
          if (hasDigit) {
            if (!hasNonDigit) {
              sb.append("-NUM");
            } else {
              sb.append("-DIG");
            }
          } else if (word.length() > 3) {
            // don't do for very short words: "yes" isn't an "-es" word
            // try doing to lower for further densening and skipping digits
            char ch = word.charAt(word.length() - 1);
            sb.append(Character.toLowerCase(ch));
          }
          // no suffix = short non-number, non-alphabetic
          break;
        }

      default:
        sb.append("-");
        sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
        sb.append("-");
        if (Character.isLowerCase(word.charAt(0))) {
          sb.append("LOWER");
        } else {
          if (Character.isUpperCase(word.charAt(0))) {
            if (loc == 0) {
              sb.append("INIT");
            } else {
              sb.append("UPPER");
            }
          } else {
            sb.append("OTHER");
          }
        }
    } // end switch (unknownLevel)
    // System.err.println("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()

  
  public String toString() {
  	StringBuffer sb = new StringBuffer();
  	Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
  	for (int tag=0; tag tree : trainTrees){
  		List words = tree.getYield();
//  		List tags = tree.getPreTerminalYield();
//  		int ind = 0;
  		for (StateSet word : words){
  			word.wordIndex = wordIndexer.indexOf(word.getWord());
  			word.sigIndex = -1;
//  			short tag = tags.get(ind).getState();
////				if (wordIsAmbiguous[word.wordIndex]) {
//					String sig = getSignature(word.getWord(), ind);
//					wordIndexer.add(sig);
//	  			word.sigIndex = (short)wordIndexer.indexOf(sig);
//					tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
////				}
////				else { word.sigIndex = -1; }
//				ind++;
  		}  		
  	}

	}


	/*
	public void clearMapping() {
		toBeIgnored = null;
		linearIndex = null;
	}

*/
	public static class IntegerIndexer implements Serializable{
		private int[] indexTo;
		private int[] indexFrom;
		private int n;
		
		IntegerIndexer(int capacity){
			indexTo = new int[capacity];
			indexFrom = new int[capacity];
			Arrays.fill(indexTo, -1);
			Arrays.fill(indexFrom, -1);
			n = 0;
		}
		
		public void add(int i){
			if (i==-1) return;
			if (indexTo[i]==-1){
				indexTo[i] = n;
				indexFrom[n] = i;
				n++;
			}
		}
		
		public int get(int i){
			if (i < indexFrom.length) return indexFrom[i];
			else return -1;
		}
		
		public int indexOf(int i){
			if (i < indexTo.length) return indexTo[i];
			else return -1;
		}
		
		public int size(){
			return n;
		}
		
		public IntegerIndexer copy(){
			IntegerIndexer copy = new IntegerIndexer(indexFrom.length);
			copy.n = n;
			copy.indexFrom = this.indexFrom.clone();
			copy.indexTo = this.indexTo.clone();
			return copy;
		}
	}


	/* (non-Javadoc)
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#computeScores()
	 */
	public void explicitlyComputeScores(int finalLevel) {
		// TODO Auto-generated method stub
		
	}

	public Counter getWordCounter() {
		return null;
	}

	public void tieRareWordStats(int threshold) {
		return;
	}

	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy