All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.SophisticatedLexicon Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.math.SloppyMath;
import edu.berkeley.nlp.util.ArrayUtil;
import edu.berkeley.nlp.util.Counter;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.PriorityQueue;
import edu.berkeley.nlp.util.ScalingTools;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.Writer;
import java.util.*;

/**
 * Simple default implementation of a lexicon, which scores word, tag pairs with
 * a smoothed estimate of P(tag|word)/P(tag).
 * 
 * for simplicity the lexicon will store words and tags as strings, while the
 * grammar will be using integers -> Numberer()
 */
public class SophisticatedLexicon implements java.io.Serializable, Lexicon {
	/** A count of strings with tags. Indexed by state, word, and substate. */
	HashMap[] wordToTagCounters = null;
	HashMap[] unseenWordToTagCounters = null;
	double totalWordTypes = 0.0;
	double totalTokens = 0.0;
	double totalUnseenTokens = 0.0;
	double totalWords = 0.0;
	/**
	 * A count of how many different words each full tag has been seen with.
	 * Indexed by state and substate
	 */
	double[][] typeTagCounter;
	/**
	 * A count of tag (state + subState) occurrences. Indexed by state and
	 * substate
	 */
	double[][] tagCounter;
	double[][] unseenTagCounter;
	double[] simpleTagCounter;
	/** The set of preterminal tags */
	Set allTags = new HashSet();
	/** The count of how often each word as been seen */
	Counter wordCounter = new Counter();
	/**
	 * A trick to allow loading of saved Lexicons even if the version has
	 * changed.
	 */
	private static final long serialVersionUID = 2L;
	/** The number of substates for each state */
	short[] numSubStates;

	/** Word-tag pairs that occur less are smoothed. */
	int smoothingCutoff;
	/** The default smoothing cutoff. */
	public static int DEFAULT_SMOOTHING_CUTOFF = 10;
	/** Add X smoothing for P(word) */
	double addXSmoothing = 1.0;

	Smoother smoother;
	double threshold;

	boolean isConditional;
	double[][][] conditionalWeights; // wordIndex, tag, substate -> weight
	Numberer wordNumberer;

	// additions from the stanford parser which are needed for a better
	// unknown word model...
	/**
	 * We cache the last signature looked up, because it asks for the same one
	 * many times when an unknown word is encountered! (Note that under the
	 * current scheme, one unknown word, if seen sentence-initially and
	 * non-initially, will be parsed with two different signatures....)
	 */
	protected transient String lastSignature = "";
	protected transient int lastSentencePosition = -1;
	protected transient String lastWordToSignaturize = "";
	private int unknownLevel = 5; // different modes for unknown words, 5 is
									// english specific
	/**
	 * A POS tag has to have been attributed to more than this number of word
	 * types before it is regarded as an open-class tag. Unknown words will only
	 * possibly be tagged as open-class tags (unless flexiTag is on).
	 */
	public static int openClassTypesThreshold = 50;

	/**
	 * Start to aggregate signature-tag pairs only for words unseen in the first
	 * this fraction of the data.
	 */
	public static double fractionBeforeUnseenCounting = 0.5; // -> secondHalf
	// protected transient Set sigs=new HashSet();
	/**
	 * Has counts for taggings in terms of unseen signatures. The IntTagWords
	 * are for (tag,sig), (tag,null), (null,sig), (null,null). (None for basic
	 * UNK if there are signatures.)
	 */
	protected static final int nullWord = -1;
	protected static final short nullTag = -1;
	double smoothInUnknownsThreshold = 100;
	double[] smooth = null; // {1.0, 1.0};

	/**
	 * If logarithmMode is true, then all scores are returned as log
	 * probabilities. Otherwise, they are returned as probabilities.
	 */
	boolean logarithmMode = false;

	/** Get the nonterminal tags */
	public Set getAllTags() {
		return allTags;
	}

	public boolean isKnown(String word) {
		return wordCounter.keySet().contains(word);
	}

	public void writeData(Writer w) throws IOException {
		PrintWriter out = new PrintWriter(w);
		Numberer n = Numberer.getGlobalNumberer("tags");

		// word counter (c_W)
		out.print("WORD-COUNTER (c_W):\n");
		PriorityQueue pq = wordCounter.asPriorityQueue();
		while (pq.hasNext()) {
			int priority = (int) Math.round(pq.getPriority());
			String element = pq.next();
			out.print(element + " " + priority + "\n");
		}

		out.print("--------------------------------------------------\n");
		out.print("TAG-COUNTER (c_T):\n");
		for (int state = 0; state < tagCounter.length; state++) {
			String tagState = (String) n.object(state);
			for (int substate = 0; substate < tagCounter[state].length; substate++) {
				double prob = tagCounter[state][substate];
				if (prob == 0)
					continue;
				out.print(tagState + "_" + substate + " " + prob + "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("UNSEEN-TAG-COUNTER (c_T):\n");
		for (int state = 0; state < unseenTagCounter.length; state++) {
			String tagState = (String) n.object(state);
			for (int substate = 0; substate < unseenTagCounter[state].length; substate++) {
				double prob = unseenTagCounter[state][substate];
				if (prob == 0)
					continue;
				out.print(tagState + "_" + substate + " " + prob + "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("TAG-AND-WORD-COUNTER (c_TW):\n");
		for (int tag = 0; tag < wordToTagCounters.length; tag++) {
			if (wordToTagCounters[tag] == null)
				continue;
			String tagState = (String) n.object(tag);
			for (String word : wordToTagCounters[tag].keySet()) {
				out.print(tagState + " " + word + " "
						+ Arrays.toString(wordToTagCounters[tag].get(word))
						+ "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("UNSEEN-TAG-AND-SIGNATURE-COUNTER (c_TW):\n");
		for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) {
			if (unseenWordToTagCounters[tag] == null)
				continue;
			String tagState = (String) n.object(tag);
			for (String word : unseenWordToTagCounters[tag].keySet()) {
				out.print(tagState
						+ " "
						+ word
						+ " "
						+ Arrays.toString(unseenWordToTagCounters[tag]
								.get(word)) + "\n");
			}
		}

		out.flush();
	}

	public String toString() {
		Numberer n = Numberer.getGlobalNumberer("tags");
		StringBuilder sb = new StringBuilder();
		// word counter (c_W)
		/*
		 * sb.append("WORD-COUNTER (c_W):\n"); PriorityQueue pq =
		 * wordCounter.asPriorityQueue(); while (pq.hasNext()) { int priority =
		 * (int)Math.round(pq.getPriority()); String element = pq.next();
		 * sb.append(element +" "+priority+"\n"); }
		 * 
		 * sb.append("--------------------------------------------------\n");
		 * sb.append("TAG-COUNTER (c_T):\n"); for (int state=0;
		 * state[] probCounter = new
		// HashMap[numSubStates.length];
		// for (int tag=0; tag();
		// for (String word : wordToTagCounters[tag].keySet()){
		// double[] probs = wordToTagCounters[tag].get(word);
		// for (int substate=0; substate 0) {
			if (wordToTagCounters[tag] != null) { // this is a lexical category
				resultArray = wordToTagCounters[tag].get(word);
				if (resultArray != null) { // we have seen this word with this
											// tag
					return resultArray;
				}
			}
			return new double[numSubStates[tag]];
		}
		String sig = getCachedSignature(word, loc);
		resultArray = wordToTagCounters[tag].get(sig);
		if (resultArray != null) { // we have seen this signature with this tag
			return resultArray;
		}
		// we have never seen the word or its signature
		// System.err.println("We have never seen the word "+word+" or its signature "+sig+" with this tag. Returning prob 0.");
		return new double[numSubStates[tag]];

	}

	/**
	 * 

* This condenses counting arrays into essential statistics. It is used * after all calls to tallyStateSetTree and before any getScore calls. *

* Currently the trees are taken into account immediately, so this does * nothing, but in the future this may contain some precomputation */ public void optimize() { // make up the set of which tags are preterminal tags for (short i = 0; i < wordToTagCounters.length; i++) { if (wordToTagCounters[i] != null) { allTags.add(i); } } // remove the unlikely ones removeUnlikelyTags(threshold, -1.0); // // add MMT randomization if necessary // if // (randomInitializationType==Grammar.RandomInitializationType.INITIALIZE_LIKE_MMT) // { // Random r = new Random(); // for (short tag=0; tag pq = new PriorityQueue(tagCounter.length); for (int i = 0; i < tagCounter.length; i++) { pq.add((String) tagNumberer.object(i), tagCounter[i][0]); // System.out.println(i+". "+(String)tagNumberer.object(i)+"\t "+symbolCounter.getCount(i,0)); } int i = 0; while (pq.hasNext()) { i++; int p = (int) pq.getPriority(); System.out.println(i + ". " + pq.next() + "\t " + p); } } /** * Split all substates in two, producing a new lexicon. The new Lexicon * gives the same scores to words under both split versions of the tag. * (Leon says: It may not be okay to use the same scores, but I think that * symmetry is sufficiently broken in Grammar.splitAllStates to ignore the * randomness here.) * * @param randomness * , mode (currently ignored) * @return */ @SuppressWarnings("unchecked") public SophisticatedLexicon splitAllStates(int[] counts, boolean moreSubstatesThanCounts, int mode) { short[] newNumSubStates = new short[numSubStates.length]; newNumSubStates[0] = 1; // never split ROOT for (short i = 1; i < numSubStates.length; i++) { // don't split a state into more substates than times it was // actaully seen // if (!moreSubstatesThanCounts && numSubStates[i]>=counts[i]) { // newNumSubStates[i]=numSubStates[i]; // } // else{ newNumSubStates[i] = (short) (numSubStates[i] * 2); // } } SophisticatedLexicon lexicon = new SophisticatedLexicon( newNumSubStates, this.smoothingCutoff, smooth, smoother, this.threshold); // copy and alter all data structures lexicon.wordToTagCounters = new HashMap[numSubStates.length]; lexicon.unseenWordToTagCounters = new HashMap[numSubStates.length]; for (int tag = 0; tag < wordToTagCounters.length; tag++) { if (wordToTagCounters[tag] != null) { lexicon.wordToTagCounters[tag] = new HashMap(); for (String word : wordToTagCounters[tag].keySet()) { lexicon.wordToTagCounters[tag].put(word, new double[newNumSubStates[tag]]); for (int substate = 0; substate < wordToTagCounters[tag] .get(word).length; substate++) { int splitFactor = 2; if (newNumSubStates[tag] == numSubStates[tag]) { splitFactor = 1; } for (int i = 0; i < splitFactor; i++) { lexicon.wordToTagCounters[tag].get(word)[substate * splitFactor + i] = (1.f / splitFactor) * wordToTagCounters[tag].get(word)[substate]; } } } } } for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) { if (unseenWordToTagCounters[tag] != null) { lexicon.unseenWordToTagCounters[tag] = new HashMap(); for (String word : unseenWordToTagCounters[tag].keySet()) { lexicon.unseenWordToTagCounters[tag].put(word, new double[newNumSubStates[tag]]); for (int substate = 0; substate < unseenWordToTagCounters[tag] .get(word).length; substate++) { int splitFactor = 2; if (newNumSubStates[tag] == numSubStates[tag]) { splitFactor = 1; } for (int i = 0; i < splitFactor; i++) { lexicon.unseenWordToTagCounters[tag].get(word)[substate * splitFactor + i] = (1.f / splitFactor) * unseenWordToTagCounters[tag].get(word)[substate]; } } } } } lexicon.totalWordTypes = totalWordTypes; lexicon.totalTokens = totalTokens; lexicon.totalUnseenTokens = totalUnseenTokens; lexicon.totalWords = totalWords; lexicon.smoother = smoother; lexicon.typeTagCounter = new double[typeTagCounter.length][]; lexicon.tagCounter = new double[tagCounter.length][]; lexicon.unseenTagCounter = new double[unseenTagCounter.length][]; lexicon.simpleTagCounter = new double[tagCounter.length]; for (int tag = 0; tag < typeTagCounter.length; tag++) { lexicon.typeTagCounter[tag] = new double[newNumSubStates[tag]]; lexicon.tagCounter[tag] = new double[newNumSubStates[tag]]; lexicon.unseenTagCounter[tag] = new double[newNumSubStates[tag]]; lexicon.simpleTagCounter[tag] = simpleTagCounter[tag]; for (int substate = 0; substate < typeTagCounter[tag].length; substate++) { int splitFactor = 2; if (newNumSubStates[tag] == numSubStates[tag]) { splitFactor = 1; } for (int i = 0; i < splitFactor; i++) { lexicon.typeTagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor) * typeTagCounter[tag][substate]; lexicon.tagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor) * tagCounter[tag][substate]; lexicon.unseenTagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor) * unseenTagCounter[tag][substate]; } } } lexicon.allTags = new HashSet(allTags); lexicon.wordCounter = new Counter(); for (String word : wordCounter.keySet()) { lexicon.wordCounter.setCount(word, wordCounter.getCount(word)); } lexicon.smoothingCutoff = smoothingCutoff; lexicon.addXSmoothing = addXSmoothing; lexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold; lexicon.wordNumberer = wordNumberer; return lexicon; } /** * This routine returns a String that is the "signature" of the class of a * word. For, example, it might represent whether it is a number of ends in * -s. The strings returned by convention match the pattern UNK-.* , which * is just assumed to not match any real word. Behavior depends on the * unknownLevel (-uwm flag) passed in to the class. The recognized numbers * are 1-5: 5 is fairly English-specific; 4, 3, and 2 look for various word * features (digits, dashes, etc.) which are only vaguely English-specific; * 1 uses the last two characters combined with a simple classification by * capitalization. * * @param word * The word to make a signature for * @param loc * Its position in the sentence (mainly so sentence-initial * capitalized words can be treated differently) * @return A String that is its signature (equivalence class) */ public String getSignature(String word, int loc) { // int unknownLevel = Options.get().useUnknownWordSignatures; StringBuffer sb = new StringBuffer("UNK"); if (word.length() == 0) return sb.toString(); switch (unknownLevel) { case 5: { // Reformed Mar 2004 (cdm); hopefully much better now. // { -CAPS, -INITC ap, -LC lowercase, 0 } + // { -KNOWNLC, 0 } + [only for INITC] // { -NUM, 0 } + // { -DASH, 0 } + // { -last lowered char(s) if known discriminating suffix, 0} int wlen = word.length(); int numCaps = 0; boolean hasDigit = false; boolean hasDash = false; boolean hasLower = false; for (int i = 0; i < wlen; i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else if (ch == '-') { hasDash = true; } else if (Character.isLetter(ch)) { if (Character.isLowerCase(ch)) { hasLower = true; } else if (Character.isTitleCase(ch)) { hasLower = true; numCaps++; } else { numCaps++; } } } char ch0 = word.charAt(0); String lowered = word.toLowerCase(); if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) { if (loc == 0 && numCaps == 1) { sb.append("-INITC"); if (isKnown(lowered)) { sb.append("-KNOWNLC"); } } else { sb.append("-CAPS"); } } else if (!Character.isLetter(ch0) && numCaps > 0) { sb.append("-CAPS"); } else if (hasLower) { // (Character.isLowerCase(ch0)) { sb.append("-LC"); } if (hasDigit) { sb.append("-NUM"); } if (hasDash) { sb.append("-DASH"); } if (lowered.endsWith("s") && wlen >= 3) { // here length 3, so you don't miss out on ones like 80s char ch2 = lowered.charAt(wlen - 2); // not -ess suffixes or greek/latin -us, -is if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') { sb.append("-s"); } } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) { // don't do for very short words; // Implement common discriminating suffixes /* * if (Corpus.myLanguage==Corpus.GERMAN){ * sb.append(lowered.substring(lowered.length()-1)); }else{ */ if (lowered.endsWith("ed")) { sb.append("-ed"); } else if (lowered.endsWith("ing")) { sb.append("-ing"); } else if (lowered.endsWith("ion")) { sb.append("-ion"); } else if (lowered.endsWith("er")) { sb.append("-er"); } else if (lowered.endsWith("est")) { sb.append("-est"); } else if (lowered.endsWith("ly")) { sb.append("-ly"); } else if (lowered.endsWith("ity")) { sb.append("-ity"); } else if (lowered.endsWith("y")) { sb.append("-y"); } else if (lowered.endsWith("al")) { sb.append("-al"); // } else if (lowered.endsWith("ble")) { // sb.append("-ble"); // } else if (lowered.endsWith("e")) { // sb.append("-e"); } } break; } case 4: { boolean hasDigit = false; boolean hasNonDigit = false; boolean hasLetter = false; boolean hasLower = false; boolean hasDash = false; boolean hasPeriod = false; boolean hasComma = false; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else { hasNonDigit = true; if (Character.isLetter(ch)) { hasLetter = true; if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) { hasLower = true; } } else { if (ch == '-') { hasDash = true; } else if (ch == '.') { hasPeriod = true; } else if (ch == ',') { hasComma = true; } } } } // 6 way on letters if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) { if (!hasLower) { sb.append("-AC"); } else if (loc == 0) { sb.append("-SC"); } else { sb.append("-C"); } } else if (hasLower) { sb.append("-L"); } else if (hasLetter) { sb.append("-U"); } else { // no letter sb.append("-S"); } // 3 way on number if (hasDigit && !hasNonDigit) { sb.append("-N"); } else if (hasDigit) { sb.append("-n"); } // binary on period, dash, comma if (hasDash) { sb.append("-H"); } if (hasPeriod) { sb.append("-P"); } if (hasComma) { sb.append("-C"); } if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = word.charAt(word.length() - 1); if (Character.isLetter(ch)) { sb.append("-"); sb.append(Character.toLowerCase(ch)); } } break; } case 3: { // This basically works right, except note that 'S' is applied to // all // capitalized letters in first word of sentence, not just first.... sb.append("-"); char lastClass = '-'; // i.e., nothing char newClass; int num = 0; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) { if (loc == 0) { newClass = 'S'; } else { newClass = 'L'; } } else if (Character.isLetter(ch)) { newClass = 'l'; } else if (Character.isDigit(ch)) { newClass = 'd'; } else if (ch == '-') { newClass = 'h'; } else if (ch == '.') { newClass = 'p'; } else { newClass = 's'; } if (newClass != lastClass) { lastClass = newClass; sb.append(lastClass); num = 1; } else { if (num < 2) { sb.append('+'); } num++; } } if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = Character.toLowerCase(word.charAt(word.length() - 1)); sb.append('-'); sb.append(ch); } break; } case 2: { // {-ALLC, -INIT, -UC, -LC, zero} + // {-DASH, zero} + // {-NUM, -DIG, zero} + // {lowerLastChar, zeroIfShort} boolean hasDigit = false; boolean hasNonDigit = false; boolean hasLower = false; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else { hasNonDigit = true; if (Character.isLetter(ch)) { if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) { hasLower = true; } } } } if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) { if (!hasLower) { sb.append("-ALLC"); } else if (loc == 0) { sb.append("-INIT"); } else { sb.append("-UC"); } } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) // { sb.append("-LC"); } // no suffix = no (lowercase) letters if (word.indexOf('-') >= 0) { sb.append("-DASH"); } if (hasDigit) { if (!hasNonDigit) { sb.append("-NUM"); } else { sb.append("-DIG"); } } else if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = word.charAt(word.length() - 1); sb.append(Character.toLowerCase(ch)); } // no suffix = short non-number, non-alphabetic break; } default: sb.append("-"); sb.append(word.substring(Math.max(word.length() - 2, 0), word .length())); sb.append("-"); if (Character.isLowerCase(word.charAt(0))) { sb.append("LOWER"); } else { if (Character.isUpperCase(word.charAt(0))) { if (loc == 0) { sb.append("INIT"); } else { sb.append("UPPER"); } } else { sb.append("OTHER"); } } } // end switch (unknownLevel) // System.err.println("Summarized " + word + " to " + sb.toString()); return sb.toString(); } // end getSignature() public double[] score(StateSet stateSet, short tag, boolean noSmoothing, boolean isSignature) { return score(stateSet.getWord(), tag, stateSet.from, noSmoothing, isSignature); } /** * Get the score of this word with this tag (as an IntTaggedWord) at this * loc. (Presumably an estimate of P(word | tag).) *

* Implementation documentation: Seen: c_W = count(W) c_TW = * count(T,W) c_T = count(T) c_Tunseen = count(T) among new words in 2nd * half total = count(seen words) totalUnseen = count("unseen" words) p_T_U * = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W > smoothInUnknownsThreshold) * = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior smooth[1] * with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = pb_T_W * p_W / p_T [Bayes * rule] Note that this doesn't really properly reserve mass to unknowns. * * Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) * c_U = totalUnseen above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of * Pmle(T|S) with P(T|Unseen) [smooth[0]] pb_W_T = P(W|T) inverted * * @param iTW * An IntTaggedWord pairing a word and POS tag * @param loc * The position in the sentence. In the default implementation * this is used only for unknown words to change their * probability distribution when sentence initial * @return A double valued score, usually P(word|tag) */ public double[] score(String word, short tag, int loc, boolean noSmoothing, boolean isSignature) { if (isConditional) return scoreConditional(word, tag, loc, noSmoothing, isSignature); double c_W = wordCounter.getCount(word); double pb_W_T = 0; // always set below // simulate no smoothing // smooth[0] = 0.0; smooth[1] = 0.0; double[] resultArray = new double[numSubStates[tag]]; for (int substate = 0; substate < numSubStates[tag]; substate++) { boolean seen = (c_W > 0.0); if (!isSignature && (seen || noSmoothing)) { // known word model for P(T|W) double c_tag = tagCounter[tag][substate]; double c_T = c_tag;// seenCounter.getCount(iTW); if (c_T == 0) continue; double c_TW = 0; if (wordToTagCounters[tag] != null && wordToTagCounters[tag].get(word) != null) { c_TW = wordToTagCounters[tag].get(word)[substate]; } // if (c_TW==0) continue; double c_Tunseen = unseenTagCounter[tag][substate]; double total = totalTokens; double totalUnseen = totalUnseenTokens; double p_T_U = (totalUnseen == 0) ? 1 : c_Tunseen / totalUnseen; double pb_T_W; // always set below // System.err.println("c_W is " + c_W + " THRESH is " + // smoothInUnknownsThreshold + " mle = " + (c_TW/c_W)); if (c_W > smoothInUnknownsThreshold || noSmoothing) { // we've seen the word enough times to have confidence in // its tagging if (noSmoothing && c_W == 0) pb_T_W = c_TW / 1; else pb_T_W = (c_TW + 0.0001 * p_T_U) / (c_W + 0.0001); // pb_T_W = c_TW / c_W; // System.out.println("c_TW "+c_TW+" c_W "+c_W); } else { // we haven't seen the word enough times to have confidence // in its tagging pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]); // System.out.println("smoothed c_TW "+c_TW+" c_W "+c_W); } if (pb_T_W == 0) continue; // Sometimes we run up against unknown tags. This should only // happen // when we're calculating the likelihood for a given tree, not // when // we're parsing. In that case, return a LL of 0. // NO NO NO, this is wrong, slav // if (c_T==0) { // resultArray[substate] = 1; // continue; // } double p_T = (c_T / total); double p_W = (c_W / total); pb_W_T = pb_T_W * p_W / p_T; } else { // test against simple Chinese lexical constants if (Corpus.myTreebank == Corpus.TreeBankType.CHINESE) { Numberer tagNumberer = Numberer.getGlobalNumberer("tags"); double prob; if (word.matches(ChineseLexicon.dateMatch)) { // EncodingPrintWriter.out.println("Date match for " + // word,encoding); if (tag == tagNumberer.number("NT")) { // (tag.equals("NT")) // { prob = 1.0; } else { prob = 0.0; } Arrays.fill(resultArray, prob); return resultArray; } else if (word.matches(ChineseLexicon.numberMatch)) { // EncodingPrintWriter.out.println("Number match for " + // word,encoding); if (tag == tagNumberer.number("CD") /* tag.equals("CD") */ && (!word.matches(ChineseLexicon.ordinalMatch))) { prob = 1.0; } else if (tag == tagNumberer.number("OD") /* * tag.equals * ("OD") */ && word.matches(ChineseLexicon.ordinalMatch)) { prob = 1.0; } else { prob = 0.0; } Arrays.fill(resultArray, prob); return resultArray; } else if (word.matches(ChineseLexicon.properNameMatch)) { // EncodingPrintWriter.out.println("Proper name match for " // + word,encoding); if (tag == tagNumberer.number("NR")) { // tag.equals("NR")) // { prob = 1.0; } else { prob = 0.0; } Arrays.fill(resultArray, prob); return resultArray; } } // unknown word model for P(T|S) String sig = (isSignature) ? word : getCachedSignature(word, loc); // iTW.word = sig; // double c_TS = unSeenCounter.getCount(iTW); double c_TS = 0; if (unseenWordToTagCounters[tag] != null && unseenWordToTagCounters[tag].get(sig) != null) { c_TS = unseenWordToTagCounters[tag].get(sig)[substate]; } // if (c_TS == 0) continue; // how often did we see this signature double c_S = wordCounter.getCount(sig); double c_U = totalUnseenTokens; double total = totalTokens; // seenCounter.getCount(iTW); double c_T = unseenTagCounter[tag][substate];// unSeenCounter.getCount(iTW); double c_Tseen = tagCounter[tag][substate]; // seenCounter.getCount(iTW); double p_T_U = c_T / c_U; if (unknownLevel == 0) { c_TS = 0; c_S = 0; } // System.out.println(" sig " + sig // +" c_TS "+c_TS+" p_T_U "+p_T_U+" c_S "+c_S); // smooth[0]=10; double pb_T_S = (c_TS + smooth[0] * p_T_U) / (c_S + smooth[0]); double p_T = (c_Tseen / total); double p_W = 1.0 / total; // if we've never before seen this tag, then just say the // probability is 1 /* * if (p_T == 0) { resultArray[substate] = 1; continue; } */pb_W_T = pb_T_S * p_W / p_T; } // give very low scores when needed, but try to avoid -Infinity if (pb_W_T == 0) {// NOT sure whether this is a good idea - slav resultArray[substate] = 1e-87; } else { resultArray[substate] = pb_W_T; } } smoother.smooth(tag, resultArray); if (logarithmMode) { for (int i = 0; i < resultArray.length; i++) { resultArray[i] = Math.log(resultArray[i]); if (Double.isNaN(resultArray[i])) resultArray[i] = Double.NEGATIVE_INFINITY; } } /* * double power = 1.0; // raise to the power for (int i=0; * i trees) { double bestScore = * Double.NEGATIVE_INFINITY; double[] bestSmooth = {0.0, 0.0}; for * (smooth[0] = 1; smooth[0] <= 1; smooth[0] *= 2.0) {//64 for (smooth[1] = * 0.2; smooth[1] <= 0.2; smooth[1] *= 2.0) {//3 //for (smooth[0]=0.5; * smooth[0]<=64; smooth[0] *= 2.0) {//64 //for (smooth[1]=0.1; * smooth[1]<=12.8; smooth[1] *= 2.0) {//3 double score = 0.0; //score = * scoreAll(trees); if (Test.verbose) { * System.out.println("Tuning lexicon: s0 " + smooth[0] + " s1 " + smooth[1] * + " is " + score + " " + trees.size() + " trees."); } if (score > * bestScore) { System.arraycopy(smooth, 0, bestSmooth, 0, smooth.length); * bestScore = score; } } } System.arraycopy(bestSmooth, 0, smooth, 0, * bestSmooth.length); if (smartMutation) { smooth[0] = 8.0; //smooth[1] = * 1.6; //smooth[0] = 0.5; smooth[1] = 0.1; } if (Test.unseenSmooth > 0.0) { * smooth[0] = Test.unseenSmooth; } if (Test.verbose) { * System.out.println("Tuning selected smoothUnseen " + smooth[0] + * " smoothSeen " + smooth[1] + " at " + bestScore); } } */ public Counter getWordCounter() { return wordCounter; } public void tieRareWordStats(int threshold) { for (int ni = 0; ni < numSubStates.length; ni++) { double unseenTagTokens = 0; for (int si = 0; si < numSubStates[ni]; si++) { unseenTagTokens += unseenTagCounter[ni][si]; } if (unseenTagTokens == 0) { continue; } for (Map.Entry wordToTagEntry : wordToTagCounters[ni].entrySet()) { String word = wordToTagEntry.getKey(); double[] substateCounter = wordToTagEntry.getValue(); if (wordCounter.getCount(word) < threshold+0.5) { double wordTagTokens = 0; for (int si = 0; si < numSubStates[ni]; si++) { wordTagTokens += substateCounter[si]; } for (int si = 0; si < numSubStates[ni]; si++) { substateCounter[si] = unseenTagCounter[ni][si] * wordTagTokens / unseenTagTokens; } } } } } /** * Trains this lexicon on the Collection of trees. */ public void trainTree(Tree trainTree, double randomness, Lexicon oldLexicon, boolean secondHalf, boolean noSmoothing, int threshold) { // scan data //for all substates that the word's preterminal tag has double sentenceScore = 0; if (randomness == -1) { sentenceScore = trainTree.getLabel().getIScore(0); if (sentenceScore == 0) { System.out.println("Something is wrong with this tree. I will skip it."); return; } } int sentenceScale = trainTree.getLabel().getIScale(); List words = trainTree.getYield(); List tags = trainTree.getPreTerminalYield(); if (words.size() != tags.size()) { System.out.println("Yield an preterminal yield do not match!"); System.out.println(words.toString()); System.out.println(tags.toString()); } Counter oldWordCounter = null; if (oldLexicon != null) { oldWordCounter = oldLexicon.getWordCounter(); } //for all words in sentence for (int position = 0; position < words.size(); position++) { totalWords++; String word = words.get(position).getWord(); int nSubStates = tags.get(position).numSubStates(); short tag = tags.get(position).getState(); String sig = getCachedSignature(word, position); wordCounter.incrementCount(sig, 0); if (unseenWordToTagCounters[tag] == null) { unseenWordToTagCounters[tag] = new HashMap(); } double[] substateCounter2 = unseenWordToTagCounters[tag].get(sig); if (substateCounter2 == null) { //System.out.print("Sig "+sig+" word "+ word+" pos "+position); substateCounter2 = new double[numSubStates[tag]]; unseenWordToTagCounters[tag].put(sig, substateCounter2); } //guarantee that the wordToTagCounter element exists so we can tally the combination if (wordToTagCounters[tag] == null) { wordToTagCounters[tag] = new HashMap(); } double[] substateCounter = wordToTagCounters[tag].get(word); if (substateCounter == null) { substateCounter = new double[numSubStates[tag]]; wordToTagCounters[tag].put(word, substateCounter); } double[] oldLexiconScores = null; if (randomness == -1) { oldLexiconScores = oldLexicon.score(word, tag, position, noSmoothing, false); } StateSet currentState = tags.get(position); double scale = ScalingTools.calcScaleFactor(currentState.getOScale() - sentenceScale) / sentenceScore; //double weightSum = 0; for (short substate = 0; substate < nSubStates; substate++) { double weight = 1; if (randomness == -1) { //weight by the probability of seeing the tag and word together, given the sentence if (!Double.isInfinite(scale)) { weight = currentState.getOScore(substate) * oldLexiconScores[substate] * scale; } else { weight = Math.exp(Math.log(ScalingTools.SCALE) * (currentState.getOScale() - sentenceScale) - Math.log(sentenceScore) + Math.log(currentState.getOScore( substate)) + Math.log(oldLexiconScores[substate])); } //weightSum+=weight; } else if (randomness == 0) { // for the baseline weight = 1; } else { //add a bit of randomness weight = GrammarTrainer.RANDOM.nextDouble() * randomness / 100.0 + 1.0; } if (weight == 0) { continue; } //tally in the tag with the given weight substateCounter[substate] += weight; // update the counters tagCounter[tag][substate] += weight; wordCounter.incrementCount(word, weight); totalTokens += weight; if (Double.isNaN(totalTokens)) { throw new Error("totalTokens is NaN: this would fail if we let it continue!"); } if (oldLexicon != null && oldWordCounter.getCount(word) < threshold+0.5) { wordCounter.incrementCount(sig, weight); substateCounter2[substate] += weight; unseenTagCounter[tag][substate] += weight; totalUnseenTokens += weight; } // if (secondHalf) { // // start doing this once we're halfway through the trees // // it's an entirely unknown word // if (wordCounter.getCount(word) < 2) { // wordCounter.incrementCount(sig, weight); // // if (unseenWordToTagCounters[tag] == null) { // unseenWordToTagCounters[tag] = new HashMap(); // } // substateCounter = unseenWordToTagCounters[tag].get(sig); // if (substateCounter == null) { // //System.out.print("Sig "+sig+" word "+ word+" pos "+position); // substateCounter = new double[numSubStates[tag]]; // unseenWordToTagCounters[tag].put(sig, substateCounter); // } // // substateCounter[substate] += weight; // unseenTagCounter[tag][substate] += weight; // totalUnseenTokens += weight; // } else { // } // } } } } /** * Returns the index of the signature of the word numbered wordIndex, where * the signature is the String representation of unknown word features. * Caches the last signature index returned. */ protected String getCachedSignature(String word, int sentencePosition) { if (word == null) return lastWordToSignaturize; if (word.equals(lastWordToSignaturize) && sentencePosition == lastSentencePosition) { // System.err.println("Signature: cache mapped " + wordIndex + // " to " + lastSignatureIndex); return lastSignature; } else { String uwSig = getSignature(word, sentencePosition); lastSignature = uwSig; lastSentencePosition = sentencePosition; lastWordToSignaturize = word; return uwSig; } } /** * Merge states, combining information about words we have seen. THIS DOES * NOT UPDATE INFORMATION FOR UNSEEN WORDS! For that, retrain the Lexicon! * * @param mergeThesePairs * @param mergeWeights */ public void mergeStates(boolean[][][] mergeThesePairs, double[][] mergeWeights) { short[] newNumSubStates = new short[numSubStates.length]; short[][] mapping = new short[numSubStates.length][]; // invariant: if partners[state][substate][0] == substate, it's the 1st // one short[][][] partners = new short[numSubStates.length][][]; Grammar.calculateMergeArrays(mergeThesePairs, newNumSubStates, mapping, partners, numSubStates); for (int tag = 0; tag < mergeThesePairs.length; tag++) { // update wordToTagCounters if (wordToTagCounters[tag] != null) { for (String word : wordToTagCounters[tag].keySet()) { double[] scores = wordToTagCounters[tag].get(word); double[] newScores = new double[newNumSubStates[tag]]; for (int i = 0; i < numSubStates[tag]; i++) { short nSplit = (short) partners[tag][i].length; if (nSplit == 2) { newScores[mapping[tag][i]] = scores[partners[tag][i][0]] + scores[partners[tag][i][1]]; } else { newScores[mapping[tag][i]] = scores[i]; } } wordToTagCounters[tag].put(word, newScores); } } // update tag counter double[] newTagCounter = new double[newNumSubStates[tag]]; for (int i = 0; i < numSubStates[tag]; i++) { if (partners[tag][i].length == 2) { newTagCounter[mapping[tag][i]] = tagCounter[tag][partners[tag][i][0]] + tagCounter[tag][partners[tag][i][1]]; } else { newTagCounter[mapping[tag][i]] = tagCounter[tag][i]; } } tagCounter[tag] = newTagCounter; } numSubStates = newNumSubStates; } public Map getUnseenScores() { Map map = new HashMap(); for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) { if (unseenWordToTagCounters[tag] != null) { for (String sig : unseenWordToTagCounters[tag].keySet()) { double[][] sigScores = map.get(sig); if (sigScores == null) { sigScores = new double[numSubStates.length][]; map.put(sig, sigScores); } sigScores[tag] = new double[numSubStates[tag]]; for (int substate = 0; substate < numSubStates[tag]; substate++) { double c_TS = 0; if (unseenWordToTagCounters[tag].get(sig) != null) { c_TS = unseenWordToTagCounters[tag].get(sig)[substate]; } // how often did we see this signature double c_S = wordCounter.getCount(sig); double c_U = totalUnseenTokens; double total = totalTokens; // seenCounter.getCount(iTW); double c_T = unseenTagCounter[tag][substate];// unSeenCounter.getCount(iTW); double c_Tseen = tagCounter[tag][substate]; // seenCounter.getCount(iTW); double p_T_U = c_T / c_U; if (unknownLevel == 0) { c_TS = 0; c_S = 0; } double pb_T_S = (c_TS + smooth[0] * p_T_U) / (c_S + smooth[0]); double p_T = (c_Tseen / total); double p_W = 1.0 / total; // if we've never before seen this tag, then just say // the probability is 1 if (p_T == 0) { sigScores[tag][substate] = 1; continue; } double pb_W_T = pb_T_S * p_W / p_T; sigScores[tag][substate] = pb_W_T; } } } } return map; } public void removeUnlikelyTags(double threshold, double exponent) { // System.out.print("Removing unlikely tags..."); if (isLogarithmMode()) threshold = Math.log(threshold); int removed = 0, total = 0; if (isConditional) { for (int i = 0; i < conditionalWeights.length; i++) { for (int j = 0; j < conditionalWeights[i].length; j++) { if (conditionalWeights[i][j] == null) continue; for (int k = 0; k < conditionalWeights[i][j].length; k++) { total++; if (conditionalWeights[i][j][k] < threshold) { conditionalWeights[i][j][k] = 0; removed++; } } } } } else { for (int tag = 0; tag < numSubStates.length; tag++) { double[] c_TW; if (wordToTagCounters[tag] != null) { for (String word : wordToTagCounters[tag].keySet()) { c_TW = wordToTagCounters[tag].get(word); for (int substate = 0; substate < numSubStates[tag]; substate++) { total++; if (c_TW[substate] < threshold) { c_TW[substate] = 0; removed++; } } } } } /* * if (unseenWordToTagCounters[tag]!=null){ for (String word : * unseenWordToTagCounters[tag].keySet()){ c_TW = * unseenWordToTagCounters[tag].get(word); for (int substate=0; * substate(); for (String word : wordToTagCounters[tag].keySet()) { double[] scores = wordToTagCounters[tag].get(word); double[] newScores = new double[newNumSubStates[tag]]; for (int i = 0; i < numSubStates[tag]; i++) { newScores[toSubstateMapping[tag][i + 1]] += condProbs[mapping[tag][i]] * scores[i]; } newLexicon.wordToTagCounters[tag].put(word, newScores); } } // update wordToTagCounters if (unseenWordToTagCounters[tag] != null) { newLexicon.unseenWordToTagCounters[tag] = new HashMap(); for (String word : unseenWordToTagCounters[tag].keySet()) { double[] scores = unseenWordToTagCounters[tag] .get(word); double[] newScores = new double[newNumSubStates[tag]]; for (int i = 0; i < numSubStates[tag]; i++) { newScores[toSubstateMapping[tag][i + 1]] += condProbs[mapping[tag][i]] * scores[i]; } newLexicon.unseenWordToTagCounters[tag].put(word, newScores); } } } } else { double[][][] newCondWeights = new double[conditionalWeights.length][conditionalWeights[0].length][]; for (int w = 0; w < newCondWeights.length; w++) { if (conditionalWeights[w] == null) continue; for (int tag = 0; tag < numSubStates.length; tag++) { if (conditionalWeights[w][tag] == null) continue; newCondWeights[w][tag] = new double[newNumSubStates[tag]]; for (int substate = 0; substate < numSubStates[tag]; substate++) { newCondWeights[w][tag][toSubstateMapping[tag][substate + 1]] += condProbs[mapping[tag][substate]] * conditionalWeights[w][tag][substate]; } } } newLexicon.conditionalWeights = newCondWeights; newLexicon.isConditional = true; } newLexicon.totalWordTypes = totalWordTypes; newLexicon.totalTokens = totalTokens; newLexicon.totalUnseenTokens = totalUnseenTokens; newLexicon.totalWords = totalWords; // newLexicon.smoother = smoother; newLexicon.allTags = new HashSet(allTags); newLexicon.wordCounter = new Counter(); for (String word : wordCounter.keySet()) { newLexicon.wordCounter.setCount(word, wordCounter.getCount(word)); } newLexicon.smoothingCutoff = smoothingCutoff; newLexicon.addXSmoothing = addXSmoothing; newLexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold; newLexicon.tagCounter = newTagCounter; newLexicon.unseenTagCounter = newUnseenTagCounter; newLexicon.numSubStates = newNumSubStates; newLexicon.wordNumberer = wordNumberer; newLexicon.unknownLevel = unknownLevel; return newLexicon; } public SophisticatedLexicon copyLexicon() { short[] newNumSubStates = numSubStates.clone(); SophisticatedLexicon newLexicon = new SophisticatedLexicon( newNumSubStates, this.smoothingCutoff, this.smooth, this.smoother, this.threshold); double[][] newTagCounter = ArrayUtil.copy(tagCounter); double[][] newUnseenTagCounter = ArrayUtil.copy(unseenTagCounter); for (int tag = 0; tag < numSubStates.length; tag++) { if (wordToTagCounters[tag] != null) { newLexicon.wordToTagCounters[tag] = new HashMap(); for (String word : wordToTagCounters[tag].keySet()) { double[] scores = wordToTagCounters[tag].get(word); double[] newScores = scores.clone(); newLexicon.wordToTagCounters[tag].put(word, newScores); } } // update wordToTagCounters if (unseenWordToTagCounters[tag] != null) { newLexicon.unseenWordToTagCounters[tag] = new HashMap(); for (String word : unseenWordToTagCounters[tag].keySet()) { double[] scores = unseenWordToTagCounters[tag].get(word); double[] newScores = scores.clone(); newLexicon.unseenWordToTagCounters[tag] .put(word, newScores); } } } if (conditionalWeights != null) newLexicon.conditionalWeights = conditionalWeights.clone(); newLexicon.isConditional = isConditional; newLexicon.totalWordTypes = totalWordTypes; newLexicon.totalTokens = totalTokens; newLexicon.totalUnseenTokens = totalUnseenTokens; newLexicon.totalWords = totalWords; newLexicon.smoother = smoother; newLexicon.allTags = new HashSet(allTags); newLexicon.wordCounter = new Counter(); for (String word : wordCounter.keySet()) { newLexicon.wordCounter.setCount(word, wordCounter.getCount(word)); } newLexicon.smoothingCutoff = smoothingCutoff; newLexicon.addXSmoothing = addXSmoothing; newLexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold; newLexicon.tagCounter = newTagCounter; newLexicon.unseenTagCounter = newUnseenTagCounter; newLexicon.numSubStates = newNumSubStates; newLexicon.wordNumberer = this.wordNumberer; newLexicon.unknownLevel = this.unknownLevel; return newLexicon; } public int getNumberOfEntries() { int nEntries = 0; if (conditionalWeights == null) { // indicates first time use: for (String word : wordCounter.keySet()) { // has all words AND also // the signatures wordNumberer.number(word); } } for (int tag = 0; tag < wordToTagCounters.length; tag++) { if (wordToTagCounters[tag] != null) { nEntries += wordToTagCounters[tag].size() * numSubStates[tag]; if (conditionalWeights == null) { for (String word : wordToTagCounters[tag].keySet()) wordNumberer.number(word); } } if (unseenWordToTagCounters[tag] != null) { nEntries += unseenWordToTagCounters[tag].size() * numSubStates[tag]; if (conditionalWeights == null) { for (String word : unseenWordToTagCounters[tag].keySet()) wordNumberer.number(word); } } } if (conditionalWeights == null) { conditionalWeights = new double[wordNumberer.total()][numSubStates.length][]; } return nEntries; } // public Pair getLinearizedLexicon(){ // return getLinearizedLexicon(getNumberOfEntries()); // } // // public Pair getLinearizedLexicon(int n){ // if (isConditional) { // System.out.println("Do not have the functionality to linearize a conditional lexicon!"); // return new Pair(null,null); // } // double[] probs = new double[n]; // int[][] startIndex = new int[wordNumberer.total()][numSubStates.length]; // ArrayUtil.fill(startIndex,Integer.MIN_VALUE); // int ind = 0; // for (int tag=0; tag(probs,startIndex); // } public void delinearizeLexicon(double[] probs) { int ind = 0; // Numberer wordNumberer = Numberer.getGlobalNumberer("words"); for (int tag = 0; tag < wordToTagCounters.length; tag++) { if (wordToTagCounters[tag] != null) { for (String word : wordToTagCounters[tag].keySet()) { double[] scores = new double[numSubStates[tag]]; for (int i = 0; i < scores.length; i++) { double val = probs[ind++];// Math.exp(); //probs[ind++] val = (val == -1000) ? 0 : Math.exp(val); if (SloppyMath.isVeryDangerous(val)) { if (Double.isNaN(probs[ind - 1])) val = 1.0e-50; else val = probs[ind - 1]; // System.out.println("word " +word+" tag "+tag); // System.out.println("Optimizer proposed Inf. Setting to probs: " // +val); } scores[i] = val; } conditionalWeights[wordNumberer.number(word)][tag] = scores; } } if (unseenWordToTagCounters[tag] != null) { for (String word : unseenWordToTagCounters[tag].keySet()) { double[] scores = new double[numSubStates[tag]]; for (int i = 0; i < scores.length; i++) { double val = probs[ind++];// Math.exp(); //probs[ind++] val = (val == -1000) ? 0 : Math.exp(val); if (SloppyMath.isVeryDangerous(val)) { if (Double.isNaN(probs[ind - 1])) val = 1.0e-50; else val = probs[ind - 1]; // System.out.println("word " +word+" tag "+tag); // System.out.println("Optimizer proposed Inf. Setting to probs: " // +val); } scores[i] = val; } conditionalWeights[wordNumberer.number(word)][tag] = scores; } } } this.isConditional = true; } public void setConditional(boolean b) { this.isConditional = b; } public double[] scoreConditional(String word, short tag, int loc, boolean noSmoothing, boolean isSignature) { if (isSignature) return getConditionalSignatureScore(word, tag, noSmoothing); else if (!isKnown(word)) return getConditionalSignatureScore(getCachedSignature(word, loc), tag, noSmoothing); // else if(!isKnown(word)) return getConditionalSignatureScore("#UNK#", // tag, noSmoothing); // else if(isKnown(word))return getConditionalSignatureScore(word, tag, // noSmoothing); double[] resultArray = new double[numSubStates[tag]]; double[] wordScore = getConditionalWordScore(word, tag, noSmoothing); String sig = getCachedSignature(word, loc); double[] sigScore = getConditionalSignatureScore(sig, tag, noSmoothing); for (int i = 0; i < resultArray.length; i++) { resultArray[i] = wordScore[i] + sigScore[i]; } return resultArray; } public double[] getConditionalSignatureScore(String sig, short tag, boolean noSmoothing) { double[] resultArray = new double[numSubStates[tag]]; int ind = wordNumberer.number(sig); if (ind >= conditionalWeights.length) { System.out .println(" We have a problem! sig " + sig + " ind " + ind); return resultArray; } double[] tmpArray = conditionalWeights[ind][tag]; if (tmpArray != null) { for (int i = 0; i < resultArray.length; i++) { resultArray[i] += tmpArray[i]; } } if (this.isLogarithmMode()) { for (int i = 0; i < resultArray.length; i++) { resultArray[i] = Math.log(resultArray[i]); } } return resultArray; } public double[] getConditionalWordScore(String word, short tag, boolean noSmoothing) { double[] resultArray = new double[numSubStates[tag]]; int ind = wordNumberer.number(word); double[] tmpArray = conditionalWeights[ind][tag]; if (tmpArray != null) { for (int i = 0; i < resultArray.length; i++) { resultArray[i] = tmpArray[i]; } } if (this.isLogarithmMode()) { for (int i = 0; i < resultArray.length; i++) { resultArray[i] = Math.log(resultArray[i]); } } return resultArray; } class ChineseLexicon implements Serializable { private static final long serialVersionUID = 1L; private static final String encoding = "GB18030"; // used only for // debugging /* * These strings are stored in ascii-stype Unicode encoding. To edit * them, either use the Unicode codes or use native2ascii or a similar * program to convert the file into a Chinese encoding, then convert * back. */ public static final String dateMatch = ".*[\u5e74\u6708\u65e5\u53f7]$"; public static final String numberMatch = ".*[\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19\uff11\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf].*"; public static final String ordinalMatch = "^\u7b2c.*"; public static final String properNameMatch = ".*\u00b7.*"; } public void setSmoother(Smoother smoother) { this.smoother = smoother; } public Smoother getSmoother() { return smoother; } public double[] getSmoothingParams() { return smooth; } public double getPruningThreshold() { return threshold; } /* * (non-Javadoc) * * @see edu.berkeley.nlp.PCFGLA.Lexicon#getLinearizedLexicon() */ public double[] getLinearizedLexicon() { // TODO Auto-generated method stub return null; } /* * (non-Javadoc) * * @see edu.berkeley.nlp.PCFGLA.Lexicon#getLinearIndex(java.lang.String, * int) */ public int getLinearIndex(String word, int tag) { // TODO Auto-generated method stub return 0; } /* * (non-Javadoc) * * @see edu.berkeley.nlp.PCFGLA.Lexicon#clearMapping() */ public void clearMapping() { // TODO Auto-generated method stub } /* * (non-Javadoc) * * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreSignature(java.lang.String, * int, int) */ public double[] scoreSignature(StateSet stateSet, int tag) { // TODO Auto-generated method stub return null; } /* * (non-Javadoc) * * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreWord(java.lang.String, int) */ public double[] scoreWord(StateSet stateSet, int tag) { // TODO Auto-generated method stub return null; } public void explicitlyComputeScores(int finalLevel) { // TODO Auto-generated method stub } @SuppressWarnings("unchecked") public SophisticatedLexicon remapStates(Numberer thisNumberer, Numberer newNumberer) { SophisticatedLexicon remappedLexicon = copyLexicon(); remappedLexicon.wordToTagCounters = new HashMap[newNumberer.size()]; remappedLexicon.unseenWordToTagCounters = new HashMap[newNumberer .size()]; remappedLexicon.typeTagCounter = new double[newNumberer.size()][]; remappedLexicon.tagCounter = new double[newNumberer.size()][]; remappedLexicon.unseenTagCounter = new double[newNumberer.size()][]; remappedLexicon.simpleTagCounter = new double[newNumberer.size()]; remappedLexicon.allTags = new HashSet(); remappedLexicon.numSubStates = new short[newNumberer.size()]; remappedLexicon.smoother = smoother.remapStates(thisNumberer, newNumberer); if (conditionalWeights != null) { for (int w = 0; w < conditionalWeights.length; w++) { remappedLexicon.conditionalWeights[w] = new double[newNumberer .size()][]; } } for (short s = 0; s < newNumberer.size(); s++) { short translatedState = translateState(s, newNumberer, thisNumberer); if (translatedState >= 0) { remappedLexicon.wordToTagCounters[s] = wordToTagCounters[translatedState]; remappedLexicon.unseenWordToTagCounters[s] = unseenWordToTagCounters[translatedState]; remappedLexicon.typeTagCounter[s] = typeTagCounter[translatedState]; remappedLexicon.tagCounter[s] = tagCounter[translatedState]; remappedLexicon.unseenTagCounter[s] = unseenTagCounter[translatedState]; remappedLexicon.simpleTagCounter[s] = simpleTagCounter[translatedState]; if (allTags.contains(translatedState)) remappedLexicon.allTags.add(s); remappedLexicon.numSubStates[s] = numSubStates[translatedState]; if (conditionalWeights != null) { for (int w = 0; w < conditionalWeights[w].length; w++) { remappedLexicon.conditionalWeights[w][s] = conditionalWeights[w][translatedState]; } } } else { remappedLexicon.wordToTagCounters[s] = new HashMap(); remappedLexicon.unseenWordToTagCounters[s] = new HashMap(); remappedLexicon.typeTagCounter[s] = new double[1]; remappedLexicon.tagCounter[s] = new double[1]; remappedLexicon.unseenTagCounter[s] = new double[1]; remappedLexicon.numSubStates[s] = 1; } } return remappedLexicon; } private short translateState(int state, Numberer baseNumberer, Numberer translationNumberer) { Object object = baseNumberer.object(state); if (translationNumberer.hasSeen(object)) { return (short) translationNumberer.number(object); } else { return (short) -1; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy