edu.berkeley.nlp.PCFGLA.SophisticatedLexicon Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.PCFGLA;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.math.SloppyMath;
import edu.berkeley.nlp.util.ArrayUtil;
import edu.berkeley.nlp.util.Counter;
import edu.berkeley.nlp.util.Numberer;
import edu.berkeley.nlp.util.PriorityQueue;
import edu.berkeley.nlp.util.ScalingTools;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.Writer;
import java.util.*;

/**
 * Simple default implementation of a lexicon, which scores word, tag pairs with
 * a smoothed estimate of P(tag|word)/P(tag).
 * 
 * for simplicity the lexicon will store words and tags as strings, while the
 * grammar will be using integers -> Numberer()
 */
public class SophisticatedLexicon implements java.io.Serializable, Lexicon {
	/** A count of strings with tags. Indexed by state, word, and substate. */
	HashMap[] wordToTagCounters = null;
	HashMap[] unseenWordToTagCounters = null;
	double totalWordTypes = 0.0;
	double totalTokens = 0.0;
	double totalUnseenTokens = 0.0;
	double totalWords = 0.0;
	/**
	 * A count of how many different words each full tag has been seen with.
	 * Indexed by state and substate
	 */
	double[][] typeTagCounter;
	/**
	 * A count of tag (state + subState) occurrences. Indexed by state and
	 * substate
	 */
	double[][] tagCounter;
	double[][] unseenTagCounter;
	double[] simpleTagCounter;
	/** The set of preterminal tags */
	Set allTags = new HashSet();
	/** The count of how often each word as been seen */
	Counter wordCounter = new Counter();
	/**
	 * A trick to allow loading of saved Lexicons even if the version has
	 * changed.
	 */
	private static final long serialVersionUID = 2L;
	/** The number of substates for each state */
	short[] numSubStates;

	/** Word-tag pairs that occur less are smoothed. */
	int smoothingCutoff;
	/** The default smoothing cutoff. */
	public static int DEFAULT_SMOOTHING_CUTOFF = 10;
	/** Add X smoothing for P(word) */
	double addXSmoothing = 1.0;

	Smoother smoother;
	double threshold;

	boolean isConditional;
	double[][][] conditionalWeights; // wordIndex, tag, substate -> weight
	Numberer wordNumberer;

	// additions from the stanford parser which are needed for a better
	// unknown word model...
	/**
	 * We cache the last signature looked up, because it asks for the same one
	 * many times when an unknown word is encountered! (Note that under the
	 * current scheme, one unknown word, if seen sentence-initially and
	 * non-initially, will be parsed with two different signatures....)
	 */
	protected transient String lastSignature = "";
	protected transient int lastSentencePosition = -1;
	protected transient String lastWordToSignaturize = "";
	private int unknownLevel = 5; // different modes for unknown words, 5 is
									// english specific
	/**
	 * A POS tag has to have been attributed to more than this number of word
	 * types before it is regarded as an open-class tag. Unknown words will only
	 * possibly be tagged as open-class tags (unless flexiTag is on).
	 */
	public static int openClassTypesThreshold = 50;

	/**
	 * Start to aggregate signature-tag pairs only for words unseen in the first
	 * this fraction of the data.
	 */
	public static double fractionBeforeUnseenCounting = 0.5; // -> secondHalf
	// protected transient Set sigs=new HashSet();
	/**
	 * Has counts for taggings in terms of unseen signatures. The IntTagWords
	 * are for (tag,sig), (tag,null), (null,sig), (null,null). (None for basic
	 * UNK if there are signatures.)
	 */
	protected static final int nullWord = -1;
	protected static final short nullTag = -1;
	double smoothInUnknownsThreshold = 100;
	double[] smooth = null; // {1.0, 1.0};

	/**
	 * If logarithmMode is true, then all scores are returned as log
	 * probabilities. Otherwise, they are returned as probabilities.
	 */
	boolean logarithmMode = false;

	/** Get the nonterminal tags */
	public Set getAllTags() {
		return allTags;
	}

	public boolean isKnown(String word) {
		return wordCounter.keySet().contains(word);
	}

	public void writeData(Writer w) throws IOException {
		PrintWriter out = new PrintWriter(w);
		Numberer n = Numberer.getGlobalNumberer("tags");

		// word counter (c_W)
		out.print("WORD-COUNTER (c_W):\n");
		PriorityQueue pq = wordCounter.asPriorityQueue();
		while (pq.hasNext()) {
			int priority = (int) Math.round(pq.getPriority());
			String element = pq.next();
			out.print(element + " " + priority + "\n");
		}

		out.print("--------------------------------------------------\n");
		out.print("TAG-COUNTER (c_T):\n");
		for (int state = 0; state < tagCounter.length; state++) {
			String tagState = (String) n.object(state);
			for (int substate = 0; substate < tagCounter[state].length; substate++) {
				double prob = tagCounter[state][substate];
				if (prob == 0)
					continue;
				out.print(tagState + "_" + substate + " " + prob + "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("UNSEEN-TAG-COUNTER (c_T):\n");
		for (int state = 0; state < unseenTagCounter.length; state++) {
			String tagState = (String) n.object(state);
			for (int substate = 0; substate < unseenTagCounter[state].length; substate++) {
				double prob = unseenTagCounter[state][substate];
				if (prob == 0)
					continue;
				out.print(tagState + "_" + substate + " " + prob + "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("TAG-AND-WORD-COUNTER (c_TW):\n");
		for (int tag = 0; tag < wordToTagCounters.length; tag++) {
			if (wordToTagCounters[tag] == null)
				continue;
			String tagState = (String) n.object(tag);
			for (String word : wordToTagCounters[tag].keySet()) {
				out.print(tagState + " " + word + " "
						+ Arrays.toString(wordToTagCounters[tag].get(word))
						+ "\n");
			}
		}

		out.print("--------------------------------------------------\n");
		out.print("UNSEEN-TAG-AND-SIGNATURE-COUNTER (c_TW):\n");
		for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) {
			if (unseenWordToTagCounters[tag] == null)
				continue;
			String tagState = (String) n.object(tag);
			for (String word : unseenWordToTagCounters[tag].keySet()) {
				out.print(tagState
						+ " "
						+ word
						+ " "
						+ Arrays.toString(unseenWordToTagCounters[tag]
								.get(word)) + "\n");
			}
		}

		out.flush();
	}

	public String toString() {
		Numberer n = Numberer.getGlobalNumberer("tags");
		StringBuilder sb = new StringBuilder();
		// word counter (c_W)
		/*
		 * sb.append("WORD-COUNTER (c_W):\n"); PriorityQueue pq =
		 * wordCounter.asPriorityQueue(); while (pq.hasNext()) { int priority =
		 * (int)Math.round(pq.getPriority()); String element = pq.next();
		 * sb.append(element +" "+priority+"\n"); }
		 * 
		 * sb.append("--------------------------------------------------\n");
		 * sb.append("TAG-COUNTER (c_T):\n"); for (int state=0;
		 * state[] probCounter = new
		// HashMap[numSubStates.length];
		// for (int tag=0; tag();
		// for (String word : wordToTagCounters[tag].keySet()){
		// double[] probs = wordToTagCounters[tag].get(word);
		// for (int substate=0; substate 0) {
			if (wordToTagCounters[tag] != null) { // this is a lexical category
				resultArray = wordToTagCounters[tag].get(word);
				if (resultArray != null) { // we have seen this word with this
											// tag
					return resultArray;
				}
			}
			return new double[numSubStates[tag]];
		}
		String sig = getCachedSignature(word, loc);
		resultArray = wordToTagCounters[tag].get(sig);
		if (resultArray != null) { // we have seen this signature with this tag
			return resultArray;
		}
		// we have never seen the word or its signature
		// System.err.println("We have never seen the word "+word+" or its signature "+sig+" with this tag. Returning prob 0.");
		return new double[numSubStates[tag]];

	}

	/**
	 * 
	 * This condenses counting arrays into essential statistics. It is used
	 * after all calls to tallyStateSetTree and before any getScore calls.
	 * 

	 * Currently the trees are taken into account immediately, so this does
	 * nothing, but in the future this may contain some precomputation
	 */
	public void optimize() {
		// make up the set of which tags are preterminal tags
		for (short i = 0; i < wordToTagCounters.length; i++) {
			if (wordToTagCounters[i] != null) {
				allTags.add(i);
			}
		}
		// remove the unlikely ones
		removeUnlikelyTags(threshold, -1.0);
		// // add MMT randomization if necessary
		// if
		// (randomInitializationType==Grammar.RandomInitializationType.INITIALIZE_LIKE_MMT)
		// {
		// Random r = new Random();
		// for (short tag=0; tag pq = new PriorityQueue(tagCounter.length);
		for (int i = 0; i < tagCounter.length; i++) {
			pq.add((String) tagNumberer.object(i), tagCounter[i][0]);
			// System.out.println(i+". "+(String)tagNumberer.object(i)+"\t "+symbolCounter.getCount(i,0));
		}
		int i = 0;
		while (pq.hasNext()) {
			i++;
			int p = (int) pq.getPriority();
			System.out.println(i + ". " + pq.next() + "\t " + p);
		}
	}

	/**
	 * Split all substates in two, producing a new lexicon. The new Lexicon
	 * gives the same scores to words under both split versions of the tag.
	 * (Leon says: It may not be okay to use the same scores, but I think that
	 * symmetry is sufficiently broken in Grammar.splitAllStates to ignore the
	 * randomness here.)
	 * 
	 * @param randomness
	 *            , mode (currently ignored)
	 * @return
	 */
	@SuppressWarnings("unchecked")
	public SophisticatedLexicon splitAllStates(int[] counts,
			boolean moreSubstatesThanCounts, int mode) {
		short[] newNumSubStates = new short[numSubStates.length];
		newNumSubStates[0] = 1; // never split ROOT
		for (short i = 1; i < numSubStates.length; i++) {
			// don't split a state into more substates than times it was
			// actaully seen
			// if (!moreSubstatesThanCounts && numSubStates[i]>=counts[i]) {
			// newNumSubStates[i]=numSubStates[i];
			// }
			// else{
			newNumSubStates[i] = (short) (numSubStates[i] * 2);
			// }
		}
		SophisticatedLexicon lexicon = new SophisticatedLexicon(
				newNumSubStates, this.smoothingCutoff, smooth, smoother,
				this.threshold);
		// copy and alter all data structures
		lexicon.wordToTagCounters = new HashMap[numSubStates.length];
		lexicon.unseenWordToTagCounters = new HashMap[numSubStates.length];
		for (int tag = 0; tag < wordToTagCounters.length; tag++) {
			if (wordToTagCounters[tag] != null) {
				lexicon.wordToTagCounters[tag] = new HashMap();
				for (String word : wordToTagCounters[tag].keySet()) {
					lexicon.wordToTagCounters[tag].put(word,
							new double[newNumSubStates[tag]]);
					for (int substate = 0; substate < wordToTagCounters[tag]
							.get(word).length; substate++) {
						int splitFactor = 2;
						if (newNumSubStates[tag] == numSubStates[tag]) {
							splitFactor = 1;
						}
						for (int i = 0; i < splitFactor; i++) {
							lexicon.wordToTagCounters[tag].get(word)[substate
									* splitFactor + i] = (1.f / splitFactor)
									* wordToTagCounters[tag].get(word)[substate];
						}
					}
				}
			}
		}
		for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) {
			if (unseenWordToTagCounters[tag] != null) {
				lexicon.unseenWordToTagCounters[tag] = new HashMap();
				for (String word : unseenWordToTagCounters[tag].keySet()) {
					lexicon.unseenWordToTagCounters[tag].put(word,
							new double[newNumSubStates[tag]]);
					for (int substate = 0; substate < unseenWordToTagCounters[tag]
							.get(word).length; substate++) {
						int splitFactor = 2;
						if (newNumSubStates[tag] == numSubStates[tag]) {
							splitFactor = 1;
						}
						for (int i = 0; i < splitFactor; i++) {
							lexicon.unseenWordToTagCounters[tag].get(word)[substate
									* splitFactor + i] = (1.f / splitFactor)
									* unseenWordToTagCounters[tag].get(word)[substate];
						}
					}
				}
			}
		}
		lexicon.totalWordTypes = totalWordTypes;
		lexicon.totalTokens = totalTokens;
		lexicon.totalUnseenTokens = totalUnseenTokens;
		lexicon.totalWords = totalWords;
		lexicon.smoother = smoother;
		lexicon.typeTagCounter = new double[typeTagCounter.length][];
		lexicon.tagCounter = new double[tagCounter.length][];
		lexicon.unseenTagCounter = new double[unseenTagCounter.length][];
		lexicon.simpleTagCounter = new double[tagCounter.length];
		for (int tag = 0; tag < typeTagCounter.length; tag++) {
			lexicon.typeTagCounter[tag] = new double[newNumSubStates[tag]];
			lexicon.tagCounter[tag] = new double[newNumSubStates[tag]];
			lexicon.unseenTagCounter[tag] = new double[newNumSubStates[tag]];
			lexicon.simpleTagCounter[tag] = simpleTagCounter[tag];
			for (int substate = 0; substate < typeTagCounter[tag].length; substate++) {
				int splitFactor = 2;
				if (newNumSubStates[tag] == numSubStates[tag]) {
					splitFactor = 1;
				}
				for (int i = 0; i < splitFactor; i++) {
					lexicon.typeTagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor)
							* typeTagCounter[tag][substate];
					lexicon.tagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor)
							* tagCounter[tag][substate];
					lexicon.unseenTagCounter[tag][substate * splitFactor + i] = (1.f / splitFactor)
							* unseenTagCounter[tag][substate];
				}
			}
		}
		lexicon.allTags = new HashSet(allTags);
		lexicon.wordCounter = new Counter();
		for (String word : wordCounter.keySet()) {
			lexicon.wordCounter.setCount(word, wordCounter.getCount(word));
		}
		lexicon.smoothingCutoff = smoothingCutoff;
		lexicon.addXSmoothing = addXSmoothing;
		lexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold;

		lexicon.wordNumberer = wordNumberer;
		return lexicon;
	}

	/**
	 * This routine returns a String that is the "signature" of the class of a
	 * word. For, example, it might represent whether it is a number of ends in
	 * -s. The strings returned by convention match the pattern UNK-.* , which
	 * is just assumed to not match any real word. Behavior depends on the
	 * unknownLevel (-uwm flag) passed in to the class. The recognized numbers
	 * are 1-5: 5 is fairly English-specific; 4, 3, and 2 look for various word
	 * features (digits, dashes, etc.) which are only vaguely English-specific;
	 * 1 uses the last two characters combined with a simple classification by
	 * capitalization.
	 * 
	 * @param word
	 *            The word to make a signature for
	 * @param loc
	 *            Its position in the sentence (mainly so sentence-initial
	 *            capitalized words can be treated differently)
	 * @return A String that is its signature (equivalence class)
	 */
	public String getSignature(String word, int loc) {
		// int unknownLevel = Options.get().useUnknownWordSignatures;
		StringBuffer sb = new StringBuffer("UNK");

		if (word.length() == 0)
			return sb.toString();

		switch (unknownLevel) {

		case 5: {
			// Reformed Mar 2004 (cdm); hopefully much better now.
			// { -CAPS, -INITC ap, -LC lowercase, 0 } +
			// { -KNOWNLC, 0 } + [only for INITC]
			// { -NUM, 0 } +
			// { -DASH, 0 } +
			// { -last lowered char(s) if known discriminating suffix, 0}
			int wlen = word.length();
			int numCaps = 0;
			boolean hasDigit = false;
			boolean hasDash = false;
			boolean hasLower = false;
			for (int i = 0; i < wlen; i++) {
				char ch = word.charAt(i);
				if (Character.isDigit(ch)) {
					hasDigit = true;
				} else if (ch == '-') {
					hasDash = true;
				} else if (Character.isLetter(ch)) {
					if (Character.isLowerCase(ch)) {
						hasLower = true;
					} else if (Character.isTitleCase(ch)) {
						hasLower = true;
						numCaps++;
					} else {
						numCaps++;
					}
				}
			}
			char ch0 = word.charAt(0);
			String lowered = word.toLowerCase();
			if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
				if (loc == 0 && numCaps == 1) {
					sb.append("-INITC");
					if (isKnown(lowered)) {
						sb.append("-KNOWNLC");
					}
				} else {
					sb.append("-CAPS");
				}
			} else if (!Character.isLetter(ch0) && numCaps > 0) {
				sb.append("-CAPS");
			} else if (hasLower) { // (Character.isLowerCase(ch0)) {
				sb.append("-LC");
			}
			if (hasDigit) {
				sb.append("-NUM");
			}
			if (hasDash) {
				sb.append("-DASH");
			}
			if (lowered.endsWith("s") && wlen >= 3) {
				// here length 3, so you don't miss out on ones like 80s
				char ch2 = lowered.charAt(wlen - 2);
				// not -ess suffixes or greek/latin -us, -is
				if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
					sb.append("-s");
				}
			} else if (word.length() >= 5 && !hasDash
					&& !(hasDigit && numCaps > 0)) {
				// don't do for very short words;
				// Implement common discriminating suffixes
				/*
				 * if (Corpus.myLanguage==Corpus.GERMAN){
				 * sb.append(lowered.substring(lowered.length()-1)); }else{
				 */
				if (lowered.endsWith("ed")) {
					sb.append("-ed");
				} else if (lowered.endsWith("ing")) {
					sb.append("-ing");
				} else if (lowered.endsWith("ion")) {
					sb.append("-ion");
				} else if (lowered.endsWith("er")) {
					sb.append("-er");
				} else if (lowered.endsWith("est")) {
					sb.append("-est");
				} else if (lowered.endsWith("ly")) {
					sb.append("-ly");
				} else if (lowered.endsWith("ity")) {
					sb.append("-ity");
				} else if (lowered.endsWith("y")) {
					sb.append("-y");
				} else if (lowered.endsWith("al")) {
					sb.append("-al");
					// } else if (lowered.endsWith("ble")) {
					// sb.append("-ble");
					// } else if (lowered.endsWith("e")) {
					// sb.append("-e");
				}
			}
			break;
		}

		case 4: {
			boolean hasDigit = false;
			boolean hasNonDigit = false;
			boolean hasLetter = false;
			boolean hasLower = false;
			boolean hasDash = false;
			boolean hasPeriod = false;
			boolean hasComma = false;
			for (int i = 0; i < word.length(); i++) {
				char ch = word.charAt(i);
				if (Character.isDigit(ch)) {
					hasDigit = true;
				} else {
					hasNonDigit = true;
					if (Character.isLetter(ch)) {
						hasLetter = true;
						if (Character.isLowerCase(ch)
								|| Character.isTitleCase(ch)) {
							hasLower = true;
						}
					} else {
						if (ch == '-') {
							hasDash = true;
						} else if (ch == '.') {
							hasPeriod = true;
						} else if (ch == ',') {
							hasComma = true;
						}
					}
				}
			}
			// 6 way on letters
			if (Character.isUpperCase(word.charAt(0))
					|| Character.isTitleCase(word.charAt(0))) {
				if (!hasLower) {
					sb.append("-AC");
				} else if (loc == 0) {
					sb.append("-SC");
				} else {
					sb.append("-C");
				}
			} else if (hasLower) {
				sb.append("-L");
			} else if (hasLetter) {
				sb.append("-U");
			} else {
				// no letter
				sb.append("-S");
			}
			// 3 way on number
			if (hasDigit && !hasNonDigit) {
				sb.append("-N");
			} else if (hasDigit) {
				sb.append("-n");
			}
			// binary on period, dash, comma
			if (hasDash) {
				sb.append("-H");
			}
			if (hasPeriod) {
				sb.append("-P");
			}
			if (hasComma) {
				sb.append("-C");
			}
			if (word.length() > 3) {
				// don't do for very short words: "yes" isn't an "-es" word
				// try doing to lower for further densening and skipping digits
				char ch = word.charAt(word.length() - 1);
				if (Character.isLetter(ch)) {
					sb.append("-");
					sb.append(Character.toLowerCase(ch));
				}
			}
			break;
		}

		case 3: {
			// This basically works right, except note that 'S' is applied to
			// all
			// capitalized letters in first word of sentence, not just first....
			sb.append("-");
			char lastClass = '-'; // i.e., nothing
			char newClass;
			int num = 0;
			for (int i = 0; i < word.length(); i++) {
				char ch = word.charAt(i);
				if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
					if (loc == 0) {
						newClass = 'S';
					} else {
						newClass = 'L';
					}
				} else if (Character.isLetter(ch)) {
					newClass = 'l';
				} else if (Character.isDigit(ch)) {
					newClass = 'd';
				} else if (ch == '-') {
					newClass = 'h';
				} else if (ch == '.') {
					newClass = 'p';
				} else {
					newClass = 's';
				}
				if (newClass != lastClass) {
					lastClass = newClass;
					sb.append(lastClass);
					num = 1;
				} else {
					if (num < 2) {
						sb.append('+');
					}
					num++;
				}
			}
			if (word.length() > 3) {
				// don't do for very short words: "yes" isn't an "-es" word
				// try doing to lower for further densening and skipping digits
				char ch = Character.toLowerCase(word.charAt(word.length() - 1));
				sb.append('-');
				sb.append(ch);
			}
			break;
		}

		case 2: {
			// {-ALLC, -INIT, -UC, -LC, zero} +
			// {-DASH, zero} +
			// {-NUM, -DIG, zero} +
			// {lowerLastChar, zeroIfShort}
			boolean hasDigit = false;
			boolean hasNonDigit = false;
			boolean hasLower = false;
			for (int i = 0; i < word.length(); i++) {
				char ch = word.charAt(i);
				if (Character.isDigit(ch)) {
					hasDigit = true;
				} else {
					hasNonDigit = true;
					if (Character.isLetter(ch)) {
						if (Character.isLowerCase(ch)
								|| Character.isTitleCase(ch)) {
							hasLower = true;
						}
					}
				}
			}
			if (Character.isUpperCase(word.charAt(0))
					|| Character.isTitleCase(word.charAt(0))) {
				if (!hasLower) {
					sb.append("-ALLC");
				} else if (loc == 0) {
					sb.append("-INIT");
				} else {
					sb.append("-UC");
				}
			} else if (hasLower) { // if (Character.isLowerCase(word.charAt(0)))
									// {
				sb.append("-LC");
			}
			// no suffix = no (lowercase) letters
			if (word.indexOf('-') >= 0) {
				sb.append("-DASH");
			}
			if (hasDigit) {
				if (!hasNonDigit) {
					sb.append("-NUM");
				} else {
					sb.append("-DIG");
				}
			} else if (word.length() > 3) {
				// don't do for very short words: "yes" isn't an "-es" word
				// try doing to lower for further densening and skipping digits
				char ch = word.charAt(word.length() - 1);
				sb.append(Character.toLowerCase(ch));
			}
			// no suffix = short non-number, non-alphabetic
			break;
		}

		default:
			sb.append("-");
			sb.append(word.substring(Math.max(word.length() - 2, 0), word
					.length()));
			sb.append("-");
			if (Character.isLowerCase(word.charAt(0))) {
				sb.append("LOWER");
			} else {
				if (Character.isUpperCase(word.charAt(0))) {
					if (loc == 0) {
						sb.append("INIT");
					} else {
						sb.append("UPPER");
					}
				} else {
					sb.append("OTHER");
				}
			}
		} // end switch (unknownLevel)
		// System.err.println("Summarized " + word + " to " + sb.toString());
		return sb.toString();
	} // end getSignature()

	public double[] score(StateSet stateSet, short tag, boolean noSmoothing,
			boolean isSignature) {
		return score(stateSet.getWord(), tag, stateSet.from, noSmoothing,
				isSignature);
	}

	/**
	 * Get the score of this word with this tag (as an IntTaggedWord) at this
	 * loc. (Presumably an estimate of P(word | tag).)
	 * 
	 * Implementation documentation: Seen: c_W = count(W) c_TW =
	 * count(T,W) c_T = count(T) c_Tunseen = count(T) among new words in 2nd
	 * half total = count(seen words) totalUnseen = count("unseen" words) p_T_U
	 * = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W > smoothInUnknownsThreshold)
	 * = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior smooth[1]
	 * with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = pb_T_W * p_W / p_T [Bayes
	 * rule] Note that this doesn't really properly reserve mass to unknowns.
	 * 
	 * Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen)
	 * c_U = totalUnseen above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of
	 * Pmle(T|S) with P(T|Unseen) [smooth[0]] pb_W_T = P(W|T) inverted
	 * 
	 * @param iTW
	 *            An IntTaggedWord pairing a word and POS tag
	 * @param loc
	 *            The position in the sentence. In the default implementation
	 *            this is used only for unknown words to change their
	 *            probability distribution when sentence initial
	 * @return A double valued score, usually P(word|tag)
	 */
	public double[] score(String word, short tag, int loc, boolean noSmoothing,
			boolean isSignature) {
		if (isConditional)
			return scoreConditional(word, tag, loc, noSmoothing, isSignature);
		double c_W = wordCounter.getCount(word);
		double pb_W_T = 0; // always set below

		// simulate no smoothing
		// smooth[0] = 0.0; smooth[1] = 0.0;

		double[] resultArray = new double[numSubStates[tag]];

		for (int substate = 0; substate < numSubStates[tag]; substate++) {
			boolean seen = (c_W > 0.0);
			if (!isSignature && (seen || noSmoothing)) {
				// known word model for P(T|W)
				double c_tag = tagCounter[tag][substate];
				double c_T = c_tag;// seenCounter.getCount(iTW);
				if (c_T == 0)
					continue;

				double c_TW = 0;
				if (wordToTagCounters[tag] != null
						&& wordToTagCounters[tag].get(word) != null) {
					c_TW = wordToTagCounters[tag].get(word)[substate];
				}
				// if (c_TW==0) continue;

				double c_Tunseen = unseenTagCounter[tag][substate];
				double total = totalTokens;
				double totalUnseen = totalUnseenTokens;

				double p_T_U = (totalUnseen == 0) ? 1 : c_Tunseen / totalUnseen;
				double pb_T_W; // always set below

				// System.err.println("c_W is " + c_W + " THRESH is " +
				// smoothInUnknownsThreshold + " mle = " + (c_TW/c_W));
				if (c_W > smoothInUnknownsThreshold || noSmoothing) {
					// we've seen the word enough times to have confidence in
					// its tagging
					if (noSmoothing && c_W == 0)
						pb_T_W = c_TW / 1;
					else
						pb_T_W = (c_TW + 0.0001 * p_T_U) / (c_W + 0.0001);
					//	pb_T_W = c_TW / c_W;
					// System.out.println("c_TW "+c_TW+" c_W "+c_W);
				} else {
					// we haven't seen the word enough times to have confidence
					// in its tagging
					pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
					// System.out.println("smoothed c_TW "+c_TW+" c_W "+c_W);
				}
				if (pb_T_W == 0)
					continue;
				// Sometimes we run up against unknown tags. This should only
				// happen
				// when we're calculating the likelihood for a given tree, not
				// when
				// we're parsing. In that case, return a LL of 0.

				// NO NO NO, this is wrong, slav
				// if (c_T==0) {
				// resultArray[substate] = 1;
				// continue;
				// }

				double p_T = (c_T / total);
				double p_W = (c_W / total);
				pb_W_T = pb_T_W * p_W / p_T;

			} else {

				// test against simple Chinese lexical constants
				if (Corpus.myTreebank == Corpus.TreeBankType.CHINESE) {
					Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
					double prob;
					if (word.matches(ChineseLexicon.dateMatch)) {
						// EncodingPrintWriter.out.println("Date match for " +
						// word,encoding);
						if (tag == tagNumberer.number("NT")) { // (tag.equals("NT"))
																// {
							prob = 1.0;
						} else {
							prob = 0.0;
						}
						Arrays.fill(resultArray, prob);
						return resultArray;
					} else if (word.matches(ChineseLexicon.numberMatch)) {
						// EncodingPrintWriter.out.println("Number match for " +
						// word,encoding);
						if (tag == tagNumberer.number("CD") /* tag.equals("CD") */
								&& (!word.matches(ChineseLexicon.ordinalMatch))) {
							prob = 1.0;
						} else if (tag == tagNumberer.number("OD") /*
																	 * tag.equals
																	 * ("OD")
																	 */
								&& word.matches(ChineseLexicon.ordinalMatch)) {
							prob = 1.0;
						} else {
							prob = 0.0;
						}
						Arrays.fill(resultArray, prob);
						return resultArray;
					} else if (word.matches(ChineseLexicon.properNameMatch)) {
						// EncodingPrintWriter.out.println("Proper name match for "
						// + word,encoding);
						if (tag == tagNumberer.number("NR")) { // tag.equals("NR"))
																// {
							prob = 1.0;
						} else {
							prob = 0.0;
						}
						Arrays.fill(resultArray, prob);
						return resultArray;
					}
				}

				// unknown word model for P(T|S)
				String sig = (isSignature) ? word : getCachedSignature(word,
						loc);

				// iTW.word = sig;
				// double c_TS = unSeenCounter.getCount(iTW);
				double c_TS = 0;
				if (unseenWordToTagCounters[tag] != null
						&& unseenWordToTagCounters[tag].get(sig) != null) {
					c_TS = unseenWordToTagCounters[tag].get(sig)[substate];
				}
				// if (c_TS == 0) continue;

				// how often did we see this signature
				double c_S = wordCounter.getCount(sig);
				double c_U = totalUnseenTokens;
				double total = totalTokens; // seenCounter.getCount(iTW);
				double c_T = unseenTagCounter[tag][substate];// unSeenCounter.getCount(iTW);
				double c_Tseen = tagCounter[tag][substate]; // seenCounter.getCount(iTW);
				double p_T_U = c_T / c_U;

				if (unknownLevel == 0) {
					c_TS = 0;
					c_S = 0;
				}
				// System.out.println(" sig " + sig
				// +" c_TS "+c_TS+" p_T_U "+p_T_U+" c_S "+c_S);
				// smooth[0]=10;
				double pb_T_S = (c_TS + smooth[0] * p_T_U) / (c_S + smooth[0]);

				double p_T = (c_Tseen / total);
				double p_W = 1.0 / total;
				// if we've never before seen this tag, then just say the
				// probability is 1
				/*
				 * if (p_T == 0) { resultArray[substate] = 1; continue; }
				 */pb_W_T = pb_T_S * p_W / p_T;
			}

			// give very low scores when needed, but try to avoid -Infinity
			if (pb_W_T == 0) {// NOT sure whether this is a good idea - slav
				resultArray[substate] = 1e-87;
			} else {
				resultArray[substate] = pb_W_T;
			}

		}
		smoother.smooth(tag, resultArray);

		if (logarithmMode) {
			for (int i = 0; i < resultArray.length; i++) {
				resultArray[i] = Math.log(resultArray[i]);
				if (Double.isNaN(resultArray[i]))
					resultArray[i] = Double.NEGATIVE_INFINITY;
			}
		}
		/*
		 * double power = 1.0; // raise to the power for (int i=0;
		 * i trees) { double bestScore =
	 * Double.NEGATIVE_INFINITY; double[] bestSmooth = {0.0, 0.0}; for
	 * (smooth[0] = 1; smooth[0] <= 1; smooth[0] *= 2.0) {//64 for (smooth[1] =
	 * 0.2; smooth[1] <= 0.2; smooth[1] *= 2.0) {//3 //for (smooth[0]=0.5;
	 * smooth[0]<=64; smooth[0] *= 2.0) {//64 //for (smooth[1]=0.1;
	 * smooth[1]<=12.8; smooth[1] *= 2.0) {//3 double score = 0.0; //score =
	 * scoreAll(trees); if (Test.verbose) {
	 * System.out.println("Tuning lexicon: s0 " + smooth[0] + " s1 " + smooth[1]
	 * + " is " + score + " " + trees.size() + " trees."); } if (score >
	 * bestScore) { System.arraycopy(smooth, 0, bestSmooth, 0, smooth.length);
	 * bestScore = score; } } } System.arraycopy(bestSmooth, 0, smooth, 0,
	 * bestSmooth.length); if (smartMutation) { smooth[0] = 8.0; //smooth[1] =
	 * 1.6; //smooth[0] = 0.5; smooth[1] = 0.1; } if (Test.unseenSmooth > 0.0) {
	 * smooth[0] = Test.unseenSmooth; } if (Test.verbose) {
	 * System.out.println("Tuning selected smoothUnseen " + smooth[0] +
	 * " smoothSeen " + smooth[1] + " at " + bestScore); } }
	 */
	
    public Counter getWordCounter() {
    	return wordCounter;
    }

    public void tieRareWordStats(int threshold) {
        for (int ni = 0; ni < numSubStates.length; ni++) {
            double unseenTagTokens = 0;
            for (int si = 0; si < numSubStates[ni]; si++) {
                unseenTagTokens += unseenTagCounter[ni][si];
            }
            if (unseenTagTokens == 0) {
                continue;
            }
            for (Map.Entry wordToTagEntry : wordToTagCounters[ni].entrySet()) {
                String word = wordToTagEntry.getKey();
                double[] substateCounter = wordToTagEntry.getValue();
                if (wordCounter.getCount(word) < threshold+0.5) {
                    double wordTagTokens = 0;
                    for (int si = 0; si < numSubStates[ni]; si++) {
                        wordTagTokens += substateCounter[si];
                    }
                    for (int si = 0; si < numSubStates[ni]; si++) {
                        substateCounter[si] = unseenTagCounter[ni][si] * wordTagTokens / unseenTagTokens;
                    }
                }
            }
        }
    }

	/**
	 * Trains this lexicon on the Collection of trees.
	 */
    public void trainTree(Tree trainTree, double randomness,
            Lexicon oldLexicon, boolean secondHalf, boolean noSmoothing,
            int threshold) {
        // scan data
        //for all substates that the word's preterminal tag has
        double sentenceScore = 0;
        if (randomness == -1) {
            sentenceScore = trainTree.getLabel().getIScore(0);
            if (sentenceScore == 0) {
                System.out.println("Something is wrong with this tree. I will skip it.");
                return;
            }
        }
        int sentenceScale = trainTree.getLabel().getIScale();

        List words = trainTree.getYield();
        List tags = trainTree.getPreTerminalYield();
        if (words.size() != tags.size()) {
            System.out.println("Yield an preterminal yield do not match!");
            System.out.println(words.toString());
            System.out.println(tags.toString());
        }

        Counter oldWordCounter = null;
        if (oldLexicon != null) {
            oldWordCounter = oldLexicon.getWordCounter();
        }
        //for all words in sentence
        for (int position = 0; position < words.size(); position++) {
            totalWords++;
            String word = words.get(position).getWord();
            int nSubStates = tags.get(position).numSubStates();
            short tag = tags.get(position).getState();


            String sig = getCachedSignature(word, position);
            wordCounter.incrementCount(sig, 0);

            if (unseenWordToTagCounters[tag] == null) {
                unseenWordToTagCounters[tag] = new HashMap();
            }
            double[] substateCounter2 = unseenWordToTagCounters[tag].get(sig);
            if (substateCounter2 == null) {
                //System.out.print("Sig "+sig+" word "+ word+" pos "+position);
                substateCounter2 = new double[numSubStates[tag]];
                unseenWordToTagCounters[tag].put(sig, substateCounter2);
            }

            //guarantee that the wordToTagCounter element exists so we can tally the combination
            if (wordToTagCounters[tag] == null) {
                wordToTagCounters[tag] = new HashMap();
            }
            double[] substateCounter = wordToTagCounters[tag].get(word);
            if (substateCounter == null) {
                substateCounter = new double[numSubStates[tag]];
                wordToTagCounters[tag].put(word, substateCounter);
            }

            double[] oldLexiconScores = null;
            if (randomness == -1) {
                oldLexiconScores = oldLexicon.score(word, tag, position, noSmoothing, false);
            }

            StateSet currentState = tags.get(position);
            double scale = ScalingTools.calcScaleFactor(currentState.getOScale() - sentenceScale) / sentenceScore;
            //double weightSum = 0;

            for (short substate = 0; substate < nSubStates; substate++) {
                double weight = 1;
                if (randomness == -1) {
                    //weight by the probability of seeing the tag and word together, given the sentence
                    if (!Double.isInfinite(scale)) {
                        weight = currentState.getOScore(substate) * oldLexiconScores[substate] * scale;
                    } else {
                        weight = Math.exp(Math.log(ScalingTools.SCALE) * (currentState.getOScale() - sentenceScale) - Math.log(sentenceScore) + Math.log(currentState.getOScore(
substate)) + Math.log(oldLexiconScores[substate]));
                    }
                    //weightSum+=weight;
                } else if (randomness == 0) {
                    // for the baseline
                    weight = 1;
                } else {
                    //add a bit of randomness
                    weight = GrammarTrainer.RANDOM.nextDouble() * randomness / 100.0 + 1.0;
                }
                if (weight == 0) {
                    continue;
                }
                //tally in the tag with the given weight
                substateCounter[substate] += weight;
                // update the counters
                tagCounter[tag][substate] += weight;
                wordCounter.incrementCount(word, weight);
                totalTokens += weight;

                if (Double.isNaN(totalTokens)) {
                    throw new Error("totalTokens is NaN: this would fail if we let it continue!");
                }

                if (oldLexicon != null && oldWordCounter.getCount(word) < threshold+0.5) {
                    wordCounter.incrementCount(sig, weight);
                    substateCounter2[substate] += weight;
                    unseenTagCounter[tag][substate] += weight;
                    totalUnseenTokens += weight;
                }
//                if (secondHalf) {
//                    // start doing this once we're halfway through the trees
//                    // it's an entirely unknown word
//                    if (wordCounter.getCount(word) < 2) {
//                        wordCounter.incrementCount(sig, weight);
//
//                        if (unseenWordToTagCounters[tag] == null) {
//                            unseenWordToTagCounters[tag] = new HashMap();
//                        }
//                        substateCounter = unseenWordToTagCounters[tag].get(sig);
//                        if (substateCounter == null) {
//                            //System.out.print("Sig "+sig+" word "+ word+" pos "+position);
//                            substateCounter = new double[numSubStates[tag]];
//                            unseenWordToTagCounters[tag].put(sig, substateCounter);
//                        }
//
//                        substateCounter[substate] += weight;
//                        unseenTagCounter[tag][substate] += weight;
//                        totalUnseenTokens += weight;
//                    } else {
//                    }
//                }
            }
        }
    }

	
	/**
	 * Returns the index of the signature of the word numbered wordIndex, where
	 * the signature is the String representation of unknown word features.
	 * Caches the last signature index returned.
	 */
	protected String getCachedSignature(String word, int sentencePosition) {
		if (word == null)
			return lastWordToSignaturize;
		if (word.equals(lastWordToSignaturize)
				&& sentencePosition == lastSentencePosition) {
			// System.err.println("Signature: cache mapped " + wordIndex +
			// " to " + lastSignatureIndex);
			return lastSignature;
		} else {
			String uwSig = getSignature(word, sentencePosition);
			lastSignature = uwSig;
			lastSentencePosition = sentencePosition;
			lastWordToSignaturize = word;
			return uwSig;
		}
	}

	/**
	 * Merge states, combining information about words we have seen. THIS DOES
	 * NOT UPDATE INFORMATION FOR UNSEEN WORDS! For that, retrain the Lexicon!
	 * 
	 * @param mergeThesePairs
	 * @param mergeWeights
	 */
	public void mergeStates(boolean[][][] mergeThesePairs,
			double[][] mergeWeights) {
		short[] newNumSubStates = new short[numSubStates.length];
		short[][] mapping = new short[numSubStates.length][];
		// invariant: if partners[state][substate][0] == substate, it's the 1st
		// one
		short[][][] partners = new short[numSubStates.length][][];
		Grammar.calculateMergeArrays(mergeThesePairs, newNumSubStates, mapping,
				partners, numSubStates);

		for (int tag = 0; tag < mergeThesePairs.length; tag++) {
			// update wordToTagCounters
			if (wordToTagCounters[tag] != null) {
				for (String word : wordToTagCounters[tag].keySet()) {
					double[] scores = wordToTagCounters[tag].get(word);
					double[] newScores = new double[newNumSubStates[tag]];
					for (int i = 0; i < numSubStates[tag]; i++) {
						short nSplit = (short) partners[tag][i].length;
						if (nSplit == 2) {
							newScores[mapping[tag][i]] = scores[partners[tag][i][0]]
									+ scores[partners[tag][i][1]];
						} else {
							newScores[mapping[tag][i]] = scores[i];
						}
					}
					wordToTagCounters[tag].put(word, newScores);
				}
			}
			// update tag counter
			double[] newTagCounter = new double[newNumSubStates[tag]];
			for (int i = 0; i < numSubStates[tag]; i++) {
				if (partners[tag][i].length == 2) {
					newTagCounter[mapping[tag][i]] = tagCounter[tag][partners[tag][i][0]]
							+ tagCounter[tag][partners[tag][i][1]];
				} else {
					newTagCounter[mapping[tag][i]] = tagCounter[tag][i];
				}
			}
			tagCounter[tag] = newTagCounter;
		}

		numSubStates = newNumSubStates;
	}

	public Map getUnseenScores() {
		Map map = new HashMap();
		for (int tag = 0; tag < unseenWordToTagCounters.length; tag++) {
			if (unseenWordToTagCounters[tag] != null) {
				for (String sig : unseenWordToTagCounters[tag].keySet()) {
					double[][] sigScores = map.get(sig);
					if (sigScores == null) {
						sigScores = new double[numSubStates.length][];
						map.put(sig, sigScores);
					}
					sigScores[tag] = new double[numSubStates[tag]];
					for (int substate = 0; substate < numSubStates[tag]; substate++) {
						double c_TS = 0;
						if (unseenWordToTagCounters[tag].get(sig) != null) {
							c_TS = unseenWordToTagCounters[tag].get(sig)[substate];
						}

						// how often did we see this signature
						double c_S = wordCounter.getCount(sig);
						double c_U = totalUnseenTokens;
						double total = totalTokens; // seenCounter.getCount(iTW);
						double c_T = unseenTagCounter[tag][substate];// unSeenCounter.getCount(iTW);
						double c_Tseen = tagCounter[tag][substate]; // seenCounter.getCount(iTW);
						double p_T_U = c_T / c_U;

						if (unknownLevel == 0) {
							c_TS = 0;
							c_S = 0;
						}
						double pb_T_S = (c_TS + smooth[0] * p_T_U)
								/ (c_S + smooth[0]);

						double p_T = (c_Tseen / total);
						double p_W = 1.0 / total;
						// if we've never before seen this tag, then just say
						// the probability is 1
						if (p_T == 0) {
							sigScores[tag][substate] = 1;
							continue;
						}
						double pb_W_T = pb_T_S * p_W / p_T;
						sigScores[tag][substate] = pb_W_T;
					}
				}
			}
		}
		return map;
	}

	public void removeUnlikelyTags(double threshold, double exponent) {
		// System.out.print("Removing unlikely tags...");
		if (isLogarithmMode())
			threshold = Math.log(threshold);
		int removed = 0, total = 0;
		if (isConditional) {
			for (int i = 0; i < conditionalWeights.length; i++) {
				for (int j = 0; j < conditionalWeights[i].length; j++) {
					if (conditionalWeights[i][j] == null)
						continue;
					for (int k = 0; k < conditionalWeights[i][j].length; k++) {
						total++;
						if (conditionalWeights[i][j][k] < threshold) {
							conditionalWeights[i][j][k] = 0;
							removed++;
						}
					}
				}
			}
		} else {
			for (int tag = 0; tag < numSubStates.length; tag++) {
				double[] c_TW;
				if (wordToTagCounters[tag] != null) {
					for (String word : wordToTagCounters[tag].keySet()) {
						c_TW = wordToTagCounters[tag].get(word);
						for (int substate = 0; substate < numSubStates[tag]; substate++) {
							total++;
							if (c_TW[substate] < threshold) {
								c_TW[substate] = 0;
								removed++;
							}
						}
					}
				}
			}
			/*
			 * if (unseenWordToTagCounters[tag]!=null){ for (String word :
			 * unseenWordToTagCounters[tag].keySet()){ c_TW =
			 * unseenWordToTagCounters[tag].get(word); for (int substate=0;
			 * substate();
					for (String word : wordToTagCounters[tag].keySet()) {
						double[] scores = wordToTagCounters[tag].get(word);
						double[] newScores = new double[newNumSubStates[tag]];
						for (int i = 0; i < numSubStates[tag]; i++) {
							newScores[toSubstateMapping[tag][i + 1]] += condProbs[mapping[tag][i]]
									* scores[i];
						}
						newLexicon.wordToTagCounters[tag].put(word, newScores);
					}
				}
				// update wordToTagCounters
				if (unseenWordToTagCounters[tag] != null) {
					newLexicon.unseenWordToTagCounters[tag] = new HashMap();
					for (String word : unseenWordToTagCounters[tag].keySet()) {
						double[] scores = unseenWordToTagCounters[tag]
								.get(word);
						double[] newScores = new double[newNumSubStates[tag]];
						for (int i = 0; i < numSubStates[tag]; i++) {
							newScores[toSubstateMapping[tag][i + 1]] += condProbs[mapping[tag][i]]
									* scores[i];
						}
						newLexicon.unseenWordToTagCounters[tag].put(word,
								newScores);
					}
				}
			}
		} else {
			double[][][] newCondWeights = new double[conditionalWeights.length][conditionalWeights[0].length][];
			for (int w = 0; w < newCondWeights.length; w++) {
				if (conditionalWeights[w] == null)
					continue;
				for (int tag = 0; tag < numSubStates.length; tag++) {
					if (conditionalWeights[w][tag] == null)
						continue;
					newCondWeights[w][tag] = new double[newNumSubStates[tag]];
					for (int substate = 0; substate < numSubStates[tag]; substate++) {
						newCondWeights[w][tag][toSubstateMapping[tag][substate + 1]] += condProbs[mapping[tag][substate]]
								* conditionalWeights[w][tag][substate];
					}

				}
			}
			newLexicon.conditionalWeights = newCondWeights;
			newLexicon.isConditional = true;
		}

		newLexicon.totalWordTypes = totalWordTypes;
		newLexicon.totalTokens = totalTokens;
		newLexicon.totalUnseenTokens = totalUnseenTokens;
		newLexicon.totalWords = totalWords;
		// newLexicon.smoother = smoother;
		newLexicon.allTags = new HashSet(allTags);
		newLexicon.wordCounter = new Counter();
		for (String word : wordCounter.keySet()) {
			newLexicon.wordCounter.setCount(word, wordCounter.getCount(word));
		}
		newLexicon.smoothingCutoff = smoothingCutoff;
		newLexicon.addXSmoothing = addXSmoothing;
		newLexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold;

		newLexicon.tagCounter = newTagCounter;
		newLexicon.unseenTagCounter = newUnseenTagCounter;
		newLexicon.numSubStates = newNumSubStates;
		newLexicon.wordNumberer = wordNumberer;
		newLexicon.unknownLevel = unknownLevel;
		return newLexicon;
	}

	public SophisticatedLexicon copyLexicon() {
		short[] newNumSubStates = numSubStates.clone();
		SophisticatedLexicon newLexicon = new SophisticatedLexicon(
				newNumSubStates, this.smoothingCutoff, this.smooth,
				this.smoother, this.threshold);

		double[][] newTagCounter = ArrayUtil.copy(tagCounter);
		double[][] newUnseenTagCounter = ArrayUtil.copy(unseenTagCounter);
		for (int tag = 0; tag < numSubStates.length; tag++) {
			if (wordToTagCounters[tag] != null) {
				newLexicon.wordToTagCounters[tag] = new HashMap();
				for (String word : wordToTagCounters[tag].keySet()) {
					double[] scores = wordToTagCounters[tag].get(word);
					double[] newScores = scores.clone();
					newLexicon.wordToTagCounters[tag].put(word, newScores);
				}
			}
			// update wordToTagCounters
			if (unseenWordToTagCounters[tag] != null) {
				newLexicon.unseenWordToTagCounters[tag] = new HashMap();
				for (String word : unseenWordToTagCounters[tag].keySet()) {
					double[] scores = unseenWordToTagCounters[tag].get(word);
					double[] newScores = scores.clone();
					newLexicon.unseenWordToTagCounters[tag]
							.put(word, newScores);
				}
			}
		}

		if (conditionalWeights != null)
			newLexicon.conditionalWeights = conditionalWeights.clone();
		newLexicon.isConditional = isConditional;
		newLexicon.totalWordTypes = totalWordTypes;
		newLexicon.totalTokens = totalTokens;
		newLexicon.totalUnseenTokens = totalUnseenTokens;
		newLexicon.totalWords = totalWords;
		newLexicon.smoother = smoother;
		newLexicon.allTags = new HashSet(allTags);
		newLexicon.wordCounter = new Counter();
		for (String word : wordCounter.keySet()) {
			newLexicon.wordCounter.setCount(word, wordCounter.getCount(word));
		}
		newLexicon.smoothingCutoff = smoothingCutoff;
		newLexicon.addXSmoothing = addXSmoothing;
		newLexicon.smoothInUnknownsThreshold = smoothInUnknownsThreshold;

		newLexicon.tagCounter = newTagCounter;
		newLexicon.unseenTagCounter = newUnseenTagCounter;
		newLexicon.numSubStates = newNumSubStates;

		newLexicon.wordNumberer = this.wordNumberer;
		newLexicon.unknownLevel = this.unknownLevel;
		return newLexicon;
	}

	public int getNumberOfEntries() {
		int nEntries = 0;
		if (conditionalWeights == null) {
			// indicates first time use:
			for (String word : wordCounter.keySet()) { // has all words AND also
														// the signatures
				wordNumberer.number(word);
			}
		}
		for (int tag = 0; tag < wordToTagCounters.length; tag++) {
			if (wordToTagCounters[tag] != null) {
				nEntries += wordToTagCounters[tag].size() * numSubStates[tag];
				if (conditionalWeights == null) {
					for (String word : wordToTagCounters[tag].keySet())
						wordNumberer.number(word);
				}
			}
			if (unseenWordToTagCounters[tag] != null) {
				nEntries += unseenWordToTagCounters[tag].size()
						* numSubStates[tag];
				if (conditionalWeights == null) {
					for (String word : unseenWordToTagCounters[tag].keySet())
						wordNumberer.number(word);
				}
			}
		}
		if (conditionalWeights == null) {
			conditionalWeights = new double[wordNumberer.total()][numSubStates.length][];
		}
		return nEntries;
	}

	// public Pair getLinearizedLexicon(){
	// return getLinearizedLexicon(getNumberOfEntries());
	// }
	//	
	// public Pair getLinearizedLexicon(int n){
	// if (isConditional) {
	// System.out.println("Do not have the functionality to linearize a conditional lexicon!");
	// return new Pair(null,null);
	// }
	// double[] probs = new double[n];
	// int[][] startIndex = new int[wordNumberer.total()][numSubStates.length];
	// ArrayUtil.fill(startIndex,Integer.MIN_VALUE);
	// int ind = 0;
	// for (int tag=0; tag(probs,startIndex);
	// }

	public void delinearizeLexicon(double[] probs) {
		int ind = 0;
		// Numberer wordNumberer = Numberer.getGlobalNumberer("words");
		for (int tag = 0; tag < wordToTagCounters.length; tag++) {
			if (wordToTagCounters[tag] != null) {
				for (String word : wordToTagCounters[tag].keySet()) {
					double[] scores = new double[numSubStates[tag]];
					for (int i = 0; i < scores.length; i++) {
						double val = probs[ind++];// Math.exp(); //probs[ind++]
						val = (val == -1000) ? 0 : Math.exp(val);
						if (SloppyMath.isVeryDangerous(val)) {
							if (Double.isNaN(probs[ind - 1]))
								val = 1.0e-50;
							else
								val = probs[ind - 1];
							// System.out.println("word " +word+" tag "+tag);
							// System.out.println("Optimizer proposed Inf. Setting to probs: "
							// +val);
						}
						scores[i] = val;
					}
					conditionalWeights[wordNumberer.number(word)][tag] = scores;
				}
			}
			if (unseenWordToTagCounters[tag] != null) {
				for (String word : unseenWordToTagCounters[tag].keySet()) {
					double[] scores = new double[numSubStates[tag]];
					for (int i = 0; i < scores.length; i++) {
						double val = probs[ind++];// Math.exp(); //probs[ind++]
						val = (val == -1000) ? 0 : Math.exp(val);
						if (SloppyMath.isVeryDangerous(val)) {
							if (Double.isNaN(probs[ind - 1]))
								val = 1.0e-50;
							else
								val = probs[ind - 1];
							// System.out.println("word " +word+" tag "+tag);
							// System.out.println("Optimizer proposed Inf. Setting to probs: "
							// +val);
						}
						scores[i] = val;
					}
					conditionalWeights[wordNumberer.number(word)][tag] = scores;
				}
			}
		}
		this.isConditional = true;
	}

	public void setConditional(boolean b) {
		this.isConditional = b;
	}

	public double[] scoreConditional(String word, short tag, int loc,
			boolean noSmoothing, boolean isSignature) {
		if (isSignature)
			return getConditionalSignatureScore(word, tag, noSmoothing);
		else if (!isKnown(word))
			return getConditionalSignatureScore(getCachedSignature(word, loc),
					tag, noSmoothing);
		// else if(!isKnown(word)) return getConditionalSignatureScore("#UNK#",
		// tag, noSmoothing);
		// else if(isKnown(word))return getConditionalSignatureScore(word, tag,
		// noSmoothing);
		double[] resultArray = new double[numSubStates[tag]];
		double[] wordScore = getConditionalWordScore(word, tag, noSmoothing);
		String sig = getCachedSignature(word, loc);
		double[] sigScore = getConditionalSignatureScore(sig, tag, noSmoothing);
		for (int i = 0; i < resultArray.length; i++) {
			resultArray[i] = wordScore[i] + sigScore[i];
		}
		return resultArray;
	}

	public double[] getConditionalSignatureScore(String sig, short tag,
			boolean noSmoothing) {
		double[] resultArray = new double[numSubStates[tag]];
		int ind = wordNumberer.number(sig);
		if (ind >= conditionalWeights.length) {
			System.out
					.println(" We have a problem! sig " + sig + " ind " + ind);
			return resultArray;
		}
		double[] tmpArray = conditionalWeights[ind][tag];
		if (tmpArray != null) {
			for (int i = 0; i < resultArray.length; i++) {
				resultArray[i] += tmpArray[i];
			}
		}
		if (this.isLogarithmMode()) {
			for (int i = 0; i < resultArray.length; i++) {
				resultArray[i] = Math.log(resultArray[i]);
			}
		}
		return resultArray;
	}

	public double[] getConditionalWordScore(String word, short tag,
			boolean noSmoothing) {
		double[] resultArray = new double[numSubStates[tag]];
		int ind = wordNumberer.number(word);
		double[] tmpArray = conditionalWeights[ind][tag];
		if (tmpArray != null) {
			for (int i = 0; i < resultArray.length; i++) {
				resultArray[i] = tmpArray[i];
			}
		}
		if (this.isLogarithmMode()) {
			for (int i = 0; i < resultArray.length; i++) {
				resultArray[i] = Math.log(resultArray[i]);
			}
		}

		return resultArray;
	}

	class ChineseLexicon implements Serializable {
		private static final long serialVersionUID = 1L;

		private static final String encoding = "GB18030"; // used only for
															// debugging

		/*
		 * These strings are stored in ascii-stype Unicode encoding. To edit
		 * them, either use the Unicode codes or use native2ascii or a similar
		 * program to convert the file into a Chinese encoding, then convert
		 * back.
		 */
		public static final String dateMatch = ".*[\u5e74\u6708\u65e5\u53f7]$";
		public static final String numberMatch = ".*[\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19\uff11\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf].*";
		public static final String ordinalMatch = "^\u7b2c.*";
		public static final String properNameMatch = ".*\u00b7.*";
	}

	public void setSmoother(Smoother smoother) {
		this.smoother = smoother;
	}

	public Smoother getSmoother() {
		return smoother;
	}

	public double[] getSmoothingParams() {
		return smooth;
	}

	public double getPruningThreshold() {
		return threshold;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#getLinearizedLexicon()
	 */
	public double[] getLinearizedLexicon() {
		// TODO Auto-generated method stub
		return null;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#getLinearIndex(java.lang.String,
	 * int)
	 */
	public int getLinearIndex(String word, int tag) {
		// TODO Auto-generated method stub
		return 0;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#clearMapping()
	 */
	public void clearMapping() {
		// TODO Auto-generated method stub

	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreSignature(java.lang.String,
	 * int, int)
	 */
	public double[] scoreSignature(StateSet stateSet, int tag) {
		// TODO Auto-generated method stub
		return null;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreWord(java.lang.String, int)
	 */
	public double[] scoreWord(StateSet stateSet, int tag) {
		// TODO Auto-generated method stub
		return null;
	}

	public void explicitlyComputeScores(int finalLevel) {
		// TODO Auto-generated method stub

	}

	@SuppressWarnings("unchecked")
	public SophisticatedLexicon remapStates(Numberer thisNumberer,
			Numberer newNumberer) {
		SophisticatedLexicon remappedLexicon = copyLexicon();
		remappedLexicon.wordToTagCounters = new HashMap[newNumberer.size()];
		remappedLexicon.unseenWordToTagCounters = new HashMap[newNumberer
				.size()];
		remappedLexicon.typeTagCounter = new double[newNumberer.size()][];
		remappedLexicon.tagCounter = new double[newNumberer.size()][];
		remappedLexicon.unseenTagCounter = new double[newNumberer.size()][];
		remappedLexicon.simpleTagCounter = new double[newNumberer.size()];
		remappedLexicon.allTags = new HashSet();
		remappedLexicon.numSubStates = new short[newNumberer.size()];
		remappedLexicon.smoother = smoother.remapStates(thisNumberer,
				newNumberer);
		if (conditionalWeights != null) {
			for (int w = 0; w < conditionalWeights.length; w++) {
				remappedLexicon.conditionalWeights[w] = new double[newNumberer
						.size()][];
			}
		}
		for (short s = 0; s < newNumberer.size(); s++) {
			short translatedState = translateState(s, newNumberer, thisNumberer);
			if (translatedState >= 0) {
				remappedLexicon.wordToTagCounters[s] = wordToTagCounters[translatedState];
				remappedLexicon.unseenWordToTagCounters[s] = unseenWordToTagCounters[translatedState];
				remappedLexicon.typeTagCounter[s] = typeTagCounter[translatedState];
				remappedLexicon.tagCounter[s] = tagCounter[translatedState];
				remappedLexicon.unseenTagCounter[s] = unseenTagCounter[translatedState];
				remappedLexicon.simpleTagCounter[s] = simpleTagCounter[translatedState];
				if (allTags.contains(translatedState))
					remappedLexicon.allTags.add(s);
				remappedLexicon.numSubStates[s] = numSubStates[translatedState];
				if (conditionalWeights != null) {
					for (int w = 0; w < conditionalWeights[w].length; w++) {
						remappedLexicon.conditionalWeights[w][s] = conditionalWeights[w][translatedState];
					}
				}
			} else {
				remappedLexicon.wordToTagCounters[s] = new HashMap();
				remappedLexicon.unseenWordToTagCounters[s] = new HashMap();
				remappedLexicon.typeTagCounter[s] = new double[1];
				remappedLexicon.tagCounter[s] = new double[1];
				remappedLexicon.unseenTagCounter[s] = new double[1];
				remappedLexicon.numSubStates[s] = 1;
			}
		}
		return remappedLexicon;
	}

	private short translateState(int state, Numberer baseNumberer,
			Numberer translationNumberer) {
		Object object = baseNumberer.object(state);
		if (translationNumberer.hasSeen(object)) {
			return (short) translationNumberer.number(object);
		} else {
			return (short) -1;
		}
	}

}