edu.berkeley.nlp.lm.WordIndexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleylm Show documentation
An N-gram Language Model Library from UC Berkeley
The newest version!
package edu.berkeley.nlp.lm;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/**
 * Enumerates words in the vocabulary of a language model. Stores a two-way
 * mapping between integers and words.
 * 
 * @author adampauls
 * 
 * @param 
 *            A type representing words in the language. Can be a
 *            String, or something more complex if needed
 */
public interface WordIndexer extends Serializable
{

	/**
	 * Gets the index for a word, adding if necessary.
	 * 
	 * @param word
	 * @return
	 */
	public int getOrAddIndex(W word);

	public int getOrAddIndexFromString(String word);

	/**
	 * Should never add to vocabulary, and should return getUnkSymbol() if the
	 * word is not in the vocabulary.
	 * 
	 * @param word
	 * @return
	 */
	public int getIndexPossiblyUnk(W word);

	/**
	 * Gets the word object for an index.
	 * 
	 * @param index
	 * @return
	 */
	public W getWord(int index);

	/**
	 * Number of words that have been added so far
	 * 
	 * @return
	 */
	public int numWords();

	/**
	 * Returns the start symbol (usually something like {@literal }
	 * 
	 * @return
	 */
	public W getStartSymbol();

	public void setStartSymbol(W sym);

	/**
	 * Returns the start symbol (usually something like {@literal }
	 * 
	 * @return
	 */
	public W getEndSymbol();

	public void setEndSymbol(W sym);

	/**
	 * Returns the unk symbol (usually something like {@literal }
	 * 
	 * @return
	 */
	public W getUnkSymbol();

	public void setUnkSymbol(W sym);

	/**
	 * Informs the implementation that no more words can be added to the
	 * vocabulary. Implementations may perform some space optimization, and
	 * should trigger an error if an attempt is made to add a word after this
	 * point.
	 */
	public void trimAndLock();

	public static class StaticMethods
	{

		/**
		 * Converts an object representation to an int array. Does not add to
		 * the indexer.
		 * 
		 * @param 
		 * @param wordIndexer
		 * @param list
		 * @return
		 */
		public static  int[] toArray(final WordIndexer wordIndexer, final List list) {
			final int[] ret = new int[list.size()];
			for (int i = 0; i < list.size(); ++i) {
				ret[i] = wordIndexer.getIndexPossiblyUnk(list.get(i));
			}
			return ret;

		}

		/**
		 * Converts an string representation to an int array, adding to the
		 * indexer.
		 * 
		 * @param 
		 * @param wordIndexer
		 * @param list
		 * @return
		 */
		public static  int[] toArrayFromStrings(final WordIndexer wordIndexer, final List list) {
			final int[] ret = new int[list.size()];
			for (int i = 0; i < list.size(); ++i) {
				ret[i] = wordIndexer.getOrAddIndexFromString(list.get(i));
			}
			return ret;

		}

		/**
		 * Converts an int representation of an n-gram to a list. Converts only
		 * the range of the array specified by [startPos,endPos)
		 * 
		 * @param 
		 * @param wordIndexer
		 * @param intNgram
		 * @param startPos
		 * @param endPos
		 * @return
		 */
		public static  List toList(final WordIndexer wordIndexer, final int[] intNgram, final int startPos, final int endPos) {
			final List l = new ArrayList(endPos - startPos);
			for (int i = startPos; i < endPos; ++i) {
				l.add(wordIndexer.getWord(intNgram[i]));
			}
			return l;
		}

		public static  List toList(final WordIndexer wordIndexer, final int[] intNgram) {
			return toList(wordIndexer, intNgram, 0, intNgram.length);
		}
	}
}