All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.lm.phrasetable.MosesPhraseTableReader Maven / Gradle / Ivy

The newest version!
package edu.berkeley.nlp.lm.phrasetable;

import java.io.IOException;
import java.util.Arrays;

import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.collections.Iterators;
import edu.berkeley.nlp.lm.io.IOUtils;
import edu.berkeley.nlp.lm.io.LmReader;
import edu.berkeley.nlp.lm.io.LmReaderCallback;
import edu.berkeley.nlp.lm.util.Logger;

public class MosesPhraseTableReader implements LmReader>
{

	static final String SEP_WORD = "<>";

	private final WordIndexer wordIndexer;

	private final String file;

	public MosesPhraseTableReader(final String file, final WordIndexer wordIndexer) {
		this.file = file;
		this.wordIndexer = wordIndexer;

	}

	@Override
	public void parse(final MosesPhraseTableReaderCallback callback) {
		readFromFiles(callback);
	}

	private void readFromFiles(final LmReaderCallback callback) {
		Logger.startTrack("Reading from file " + file);
		try {
			final Iterable allLinesIterator = Iterators.able(IOUtils.lineIterator(file));
			countPhrases(allLinesIterator, callback);
		} catch (final IOException e) {
			throw new RuntimeException(e);

		}
		Logger.endTrack();

	}

	/**
	 * @param 
	 * @param wordIndexer
	 * @param maxOrder
	 * @param allLinesIterator
	 * @param callback
	 * @param ngrams
	 * @return
	 */
	private void countPhrases(final Iterable allLinesIterator, final LmReaderCallback callback) {
		long numLines = 0;

		for (final String line : allLinesIterator) {
			if (numLines % 10000 == 0) Logger.logs("On line " + numLines);
			numLines++;
			final String[] parts = line.trim().split("\\|\\|\\|");
			if (parts.length != 5 && parts.length != 3) throw new IllegalArgumentException("Bad Moses phrase table file line " + line);
			assert (parts.length == 3 || parts.length == 5);
			// ingore alignments if they exist
			if (parts.length == 5) parts[2] = parts[4];

			final String[] src = parts[0].trim().split("\\s+");
			final int[] srcInts = WordIndexer.StaticMethods.toArrayFromStrings(wordIndexer, Arrays.asList(src));
			final String[] trg = parts[1].trim().split("\\s+");
			final int[] trgInts = WordIndexer.StaticMethods.toArrayFromStrings(wordIndexer, Arrays.asList(trg));

			final int sepIndex = wordIndexer.getOrAddIndexFromString(SEP_WORD);
			final String[] featStrings = parts[2].trim().split("\\s+");
			final float[] features = new float[featStrings.length];
			// we skip the last feature since it is the bias, and is always the same.
			for (int i = 0; i < featStrings.length - 1; i++) {
				try {
					final Float val = Float.parseFloat(featStrings[i]);
					if (val.isInfinite() || val.isNaN()) {
						Logger.warn("Non-finite feature: " + featStrings[i]);
						continue;
					}

					features[i] = (float) -Math.log(val);
				} catch (final NumberFormatException n) {
					throw new RuntimeException("Bad Moses phrase table file line: " + line);
				}
			}

			final int[] concat = new int[srcInts.length + trgInts.length + 1];
			System.arraycopy(srcInts, 0, concat, 0, srcInts.length);
			concat[srcInts.length] = sepIndex;
			System.arraycopy(trgInts, 0, concat, srcInts.length + 1, trgInts.length);
			callback.call(concat, 0, concat.length, new PhraseTableCounts(features), line);

		}
		callback.cleanup();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy