
edu.berkeley.nlp.lm.phrasetable.MosesPhraseTableReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleylm Show documentation
Show all versions of berkeleylm Show documentation
An N-gram Language Model Library from UC Berkeley
The newest version!
package edu.berkeley.nlp.lm.phrasetable;
import java.io.IOException;
import java.util.Arrays;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.collections.Iterators;
import edu.berkeley.nlp.lm.io.IOUtils;
import edu.berkeley.nlp.lm.io.LmReader;
import edu.berkeley.nlp.lm.io.LmReaderCallback;
import edu.berkeley.nlp.lm.util.Logger;
public class MosesPhraseTableReader implements LmReader>
{
static final String SEP_WORD = "<>";
private final WordIndexer wordIndexer;
private final String file;
public MosesPhraseTableReader(final String file, final WordIndexer wordIndexer) {
this.file = file;
this.wordIndexer = wordIndexer;
}
@Override
public void parse(final MosesPhraseTableReaderCallback callback) {
readFromFiles(callback);
}
private void readFromFiles(final LmReaderCallback callback) {
Logger.startTrack("Reading from file " + file);
try {
final Iterable allLinesIterator = Iterators.able(IOUtils.lineIterator(file));
countPhrases(allLinesIterator, callback);
} catch (final IOException e) {
throw new RuntimeException(e);
}
Logger.endTrack();
}
/**
* @param
* @param wordIndexer
* @param maxOrder
* @param allLinesIterator
* @param callback
* @param ngrams
* @return
*/
private void countPhrases(final Iterable allLinesIterator, final LmReaderCallback callback) {
long numLines = 0;
for (final String line : allLinesIterator) {
if (numLines % 10000 == 0) Logger.logs("On line " + numLines);
numLines++;
final String[] parts = line.trim().split("\\|\\|\\|");
if (parts.length != 5 && parts.length != 3) throw new IllegalArgumentException("Bad Moses phrase table file line " + line);
assert (parts.length == 3 || parts.length == 5);
// ingore alignments if they exist
if (parts.length == 5) parts[2] = parts[4];
final String[] src = parts[0].trim().split("\\s+");
final int[] srcInts = WordIndexer.StaticMethods.toArrayFromStrings(wordIndexer, Arrays.asList(src));
final String[] trg = parts[1].trim().split("\\s+");
final int[] trgInts = WordIndexer.StaticMethods.toArrayFromStrings(wordIndexer, Arrays.asList(trg));
final int sepIndex = wordIndexer.getOrAddIndexFromString(SEP_WORD);
final String[] featStrings = parts[2].trim().split("\\s+");
final float[] features = new float[featStrings.length];
// we skip the last feature since it is the bias, and is always the same.
for (int i = 0; i < featStrings.length - 1; i++) {
try {
final Float val = Float.parseFloat(featStrings[i]);
if (val.isInfinite() || val.isNaN()) {
Logger.warn("Non-finite feature: " + featStrings[i]);
continue;
}
features[i] = (float) -Math.log(val);
} catch (final NumberFormatException n) {
throw new RuntimeException("Bad Moses phrase table file line: " + line);
}
}
final int[] concat = new int[srcInts.length + trgInts.length + 1];
System.arraycopy(srcInts, 0, concat, 0, srcInts.length);
concat[srcInts.length] = sepIndex;
System.arraycopy(trgInts, 0, concat, srcInts.length + 1, trgInts.length);
callback.call(concat, 0, concat.length, new PhraseTableCounts(features), line);
}
callback.cleanup();
}
}