dist.edu.umd.hooka.alignment.model1.Model1Base Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
package edu.umd.hooka.alignment.model1;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.OutputCollector;
import edu.umd.hooka.Int2FloatMap;
import edu.umd.hooka.PhrasePair;
import edu.umd.hooka.alignment.AlignmentModel;
import edu.umd.hooka.alignment.PartialCountContainer;
import edu.umd.hooka.ttables.TTable;
public abstract class Model1Base extends AlignmentModel {
protected final int maxF = 214350; // TODO: fix
protected boolean _includeEnglishNullWord = true;
public Model1Base(boolean incNullWord) {
_includeEnglishNullWord = incNullWord;
}
TreeMap counts =
new TreeMap();
IntWritable nullWord = new IntWritable(0);
public void writePartialCounts(OutputCollector output) throws IOException
{
PartialCountContainer pcc = new PartialCountContainer();
Iterator> i = counts.entrySet().iterator();
while (i.hasNext()) {
Map.Entry p = i.next();
pcc.setContent(p.getValue().getAsIndexedFloatArray());
output.collect(p.getKey(), pcc);
i.remove();
}
}
public void addPartialTranslationCountsToTTable(TTable tcounts) {
Iterator> i = counts.entrySet().iterator();
while (i.hasNext()) {
Map.Entry p = i.next();
int ei = p.getKey().get();
for (Map.Entry f:p.getValue().entrySet()) {
tcounts.add(ei, f.getKey(), f.getValue().get());
}
i.remove();
}
}
/*tcmap is actually a 2D array projected onto a linear space*/
FloatWritable[] tcmap = null;
int width = 0;
protected void initializeCountTableForSentencePair(PhrasePair pp) {
int ew[] = pp.getE().getWords();
int fw[] = pp.getF().getWords();
width = fw.length;
// add null word to the beginning of e sentence:
tcmap = new FloatWritable[(ew.length+1) * fw.length];
int c = 0;
Int2FloatMap ecm = null;
if (_includeEnglishNullWord) {
ecm = counts.get(nullWord);
if (ecm == null) {
ecm = new Int2FloatMap();
counts.put(nullWord, ecm);
}
for (int fi:fw) {
ecm.createIfMissing(fi);
tcmap[c] = ecm.getFloatWritable(fi);
c++;
}
} else { c += fw.length; }
for (int ei:ew) {
IntWritable cew = new IntWritable(ei);
ecm = counts.get(cew);
if (ecm == null) {
ecm = new Int2FloatMap();
counts.put(cew, ecm);
}
for (int fi:fw) {
ecm.createIfMissing(fi);
tcmap[c] = ecm.getFloatWritable(fi);
c++;
}
}
}
/**
* Normally, i is a zero based array, but since E
* may have a null word, in this case, i=0 refers to the
* null word, i=1 refers to the first word. For j,
* j=0 refers to the first word.
*/
protected final int getTranslationCoord(int i_plus1, int j) {
return i_plus1 * width + j;
}
protected final void addTranslationCount(int i_plus1, int j, float v) {
if (v == 0.0f) return;
int coord = getTranslationCoord(i_plus1, j);
if (tcmap[coord] == null) {
throw new RuntimeException("isNull(" + i_plus1 + "," + j +")");
}
//add v to existing count
tcmap[coord].set(tcmap[coord].get() + v);
}
protected final void addTranslationCount(int coord, float v) {
if (v == 0.0f) return;
tcmap[coord].set(tcmap[coord].get() + v);
}
}