All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dist.edu.umd.hooka.alignment.model1.Model1Base Maven / Gradle / Ivy

There is a newer version: 2.0.1
Show newest version
package edu.umd.hooka.alignment.model1;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.OutputCollector;

import edu.umd.hooka.Int2FloatMap;
import edu.umd.hooka.PhrasePair;
import edu.umd.hooka.alignment.AlignmentModel;
import edu.umd.hooka.alignment.PartialCountContainer;
import edu.umd.hooka.ttables.TTable;

public abstract class Model1Base extends AlignmentModel {

	protected final int maxF = 214350; // TODO: fix
	protected boolean _includeEnglishNullWord = true;

	public Model1Base(boolean incNullWord) {
		_includeEnglishNullWord = incNullWord;
	}

	TreeMap counts =
		new TreeMap();
	IntWritable nullWord = new IntWritable(0);

	public void writePartialCounts(OutputCollector output) throws IOException
	{
		PartialCountContainer pcc = new PartialCountContainer();
		Iterator> i = counts.entrySet().iterator();
		while (i.hasNext()) {
			Map.Entry p = i.next();
			pcc.setContent(p.getValue().getAsIndexedFloatArray());
			output.collect(p.getKey(), pcc);
			i.remove();
		}
	}
	
	public void addPartialTranslationCountsToTTable(TTable tcounts) {
		Iterator> i = counts.entrySet().iterator();
		while (i.hasNext()) {
			Map.Entry p = i.next();
			int ei = p.getKey().get();
			for (Map.Entry f:p.getValue().entrySet()) {
				tcounts.add(ei, f.getKey(), f.getValue().get());
			}
			i.remove();
		}
	}
	
	/*tcmap is actually a 2D array projected onto a linear space*/	
	FloatWritable[] tcmap = null;
	int width = 0;
	protected void initializeCountTableForSentencePair(PhrasePair pp) {
		int ew[] = pp.getE().getWords();
		int fw[] = pp.getF().getWords();
		width = fw.length;
		// add null word to the beginning of e sentence:
		tcmap = new FloatWritable[(ew.length+1) * fw.length];
		int c = 0;
		Int2FloatMap ecm = null;
		if (_includeEnglishNullWord) {
			ecm = counts.get(nullWord);
			if (ecm == null) {
				ecm = new Int2FloatMap();
				counts.put(nullWord, ecm);
			}
			for (int fi:fw) {
				ecm.createIfMissing(fi);
				tcmap[c] = ecm.getFloatWritable(fi);
				c++;
			}
		} else { c += fw.length; }
		for (int ei:ew) {
			IntWritable cew = new IntWritable(ei);
			ecm = counts.get(cew);
			if (ecm == null) {
				ecm = new Int2FloatMap();
				counts.put(cew, ecm);
			}
			for (int fi:fw) {
				ecm.createIfMissing(fi);
				tcmap[c] = ecm.getFloatWritable(fi);
				c++;
			}
		}
	}
	
	/**
	 * Normally, i is a zero based array, but since E
	 * may have a null word, in this case, i=0 refers to the
	 * null word, i=1 refers to the first word.  For j,
	 * j=0 refers to the first word.
	 */
	protected final int getTranslationCoord(int i_plus1, int j) {
		return i_plus1 * width + j;
	}
	
	protected final void addTranslationCount(int i_plus1, int j, float v) {
		if (v == 0.0f) return;
		int coord = getTranslationCoord(i_plus1, j);
		if (tcmap[coord] == null) {
			throw new RuntimeException("isNull(" + i_plus1 + "," + j +")");
		}
		//add v to existing count
		tcmap[coord].set(tcmap[coord].get() + v);
	}

	protected final void addTranslationCount(int coord, float v) {
		if (v == 0.0f) return;
		tcmap[coord].set(tcmap[coord].get() + v);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy