dist.edu.umd.hooka.AlignmentWordPreprocessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
There is a newer version: 2.0.1
package edu.umd.hooka;

import org.apache.hadoop.conf.Configuration;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;



/**
 * This class contains tokenizers for several languages. 
 * The method to tokenize a sentence is preprocessWordsImpl. The input is an array of Strings, generated by splitting the input sentence by space characters.
 * 
 * @author ferhanture
 *
 */
public abstract class AlignmentWordPreprocessor {

	public final String[] preprocessWordsForAlignment(String[] arg) {
		final String[] res = preprocessWordsImpl(arg);
		assert(res.length == arg.length);
		return res;
	}

	protected abstract String[] preprocessWordsImpl(String[] arg);

	public static AlignmentWordPreprocessor CreatePreprocessor(LanguagePair lp,
			Language l,
			Configuration conf) {
		if(l == null)
			return new NullPreprocessor(conf);
		if (l == Language.languageForISO639_1("en"))
			return new Truncator(conf);
		if (l == Language.languageForISO639_1("de"))
			return new GermanTruncator(conf);
		if (l == Language.languageForISO639_1("ar"))
			return new ArabicRawTruncator(conf);
		if (l == Language.languageForISO639_1("hu"))
			return new HungarianTruncator(conf);
		return new Truncator(conf);
	}
}

class NullPreprocessor extends AlignmentWordPreprocessor {

	public NullPreprocessor(Configuration c){}
	@Override
	protected String[] preprocessWordsImpl(String[] arg) {
		return arg;
	}

}

class ArabicRawTruncator extends AlignmentWordPreprocessor {

	int length = 4;

	static final String AL = "\u0627\u0644";
	static final String A = "\u0627";

	public ArabicRawTruncator(Configuration conf) {
	}

	@Override
	protected String[] preprocessWordsImpl(String[] arg) {
		String[] res = new String[arg.length];
		for (int i =0; i < arg.length; ++i) {
			final String cur = arg[i].toLowerCase();
			int l = length;
			int s = 0;
			if (cur.startsWith(AL))
				l+=2;
			else if (cur.startsWith(A)) {
				l+=1;
			}
			if (s >= cur.length()) s=0;
			if (cur.length() < (s+l)) l = cur.length() - s;
			res[i] = cur.substring(s, s+l);
		}
		return res;
	}
}


class Truncator extends AlignmentWordPreprocessor {

	int length = 4;

	public Truncator(Configuration conf) {
	}

	@Override
	protected String[] preprocessWordsImpl(String[] arg) {
		String[] res = new String[arg.length];
		for (int i =0; i < arg.length; ++i) {
			final String cur = arg[i].toLowerCase();
			int l = length;
			if (cur.startsWith("con"))
				l+=2;
			else if (cur.startsWith("intra"))
				l+=4;
			else if (cur.startsWith("pro"))
				l+=2;
			else if (cur.startsWith("anti"))
				l+=3;
			else if (cur.startsWith("inter"))
				l+=4;
			else if (cur.startsWith("in"))
				l+=2;
			else if (cur.startsWith("im"))
				l+=2;
			else if (cur.startsWith("re"))
				l+=2;
			else if (cur.startsWith("de"))
				l+=1;
			else if (cur.startsWith("pre"))
				l+=2;
			else if (cur.startsWith("un"))
				l+=2;
			else if (cur.startsWith("co"))
				l+=2;
			else if (cur.startsWith("qu"))
				l+=1;
			else if (cur.startsWith("ad"))
				l+=1;
			else if (cur.startsWith("en"))
				l+=2;
			else if (cur.startsWith("al-"))
				l+=2;
			else if (cur.startsWith("sim"))
				l+=2;
			else if (cur.startsWith("sym"))
				l+=2;
			if (cur.length() < l) l = cur.length();
			res[i] = cur.substring(0, l);
		}
		return res;
	}
}

class HungarianTruncator extends AlignmentWordPreprocessor {

	int length = 6;

	public HungarianTruncator(Configuration conf) {
	}

	@Override
	protected String[] preprocessWordsImpl(String[] arg) {
		String[] res = new String[arg.length];
		for (int i =0; i < arg.length; ++i) {
			final String cur = arg[i].toLowerCase();
			int l = length;
			if (cur.startsWith("con"))
				l+=2;
			else if (cur.startsWith("intra"))
				l+=4;
			if (cur.length() < l) l = cur.length();
			res[i] = cur.substring(0, l);
		}
		return res;
	}
}

class GermanTruncator extends AlignmentWordPreprocessor {

	int length = 4;

	public GermanTruncator(Configuration conf) {
	}

	@Override
	protected String[] preprocessWordsImpl(String[] arg) {
		String[] res = new String[arg.length];
		for (int i =0; i < arg.length; ++i) {
			final String cur = arg[i].toLowerCase().replaceAll("sch", "S");
			int l = length;
			int s = 0;
			if (cur.startsWith("gegen"))
				l+=5;
			else if (cur.startsWith("zusammen"))
				l+=8;
			else if (cur.startsWith("zuge"))
				l+=4;
			else if (cur.startsWith("einge"))
				l+=5;
			else if (cur.startsWith("aufge"))
				l+=5;
			else if (cur.startsWith("ausge"))
				l+=5;
			else if (cur.startsWith("hinge"))
				l+=5;
			else if (cur.startsWith("herge"))
				l+=5;
			else if (cur.startsWith("ein"))
				l+=3;
			else if (cur.startsWith("zer"))
				l+=2;
			else if (cur.startsWith("ver"))
				l+=3;
			else if (cur.startsWith("ent"))
				l+=2;
			else if (cur.startsWith("auf"))
				l+=3;
			else if (cur.startsWith("aus"))
				l+=3;
			else if (cur.startsWith("abge"))
				l+=4;
			else if (cur.startsWith("bei"))
				l+=3;
			else if (cur.startsWith("voran"))
				l+=5;
			else if (cur.startsWith("vor"))
				l+=3;
			else if (cur.startsWith("mit"))
				l+=3;
			else if (cur.startsWith("ab"))
				l+=2;
			else if (cur.startsWith("be"))
				l+=1;
			else if (cur.startsWith("?ber"))
				l+=4;
			else if (cur.startsWith("unter"))
				l+=5;
			else if (cur.startsWith("ge"))
				s+=2;
			else if (cur.startsWith("er"))
				l+=1;
			else if (cur.startsWith("zu"))
				l+=2;
			else if (cur.startsWith("ange"))
				l+=3;
			else if (cur.startsWith("an"))
				l+=2;
			else if (cur.startsWith("durch"))
				l+=5;
			else if (cur.startsWith("nieder"))
				l+=5;
			else if (cur.startsWith("dar"))
				l+=2;
			if (s >= cur.length()) s=0;
			if (cur.length() < (s+l)) l = cur.length() - s;
			res[i] = cur.substring(s, s+l);
		}
		return res;
	}
}