dist.edu.umd.hooka.AlignmentWordPreprocessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
package edu.umd.hooka;
import org.apache.hadoop.conf.Configuration;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
/**
* This class contains tokenizers for several languages.
* The method to tokenize a sentence is preprocessWordsImpl. The input is an array of Strings, generated by splitting the input sentence by space characters.
*
* @author ferhanture
*
*/
public abstract class AlignmentWordPreprocessor {
public final String[] preprocessWordsForAlignment(String[] arg) {
final String[] res = preprocessWordsImpl(arg);
assert(res.length == arg.length);
return res;
}
protected abstract String[] preprocessWordsImpl(String[] arg);
public static AlignmentWordPreprocessor CreatePreprocessor(LanguagePair lp,
Language l,
Configuration conf) {
if(l == null)
return new NullPreprocessor(conf);
if (l == Language.languageForISO639_1("en"))
return new Truncator(conf);
if (l == Language.languageForISO639_1("de"))
return new GermanTruncator(conf);
if (l == Language.languageForISO639_1("ar"))
return new ArabicRawTruncator(conf);
if (l == Language.languageForISO639_1("hu"))
return new HungarianTruncator(conf);
return new Truncator(conf);
}
}
class NullPreprocessor extends AlignmentWordPreprocessor {
public NullPreprocessor(Configuration c){}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
return arg;
}
}
class ArabicRawTruncator extends AlignmentWordPreprocessor {
int length = 4;
static final String AL = "\u0627\u0644";
static final String A = "\u0627";
public ArabicRawTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
int s = 0;
if (cur.startsWith(AL))
l+=2;
else if (cur.startsWith(A)) {
l+=1;
}
if (s >= cur.length()) s=0;
if (cur.length() < (s+l)) l = cur.length() - s;
res[i] = cur.substring(s, s+l);
}
return res;
}
}
class Truncator extends AlignmentWordPreprocessor {
int length = 4;
public Truncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
if (cur.startsWith("con"))
l+=2;
else if (cur.startsWith("intra"))
l+=4;
else if (cur.startsWith("pro"))
l+=2;
else if (cur.startsWith("anti"))
l+=3;
else if (cur.startsWith("inter"))
l+=4;
else if (cur.startsWith("in"))
l+=2;
else if (cur.startsWith("im"))
l+=2;
else if (cur.startsWith("re"))
l+=2;
else if (cur.startsWith("de"))
l+=1;
else if (cur.startsWith("pre"))
l+=2;
else if (cur.startsWith("un"))
l+=2;
else if (cur.startsWith("co"))
l+=2;
else if (cur.startsWith("qu"))
l+=1;
else if (cur.startsWith("ad"))
l+=1;
else if (cur.startsWith("en"))
l+=2;
else if (cur.startsWith("al-"))
l+=2;
else if (cur.startsWith("sim"))
l+=2;
else if (cur.startsWith("sym"))
l+=2;
if (cur.length() < l) l = cur.length();
res[i] = cur.substring(0, l);
}
return res;
}
}
class HungarianTruncator extends AlignmentWordPreprocessor {
int length = 6;
public HungarianTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
if (cur.startsWith("con"))
l+=2;
else if (cur.startsWith("intra"))
l+=4;
if (cur.length() < l) l = cur.length();
res[i] = cur.substring(0, l);
}
return res;
}
}
class GermanTruncator extends AlignmentWordPreprocessor {
int length = 4;
public GermanTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase().replaceAll("sch", "S");
int l = length;
int s = 0;
if (cur.startsWith("gegen"))
l+=5;
else if (cur.startsWith("zusammen"))
l+=8;
else if (cur.startsWith("zuge"))
l+=4;
else if (cur.startsWith("einge"))
l+=5;
else if (cur.startsWith("aufge"))
l+=5;
else if (cur.startsWith("ausge"))
l+=5;
else if (cur.startsWith("hinge"))
l+=5;
else if (cur.startsWith("herge"))
l+=5;
else if (cur.startsWith("ein"))
l+=3;
else if (cur.startsWith("zer"))
l+=2;
else if (cur.startsWith("ver"))
l+=3;
else if (cur.startsWith("ent"))
l+=2;
else if (cur.startsWith("auf"))
l+=3;
else if (cur.startsWith("aus"))
l+=3;
else if (cur.startsWith("abge"))
l+=4;
else if (cur.startsWith("bei"))
l+=3;
else if (cur.startsWith("voran"))
l+=5;
else if (cur.startsWith("vor"))
l+=3;
else if (cur.startsWith("mit"))
l+=3;
else if (cur.startsWith("ab"))
l+=2;
else if (cur.startsWith("be"))
l+=1;
else if (cur.startsWith("?ber"))
l+=4;
else if (cur.startsWith("unter"))
l+=5;
else if (cur.startsWith("ge"))
s+=2;
else if (cur.startsWith("er"))
l+=1;
else if (cur.startsWith("zu"))
l+=2;
else if (cur.startsWith("ange"))
l+=3;
else if (cur.startsWith("an"))
l+=2;
else if (cur.startsWith("durch"))
l+=5;
else if (cur.startsWith("nieder"))
l+=5;
else if (cur.startsWith("dar"))
l+=2;
if (s >= cur.length()) s=0;
if (cur.length() < (s+l)) l = cur.length() - s;
res[i] = cur.substring(s, s+l);
}
return res;
}
}