dist.edu.umd.hooka.CorpusInfo Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
package edu.umd.hooka;
import org.apache.hadoop.fs.Path;
public abstract class CorpusInfo {
public enum Corpus { HANSARDS,
ARABIC_SMALL,
ARABIC_10k,
ARABIC_50k,
ARABIC_150k,
ARABIC_500k,
ARABIC_1000k,
ARABIC_1500k,
ARABIC_5000k,
ARABIC_LARGE,
CZECH_WMT08,
GERMAN_TINY
};
public static CorpusInfo getCorpus(Corpus corpus) {
CorpusInfo res = null;
switch (corpus) {
case HANSARDS:
res = new Hansards();
break;
case CZECH_WMT08:
res = new CzechWMT08();
break;
case ARABIC_SMALL:
res = new ArabicSmall();
break;
case ARABIC_10k:
res = new Arabic10k();
break;
case ARABIC_50k:
res = new Arabic50k();
break;
case ARABIC_150k:
res = new Arabic150k();
break;
case ARABIC_500k:
res = new Arabic500k();
break;
case ARABIC_1000k:
res = new Arabic1000k();
break;
case ARABIC_1500k:
res = new Arabic1500k();
break;
case ARABIC_LARGE:
case ARABIC_5000k:
res = new ArabicLarge();
break;
case GERMAN_TINY:
res = new GermanTiny();
break;
}
return res;
}
protected abstract String getBasePath();
protected abstract String getBaseName();
public Path getBitext() {
return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext");
}
public Path getAlignedBitext() {
return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext-aligned");
}
public Path getCanonicalTTable() {
return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable());
}
public Path getCanonicalTTable(String type) {
return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable(type));
}
public Path getLocalTTable() {
return new Path(getBaseName() + ".ttable");
}
public Path getLocalTTable(String type) {
return new Path(getBaseName() + ".ttable-" + type);
}
public Path getLocalATable() {
return new Path(getBaseName() + ".atable");
}
public Path getLocalPhraseTable() {
return new Path(getBaseName() + ".ptable");
}
public Path getTestSubset() {
return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".test");
}
static class Hansards extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/hansards.fr-en";
}
public String getBaseName() {
return "hansards.aachen";
}
}
static class CzechWMT08 extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/cs-en.wmt08";
}
public String getBaseName() {
return "cs-en";
}
}
static class ArabicSmall extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/small.ar-en.ldc";
}
public String getBaseName() {
return "small.ar-en";
}
}
static class ArabicLarge extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/large.ar-en.ldc";
}
public String getBaseName() {
return "large.ar-en";
}
}
static class Arabic10k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.10k2";
}
public String getBaseName() {
return "ar-en.10k";
}
}
static class Arabic50k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.50k";
}
public String getBaseName() {
return "ar-en.50k";
}
}
static class Arabic150k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.150k";
}
public String getBaseName() {
return "ar-en.150k";
}
}
static class Arabic500k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.500k";
}
public String getBaseName() {
return "ar-en.500k";
}
}
static class Arabic1500k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.1500k";
}
public String getBaseName() {
return "ar-en.1500k";
}
}
static class Arabic1000k extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/ar-en.ldc.1000k";
}
public String getBaseName() {
return "ar-en.1000k";
}
}
static class GermanTiny extends CorpusInfo {
public String getBasePath() {
return "/shared/bitexts/tiny.de-en";
}
public String getBaseName() {
return "tiny-deen";
}
}
}