All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dist.edu.umd.hooka.CorpusInfo Maven / Gradle / Ivy

There is a newer version: 2.0.1
Show newest version
package edu.umd.hooka;

import org.apache.hadoop.fs.Path;

public abstract class CorpusInfo {
	public enum Corpus { HANSARDS,
		ARABIC_SMALL,
		ARABIC_10k,
		ARABIC_50k,
		ARABIC_150k,
		ARABIC_500k,
		ARABIC_1000k,
		ARABIC_1500k,
		ARABIC_5000k,
		ARABIC_LARGE,
		CZECH_WMT08,
		GERMAN_TINY
	};
	public static CorpusInfo getCorpus(Corpus corpus) {
		CorpusInfo res = null;
		switch (corpus) {
			case HANSARDS:
				res = new Hansards();
				break;
			case CZECH_WMT08:
				res = new CzechWMT08();
				break;
			case ARABIC_SMALL:
				res = new ArabicSmall();
				break;
			case ARABIC_10k:
				res = new Arabic10k();
				break;
			case ARABIC_50k:
				res = new Arabic50k();
				break;
			case ARABIC_150k:
				res = new Arabic150k();
				break;
			case ARABIC_500k:
				res = new Arabic500k();
				break;
			case ARABIC_1000k:
				res = new Arabic1000k();
				break;
			case ARABIC_1500k:
				res = new Arabic1500k();
				break;
			case ARABIC_LARGE:
			case ARABIC_5000k:
				res = new ArabicLarge();
				break;
			case GERMAN_TINY:
				res = new GermanTiny();
				break;
		}
		return res;
	}
	
	protected abstract String getBasePath();
	protected abstract String getBaseName();
	public Path getBitext()  {
		return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext");
	}
	public Path getAlignedBitext() {
		return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext-aligned");	
	}
	public Path getCanonicalTTable() {
		return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable());
	}
	public Path getCanonicalTTable(String type) {
		return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable(type));
	}
	public Path getLocalTTable() {
		return new Path(getBaseName() + ".ttable");
	}
	public Path getLocalTTable(String type) {
		return new Path(getBaseName() + ".ttable-" + type);
	}
	public Path getLocalATable() {
		return new Path(getBaseName() + ".atable");
	}
	public Path getLocalPhraseTable() {
		return new Path(getBaseName() + ".ptable");
	}
	public Path getTestSubset() {
		return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".test");
	}

	static class Hansards extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/hansards.fr-en";
		}
		public String getBaseName() {
			return "hansards.aachen";
		}
	}

	static class CzechWMT08 extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/cs-en.wmt08";
		}
		public String getBaseName() {
			return "cs-en";
		}
	}

	static class ArabicSmall extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/small.ar-en.ldc";
		}
		public String getBaseName() {
			return "small.ar-en";
		}
	}

	static class ArabicLarge extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/large.ar-en.ldc";
		}
		public String getBaseName() {
			return "large.ar-en";
		}
	}

	static class Arabic10k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.10k2";
		}
		public String getBaseName() {
			return "ar-en.10k";
		}
	}

	static class Arabic50k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.50k";
		}
		public String getBaseName() {
			return "ar-en.50k";
		}
	}
	static class Arabic150k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.150k";
		}
		public String getBaseName() {
			return "ar-en.150k";
		}
	}

	static class Arabic500k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.500k";
		}
		public String getBaseName() {
			return "ar-en.500k";
		}
	}

	static class Arabic1500k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.1500k";
		}
		public String getBaseName() {
			return "ar-en.1500k";
		}
	}

	static class Arabic1000k extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/ar-en.ldc.1000k";
		}
		public String getBaseName() {
			return "ar-en.1000k";
		}
	}

	static class GermanTiny extends CorpusInfo {
		public String getBasePath() {
			return "/shared/bitexts/tiny.de-en";
		}
		public String getBaseName() {
			return "tiny-deen";
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy