dist.edu.umd.hooka.CorpusVocabNormalizerAndNumberizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
package edu.umd.hooka;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.streaming.StreamXmlRecordReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import edu.umd.hooka.corpora.Chunk;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
import edu.umd.hooka.corpora.ParallelChunk;
import edu.umd.hooka.corpora.ParallelCorpusReader;
public class CorpusVocabNormalizerAndNumberizer {
static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, CHUNKS, WRONG_LANGUAGE, SRC_TOO_LONG, TGT_TOO_LONG };
private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class);
static final String SRC_LANG = "ha.sourcelang";
static final String TGT_LANG = "ha.targetlang";
public static class BitextCompilerMapper extends MapReduceBase
implements Mapper {
String outputBase = null;
Path pf = null;
Path pe = null;
Path pa = null;
static Vocab vocE = null;
static Vocab vocF = null;
ParallelCorpusReader pcr = new ParallelCorpusReader();
Language src = null;
Language tgt = null;
AlignmentWordPreprocessor sawp = null;
AlignmentWordPreprocessor tawp = null;
LanguagePair lp = null;
JobConf job_ = null;
public void configure(JobConf job) {
sLogger.setLevel(Level.OFF);
src = Language.languageForISO639_1(job.get(SRC_LANG));
tgt = Language.languageForISO639_1(job.get(TGT_LANG));
sLogger.debug("Source language: "+src.code());
sLogger.debug("Target language: "+tgt.code());
boolean useVocabServer = false;
if (!useVocabServer) {
if (vocE == null) vocE = new VocabularyWritable();
if (vocF == null) vocF = new VocabularyWritable();
} else {
try {
vocE = new VocabServerClient(job.get("ha.vocabserver.host"),
Integer.parseInt(job.get("ha.vocabserver.port1")));
vocF = new VocabServerClient(job.get("ha.vocabserver.host"),
Integer.parseInt(job.get("ha.vocabserver.port2")));
} catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); }
}
lp = LanguagePair.languageForISO639_1Pair(
src.code() + "-" + tgt.code());
if(job.getBoolean("ha.trunc.use", true)){
sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job);
tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job);
}else{
sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
}
job_ = job;
}
public int[] convertStrings(String[] s, Vocab v) {
int[] res = new int[s.length];
for (int i =0; i"+res[i]);
}
return res;
}
Text ok = new Text("");
@Override
public void close() {
System.err.println("Target: " + vocE.size() + " types. Writing to "+job_.get("root",null)+"/vocab.E");
System.err.println("Source: " + vocF.size() + " types .Writing to "+job_.get("root",null)+"/vocab.F");
//write out vocabulary to file
try {
FileSystem fs = FileSystem.get(job_);
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.E"))));
((VocabularyWritable) vocE).write(dos);
dos.close();
DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.F"))));
((VocabularyWritable) vocF).write(dos2);
dos2.close();
} catch (IOException e) {
throw new RuntimeException("Vocab couldn't be written to disk.\n"+e.toString());
}
}
//read in xml-format bitext and output each instance as a PhrasePair object with a unique string id as key.
public void map(Text key, Text value,
OutputCollector oc,
Reporter reporter) throws IOException {
//key: a single sentence in both languages and alignment
//ignore value. each key is parallel sentence and its alignment, in xml format
ParallelChunk c = pcr.parseString(key.toString());
ok.set(c.idString());
//Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
Chunk fc = c.getChunk(src);
Chunk ec = c.getChunk(tgt);
if (fc == null || ec == null) {
reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
return;
}
if (fc.getLength() > 200) {
reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
return;
}
if (ec.getLength() > 200) {
reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
return;
}
//ec,fc: English/French sentence represented as sequence of words
//vocE,vocF: vocabularies for english and french, of type VocabularyWritable
//ee,fe: integer representation of words in sentences ec and fc
sLogger.debug("Target sentence:");
int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
sLogger.debug("Source sentence:");
int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);
//e,f: phrase from whole sentence
Phrase e = new Phrase(ee, 0);
Phrase f = new Phrase(fe, 1);
edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
ReferenceAlignment ra = c.getReferenceAlignment(lp);
if (ra != null) {
b.setAlignment(ra);
}
reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
oc.collect(ok, b);
}
}
public static class XMLInput extends FileInputFormat {
private CompressionCodecFactory compressionCodecs = null;
public void configure(JobConf conf) {
compressionCodecs = new CompressionCodecFactory(conf);
}
protected boolean isSplitable(FileSystem fs, Path file) {
if (compressionCodecs == null) return true;
return compressionCodecs.getCodec(file) == null;
}
public RecordReader getRecordReader(
InputSplit genericSplit, JobConf job,
Reporter reporter)
throws IOException {
reporter.setStatus(genericSplit.toString());
FileSplit split = (FileSplit)genericSplit;
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
if (compressionCodecs != null && compressionCodecs.getCodec(file) != null)
throw new RuntimeException("Not handling compression!");
return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job));
}
}
@SuppressWarnings({ "deprecation", "null" })
public static void preprocessAndNumberizeFiles(Configuration c,
String inputPaths, Path output) throws IOException {
sLogger.setLevel(Level.INFO);
JobConf conf = new JobConf(c);
conf.setJobName("bitext.compile");
boolean useVocabServer = false;
Thread vst1= null;
Thread vst2= null;
VocabServer vocabServer1 = null;
VocabServer vocabServer2 = null;
try {
//inputPaths = bi-text given as input in main method of HadoopAlign
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(PhrasePair.class);
conf.setMapperClass(BitextCompilerMapper.class);
conf.setReducerClass(IdentityReducer.class);
conf.setNumMapTasks(1);
conf.setNumReduceTasks(1);
FileInputFormat.setInputPaths(conf, inputPaths);
conf.set("stream.recordreader.begin", "");
conf.set("stream.recordreader.slowmatch", "false");
conf.set("stream.recordreader.maxrec", "100000");
conf.setInputFormat(XMLInput.class);
FileOutputFormat.setOutputPath(conf, output);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class);
System.out.println("Running job "+conf.getJobName());
System.out.println("Input: " + inputPaths);
System.out.println("Output: "+output);
JobClient.runJob(conf);
} finally {
try {
if (vst1!=null) vocabServer1.stopServer();
if (vst2!=null) vocabServer2.stopServer();
if (vst1!=null) vst1.join();
if (vst2!=null) vst2.join();
} catch (InterruptedException e) {}
}
}
public static void main(String args[]) {
Path[] files = new Path[2];
files[0] = new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml");
files[1] = new Path("/tmp/bar.xml");
try {
Configuration c = new Configuration();
c.set(SRC_LANG, "de");
c.set(TGT_LANG, "en");
// c.set("mapred.job.tracker", "local");
// c.set("fs.default.name", "file:///");
// FileSystem.get(c).delete(new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"), true);
// preprocessAndNumberizeFiles(c, "/Users/ferhanture/edu/research/programs/hadoop-aligner/training-data.tar/eu-nc-wmt2008.de-en/eu-nc-wmt2008.de-en.xml", new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"));
preprocessAndNumberizeFiles(c, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext"));
} catch (Exception e) { e.printStackTrace(); }
}
}