dist.edu.umd.hooka.HBitextCompiler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
package edu.umd.hooka;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
public class HBitextCompiler {
static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, LINES, ENCODING_ERRORS };
static final String OUTPUT_BASENAME = "bitextcomp.outputbasename";
static final String EN_PATH = "bitextcomp.enpath";
static final String FR_PATH = "bitextcomp.frpath";
static final String AL_PATH = "bitextcomp.alpath";
public static class BitextCompilerMapper extends MapReduceBase
implements Mapper {
String outputBase = null;
Path pf = null;
Path pe = null;
Path pa = null;
public void configure(JobConf job) {
outputBase = job.get((String)OUTPUT_BASENAME);
pe = new Path(job.get((String)EN_PATH));
pf = new Path(job.get((String)FR_PATH));
String alps = job.get((String)AL_PATH);
if (alps != null && alps.compareTo("") != 0)
pa = new Path(alps);
}
public void map(LongWritable key, Text value,
OutputCollector oc,
Reporter reporter) throws IOException {
Path output = new Path(outputBase);
Path pmd = new Path(outputBase + ".metadata");
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
FileSystem fileSys = FileSystem.get(conf);
VocabularyWritable vocE = new VocabularyWritable();
VocabularyWritable vocF = new VocabularyWritable();
org.apache.hadoop.io.SequenceFile.Writer sfw =
SequenceFile.createWriter(fileSys, conf, output, IntWritable.class, PhrasePair.class);
//if (true) throw new RuntimeException("here " + pe + " " + pf + " " + pa);
boolean hasAlignment = (pa != null);
BufferedReader rde = new BufferedReader(new InputStreamReader(fileSys.open(pe), "UTF8"));
BufferedReader rdf = new BufferedReader(new InputStreamReader(fileSys.open(pf), "UTF8"));
BufferedReader rda = null;
if (hasAlignment)
rda = new BufferedReader(new InputStreamReader(fileSys.open(pa), "UTF8"));
String es;
IntWritable lci = new IntWritable(0);
int lc = 0;
reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 0);
while ((es = rde.readLine()) != null) {
lc++;
if (lc % 100 == 0) reporter.progress();
reporter.incrCounter(BitextCompilerCounters.LINES, 1);
String fs = rdf.readLine();
if (fs == null) {
throw new RuntimeException(pf + " has fewer lines than " + pe);
}
try {
Phrase e=Phrase.fromString(0, es, vocE);
Phrase f=Phrase.fromString(1, fs, vocF);
PhrasePair b = new PhrasePair(f,e);
if (hasAlignment) {
Alignment a = new Alignment(f.size(), e.size(),
rda.readLine());
b.setAlignment(a);
}
lci.set(lc);
sfw.append(lci, b);
reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
reporter.progress();
} catch (Exception e) {
System.err.println("\nAt line "+lc+" caught: "+e);
reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 1);
}
}
if (rdf.readLine() != null) {
throw new RuntimeException(pf + " has more lines than " + pe);
}
sfw.close();
Path pve = new Path(outputBase + ".voc.e");
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pve)));
vocE.write(dos);
dos.close();
Path pvf = new Path(outputBase + ".voc.f");
dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pvf)));
vocF.write(dos);
dos.close();
Metadata theMetadata = new Metadata(lc, vocE.size(), vocF.size());
ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(fileSys.create(pmd)));
mdstream.writeObject(theMetadata);
mdstream.close();
oc.collect(new LongWritable(0), new Text("done"));
}
}
/**
* @param args
*/
@SuppressWarnings("deprecation")
public static void main(String[] args) {
JobConf conf = new JobConf(HBitextCompiler.class);
conf.set(OUTPUT_BASENAME, "/shared/bitexts/ep700k+nc.de-en/ep700k+nc");
conf.set(FR_PATH, "filt.lc.de");
conf.set(EN_PATH, "filt.lc.en");
conf.set(AL_PATH, ""); ///user/redpony/model-5M/aligned.grow-diag-final");
conf.setJobName("bitext.compile");
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(BitextCompilerMapper.class);
conf.setNumMapTasks(1);
conf.setNumReduceTasks(0);
FileInputFormat.setInputPaths(conf, new Path("dummy"));
try {
FileSystem.get(conf).delete(new Path("dummy.out"));
FileOutputFormat.setOutputPath(conf, new Path("dummy.out"));
conf.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(conf);
} catch (IOException e) {
System.err.println("Caught " + e);
e.printStackTrace();
}
}
}