dist.edu.umd.hooka.HBitextCompiler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cloud9 Show documentation
University of Maryland's Hadoop Library
There is a newer version: 2.0.1
package edu.umd.hooka;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;


public class HBitextCompiler {
	static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, LINES, ENCODING_ERRORS };

	static final String OUTPUT_BASENAME = "bitextcomp.outputbasename";
	static final String EN_PATH         = "bitextcomp.enpath";
	static final String FR_PATH         = "bitextcomp.frpath";
	static final String AL_PATH         = "bitextcomp.alpath";
	public static class BitextCompilerMapper extends MapReduceBase
		implements Mapper {
		String outputBase = null;
		Path pf = null;
		Path pe = null;
		Path pa = null;
		public void configure(JobConf job) {
			outputBase = job.get((String)OUTPUT_BASENAME);
			pe = new Path(job.get((String)EN_PATH));
			pf = new Path(job.get((String)FR_PATH));
			String alps = job.get((String)AL_PATH);
			if (alps != null && alps.compareTo("") != 0)
				pa = new Path(alps);
		}
		
		public void map(LongWritable key, Text value, 
		                    OutputCollector oc, 
		                    Reporter reporter) throws IOException {
			Path output = new Path(outputBase);
			Path pmd = new Path(outputBase + ".metadata");
			
			org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
			FileSystem fileSys = FileSystem.get(conf);
			
			VocabularyWritable vocE = new VocabularyWritable();
			VocabularyWritable vocF = new VocabularyWritable();
			org.apache.hadoop.io.SequenceFile.Writer sfw = 
				SequenceFile.createWriter(fileSys, conf, output, IntWritable.class, PhrasePair.class);

			//if (true) throw new RuntimeException("here " + pe + " " + pf + " " + pa);
			
			boolean hasAlignment = (pa != null);
			BufferedReader rde = new BufferedReader(new InputStreamReader(fileSys.open(pe), "UTF8"));
			BufferedReader rdf = new BufferedReader(new InputStreamReader(fileSys.open(pf), "UTF8"));
			BufferedReader rda = null;
			if (hasAlignment)
				rda = new BufferedReader(new InputStreamReader(fileSys.open(pa), "UTF8"));
			String es;
			IntWritable lci = new IntWritable(0);
			int lc = 0;
			reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 0);
			while ((es = rde.readLine()) != null) {
				lc++;
				if (lc % 100 == 0) reporter.progress();
				reporter.incrCounter(BitextCompilerCounters.LINES, 1);
				String fs = rdf.readLine();
				if (fs == null) {
					throw new RuntimeException(pf + " has fewer lines than " + pe);
				}
				try {
					Phrase e=Phrase.fromString(0, es, vocE);
					Phrase f=Phrase.fromString(1, fs, vocF);
					PhrasePair b = new PhrasePair(f,e);
					if (hasAlignment) {
						Alignment a = new Alignment(f.size(), e.size(),
								rda.readLine());
						b.setAlignment(a);
					}
					lci.set(lc);
					sfw.append(lci, b);
					reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
					reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
					reporter.progress();
				} catch (Exception e) {
					System.err.println("\nAt line "+lc+" caught: "+e);
					reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 1);
				}
			}
			if (rdf.readLine() != null) {
				throw new RuntimeException(pf + " has more lines than " + pe);
			}
			sfw.close();
			Path pve = new Path(outputBase + ".voc.e");
			DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pve)));
			vocE.write(dos);
			dos.close();
			Path pvf = new Path(outputBase + ".voc.f");
			dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pvf)));
			vocF.write(dos);
			dos.close();
			Metadata theMetadata = new Metadata(lc, vocE.size(), vocF.size());
			ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(fileSys.create(pmd)));
			mdstream.writeObject(theMetadata);
			mdstream.close();
			oc.collect(new LongWritable(0), new Text("done"));
		}
	}	

	/**
	 * @param args
	 */
	@SuppressWarnings("deprecation")
	public static void main(String[] args) {
		JobConf conf = new JobConf(HBitextCompiler.class);
		conf.set(OUTPUT_BASENAME, "/shared/bitexts/ep700k+nc.de-en/ep700k+nc");
		conf.set(FR_PATH, "filt.lc.de");
		conf.set(EN_PATH, "filt.lc.en");
		conf.set(AL_PATH, ""); ///user/redpony/model-5M/aligned.grow-diag-final");
		conf.setJobName("bitext.compile");
		conf.setOutputKeyClass(LongWritable.class);
		conf.setOutputValueClass(Text.class);
		conf.setMapperClass(BitextCompilerMapper.class); 
		conf.setNumMapTasks(1);
		conf.setNumReduceTasks(0);
		FileInputFormat.setInputPaths(conf, new Path("dummy"));
		try {
			FileSystem.get(conf).delete(new Path("dummy.out"));
			FileOutputFormat.setOutputPath(conf, new Path("dummy.out"));
			conf.setOutputFormat(SequenceFileOutputFormat.class);
			JobClient.runJob(conf);
		} catch (IOException e) {
			System.err.println("Caught " + e);
			e.printStackTrace();
		}
	}

}