All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.AbstractNonIncrementalJob Maven / Gradle / Ivy

The newest version!
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;


import datafu.hourglass.avro.CombinedAvroKeyInputFormat;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;

/**
 * Base class for Hadoop jobs that consume time-partitioned data
 * in a non-incremental way.  Typically this is only used for comparing incremental
 * jobs against a non-incremental baseline.
 * It is essentially the same as {@link AbstractPartitionCollapsingIncrementalJob}
 * without all the incremental features.
 * 
 * 

* Jobs extending this class consume input data partitioned according to yyyy/MM/dd. * Only a single input path is supported. The output will be written to a directory * in the output path with name format yyyyMMdd derived from the end of the time * window that is consumed. *

* *

* This class has the same configuration and methods as {@link TimeBasedJob}. * In addition it also recognizes the following properties: *

* *
    *
  • combine.inputs - True if inputs should be combined (defaults to false)
  • *
  • num.reducers.bytes.per.reducer - Number of input bytes per reducer
  • *
* *

* When combine.inputs is true, then CombinedAvroKeyInputFormat is used * instead of AvroKeyInputFormat. This enables a single map task to consume more than * one file. *

* *

* The num.reducers.bytes.per.reducer property controls the number of reducers to * use based on the input size. The total size of the input files is divided by this number * and then rounded up. *

* * @author "Matthew Hayes" * */ public abstract class AbstractNonIncrementalJob extends TimeBasedJob { private final Logger _log = Logger.getLogger(AbstractNonIncrementalJob.class); private boolean _combineInputs; private Report _report; /** * Initializes the job. * * @param name job name * @param props configuration properties * @throws IOException */ public AbstractNonIncrementalJob(String name, Properties props) throws IOException { super(name,props); if (props.containsKey("combine.inputs")) { setCombineInputs(Boolean.parseBoolean(props.getProperty("combine.inputs"))); } } /** * Gets whether inputs should be combined. * * @return true if inputs are to be combined */ public boolean getCombineInputs() { return _combineInputs; } /** * Sets whether inputs should be combined. * * @param combineInputs true to combine inputs */ public void setCombineInputs(boolean combineInputs) { _combineInputs = combineInputs; } /** * Gets a report summarizing the run. * * @return report */ public Report getReport() { return _report; } /** * Runs the job. * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ @Override public void run() throws IOException, InterruptedException, ClassNotFoundException { _report = new Report(); Calendar cal = Calendar.getInstance(PathUtils.timeZone); if (!getFileSystem().exists(getOutputPath())) { getFileSystem().mkdirs(getOutputPath()); } if (getInputPaths().size() > 1) { throw new RuntimeException("Only a single input is supported"); } List inputs = PathUtils.findNestedDatedPaths(getFileSystem(), getInputPaths().get(0)); DatePath latestInput = (inputs.size() > 0) ? inputs.get(inputs.size() - 1) : null; if (inputs.size() == 0) { throw new RuntimeException("no input data available"); } List dates = new ArrayList(); for (DatePath dp : inputs) { dates.add(dp.getDate()); } DateRange dateRange = DateRangePlanner.getDateRange(getStartDate(), getEndDate(), dates, getDaysAgo(), getNumDays()); Map existingInputs = new HashMap(); for (DatePath input : inputs) { existingInputs.put(input.getDate(), input); } _log.info("Getting schema for input " + latestInput.getPath()); Schema inputSchema = PathUtils.getSchemaFromPath(getFileSystem(),latestInput.getPath()); ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProperties()); List inputPaths = new ArrayList(); for (Date currentDate=dateRange.getBeginDate(); currentDate.compareTo(dateRange.getEndDate()) <= 0; ) { DatePath input = existingInputs.get(currentDate); if (input != null) { _log.info(String.format("Processing %s",input.getPath())); inputPaths.add(input.getPath().toString()); estimator.addInputPath(input.getPath()); _report.inputFiles.add(input); latestInput = input; } else { throw new RuntimeException(String.format("Missing input for %s",PathUtils.datedPathFormat.format(currentDate))); } cal.setTime(currentDate); cal.add(Calendar.DAY_OF_MONTH, 1); currentDate = cal.getTime(); } Path timestampOutputPath = new Path(getOutputPath(),PathUtils.datedPathFormat.format(latestInput.getDate())); final StagedOutputJob job = StagedOutputJob.createStagedJob( getConf(), getName() + "-" + PathUtils.datedPathFormat.format(latestInput.getDate()), inputPaths, "/tmp" + timestampOutputPath.toString(), timestampOutputPath.toString(), _log); job.setCountersParentPath(getCountersParentPath()); if (_combineInputs) { job.setInputFormatClass(CombinedAvroKeyInputFormat.class); } else { job.setInputFormatClass(AvroKeyInputFormat.class); } job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroJob.setInputKeySchema(job, inputSchema); AvroJob.setMapOutputKeySchema(job, getMapOutputKeySchema()); AvroJob.setMapOutputValueSchema(job, getMapOutputValueSchema()); AvroJob.setOutputKeySchema(job, getReduceOutputSchema()); int numReducers; if (getNumReducers() != null) { numReducers = getNumReducers(); _log.info(String.format("Using %d reducers (fixed)",numReducers)); } else { numReducers = estimator.getNumReducers(); _log.info(String.format("Using %d reducers (computed)",numReducers)); } job.setNumReduceTasks(numReducers); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); if (isUseCombiner() && getCombinerClass() != null) { job.setCombinerClass(getCombinerClass()); } config(job.getConfiguration()); if (!job.waitForCompletion(true)) { _log.error("Job failed! Quitting..."); throw new RuntimeException("Job failed"); } _report.jobId = job.getJobID().toString(); _report.jobName = job.getJobName(); _report.countersPath = job.getCountersPath(); _report.outputFile = new DatePath(latestInput.getDate(),timestampOutputPath); if (getRetentionCount() != null) { PathUtils.keepLatestDatedPaths(getFileSystem(), getOutputPath(), getRetentionCount()); } } /** * Gets the key schema for the map output. * * @return map output key schema */ protected abstract Schema getMapOutputKeySchema(); /** * Gets the value schema for the map output. * * @return map output value schema */ protected abstract Schema getMapOutputValueSchema(); /** * Gets the reduce output schema. * * @return reduce output schema */ protected abstract Schema getReduceOutputSchema(); /** * Gets the mapper class. * * @return the mapper */ public abstract Class getMapperClass(); /** * Gets the reducer class. * * @return the reducer */ public abstract Class getReducerClass(); /** * Gets the combiner class. * * @return the combiner */ public Class getCombinerClass() { return null; } /** * Mapper base class for {@link AbstractNonIncrementalJob}. * * @author "Matthew Hayes" * */ public static abstract class BaseMapper extends Mapper, NullWritable, AvroKey, AvroValue> { } /** * Combiner base class for {@link AbstractNonIncrementalJob}. * * @author "Matthew Hayes" * */ public static abstract class BaseCombiner extends Reducer, AvroValue, AvroKey, AvroValue> { } /** * Reducer base class for {@link AbstractNonIncrementalJob}. * * @author "Matthew Hayes" * */ public static abstract class BaseReducer extends Reducer, AvroValue, AvroKey, NullWritable> { } /** * Reports files created and processed for an iteration of the job. * * @author "Matthew Hayes" * */ public static class Report { private String jobName; private String jobId; private Path countersPath; private List inputFiles = new ArrayList(); private DatePath outputFile; /** * Gets the job name. * * @return job name */ public String getJobName() { return jobName; } /** * Gets the job ID. * * @return job ID */ public String getJobId() { return jobId; } /** * Gets the path to the counters file, if one was written. * * @return counters path */ public Path getCountersPath() { return countersPath; } /** * Gets input files that were processed. These are files that are within * the desired date range. * * @return input files */ public List getInputFiles() { return Collections.unmodifiableList(inputFiles); } /** * Gets the output file that was produced by the job. * * @return output file */ public DatePath getOutputFile() { return outputFile; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy