
datafu.hourglass.jobs.AbstractPartitionCollapsingIncrementalJob Maven / Gradle / Ivy
Show all versions of datafu-hourglass-incubating Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package datafu.hourglass.jobs;
import java.io.IOException;
import java.sql.Date;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.log4j.Logger;
import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.avro.AvroKeyWithMetadataOutputFormat;
import datafu.hourglass.avro.AvroMultipleInputsKeyInputFormat;
import datafu.hourglass.avro.AvroMultipleInputsUtil;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;
import datafu.hourglass.mapreduce.AvroKeyValueIdentityMapper;
import datafu.hourglass.mapreduce.CollapsingCombiner;
import datafu.hourglass.mapreduce.CollapsingMapper;
import datafu.hourglass.mapreduce.CollapsingReducer;
import datafu.hourglass.mapreduce.DelegatingCombiner;
import datafu.hourglass.mapreduce.DelegatingMapper;
import datafu.hourglass.mapreduce.DelegatingReducer;
import datafu.hourglass.mapreduce.DistributedCacheHelper;
import datafu.hourglass.mapreduce.Parameters;
import datafu.hourglass.model.Accumulator;
import datafu.hourglass.model.Mapper;
import datafu.hourglass.model.Merger;
import datafu.hourglass.schemas.PartitionCollapsingSchemas;
/**
* An {@link IncrementalJob} that consumes partitioned input data and collapses the
* partitions to produce a single output. This job can be used to process data
* using a sliding window. It is capable of reusing the previous output, which
* means that it can process data more efficiently.
* Only Avro is supported for the input, intermediate, and output data.
*
*
* Implementations of this class must provide key, intermediate value, and output value schemas.
* The key and intermediate value schemas define the output for the mapper and combiner.
* The key and output value schemas define the output for the reducer.
* These are defined by overriding {@link #getKeySchema()}, {@link #getIntermediateValueSchema()},
* and {@link #getOutputValueSchema()}.
*
*
*
* Implementations must also provide a mapper by overriding {@link #getMapper()} and an accumulator
* for the reducer by overriding {@link #getReducerAccumulator()}. An optional combiner may be
* provided by overriding {@link #getCombinerAccumulator()}. For the combiner to be used
* the property use.combiner must also be set to true.
*
*
*
* The input path can be provided either through the property input.path
* or by calling {@link #setInputPaths(List)}. If multiple input paths are provided then
* this implicitly means a join is to be performed. Multiple input paths can be provided via
* properties by prefixing each with input.path., such as input.path.first
* and input.path.second.
* Input data must be partitioned by day according to the naming convention yyyy/MM/dd.
* The output path can be provided either through the property output.path
* or by calling {@link #setOutputPath(Path)}.
* Output data will be written using the naming convention yyyyMMdd, where the date used
* to format the output path is the same as the end of the desired time range to process.
* For example, if the desired time range to process is 2013/01/01 through 2013/01/14,
* then the output will be named 20130114.
* By default the job will fail if any input data in the desired time window is missing. This can be overriden by setting
* fail.on.missing to false.
*
*
*
* The job will not process input if the corresponding output has already been produced. For example, if the desired date
* range is 2013/01/01 through 2013/01/14 and the output 20130114 already exists, then it assumes the work has alreaday
* been completed.
*
*
*
* By default only the latest output will be kept. All other outputs will be removed. This can be controlled
* by setting the property retention.count, or by calling {@link #setRetentionCount(Integer)}.
*
*
*
* Two types of sliding windows may be used: fixed-length and fixed-start. For a fixed-length
* sliding window, the size of the window is fixed; the start and end move according to the
* availability of input data. For a fixed-start window, the size of the window is flexible;
* the start is fixed and the end moves according to the availability of input data.
*
*
*
* A fixed-length sliding window can be defined either by setting the property num.days
* or by calling {@link #setNumDays(Integer)}. This sets how many days of input data will be
* consumed. By default the end of the window will be the same as the date of the latest available
* input data. The start is then determine by the number of days to consume. The end date can
* be moved back relative to the latest input data by setting the days.ago property or
* by calling {@link #setDaysAgo(Integer)}. Since the end date is determined by the availability
* of input data, as new data arrives the window will advance forward.
*
*
*
* A fixed-start sliding window can be defined by setting the property start.date or
* by calling {@link #setStartDate(java.util.Date)}. The end date will be the same as the date of
* the latest available input data. The end date can
* be moved back relative to the latest input data by setting the days.ago property or
* by calling {@link #setDaysAgo(Integer)}.
* Because the end date is determined by the availability of input data, as new data arrives the window
* will grow to include it.
*
*
*
* Previous output can be reused by setting the reuse.previous.output property to true, or
* by calling {@link #setReusePreviousOutput(boolean)}. Reusing the previous output is often more efficient
* because only input data outside of the time window covered by the previous output needs to be consumed.
* For example, given a fixed-start sliding window job, if one new day of input data is available since the
* last time the job ran, then the job can reuse the previous output and only read the newest day of data, rather
* than reading all the input data again. Given a fixed-length sliding window in the same scenario, the new output
* can be produced by adding the newest input to the previous output and subtracting the oldest input from the old
* window.
*
*
*
* For a fixed-start sliding window, if the schema for the intermediate and output values are the same then no additional
* changes are necessary, as the reducer's accumulator should be capable of adding the new input to the previous output.
* However if they are different then a record must be defined by overriding {@link #getRecordMerger()} so that the previous
* output can be merged with the partial output produced by reducing the new input data.
* For the fixed-length sliding window one must override {@link #getOldRecordMerger()} to reuse the previous output.
* This method essentially unmerges old, partial output data from the current output. For this case as well if the intermediate
* and output schemas are the same the {@link #getRecordMerger()} method does not need to be overriden.
*
*
*
* The number of reducers to use is automatically determined based on the size of the data to process.
* The total size is computed and then divided by the value of the property num.reducers.bytes.per.reducer, which
* defaults to 256 MB. This is the number of reducers that will be used. This calculation includes
* the input data as well as previous output that will be reused. It is also possible calculate the number of reducers
* separately for the input and previous output through the properties num.reducers.input.bytes.per.reducer
* and num.reducers.previous.bytes.per.reducer. The reducers will be computed separately for the two sets of data
* and then added together. The number of reducers can also be set to a fixed value through the property num.reducers.
*
*
*
* This type of job is capable of performing its work over multiple iterations if previous output can be reused.
* The number of days to process at a time can be limited by setting the property max.days.to.process,
* or by calling {@link #setMaxToProcess(Integer)}. The default is 90 days.
* This can be useful when there are restrictions on how many tasks
* can be used by a single MapReduce job in the cluster. When this property is set, the job will process no more than
* this many days at a time, and it will perform one or more iterations if necessary to complete the work.
* The number of iterations can be limited by setting the property max.iterations, or by calling {@link #setMaxIterations(Integer)}.
* If the number of iterations is exceeded the job will fail. By default the maximum number of iterations is 20.
*
*
*
* Hadoop configuration may be provided by setting a property with the prefix hadoop-conf..
* For example, mapred.min.split.size can be configured by setting property
* hadoop-conf.mapred.min.split.size to the desired value.
*
*
*/
public abstract class AbstractPartitionCollapsingIncrementalJob extends IncrementalJob
{
private final Logger _log = Logger.getLogger(AbstractPartitionCollapsingIncrementalJob.class);
private List _reports = new ArrayList();
protected boolean _reusePreviousOutput;
private FileCleaner _garbage;
/**
* Initializes the job.
* @throws IOException IOException
*/
public AbstractPartitionCollapsingIncrementalJob() throws IOException
{
}
/**
* Initializes the job with a job name and properties.
*
* @param name job name
* @param props configuration properties
* @throws IOException IOException
*/
public AbstractPartitionCollapsingIncrementalJob(String name, Properties props) throws IOException
{
super(name,props);
}
/**
* Gets the mapper.
*
* @return mapper
*/
public abstract Mapper getMapper();
/**
* Gets the accumulator used for the combiner.
*
* @return combiner accumulator
*/
public Accumulator getCombinerAccumulator()
{
return null;
}
/**
* Gets the accumulator used for the reducer.
*
* @return reducer accumulator
*/
public abstract Accumulator getReducerAccumulator();
/**
* Gets the record merger that is capable of merging previous output with a new partial output.
* This is only needed when reusing previous output where the intermediate and output schemas are different.
* New partial output is produced by the reducer from new input that is after the previous output.
*
* @return merger
*/
public Merger getRecordMerger()
{
return null;
}
/**
* Gets the record merger that is capable of unmerging old partial output from the new output.
* This is only needed when reusing previous output for a fixed-length sliding window.
* The new output is the result of merging the previous output with the new partial output.
* The old partial output is produced by the reducer from old input data before the time range of
* the previous output.
*
* @return merger
*/
public Merger getOldRecordMerger()
{
return null;
}
/**
* Get the name for the reduce output schema.
* By default this is the name of the class with "Output" appended.
*
* @return output schema name
*/
protected String getOutputSchemaName()
{
return this.getClass().getSimpleName() + "Output";
}
/**
* Get the namespace for the reduce output schema.
* By default this is the package of the class.
*
* @return output schema namespace
*/
protected String getOutputSchemaNamespace()
{
return this.getClass().getPackage().getName();
}
@Override
public void setProperties(Properties props)
{
super.setProperties(props);
if (getProperties().get("reuse.previous.output") != null)
{
setReusePreviousOutput(Boolean.parseBoolean((String)getProperties().get("reuse.previous.output")));
}
}
/**
* Get whether previous output should be reused.
*
* @return true if previous output should be reused
*/
public boolean getReusePreviousOutput()
{
return _reusePreviousOutput;
}
/**
* Set whether previous output should be reused.
*
* @param reuse true if previous output should be reused
*/
public void setReusePreviousOutput(boolean reuse)
{
_reusePreviousOutput = reuse;
}
@Override
protected void initialize()
{
_garbage = new FileCleaner(getFileSystem());
if (getMaxIterations() == null)
{
setMaxIterations(20);
}
if (getMaxToProcess() == null)
{
if (getNumDays() != null)
{
setMaxToProcess(getNumDays());
}
else
{
setMaxToProcess(90);
}
}
if (getRetentionCount() == null)
{
setRetentionCount(1);
}
super.initialize();
}
@Override
public void run() throws IOException, InterruptedException, ClassNotFoundException
{
try
{
initialize();
validate();
execute();
}
finally
{
cleanup();
}
}
/**
* Get reports that summarize each of the job iterations.
*
* @return reports
*/
public List getReports()
{
return Collections.unmodifiableList(_reports);
}
/**
* Execute the job.
*
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
private void execute() throws IOException, InterruptedException, ClassNotFoundException
{
int iterations = 0;
while (true)
{
PartitionCollapsingExecutionPlanner planner = new PartitionCollapsingExecutionPlanner(getFileSystem(),getProperties());
planner.setInputPaths(getInputPaths());
planner.setOutputPath(getOutputPath());
planner.setStartDate(getStartDate());
planner.setEndDate(getEndDate());
planner.setDaysAgo(getDaysAgo());
planner.setNumDays(getNumDays());
planner.setMaxToProcess(getMaxToProcess());
planner.setReusePreviousOutput(getReusePreviousOutput());
planner.setFailOnMissing(isFailOnMissing());
planner.createPlan();
if (planner.getInputsToProcess().size() == 0)
{
_log.info("Nothing to do");
break;
}
if (iterations >= getMaxIterations())
{
throw new RuntimeException(String.format("Already completed %d iterations but the max is %d and there are still %d inputs to process",
iterations,
getMaxIterations(),
planner.getInputsToProcess().size()));
}
Report report = new Report();
report.inputFiles.addAll(planner.getNewInputsToProcess());
report.oldInputFiles.addAll(planner.getOldInputsToProcess());
if (planner.getPreviousOutputToProcess() != null)
{
report.reusedOutput = planner.getPreviousOutputToProcess();
}
DatePath outputPath = DatePath.createDatedPath(getOutputPath(), planner.getCurrentDateRange().getEndDate());
_log.info("Output path: " + outputPath);
Path tempOutputPath = createRandomTempPath();
_garbage.add(tempOutputPath);
final StagedOutputJob job = StagedOutputJob.createStagedJob(
getConf(),
getName() + "-" + PathUtils.datedPathFormat.format(planner.getCurrentDateRange().getEndDate()),
null, // no input paths specified here, will add multiple inputs down below
tempOutputPath.toString(),
outputPath.getPath().toString(),
_log);
job.setCountersParentPath(getCountersParentPath());
if (planner.getNewInputsToProcess() != null && planner.getNewInputsToProcess().size() > 0)
{
_log.info("*** New Input data:");
for (DatePath inputPath : planner.getNewInputsToProcess())
{
_log.info(inputPath.getPath());
MultipleInputs.addInputPath(job, inputPath.getPath(), AvroMultipleInputsKeyInputFormat.class, DelegatingMapper.class);
}
}
if (planner.getOldInputsToProcess() != null && planner.getOldInputsToProcess().size() > 0)
{
_log.info("*** Old Input data:");
for (DatePath inputPath : planner.getOldInputsToProcess())
{
_log.info(inputPath.getPath());
MultipleInputs.addInputPath(job, inputPath.getPath(), AvroMultipleInputsKeyInputFormat.class, DelegatingMapper.class);
}
}
if (planner.getPreviousOutputToProcess() != null)
{
_log.info("*** Previous output data:");
_log.info(planner.getPreviousOutputToProcess().getPath());
MultipleInputs.addInputPath(job, planner.getPreviousOutputToProcess().getPath(), AvroKeyInputFormat.class, AvroKeyValueIdentityMapper.class);
}
final Configuration conf = job.getConfiguration();
config(conf);
AvroDateRangeMetadata.configureOutputDateRange(conf, planner.getCurrentDateRange());
PartitionCollapsingSchemas spSchemas = new PartitionCollapsingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace());
job.setOutputFormatClass(AvroKeyWithMetadataOutputFormat.class);
_log.info("Setting input path to schema mappings");
for (String path : spSchemas.getMapInputSchemas().keySet())
{
Schema schema = spSchemas.getMapInputSchemas().get(path);
_log.info("*** " + path);
_log.info("*** => " + schema.toString());
AvroMultipleInputsUtil.setInputKeySchemaForPath(job, schema, path);
}
AvroJob.setMapOutputKeySchema(job, spSchemas.getMapOutputKeySchema());
AvroJob.setMapOutputValueSchema(job, spSchemas.getMapOutputValueSchema());
AvroJob.setOutputKeySchema(job, spSchemas.getReduceOutputSchema());
int numReducers;
if (getNumReducers() != null)
{
numReducers = getNumReducers();
_log.info(String.format("Using %d reducers (fixed)",numReducers));
}
else
{
numReducers = planner.getNumReducers();
_log.info(String.format("Using %d reducers (computed)",numReducers));
}
job.setNumReduceTasks(numReducers);
job.setReducerClass(DelegatingReducer.class);
Path mapperPath = new Path(tempOutputPath,".mapper_impl");
Path reducerPath = new Path(tempOutputPath,".reducer_impl");
Path combinerPath = new Path(tempOutputPath,".combiner_impl");
CollapsingMapper mapper = new CollapsingMapper();
CollapsingReducer reducer = new CollapsingReducer();
mapper.setSchemas(spSchemas);
reducer.setSchemas(spSchemas);
mapper.setMapper(getMapper());
reducer.setAccumulator(getReducerAccumulator());
reducer.setRecordMerger(getRecordMerger());
reducer.setOldRecordMerger(getOldRecordMerger());
mapper.setReuseOutput(_reusePreviousOutput);
reducer.setReuseOutput(_reusePreviousOutput);
configureOutputDateRange(job.getConfiguration(),planner.getCurrentDateRange(), reducer);
DistributedCacheHelper.writeObject(conf, mapper, mapperPath);
DistributedCacheHelper.writeObject(conf, reducer, reducerPath);
conf.set(Parameters.REDUCER_IMPL_PATH, reducerPath.toString());
conf.set(Parameters.MAPPER_IMPL_PATH, mapperPath.toString());
if (isUseCombiner())
{
CollapsingCombiner combiner = new CollapsingCombiner();
configureOutputDateRange(job.getConfiguration(),planner.getCurrentDateRange(), combiner);
combiner.setReuseOutput(_reusePreviousOutput);
combiner.setSchemas(spSchemas);
combiner.setAccumulator(getCombinerAccumulator());
conf.set(Parameters.COMBINER_IMPL_PATH, combinerPath.toString());
job.setCombinerClass(DelegatingCombiner.class);
DistributedCacheHelper.writeObject(conf, combiner, combinerPath);
}
if (!job.waitForCompletion(true))
{
_log.error("Job failed! Quitting...");
throw new RuntimeException("Job failed");
}
report.jobId = job.getJobID().toString();
report.jobName = job.getJobName();
report.countersPath = job.getCountersPath();
report.outputPath = outputPath;
_reports.add(report);
applyRetention();
if (!planner.getNeedsAnotherPass())
{
break;
}
cleanup();
iterations++;
}
}
/**
* Removes all but the more recent ouputs that are within the retention period, if one is specified.
*
* @throws IOException
*/
private void applyRetention() throws IOException
{
if (getRetentionCount() != null)
{
PathUtils.keepLatestDatedPaths(getFileSystem(), getOutputPath(), getRetentionCount());
}
}
/**
* Configures the output date range for processing components.
*
* @param conf configuration
* @param dateRange output date range
* @param proc processor
*/
private static void configureOutputDateRange(Configuration conf, DateRange dateRange, DateRangeConfigurable proc)
{
Calendar cal = Calendar.getInstance(PathUtils.timeZone);
long beginTime = 0L;
long endTime = Long.MAX_VALUE;
if (dateRange.getBeginDate() != null)
{
cal.setTime(dateRange.getBeginDate());
beginTime = cal.getTimeInMillis();
}
if (dateRange.getEndDate() != null)
{
cal.setTime(dateRange.getEndDate());
cal.getTimeInMillis();
}
proc.setOutputDateRange(new DateRange(new Date(beginTime),new Date(endTime)));
}
/**
* Remove all temporary paths.
*
* @throws IOException
*/
private void cleanup() throws IOException
{
if (_garbage != null)
{
_garbage.clean();
}
}
/**
* Reports files created and processed for an iteration of the job.
*
*/
public static class Report
{
private String jobName;
private String jobId;
private Path countersPath;
private DatePath outputPath;
private List inputFiles = new ArrayList();
private List oldInputFiles = new ArrayList();
private DatePath reusedOutput;
/**
* Gets the job name.
*
* @return job name
*/
public String getJobName()
{
return jobName;
}
/**
* Gets the job ID.
*
* @return job ID
*/
public String getJobId()
{
return jobId;
}
/**
* Gets the path to the counters file, if one was written.
*
* @return counters path
*/
public Path getCountersPath()
{
return countersPath;
}
/**
* Gets the path to the output which was produced by the job.
*
* @return output path
*/
public DatePath getOutputPath()
{
return outputPath;
}
/**
* Gets the output that was reused, if one was reused.
*
* @return reused output path
*/
public DatePath getReusedOutput()
{
return reusedOutput;
}
/**
* Gets new input files that were processed. These are files that are within
* the desired date range.
*
* @return input files
*/
public List getInputFiles()
{
return Collections.unmodifiableList(inputFiles);
}
/**
* Gets old input files that were processed. These are files that are before
* the desired date range and were subtracted from the reused output.
*
* @return output files
*/
public List getOldInputFiles()
{
return Collections.unmodifiableList(oldInputFiles);
}
}
}