All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.AbstractPartitionCollapsingIncrementalJob Maven / Gradle / Ivy

Go to download

Librares that make easier to solve data problems using Hadoop and higher level languages based on it.

There is a newer version: 1.3.3
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.IOException;
import java.sql.Date;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.log4j.Logger;


import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.avro.AvroKeyWithMetadataOutputFormat;
import datafu.hourglass.avro.AvroMultipleInputsKeyInputFormat;
import datafu.hourglass.avro.AvroMultipleInputsUtil;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;
import datafu.hourglass.mapreduce.AvroKeyValueIdentityMapper;
import datafu.hourglass.mapreduce.CollapsingCombiner;
import datafu.hourglass.mapreduce.CollapsingMapper;
import datafu.hourglass.mapreduce.CollapsingReducer;
import datafu.hourglass.mapreduce.DelegatingCombiner;
import datafu.hourglass.mapreduce.DelegatingMapper;
import datafu.hourglass.mapreduce.DelegatingReducer;
import datafu.hourglass.mapreduce.DistributedCacheHelper;
import datafu.hourglass.mapreduce.Parameters;
import datafu.hourglass.model.Accumulator;
import datafu.hourglass.model.Mapper;
import datafu.hourglass.model.Merger;
import datafu.hourglass.schemas.PartitionCollapsingSchemas;

/**
 * An {@link IncrementalJob} that consumes partitioned input data and collapses the
 * partitions to produce a single output.  This job can be used to process data
 * using a sliding window.  It is capable of reusing the previous output, which
 * means that it can process data more efficiently.
 * Only Avro is supported for the input, intermediate, and output data.
 * 
 * 

* Implementations of this class must provide key, intermediate value, and output value schemas. * The key and intermediate value schemas define the output for the mapper and combiner. * The key and output value schemas define the output for the reducer. * These are defined by overriding {@link #getKeySchema()}, {@link #getIntermediateValueSchema()}, * and {@link #getOutputValueSchema()}. *

* *

* Implementations must also provide a mapper by overriding {@link #getMapper()} and an accumulator * for the reducer by overriding {@link #getReducerAccumulator()}. An optional combiner may be * provided by overriding {@link #getCombinerAccumulator()}. For the combiner to be used * the property use.combiner must also be set to true. *

* *

* The input path can be provided either through the property input.path * or by calling {@link #setInputPaths(List)}. If multiple input paths are provided then * this implicitly means a join is to be performed. Multiple input paths can be provided via * properties by prefixing each with input.path., such as input.path.first * and input.path.second. * Input data must be partitioned by day according to the naming convention yyyy/MM/dd. * The output path can be provided either through the property output.path * or by calling {@link #setOutputPath(Path)}. * Output data will be written using the naming convention yyyyMMdd, where the date used * to format the output path is the same as the end of the desired time range to process. * For example, if the desired time range to process is 2013/01/01 through 2013/01/14, * then the output will be named 20130114. * By default the job will fail if any input data in the desired time window is missing. This can be overriden by setting * fail.on.missing to false. *

* *

* The job will not process input if the corresponding output has already been produced. For example, if the desired date * range is 2013/01/01 through 2013/01/14 and the output 20130114 already exists, then it assumes the work has alreaday * been completed. *

* *

* By default only the latest output will be kept. All other outputs will be removed. This can be controlled * by setting the property retention.count, or by calling {@link #setRetentionCount(Integer)}. *

* *

* Two types of sliding windows may be used: fixed-length and fixed-start. For a fixed-length * sliding window, the size of the window is fixed; the start and end move according to the * availability of input data. For a fixed-start window, the size of the window is flexible; * the start is fixed and the end moves according to the availability of input data. *

* *

* A fixed-length sliding window can be defined either by setting the property num.days * or by calling {@link #setNumDays(Integer)}. This sets how many days of input data will be * consumed. By default the end of the window will be the same as the date of the latest available * input data. The start is then determine by the number of days to consume. The end date can * be moved back relative to the latest input data by setting the days.ago property or * by calling {@link #setDaysAgo(Integer)}. Since the end date is determined by the availability * of input data, as new data arrives the window will advance forward. *

* *

* A fixed-start sliding window can be defined by setting the property start.date or * by calling {@link #setStartDate(java.util.Date)}. The end date will be the same as the date of * the latest available input data. The end date can * be moved back relative to the latest input data by setting the days.ago property or * by calling {@link #setDaysAgo(Integer)}. * Because the end date is determined by the availability of input data, as new data arrives the window * will grow to include it. *

* *

* Previous output can be reused by setting the reuse.previous.output property to true, or * by calling {@link #setReusePreviousOutput(boolean)}. Reusing the previous output is often more efficient * because only input data outside of the time window covered by the previous output needs to be consumed. * For example, given a fixed-start sliding window job, if one new day of input data is available since the * last time the job ran, then the job can reuse the previous output and only read the newest day of data, rather * than reading all the input data again. Given a fixed-length sliding window in the same scenario, the new output * can be produced by adding the newest input to the previous output and subtracting the oldest input from the old * window. *

* *

* For a fixed-start sliding window, if the schema for the intermediate and output values are the same then no additional * changes are necessary, as the reducer's accumulator should be capable of adding the new input to the previous output. * However if they are different then a record must be defined by overriding {@link #getRecordMerger()} so that the previous * output can be merged with the partial output produced by reducing the new input data. * For the fixed-length sliding window one must override {@link #getOldRecordMerger()} to reuse the previous output. * This method essentially unmerges old, partial output data from the current output. For this case as well if the intermediate * and output schemas are the same the {@link #getRecordMerger()} method does not need to be overriden. *

* *

* The number of reducers to use is automatically determined based on the size of the data to process. * The total size is computed and then divided by the value of the property num.reducers.bytes.per.reducer, which * defaults to 256 MB. This is the number of reducers that will be used. This calculation includes * the input data as well as previous output that will be reused. It is also possible calculate the number of reducers * separately for the input and previous output through the properties num.reducers.input.bytes.per.reducer * and num.reducers.previous.bytes.per.reducer. The reducers will be computed separately for the two sets of data * and then added together. The number of reducers can also be set to a fixed value through the property num.reducers. *

* *

* This type of job is capable of performing its work over multiple iterations if previous output can be reused. * The number of days to process at a time can be limited by setting the property max.days.to.process, * or by calling {@link #setMaxToProcess(Integer)}. The default is 90 days. * This can be useful when there are restrictions on how many tasks * can be used by a single MapReduce job in the cluster. When this property is set, the job will process no more than * this many days at a time, and it will perform one or more iterations if necessary to complete the work. * The number of iterations can be limited by setting the property max.iterations, or by calling {@link #setMaxIterations(Integer)}. * If the number of iterations is exceeded the job will fail. By default the maximum number of iterations is 20. *

* *

* Hadoop configuration may be provided by setting a property with the prefix hadoop-conf.. * For example, mapred.min.split.size can be configured by setting property * hadoop-conf.mapred.min.split.size to the desired value. *

* */ public abstract class AbstractPartitionCollapsingIncrementalJob extends IncrementalJob { private final Logger _log = Logger.getLogger(AbstractPartitionCollapsingIncrementalJob.class); private List _reports = new ArrayList(); protected boolean _reusePreviousOutput; private FileCleaner _garbage; /** * Initializes the job. * @throws IOException IOException */ public AbstractPartitionCollapsingIncrementalJob() throws IOException { } /** * Initializes the job with a job name and properties. * * @param name job name * @param props configuration properties * @throws IOException IOException */ public AbstractPartitionCollapsingIncrementalJob(String name, Properties props) throws IOException { super(name,props); } /** * Gets the mapper. * * @return mapper */ public abstract Mapper getMapper(); /** * Gets the accumulator used for the combiner. * * @return combiner accumulator */ public Accumulator getCombinerAccumulator() { return null; } /** * Gets the accumulator used for the reducer. * * @return reducer accumulator */ public abstract Accumulator getReducerAccumulator(); /** * Gets the record merger that is capable of merging previous output with a new partial output. * This is only needed when reusing previous output where the intermediate and output schemas are different. * New partial output is produced by the reducer from new input that is after the previous output. * * @return merger */ public Merger getRecordMerger() { return null; } /** * Gets the record merger that is capable of unmerging old partial output from the new output. * This is only needed when reusing previous output for a fixed-length sliding window. * The new output is the result of merging the previous output with the new partial output. * The old partial output is produced by the reducer from old input data before the time range of * the previous output. * * @return merger */ public Merger getOldRecordMerger() { return null; } /** * Get the name for the reduce output schema. * By default this is the name of the class with "Output" appended. * * @return output schema name */ protected String getOutputSchemaName() { return this.getClass().getSimpleName() + "Output"; } /** * Get the namespace for the reduce output schema. * By default this is the package of the class. * * @return output schema namespace */ protected String getOutputSchemaNamespace() { return this.getClass().getPackage().getName(); } @Override public void setProperties(Properties props) { super.setProperties(props); if (getProperties().get("reuse.previous.output") != null) { setReusePreviousOutput(Boolean.parseBoolean((String)getProperties().get("reuse.previous.output"))); } } /** * Get whether previous output should be reused. * * @return true if previous output should be reused */ public boolean getReusePreviousOutput() { return _reusePreviousOutput; } /** * Set whether previous output should be reused. * * @param reuse true if previous output should be reused */ public void setReusePreviousOutput(boolean reuse) { _reusePreviousOutput = reuse; } @Override protected void initialize() { _garbage = new FileCleaner(getFileSystem()); if (getMaxIterations() == null) { setMaxIterations(20); } if (getMaxToProcess() == null) { if (getNumDays() != null) { setMaxToProcess(getNumDays()); } else { setMaxToProcess(90); } } if (getRetentionCount() == null) { setRetentionCount(1); } super.initialize(); } @Override public void run() throws IOException, InterruptedException, ClassNotFoundException { try { initialize(); validate(); execute(); } finally { cleanup(); } } /** * Get reports that summarize each of the job iterations. * * @return reports */ public List getReports() { return Collections.unmodifiableList(_reports); } /** * Execute the job. * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private void execute() throws IOException, InterruptedException, ClassNotFoundException { int iterations = 0; while (true) { PartitionCollapsingExecutionPlanner planner = new PartitionCollapsingExecutionPlanner(getFileSystem(),getProperties()); planner.setInputPaths(getInputPaths()); planner.setOutputPath(getOutputPath()); planner.setStartDate(getStartDate()); planner.setEndDate(getEndDate()); planner.setDaysAgo(getDaysAgo()); planner.setNumDays(getNumDays()); planner.setMaxToProcess(getMaxToProcess()); planner.setReusePreviousOutput(getReusePreviousOutput()); planner.setFailOnMissing(isFailOnMissing()); planner.createPlan(); if (planner.getInputsToProcess().size() == 0) { _log.info("Nothing to do"); break; } if (iterations >= getMaxIterations()) { throw new RuntimeException(String.format("Already completed %d iterations but the max is %d and there are still %d inputs to process", iterations, getMaxIterations(), planner.getInputsToProcess().size())); } Report report = new Report(); report.inputFiles.addAll(planner.getNewInputsToProcess()); report.oldInputFiles.addAll(planner.getOldInputsToProcess()); if (planner.getPreviousOutputToProcess() != null) { report.reusedOutput = planner.getPreviousOutputToProcess(); } DatePath outputPath = DatePath.createDatedPath(getOutputPath(), planner.getCurrentDateRange().getEndDate()); _log.info("Output path: " + outputPath); Path tempOutputPath = createRandomTempPath(); _garbage.add(tempOutputPath); final StagedOutputJob job = StagedOutputJob.createStagedJob( getConf(), getName() + "-" + PathUtils.datedPathFormat.format(planner.getCurrentDateRange().getEndDate()), null, // no input paths specified here, will add multiple inputs down below tempOutputPath.toString(), outputPath.getPath().toString(), _log); job.setCountersParentPath(getCountersParentPath()); if (planner.getNewInputsToProcess() != null && planner.getNewInputsToProcess().size() > 0) { _log.info("*** New Input data:"); for (DatePath inputPath : planner.getNewInputsToProcess()) { _log.info(inputPath.getPath()); MultipleInputs.addInputPath(job, inputPath.getPath(), AvroMultipleInputsKeyInputFormat.class, DelegatingMapper.class); } } if (planner.getOldInputsToProcess() != null && planner.getOldInputsToProcess().size() > 0) { _log.info("*** Old Input data:"); for (DatePath inputPath : planner.getOldInputsToProcess()) { _log.info(inputPath.getPath()); MultipleInputs.addInputPath(job, inputPath.getPath(), AvroMultipleInputsKeyInputFormat.class, DelegatingMapper.class); } } if (planner.getPreviousOutputToProcess() != null) { _log.info("*** Previous output data:"); _log.info(planner.getPreviousOutputToProcess().getPath()); MultipleInputs.addInputPath(job, planner.getPreviousOutputToProcess().getPath(), AvroKeyInputFormat.class, AvroKeyValueIdentityMapper.class); } final Configuration conf = job.getConfiguration(); config(conf); AvroDateRangeMetadata.configureOutputDateRange(conf, planner.getCurrentDateRange()); PartitionCollapsingSchemas spSchemas = new PartitionCollapsingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace()); job.setOutputFormatClass(AvroKeyWithMetadataOutputFormat.class); _log.info("Setting input path to schema mappings"); for (String path : spSchemas.getMapInputSchemas().keySet()) { Schema schema = spSchemas.getMapInputSchemas().get(path); _log.info("*** " + path); _log.info("*** => " + schema.toString()); AvroMultipleInputsUtil.setInputKeySchemaForPath(job, schema, path); } AvroJob.setMapOutputKeySchema(job, spSchemas.getMapOutputKeySchema()); AvroJob.setMapOutputValueSchema(job, spSchemas.getMapOutputValueSchema()); AvroJob.setOutputKeySchema(job, spSchemas.getReduceOutputSchema()); int numReducers; if (getNumReducers() != null) { numReducers = getNumReducers(); _log.info(String.format("Using %d reducers (fixed)",numReducers)); } else { numReducers = planner.getNumReducers(); _log.info(String.format("Using %d reducers (computed)",numReducers)); } job.setNumReduceTasks(numReducers); job.setReducerClass(DelegatingReducer.class); Path mapperPath = new Path(tempOutputPath,".mapper_impl"); Path reducerPath = new Path(tempOutputPath,".reducer_impl"); Path combinerPath = new Path(tempOutputPath,".combiner_impl"); CollapsingMapper mapper = new CollapsingMapper(); CollapsingReducer reducer = new CollapsingReducer(); mapper.setSchemas(spSchemas); reducer.setSchemas(spSchemas); mapper.setMapper(getMapper()); reducer.setAccumulator(getReducerAccumulator()); reducer.setRecordMerger(getRecordMerger()); reducer.setOldRecordMerger(getOldRecordMerger()); mapper.setReuseOutput(_reusePreviousOutput); reducer.setReuseOutput(_reusePreviousOutput); configureOutputDateRange(job.getConfiguration(),planner.getCurrentDateRange(), reducer); DistributedCacheHelper.writeObject(conf, mapper, mapperPath); DistributedCacheHelper.writeObject(conf, reducer, reducerPath); conf.set(Parameters.REDUCER_IMPL_PATH, reducerPath.toString()); conf.set(Parameters.MAPPER_IMPL_PATH, mapperPath.toString()); if (isUseCombiner()) { CollapsingCombiner combiner = new CollapsingCombiner(); configureOutputDateRange(job.getConfiguration(),planner.getCurrentDateRange(), combiner); combiner.setReuseOutput(_reusePreviousOutput); combiner.setSchemas(spSchemas); combiner.setAccumulator(getCombinerAccumulator()); conf.set(Parameters.COMBINER_IMPL_PATH, combinerPath.toString()); job.setCombinerClass(DelegatingCombiner.class); DistributedCacheHelper.writeObject(conf, combiner, combinerPath); } if (!job.waitForCompletion(true)) { _log.error("Job failed! Quitting..."); throw new RuntimeException("Job failed"); } report.jobId = job.getJobID().toString(); report.jobName = job.getJobName(); report.countersPath = job.getCountersPath(); report.outputPath = outputPath; _reports.add(report); applyRetention(); if (!planner.getNeedsAnotherPass()) { break; } cleanup(); iterations++; } } /** * Removes all but the more recent ouputs that are within the retention period, if one is specified. * * @throws IOException */ private void applyRetention() throws IOException { if (getRetentionCount() != null) { PathUtils.keepLatestDatedPaths(getFileSystem(), getOutputPath(), getRetentionCount()); } } /** * Configures the output date range for processing components. * * @param conf configuration * @param dateRange output date range * @param proc processor */ private static void configureOutputDateRange(Configuration conf, DateRange dateRange, DateRangeConfigurable proc) { Calendar cal = Calendar.getInstance(PathUtils.timeZone); long beginTime = 0L; long endTime = Long.MAX_VALUE; if (dateRange.getBeginDate() != null) { cal.setTime(dateRange.getBeginDate()); beginTime = cal.getTimeInMillis(); } if (dateRange.getEndDate() != null) { cal.setTime(dateRange.getEndDate()); cal.getTimeInMillis(); } proc.setOutputDateRange(new DateRange(new Date(beginTime),new Date(endTime))); } /** * Remove all temporary paths. * * @throws IOException */ private void cleanup() throws IOException { if (_garbage != null) { _garbage.clean(); } } /** * Reports files created and processed for an iteration of the job. * */ public static class Report { private String jobName; private String jobId; private Path countersPath; private DatePath outputPath; private List inputFiles = new ArrayList(); private List oldInputFiles = new ArrayList(); private DatePath reusedOutput; /** * Gets the job name. * * @return job name */ public String getJobName() { return jobName; } /** * Gets the job ID. * * @return job ID */ public String getJobId() { return jobId; } /** * Gets the path to the counters file, if one was written. * * @return counters path */ public Path getCountersPath() { return countersPath; } /** * Gets the path to the output which was produced by the job. * * @return output path */ public DatePath getOutputPath() { return outputPath; } /** * Gets the output that was reused, if one was reused. * * @return reused output path */ public DatePath getReusedOutput() { return reusedOutput; } /** * Gets new input files that were processed. These are files that are within * the desired date range. * * @return input files */ public List getInputFiles() { return Collections.unmodifiableList(inputFiles); } /** * Gets old input files that were processed. These are files that are before * the desired date range and were subtracted from the reused output. * * @return output files */ public List getOldInputFiles() { return Collections.unmodifiableList(oldInputFiles); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy