
datafu.hourglass.jobs.PartitionCollapsingExecutionPlanner Maven / Gradle / Ivy
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.jobs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;
/**
* Execution planner used by {@link AbstractPartitionCollapsingIncrementalJob} and its derived classes.
* This creates a plan to process partitioned input data and collapse the partitions into a single output.
*
*
* To use this class, the input and output paths must be specified. In addition the desired input date
* range can be specified through several methods. Then {@link #createPlan()} can be called and the
* execution plan will be created. The inputs to process will be available from {@link #getInputsToProcess()},
* the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas
* will be available from {@link #getInputSchemas()}.
*
*
*
* Previous output may be reused by using {@link #setReusePreviousOutput(boolean)}. If previous output exists
* and it is to be reused then it will be available from {@link #getPreviousOutputToProcess()}. New input data
* to process that is after the previous output time range is available from {@link #getNewInputsToProcess()}.
* Old input data to process that is before the previous output time range and should be subtracted from the
* previous output is available from {@link #getOldInputsToProcess()}.
*
*
*
* Configuration properties are used to configure a {@link ReduceEstimator} instance. This is used to
* calculate how many reducers should be used.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property. This setting can be controlled more granularly
* through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
* Check {@link ReduceEstimator} for more details on how the properties are used.
*
*
* @author "Matthew Hayes"
*
*/
public class PartitionCollapsingExecutionPlanner extends ExecutionPlanner
{
private final Logger _log = Logger.getLogger(PartitionCollapsingExecutionPlanner.class);
private int _numReducers;
private SortedMap _outputPathsByDate;
private boolean _reusePreviousOutput;
private List _inputsToProcess = new ArrayList();
private List _newInputsToProcess = new ArrayList();
private List _oldInputsToProcess = new ArrayList();
private Map> _inputsToProcessByDate = new HashMap>();
private DatePath _previousOutputToProcess;
private List _inputSchemas = new ArrayList();
private boolean _needAnotherPass;
private DateRange _currentDateRange;
private boolean _planExists;
/**
* Initializes the execution planner.
*
* @param fs file system
* @param props configuration properties
*/
public PartitionCollapsingExecutionPlanner(FileSystem fs, Properties props)
{
super(fs, props);
}
/**
* Create the execution plan.
*
* @throws IOException
*/
public void createPlan() throws IOException
{
if (_planExists) throw new RuntimeException("Plan already exists");
_planExists = true;
loadInputData();
loadOutputData();
determineAvailableInputDates();
determineDateRange();
determineInputsToProcess();
determineInputSchemas();
determineNumReducers();
}
/**
* Gets whether previous output should be reused, if it exists.
*
* @return true if previous output should be reused
*/
public boolean getReusePreviousOutput()
{
return _reusePreviousOutput;
}
/**
* Sets whether previous output should be reused, if it exists.
*
* @param reuse true if previous output should be reused
*/
public void setReusePreviousOutput(boolean reuse)
{
_reusePreviousOutput = reuse;
}
/**
* Get the number of reducers to use based on the input and previous output data size.
* Must call {@link #createPlan()} first.
*
* @return number of reducers to use
*/
public int getNumReducers()
{
checkPlanExists();
return _numReducers;
}
public DateRange getCurrentDateRange()
{
checkPlanExists();
return _currentDateRange;
}
/**
* Gets the previous output to reuse, or null if no output is being reused.
* Must call {@link #createPlan()} first.
*
* @return previous output to reuse, or null
*/
public DatePath getPreviousOutputToProcess()
{
checkPlanExists();
return _previousOutputToProcess;
}
/**
* Gets all inputs that will be processed. This includes both old and new data.
* Must call {@link #createPlan()} first.
*
* @return inputs to process
*/
public List getInputsToProcess()
{
checkPlanExists();
return _inputsToProcess;
}
/**
* Gets only the new data that will be processed. New data is data that falls within the
* desired date range.
* Must call {@link #createPlan()} first.
*
* @return new inputs to process
*/
public List getNewInputsToProcess()
{
checkPlanExists();
return _newInputsToProcess;
}
/**
* Gets only the old data that will be processed. Old data is data that falls before the
* desired date range. It will be subtracted out from the previous output.
* Must call {@link #createPlan()} first.
*
* @return old inputs to process
*/
public List getOldInputsToProcess()
{
checkPlanExists();
return _oldInputsToProcess;
}
/**
* Gets whether another pass will be required. Because there may be a limit on the number of inputs processed
* in a single run, multiple runs may be required to process all data in the desired date range.
* Must call {@link #createPlan()} first.
*
* @return true if another pass is required
*/
public boolean getNeedsAnotherPass()
{
checkPlanExists();
return _needAnotherPass;
}
/**
* Gets the input schemas. Because multiple inputs are allowed, there may be multiple schemas.
* Must call {@link #createPlan()} first.
*
*
* This does not include the output schema, even though previous output may be fed back as input.
* The reason is that the ouput schema it determined based on the input schema.
*
*
* @return input schemas
*/
public List getInputSchemas()
{
checkPlanExists();
return _inputSchemas;
}
/**
* Determines the number of reducers to use based on the input data size and the previous output,
* if it exists and is being reused.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property. This setting can be controlled more granularly
* through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
* See {@link ReduceEstimator} for details on reducer estimation.
*
* @throws IOException
*/
private void determineNumReducers() throws IOException
{
ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps());
List inputPaths = new ArrayList();
for (DatePath input : getInputsToProcess())
{
inputPaths.add(input.getPath().toString());
estimator.addInputPath("input",input.getPath());
}
if (_previousOutputToProcess != null)
{
estimator.addInputPath("previous",_previousOutputToProcess.getPath());
}
_numReducers = estimator.getNumReducers();
}
/**
* Determines the input schemas. There may be multiple input schemas because multiple inputs are allowed.
* The latest available inputs are used to determine the schema, the assumption being that schemas are
* backwards-compatible.
*
* @throws IOException
*/
private void determineInputSchemas() throws IOException
{
List dates = new ArrayList(_inputsToProcessByDate.keySet());
if (dates.size() > 0)
{
Collections.sort(dates);
Date lastDate = dates.get(dates.size()-1);
List lastInputs = _inputsToProcessByDate.get(lastDate);
for (DatePath input : lastInputs)
{
_inputSchemas.add(PathUtils.getSchemaFromPath(getFileSystem(),input.getPath()));
}
}
}
/**
* Determines what output data already exists. Previous output may be reused.
*
* @throws IOException
*/
private void loadOutputData() throws IOException
{
_log.info(String.format("Checking output data in " + getOutputPath()));
_outputPathsByDate = getDatedData(getOutputPath());
}
/**
* Determines what input data to process.
*
*
* The input data to consume is determined by the desired date range. If previous output is not reused then the input data to process
* will coincide with the date range. If previous output may be reused and previous output exists, then the input data to process
* will consist of new data and potentially old data. The new input data to process is data that has time after the previous output date range,
* so that it may be added to the previous output.
* The old data to process is data that has time before the previous output date range, so that it may be subtracted from the previous output.
*
*
*
* If there is a limit on how many days of input data can be processed then it may be the case that not all input data will be processed in
* a single run.
*
*
* @throws IOException
*/
private void determineInputsToProcess() throws IOException
{
Calendar cal = Calendar.getInstance(PathUtils.timeZone);
_inputsToProcess.clear();
_inputsToProcessByDate.clear();
_previousOutputToProcess = null;
DateRange outputDateRange = null;
if (_reusePreviousOutput && _outputPathsByDate.size() > 0)
{
DatePath latestPriorOutput = _outputPathsByDate.get(Collections.max(_outputPathsByDate.keySet()));
_log.info("Have previous output, determining what previous incremental data to difference out");
outputDateRange = AvroDateRangeMetadata.getOutputFileDateRange(getFileSystem(),latestPriorOutput.getPath());
_log.info(String.format("Previous output has date range %s to %s",
PathUtils.datedPathFormat.format(outputDateRange.getBeginDate()),
PathUtils.datedPathFormat.format(outputDateRange.getEndDate())));
for (Date currentDate=outputDateRange.getBeginDate(); currentDate.compareTo(getDateRange().getBeginDate()) < 0;)
{
if (!getAvailableInputsByDate().containsKey(currentDate))
{
throw new RuntimeException(String.format("Missing incremental data for %s, so can't remove it from previous output",PathUtils.datedPathFormat.format(currentDate)));
}
List inputs = getAvailableInputsByDate().get(currentDate);
for (DatePath input : inputs)
{
_log.info(String.format("Input: %s",input.getPath()));
_inputsToProcess.add(input);
_oldInputsToProcess.add(input);
}
_inputsToProcessByDate.put(currentDate, inputs);
cal.setTime(currentDate);
cal.add(Calendar.DAY_OF_MONTH, 1);
currentDate = cal.getTime();
}
_previousOutputToProcess = latestPriorOutput;
_log.info("Including previous output: " + _previousOutputToProcess.getPath());
}
// consume the incremental data and produce the final output
_log.info("Determining what new incremental data to include");
int newDataCount = 0;
Date startDate = getDateRange().getBeginDate();
Date endDate = startDate;
for (Date currentDate=startDate; currentDate.compareTo(getDateRange().getEndDate()) <= 0; )
{
if (getMaxToProcess() != null && newDataCount >= getMaxToProcess())
{
if (!_reusePreviousOutput)
{
throw new RuntimeException(String.format("Amount of input data has exceeded max of %d however output is not being reused so cannot do in multiple passes", getMaxToProcess()));
}
// too much data to process in a single run, will require another pass
_needAnotherPass = true;
break;
}
if (outputDateRange == null || currentDate.compareTo(outputDateRange.getEndDate()) > 0)
{
if (!getAvailableInputsByDate().containsKey(currentDate))
{
if (isFailOnMissing())
{
throw new RuntimeException("missing " + PathUtils.datedPathFormat.format(currentDate));
}
else
{
_log.info("No input data found for " + PathUtils.datedPathFormat.format(currentDate));
}
}
else
{
List inputs = getAvailableInputsByDate().get(currentDate);
for (DatePath input : inputs)
{
_log.info(String.format("Input: %s",input.getPath()));
_inputsToProcess.add(input);
_newInputsToProcess.add(input);
}
_inputsToProcessByDate.put(currentDate, inputs);
newDataCount++;
}
}
cal.setTime(currentDate);
endDate = cal.getTime();
cal.add(Calendar.DAY_OF_MONTH, 1);
currentDate = cal.getTime();
}
_currentDateRange = new DateRange(startDate,endDate);
}
/**
* Throws an exception if the plan hasn't been created.
*/
private void checkPlanExists()
{
if (!_planExists) throw new RuntimeException("Must call createPlan first");
}
}