![JAR search and dependency download from the Maven repository](/logo.png)
datafu.hourglass.jobs.PartitionCollapsingExecutionPlanner Maven / Gradle / Ivy
Show all versions of datafu-hourglass-incubating Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package datafu.hourglass.jobs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.SortedMap;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;
/**
* Execution planner used by {@link AbstractPartitionCollapsingIncrementalJob} and its derived classes.
* This creates a plan to process partitioned input data and collapse the partitions into a single output.
*
*
* To use this class, the input and output paths must be specified. In addition the desired input date
* range can be specified through several methods. Then {@link #createPlan()} can be called and the
* execution plan will be created. The inputs to process will be available from {@link #getInputsToProcess()},
* the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas
* will be available from {@link #getInputSchemas()}.
*
*
*
* Previous output may be reused by using {@link #setReusePreviousOutput(boolean)}. If previous output exists
* and it is to be reused then it will be available from {@link #getPreviousOutputToProcess()}. New input data
* to process that is after the previous output time range is available from {@link #getNewInputsToProcess()}.
* Old input data to process that is before the previous output time range and should be subtracted from the
* previous output is available from {@link #getOldInputsToProcess()}.
*
*
*
* Configuration properties are used to configure a {@link ReduceEstimator} instance. This is used to
* calculate how many reducers should be used.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property. This setting can be controlled more granularly
* through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
* Check {@link ReduceEstimator} for more details on how the properties are used.
*
*
*/
public class PartitionCollapsingExecutionPlanner extends ExecutionPlanner
{
private final Logger _log = Logger.getLogger(PartitionCollapsingExecutionPlanner.class);
private SortedMap _outputPathsByDate;
private boolean _reusePreviousOutput;
// the chosen execution plan
private Plan _plan;
/**
* An execution plan. Encapsulates what inputs will be processed.
*
*/
private class Plan
{
private List _inputsToProcess = new ArrayList();
private List _newInputsToProcess = new ArrayList();
private List _oldInputsToProcess = new ArrayList();
private Map _latestInputByPath = new HashMap();
private DatePath _previousOutputToProcess;
private List _inputSchemas = new ArrayList();
private Map _inputSchemasByPath = new HashMap();
private boolean _needAnotherPass;
private DateRange _currentDateRange;
private int _numReducers;
private Long _totalBytes;
public void finalizePlan() throws IOException
{
determineInputSchemas();
determineNumReducers();
determineTotalBytes();
}
/**
* Determines the number of bytes that will be consumed by this execution plan.
* This is used to compare alternative plans so the one with the least bytes
* consumed can be used.
*
* @throws IOException
*/
private void determineTotalBytes() throws IOException
{
_totalBytes = 0L;
for (DatePath dp : _inputsToProcess)
{
_totalBytes += PathUtils.countBytes(getFileSystem(), dp.getPath());
}
if (_previousOutputToProcess != null)
{
_totalBytes += PathUtils.countBytes(getFileSystem(), _previousOutputToProcess.getPath());
}
_log.info("Total bytes consumed: " + _totalBytes);
}
/**
* Determines the input schemas. There may be multiple input schemas because multiple inputs are allowed.
* The latest available inputs are used to determine the schema, the assumption being that schemas are
* backwards-compatible.
*
* @throws IOException
*/
private void determineInputSchemas() throws IOException
{
if (_latestInputByPath.size() > 0)
{
_log.info("Determining input schemas");
for (Entry entry : _latestInputByPath.entrySet())
{
String root = entry.getKey();
String input = entry.getValue();
_log.info("Loading schema for " + input);
Schema schema = PathUtils.getSchemaFromPath(getFileSystem(),new Path(input));
_inputSchemas.add(schema);
_inputSchemasByPath.put(root, schema);
}
}
}
/**
* Determines the number of reducers to use based on the input data size and the previous output,
* if it exists and is being reused.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property. This setting can be controlled more granularly
* through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
* See {@link ReduceEstimator} for details on reducer estimation.
*
* @throws IOException
*/
private void determineNumReducers() throws IOException
{
ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps());
List inputPaths = new ArrayList();
for (DatePath input : _inputsToProcess)
{
inputPaths.add(input.getPath().toString());
estimator.addInputPath("input",input.getPath());
}
if (_previousOutputToProcess != null)
{
estimator.addInputPath("previous",_previousOutputToProcess.getPath());
}
_numReducers = estimator.getNumReducers();
}
}
/**
* Initializes the execution planner.
*
* @param fs file system
* @param props configuration properties
*/
public PartitionCollapsingExecutionPlanner(FileSystem fs, Properties props)
{
super(fs, props);
}
/**
* Create the execution plan.
*
* @throws IOException IOException
*/
public void createPlan() throws IOException
{
if (_plan != null) throw new RuntimeException("Plan already exists");
_log.info("Creating execution plan");
loadInputData();
loadOutputData();
determineAvailableInputDates();
determineDateRange();
List plans = new ArrayList();
Plan plan;
if (_reusePreviousOutput)
{
_log.info("Output may be reused, will create alternative plan that does not reuse output");
plan = new Plan();
try
{
determineInputsToProcess(false,plan);
plan.finalizePlan();
plans.add(plan);
}
catch (MaxInputDataExceededException e)
{
_log.info(e.getMessage());
}
}
_log.info(String.format("Creating plan that %s previous output",(_reusePreviousOutput ? "reuses" : "does not reuse")));
plan = new Plan();
try
{
determineInputsToProcess(_reusePreviousOutput,plan);
}
catch (MaxInputDataExceededException e)
{
throw new RuntimeException(e);
}
plan.finalizePlan();
plans.add(plan);
if (plans.size() > 1)
{
_log.info(String.format("There are %d alternative execution plans:",plans.size()));
for (Plan option : plans)
{
_log.info(String.format("* Consume %d new inputs, %d old inputs, %s previous output (%d bytes)",
option._newInputsToProcess.size(),
option._oldInputsToProcess.size(),
option._previousOutputToProcess != null ? "reuse" : "no",
option._totalBytes));
}
// choose plan with least bytes consumed
Collections.sort(plans, new Comparator() {
@Override
public int compare(Plan o1, Plan o2)
{
return o1._totalBytes.compareTo(o2._totalBytes);
}
});
_plan = plans.get(0);
_log.info(String.format("Choosing plan consuming %d bytes",_plan._totalBytes));
}
else
{
_plan = plans.get(0);
}
}
/**
* Gets whether previous output should be reused, if it exists.
*
* @return true if previous output should be reused
*/
public boolean getReusePreviousOutput()
{
return _reusePreviousOutput;
}
/**
* Sets whether previous output should be reused, if it exists.
*
* @param reuse true if previous output should be reused
*/
public void setReusePreviousOutput(boolean reuse)
{
_reusePreviousOutput = reuse;
}
/**
* Get the number of reducers to use based on the input and previous output data size.
* Must call {@link #createPlan()} first.
*
* @return number of reducers to use
*/
public int getNumReducers()
{
checkPlanExists();
return getPlan()._numReducers;
}
public DateRange getCurrentDateRange()
{
checkPlanExists();
return getPlan()._currentDateRange;
}
/**
* Gets the previous output to reuse, or null if no output is being reused.
* Must call {@link #createPlan()} first.
*
* @return previous output to reuse, or null
*/
public DatePath getPreviousOutputToProcess()
{
return getPlan()._previousOutputToProcess;
}
/**
* Gets all inputs that will be processed. This includes both old and new data.
* Must call {@link #createPlan()} first.
*
* @return inputs to process
*/
public List getInputsToProcess()
{
return getPlan()._inputsToProcess;
}
/**
* Gets only the new data that will be processed. New data is data that falls within the
* desired date range.
* Must call {@link #createPlan()} first.
*
* @return new inputs to process
*/
public List getNewInputsToProcess()
{
return getPlan()._newInputsToProcess;
}
/**
* Gets only the old data that will be processed. Old data is data that falls before the
* desired date range. It will be subtracted out from the previous output.
* Must call {@link #createPlan()} first.
*
* @return old inputs to process
*/
public List getOldInputsToProcess()
{
return getPlan()._oldInputsToProcess;
}
/**
* Gets whether another pass will be required. Because there may be a limit on the number of inputs processed
* in a single run, multiple runs may be required to process all data in the desired date range.
* Must call {@link #createPlan()} first.
*
* @return true if another pass is required
*/
public boolean getNeedsAnotherPass()
{
return getPlan()._needAnotherPass;
}
/**
* Gets the input schemas. Because multiple inputs are allowed, there may be multiple schemas.
* Must call {@link #createPlan()} first.
*
*
* This does not include the output schema, even though previous output may be fed back as input.
* The reason is that the ouput schema it determined based on the input schema.
*
*
* @return input schemas
*/
public List getInputSchemas()
{
return getPlan()._inputSchemas;
}
/**
* Gets a map from input path to schema. Because multiple inputs are allowed, there may be multiple schemas.
* Must call {@link #createPlan()} first.
*
* @return map from path to input schema
*/
public Map getInputSchemasByPath()
{
return getPlan()._inputSchemasByPath;
}
/**
* Determines what output data already exists. Previous output may be reused.
*
* @throws IOException
*/
private void loadOutputData() throws IOException
{
if (getOutputPath() == null)
{
throw new RuntimeException("No output path specified");
}
_log.info(String.format("Searching for existing output data in " + getOutputPath()));
_outputPathsByDate = getDatedData(getOutputPath());
_log.info(String.format("Found %d output paths",_outputPathsByDate.size()));
}
/**
* Determines what input data to process.
*
*
* The input data to consume is determined by the desired date range. If previous output is not reused then the input data to process
* will coincide with the date range. If previous output may be reused and previous output exists, then the input data to process
* will consist of new data and potentially old data. The new input data to process is data that has time after the previous output date range,
* so that it may be added to the previous output.
* The old data to process is data that has time before the previous output date range, so that it may be subtracted from the previous output.
*
*
*
* If there is a limit on how many days of input data can be processed then it may be the case that not all input data will be processed in
* a single run.
*
*
* @throws IOException
* @throws MaxInputDataExceededException
*/
private void determineInputsToProcess(boolean reusePreviousOutput, Plan plan) throws IOException, MaxInputDataExceededException
{
Calendar cal = Calendar.getInstance(PathUtils.timeZone);
DateRange outputDateRange = null;
if (reusePreviousOutput)
{
if (_outputPathsByDate.size() > 0)
{
DatePath latestPriorOutput = _outputPathsByDate.get(Collections.max(_outputPathsByDate.keySet()));
_log.info("Have previous output, determining what previous incremental data to difference out");
outputDateRange = AvroDateRangeMetadata.getOutputFileDateRange(getFileSystem(),latestPriorOutput.getPath());
_log.info(String.format("Previous output has date range %s to %s",
PathUtils.datedPathFormat.format(outputDateRange.getBeginDate()),
PathUtils.datedPathFormat.format(outputDateRange.getEndDate())));
for (Date currentDate=outputDateRange.getBeginDate();
currentDate.compareTo(getDateRange().getBeginDate()) < 0
&& currentDate.compareTo(outputDateRange.getEndDate()) <= 0;)
{
if (!getAvailableInputsByDate().containsKey(currentDate))
{
throw new RuntimeException(String.format("Missing incremental data for %s, so can't remove it from previous output",PathUtils.datedPathFormat.format(currentDate)));
}
List inputs = getAvailableInputsByDate().get(currentDate);
for (DatePath input : inputs)
{
_log.info(String.format("Old Input: %s",input.getPath()));
plan._inputsToProcess.add(input);
plan._oldInputsToProcess.add(input);
Path root = PathUtils.getNestedPathRoot(input.getPath());
plan._latestInputByPath.put(root.toString(), input.getPath().toString());
}
cal.setTime(currentDate);
cal.add(Calendar.DAY_OF_MONTH, 1);
currentDate = cal.getTime();
}
plan._previousOutputToProcess = latestPriorOutput;
_log.info("Previous Output: " + plan._previousOutputToProcess.getPath());
}
else
{
_log.info("No previous output to reuse");
}
}
// consume the incremental data and produce the final output
int newDataCount = 0;
Date startDate = getDateRange().getBeginDate();
Date endDate = startDate;
for (Date currentDate=startDate; currentDate.compareTo(getDateRange().getEndDate()) <= 0; )
{
if (getMaxToProcess() != null && newDataCount >= getMaxToProcess())
{
if (!reusePreviousOutput)
{
throw new MaxInputDataExceededException(String.format("Amount of input data has exceeded max of %d however output is not being reused so cannot do in multiple passes", getMaxToProcess()));
}
// too much data to process in a single run, will require another pass
plan._needAnotherPass = true;
break;
}
if (outputDateRange == null || currentDate.compareTo(outputDateRange.getEndDate()) > 0)
{
if (!getAvailableInputsByDate().containsKey(currentDate))
{
if (isFailOnMissing())
{
throw new RuntimeException("missing " + PathUtils.datedPathFormat.format(currentDate));
}
else
{
_log.info("No input data found for " + PathUtils.datedPathFormat.format(currentDate));
}
}
else
{
List inputs = getAvailableInputsByDate().get(currentDate);
for (DatePath input : inputs)
{
_log.info(String.format("New Input: %s",input.getPath()));
plan._inputsToProcess.add(input);
plan._newInputsToProcess.add(input);
Path root = PathUtils.getNestedPathRoot(input.getPath());
plan._latestInputByPath.put(root.toString(), input.getPath().toString());
}
newDataCount++;
}
}
cal.setTime(currentDate);
endDate = cal.getTime();
cal.add(Calendar.DAY_OF_MONTH, 1);
currentDate = cal.getTime();
}
plan._currentDateRange = new DateRange(startDate,endDate);
}
/**
* Throws an exception if the plan hasn't been created.
*/
private void checkPlanExists()
{
if (_plan == null) throw new RuntimeException("Must call createPlan first");
}
private Plan getPlan()
{
checkPlanExists();
return _plan;
}
}