datafu.hourglass.jobs.PartitionPreservingExecutionPlanner Maven / Gradle / Ivy
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.jobs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeSet;
import java.util.Map.Entry;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.PathUtils;
/**
* Execution planner used by {@link AbstractPartitionPreservingIncrementalJob} and its derived classes.
* This creates a plan to process partitioned input data and produce partitioned output data.
*
*
* To use this class, the input and output paths must be specified. In addition the desired input date
* range can be specified through several methods. Then {@link #createPlan()} can be called and the
* execution plan will be created. The inputs to process will be available from {@link #getInputsToProcess()},
* the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas
* will be available from {@link #getInputSchemas()}.
*
*
*
* Configuration properties are used to configure a {@link ReduceEstimator} instance. This is used to
* calculate how many reducers should be used.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property.
* Check {@link ReduceEstimator} for more details on how the properties are used.
*
*
* @author "Matthew Hayes"
*
*/
public class PartitionPreservingExecutionPlanner extends ExecutionPlanner
{
private final Logger _log = Logger.getLogger(PartitionPreservingExecutionPlanner.class);
private SortedMap _outputPathsByDate;
private Map _latestInputByPath = new HashMap();
private List _inputsToProcess = new ArrayList();
private List _inputSchemas = new ArrayList();
private Map _inputSchemasByPath = new HashMap();
private boolean _needAnotherPass;
private int _numReducers;
private boolean _planExists;
/**
* Initializes the execution planner.
*
* @param fs file system
* @param props configuration properties
*/
public PartitionPreservingExecutionPlanner(FileSystem fs, Properties props)
{
super(fs,props);
}
/**
* Create the execution plan.
*
* @throws IOException
*/
public void createPlan() throws IOException
{
if (_planExists) throw new RuntimeException("Plan already exists");
_planExists = true;
loadInputData();
loadOutputData();
determineAvailableInputDates();
determineDateRange();
determineInputsToProcess();
determineInputSchemas();
determineNumReducers();
}
/**
* Get the number of reducers to use based on the input data size.
* Must call {@link #createPlan()} first.
*
* @return number of reducers to use
*/
public int getNumReducers()
{
checkPlanExists();
return _numReducers;
}
/**
* Gets the input schemas. Because multiple inputs are allowed, there may be multiple schemas.
* Must call {@link #createPlan()} first.
*
* @return input schemas
*/
public List getInputSchemas()
{
checkPlanExists();
return _inputSchemas;
}
/**
* Gets a map from input path to schema. Because multiple inputs are allowed, there may be multiple schemas.
* Must call {@link #createPlan()} first.
*
* @return map from path to input schema
*/
public Map getInputSchemasByPath()
{
checkPlanExists();
return _inputSchemasByPath;
}
/**
* Gets whether another pass will be required. Because there may be a limit on the number of inputs processed
* in a single run, multiple runs may be required to process all data in the desired date range.
* Must call {@link #createPlan()} first.
*
* @return true if another pass is required
*/
public boolean getNeedsAnotherPass()
{
checkPlanExists();
return _needAnotherPass;
}
/**
* Gets the inputs which are to be processed.
* Must call {@link #createPlan()} first.
*
* @return inputs to process
*/
public List getInputsToProcess()
{
checkPlanExists();
return _inputsToProcess;
}
/**
* Gets the input dates which are to be processed.
* Must call {@link #createPlan()} first.
*
* @return dates to process
*/
public List getDatesToProcess()
{
checkPlanExists();
Set dates = new TreeSet();
for (DatePath dp : _inputsToProcess)
{
dates.add(dp.getDate());
}
return new ArrayList(dates);
}
/**
* Determines the number of reducers to use based on the input data size.
* The number of reducers to use is based on the input data size and the
* num.reducers.bytes.per.reducer property. See {@link ReduceEstimator}
* for details on reducer estimation.
*
* @throws IOException
*/
private void determineNumReducers() throws IOException
{
ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps());
List inputPaths = new ArrayList();
for (DatePath input : getInputsToProcess())
{
inputPaths.add(input.getPath().toString());
estimator.addInputPath("input",input.getPath());
}
_numReducers = estimator.getNumReducers();
}
/**
* Determines the input schemas. There may be multiple input schemas because multiple inputs are allowed.
* The latest available inputs are used to determine the schema, the assumption being that schemas are
* backwards-compatible.
*
* @throws IOException
*/
private void determineInputSchemas() throws IOException
{
if (_latestInputByPath.size() > 0)
{
_log.info("Determining input schemas");
for (Entry entry : _latestInputByPath.entrySet())
{
String root = entry.getKey();
String input = entry.getValue();
_log.info("Loading schema for " + input);
Schema schema = PathUtils.getSchemaFromPath(getFileSystem(),new Path(input));
_inputSchemas.add(schema);
_inputSchemasByPath.put(root, schema);
}
}
}
/**
* Determines which input data should be processed. This checks the availability of input data within
* the desired date range and also checks whether the output already exists. Only inputs with no
* corresponding output are processed.
*/
private void determineInputsToProcess()
{
_log.info("Determining inputs to process");
_latestInputByPath.clear();
int newDataCount = 0;
Calendar cal = Calendar.getInstance(PathUtils.timeZone);
for (Date currentDate=getDateRange().getBeginDate(); currentDate.compareTo(getDateRange().getEndDate()) <= 0; )
{
if (!_outputPathsByDate.containsKey(currentDate))
{
List inputs = getAvailableInputsByDate().get(currentDate);
if (inputs != null)
{
if (getMaxToProcess() != null && newDataCount >= getMaxToProcess())
{
// too much data to process in a single run, will require another pass
_needAnotherPass = true;
break;
}
for (DatePath input : inputs)
{
_log.info(String.format("Input: %s",input.getPath()));
_inputsToProcess.add(input);
Path root = PathUtils.getNestedPathRoot(input.getPath());
_latestInputByPath.put(root.toString(), input.getPath().toString());
}
newDataCount++;
}
else
{
throw new RuntimeException("missing input data for " + currentDate);
}
}
cal.setTime(currentDate);
cal.add(Calendar.DAY_OF_MONTH, 1);
currentDate = cal.getTime();
}
}
/**
* Determines what output data already exists. Inputs will not be consumed if the output already exists.
*
* @throws IOException
*/
private void loadOutputData() throws IOException
{
_log.info(String.format("Checking output data in " + getOutputPath()));
_outputPathsByDate = getDailyData(getOutputPath());
}
/**
* Throws an exception if the plan hasn't been created.
*/
private void checkPlanExists()
{
if (!_planExists) throw new RuntimeException("Must call createPlan first");
}
}