datafu.hourglass.jobs.PartitionPreservingExecutionPlanner Maven / Gradle / Ivy

Go to download
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeSet;
import java.util.Map.Entry;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.PathUtils;

/**
 * Execution planner used by {@link AbstractPartitionPreservingIncrementalJob} and its derived classes.
 * This creates a plan to process partitioned input data and produce partitioned output data.
 * 
 * 
 * To use this class, the input and output paths must be specified.  In addition the desired input date
 * range can be specified through several methods.  Then {@link #createPlan()} can be called and the
 * execution plan will be created.  The inputs to process will be available from {@link #getInputsToProcess()},
 * the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas
 * will be available from {@link #getInputSchemas()}.
 * 
 * 
 * 
 * Configuration properties are used to configure a {@link ReduceEstimator} instance.  This is used to 
 * calculate how many reducers should be used.  
 * The number of reducers to use is based on the input data size and the 
 * num.reducers.bytes.per.reducer property.
 * Check {@link ReduceEstimator} for more details on how the properties are used.
 * 
 * 
 * @author "Matthew Hayes"
 *
 */
public class PartitionPreservingExecutionPlanner extends ExecutionPlanner
{
  private final Logger _log = Logger.getLogger(PartitionPreservingExecutionPlanner.class);
  
  private SortedMap _outputPathsByDate;
  private Map _latestInputByPath = new HashMap();
  private List _inputsToProcess = new ArrayList();
  private List _inputSchemas = new ArrayList();
  private Map _inputSchemasByPath = new HashMap();
  private boolean _needAnotherPass;
  private int _numReducers;
  private boolean _planExists;
  
  /**
   * Initializes the execution planner.
   * 
   * @param fs file system
   * @param props configuration properties
   */
  public PartitionPreservingExecutionPlanner(FileSystem fs, Properties props)
  {
    super(fs,props);
  }

  /**
   * Create the execution plan.
   * 
   * @throws IOException
   */
  public void createPlan() throws IOException
  {
    if (_planExists) throw new RuntimeException("Plan already exists");
    _planExists = true;
    loadInputData();
    loadOutputData();
    determineAvailableInputDates();
    determineDateRange();
    determineInputsToProcess();
    determineInputSchemas();
    determineNumReducers();
  }
  
  /**
   * Get the number of reducers to use based on the input data size.
   * Must call {@link #createPlan()} first.
   * 
   * @return number of reducers to use
   */
  public int getNumReducers()
  {
    checkPlanExists();
    return _numReducers;
  }
  
  /**
   * Gets the input schemas.  Because multiple inputs are allowed, there may be multiple schemas.
   * Must call {@link #createPlan()} first.
   * 
   * @return input schemas
   */
  public List getInputSchemas()
  {
    checkPlanExists();
    return _inputSchemas;
  }
  
  /**
   * Gets a map from input path to schema.  Because multiple inputs are allowed, there may be multiple schemas.
   * Must call {@link #createPlan()} first.
   * 
   * @return map from path to input schema
   */
  public Map getInputSchemasByPath()
  {
    checkPlanExists();
    return _inputSchemasByPath;
  }
    
  /**
   * Gets whether another pass will be required.  Because there may be a limit on the number of inputs processed 
   * in a single run, multiple runs may be required to process all data in the desired date range.  
   * Must call {@link #createPlan()} first.
   * 
   * @return true if another pass is required
   */
  public boolean getNeedsAnotherPass()
  {
    checkPlanExists();
    return _needAnotherPass;
  }
  
  /**
   * Gets the inputs which are to be processed.
   * Must call {@link #createPlan()} first.
   * 
   * @return inputs to process
   */
  public List getInputsToProcess()
  {
    checkPlanExists();
    return _inputsToProcess;
  }
  
  /**
   * Gets the input dates which are to be processed.
   * Must call {@link #createPlan()} first.
   * 
   * @return dates to process
   */
  public List getDatesToProcess()
  {
    checkPlanExists();
    Set dates = new TreeSet();
    for (DatePath dp : _inputsToProcess)
    {
      dates.add(dp.getDate());
    }
    return new ArrayList(dates);
  }
  
  /**
   * Determines the number of reducers to use based on the input data size.
   * The number of reducers to use is based on the input data size and the 
   * num.reducers.bytes.per.reducer property.  See {@link ReduceEstimator}
   * for details on reducer estimation.
   * 
   * @throws IOException
   */
  private void determineNumReducers() throws IOException
  {
    ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps());
    List inputPaths = new ArrayList();
    for (DatePath input : getInputsToProcess())
    {
      inputPaths.add(input.getPath().toString());
      estimator.addInputPath("input",input.getPath());
    }
    _numReducers = estimator.getNumReducers();
  }
  
  /**
   * Determines the input schemas.  There may be multiple input schemas because multiple inputs are allowed.
   * The latest available inputs are used to determine the schema, the assumption being that schemas are
   * backwards-compatible.
   * 
   * @throws IOException
   */
  private void determineInputSchemas() throws IOException
  {
    if (_latestInputByPath.size() > 0)
    {
      _log.info("Determining input schemas");
      for (Entry entry : _latestInputByPath.entrySet())
      {
        String root = entry.getKey();
        String input = entry.getValue();
        _log.info("Loading schema for " + input);
        Schema schema = PathUtils.getSchemaFromPath(getFileSystem(),new Path(input));
        _inputSchemas.add(schema);
        _inputSchemasByPath.put(root, schema);
      }
    }
  }
  
  /**
   * Determines which input data should be processed.  This checks the availability of input data within
   * the desired date range and also checks whether the output already exists.  Only inputs with no 
   * corresponding output are processed. 
   */
  private void determineInputsToProcess()
  {
    _log.info("Determining inputs to process");
    _latestInputByPath.clear();
    int newDataCount = 0;
    Calendar cal = Calendar.getInstance(PathUtils.timeZone);
    for (Date currentDate=getDateRange().getBeginDate(); currentDate.compareTo(getDateRange().getEndDate()) <= 0; )
    { 
      if (!_outputPathsByDate.containsKey(currentDate))
      {      
        List inputs = getAvailableInputsByDate().get(currentDate);  
        if (inputs != null)
        { 
          if (getMaxToProcess() != null && newDataCount >= getMaxToProcess())
          {          
            // too much data to process in a single run, will require another pass
            _needAnotherPass = true;
            break;
          }
          
          for (DatePath input : inputs)
          {
            _log.info(String.format("Input: %s",input.getPath()));
            _inputsToProcess.add(input);
            
            Path root = PathUtils.getNestedPathRoot(input.getPath());
            _latestInputByPath.put(root.toString(), input.getPath().toString());
          }
                    
          newDataCount++;
        }
        else
        {
          throw new RuntimeException("missing input data for " + currentDate);
        }
      }
      
      cal.setTime(currentDate);
      cal.add(Calendar.DAY_OF_MONTH, 1);
      currentDate = cal.getTime();
    }
  }
    
  /**
   * Determines what output data already exists.  Inputs will not be consumed if the output already exists.
   * 
   * @throws IOException
   */
  private void loadOutputData() throws IOException
  {
    _log.info(String.format("Checking output data in " + getOutputPath()));
    _outputPathsByDate = getDailyData(getOutputPath());
  }
  
  /**
   * Throws an exception if the plan hasn't been created.
   */
  private void checkPlanExists()
  {
    if (!_planExists) throw new RuntimeException("Must call createPlan first");
  }
}