datafu.hourglass.jobs.PartitionCollapsingExecutionPlanner Maven / Gradle / Ivy

Go to download
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;


import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;

/**
 * Execution planner used by {@link AbstractPartitionCollapsingIncrementalJob} and its derived classes.
 * This creates a plan to process partitioned input data and collapse the partitions into a single output.
 * 
 * 
 * To use this class, the input and output paths must be specified.  In addition the desired input date
 * range can be specified through several methods.  Then {@link #createPlan()} can be called and the
 * execution plan will be created.  The inputs to process will be available from {@link #getInputsToProcess()},
 * the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas
 * will be available from {@link #getInputSchemas()}.
 * 
 * 
 * 
 * Previous output may be reused by using {@link #setReusePreviousOutput(boolean)}.  If previous output exists
 * and it is to be reused then it will be available from {@link #getPreviousOutputToProcess()}.  New input data
 * to process that is after the previous output time range is available from {@link #getNewInputsToProcess()}.
 * Old input data to process that is before the previous output time range and should be subtracted from the
 * previous output is available from {@link #getOldInputsToProcess()}.
 * 
 * 
 * 
 * Configuration properties are used to configure a {@link ReduceEstimator} instance.  This is used to 
 * calculate how many reducers should be used.  
 * The number of reducers to use is based on the input data size and the 
 * num.reducers.bytes.per.reducer property.  This setting can be controlled more granularly
 * through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
 * Check {@link ReduceEstimator} for more details on how the properties are used.
 * 
 * 
 * @author "Matthew Hayes"
 *
 */
public class PartitionCollapsingExecutionPlanner extends ExecutionPlanner
{
  private final Logger _log = Logger.getLogger(PartitionCollapsingExecutionPlanner.class);

  private int _numReducers;
  private SortedMap _outputPathsByDate;
  private boolean _reusePreviousOutput;
  
  private List _inputsToProcess = new ArrayList();
  private List _newInputsToProcess = new ArrayList();
  private List _oldInputsToProcess = new ArrayList();
  private Map> _inputsToProcessByDate = new HashMap>();
  private DatePath _previousOutputToProcess;
  private List _inputSchemas = new ArrayList();
  private boolean _needAnotherPass;
  private DateRange _currentDateRange;
  private boolean _planExists;
  
  /**
   * Initializes the execution planner.
   * 
   * @param fs file system
   * @param props configuration properties
   */
  public PartitionCollapsingExecutionPlanner(FileSystem fs, Properties props)
  {
    super(fs, props);
  }

  /**
   * Create the execution plan.
   * 
   * @throws IOException
   */
  public void createPlan() throws IOException
  {
    if (_planExists) throw new RuntimeException("Plan already exists");
    _planExists = true;
    loadInputData();
    loadOutputData();
    determineAvailableInputDates();
    determineDateRange();
    determineInputsToProcess();
    determineInputSchemas();
    determineNumReducers();
  } 

  /**
   * Gets whether previous output should be reused, if it exists.
   * 
   * @return true if previous output should be reused
   */
  public boolean getReusePreviousOutput()
  {
    return _reusePreviousOutput;
  }
  
  /**
   * Sets whether previous output should be reused, if it exists.
   * 
   * @param reuse true if previous output should be reused
   */
  public void setReusePreviousOutput(boolean reuse)
  {
    _reusePreviousOutput = reuse;
  }
  
  /**
   * Get the number of reducers to use based on the input and previous output data size.
   * Must call {@link #createPlan()} first.
   * 
   * @return number of reducers to use
   */
  public int getNumReducers()
  {
    checkPlanExists();
    return _numReducers;
  }
  
  public DateRange getCurrentDateRange()
  {
    checkPlanExists();
    return _currentDateRange;
  }
  
  /**
   * Gets the previous output to reuse, or null if no output is being reused.
   * Must call {@link #createPlan()} first.
   * 
   * @return previous output to reuse, or null
   */
  public DatePath getPreviousOutputToProcess()
  {
    checkPlanExists();
    return _previousOutputToProcess;
  }
  
  /**
   * Gets all inputs that will be processed.  This includes both old and new data.
   * Must call {@link #createPlan()} first.
   * 
   * @return inputs to process
   */
  public List getInputsToProcess()
  {
    checkPlanExists();
    return _inputsToProcess;
  }
  
  /**
   * Gets only the new data that will be processed.  New data is data that falls within the 
   * desired date range.
   * Must call {@link #createPlan()} first.
   * 
   * @return new inputs to process
   */
  public List getNewInputsToProcess()
  {
    checkPlanExists();
    return _newInputsToProcess;
  }
  
  /**
   * Gets only the old data that will be processed.  Old data is data that falls before the
   * desired date range.  It will be subtracted out from the previous output.
   * Must call {@link #createPlan()} first.
   * 
   * @return old inputs to process
   */
  public List getOldInputsToProcess()
  {
    checkPlanExists();
    return _oldInputsToProcess;
  }
  
  /**
   * Gets whether another pass will be required.  Because there may be a limit on the number of inputs processed 
   * in a single run, multiple runs may be required to process all data in the desired date range.  
   * Must call {@link #createPlan()} first.
   * 
   * @return true if another pass is required
   */
  public boolean getNeedsAnotherPass()
  {
    checkPlanExists();
    return _needAnotherPass;
  }
  
  /**
   * Gets the input schemas.  Because multiple inputs are allowed, there may be multiple schemas.
   * Must call {@link #createPlan()} first.
   * 
   * 
   * This does not include the output schema, even though previous output may be fed back as input.
   * The reason is that the ouput schema it determined based on the input schema.
   * 
   * 
   * @return input schemas
   */
  public List getInputSchemas()
  {
    checkPlanExists();
    return _inputSchemas;
  }
  
  /**
   * Determines the number of reducers to use based on the input data size and the previous output,
   * if it exists and is being reused.
   * The number of reducers to use is based on the input data size and the 
   * num.reducers.bytes.per.reducer property.  This setting can be controlled more granularly
   * through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer.
   * See {@link ReduceEstimator} for details on reducer estimation.
   * 
   * @throws IOException
   */
  private void determineNumReducers() throws IOException
  {
    ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps());
    List inputPaths = new ArrayList();
    for (DatePath input : getInputsToProcess())
    {
      inputPaths.add(input.getPath().toString());
      estimator.addInputPath("input",input.getPath());
    }
    if (_previousOutputToProcess != null)
    {
      estimator.addInputPath("previous",_previousOutputToProcess.getPath());
    }
    _numReducers = estimator.getNumReducers();
  }

  /**
   * Determines the input schemas.  There may be multiple input schemas because multiple inputs are allowed.
   * The latest available inputs are used to determine the schema, the assumption being that schemas are
   * backwards-compatible.
   * 
   * @throws IOException
   */
  private void determineInputSchemas() throws IOException
  {
    List dates = new ArrayList(_inputsToProcessByDate.keySet());
    if (dates.size() > 0)
    {
      Collections.sort(dates);
      Date lastDate = dates.get(dates.size()-1);
      List lastInputs = _inputsToProcessByDate.get(lastDate);
      for (DatePath input : lastInputs)
      {
        _inputSchemas.add(PathUtils.getSchemaFromPath(getFileSystem(),input.getPath()));
      }
    }
  }
  
  /**
   * Determines what output data already exists.  Previous output may be reused.
   * 
   * @throws IOException
   */
  private void loadOutputData() throws IOException
  {
    _log.info(String.format("Checking output data in " + getOutputPath()));
    _outputPathsByDate = getDatedData(getOutputPath());
  }
  
  /**
   * Determines what input data to process.
   * 
   * 
   * The input data to consume is determined by the desired date range.  If previous output is not reused then the input data to process
   * will coincide with the date range.  If previous output may be reused and previous output exists, then the input data to process 
   * will consist of new data and potentially old data.  The new input data to process is data that has time after the previous output date range,
   * so that it may be added to the previous output.
   * The old data to process is data that has time before the previous output date range, so that it may be subtracted from the previous output.
   * 
   * 
   * 
   * If there is a limit on how many days of input data can be processed then it may be the case that not all input data will be processed in
   * a single run.
   * 
   * 
   * @throws IOException
   */
  private void determineInputsToProcess() throws IOException
  {
    Calendar cal = Calendar.getInstance(PathUtils.timeZone);    
    
    _inputsToProcess.clear();
    _inputsToProcessByDate.clear();
    _previousOutputToProcess = null;
    
    DateRange outputDateRange = null;
    
    if (_reusePreviousOutput && _outputPathsByDate.size() > 0)
    {
      DatePath latestPriorOutput = _outputPathsByDate.get(Collections.max(_outputPathsByDate.keySet()));
      _log.info("Have previous output, determining what previous incremental data to difference out");
      outputDateRange = AvroDateRangeMetadata.getOutputFileDateRange(getFileSystem(),latestPriorOutput.getPath());
      _log.info(String.format("Previous output has date range %s to %s",
                PathUtils.datedPathFormat.format(outputDateRange.getBeginDate()),
                PathUtils.datedPathFormat.format(outputDateRange.getEndDate())));
      
      for (Date currentDate=outputDateRange.getBeginDate(); currentDate.compareTo(getDateRange().getBeginDate()) < 0;)
      {
        if (!getAvailableInputsByDate().containsKey(currentDate))
        {  
          throw new RuntimeException(String.format("Missing incremental data for %s, so can't remove it from previous output",PathUtils.datedPathFormat.format(currentDate)));
        }
        
        List inputs = getAvailableInputsByDate().get(currentDate);
        
        for (DatePath input : inputs)
        {
          _log.info(String.format("Input: %s",input.getPath()));
          _inputsToProcess.add(input);
          _oldInputsToProcess.add(input);
        }
        
        _inputsToProcessByDate.put(currentDate, inputs);
                
        cal.setTime(currentDate);
        cal.add(Calendar.DAY_OF_MONTH, 1);
        currentDate = cal.getTime();
      }
        
      _previousOutputToProcess = latestPriorOutput;
      _log.info("Including previous output: " + _previousOutputToProcess.getPath());
    }
    
    // consume the incremental data and produce the final output
    _log.info("Determining what new incremental data to include");
    
    int newDataCount = 0;
    Date startDate = getDateRange().getBeginDate();
    Date endDate = startDate;
    for (Date currentDate=startDate; currentDate.compareTo(getDateRange().getEndDate()) <= 0; )
    { 
      if (getMaxToProcess() != null && newDataCount >= getMaxToProcess())
      {
        if (!_reusePreviousOutput)
        {
          throw new RuntimeException(String.format("Amount of input data has exceeded max of %d however output is not being reused so cannot do in multiple passes", getMaxToProcess()));
        }
        
        // too much data to process in a single run, will require another pass
        _needAnotherPass = true;
        break;
      }
      
      if (outputDateRange == null || currentDate.compareTo(outputDateRange.getEndDate()) > 0)
      {
        if (!getAvailableInputsByDate().containsKey(currentDate))
        {
          if (isFailOnMissing())
          {
            throw new RuntimeException("missing " + PathUtils.datedPathFormat.format(currentDate));            
          }
          else
          {
            _log.info("No input data found for " + PathUtils.datedPathFormat.format(currentDate));
          }
        }
        else
        {
          List inputs = getAvailableInputsByDate().get(currentDate);
          
          for (DatePath input : inputs)
          {
            _log.info(String.format("Input: %s",input.getPath()));
            _inputsToProcess.add(input);
            _newInputsToProcess.add(input);
          }
          
          _inputsToProcessByDate.put(currentDate, inputs);
          
          newDataCount++;
        }
      }
      
      cal.setTime(currentDate);
      endDate = cal.getTime();
      cal.add(Calendar.DAY_OF_MONTH, 1);
      currentDate = cal.getTime();
    }
    
    _currentDateRange = new DateRange(startDate,endDate);
  } 
  
  /**
   * Throws an exception if the plan hasn't been created.
   */
  private void checkPlanExists()
  {
    if (!_planExists) throw new RuntimeException("Must call createPlan first");
  }
}