All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.ExecutionPlanner Maven / Gradle / Ivy

The newest version!
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Properties;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;

/**
 * Base class for execution planners.  An execution planner determines which files should be processed
 * for a particular run.
 * 
 * @author "Matthew Hayes"
 *
 */
public abstract class ExecutionPlanner
{
  private final Logger _log = Logger.getLogger(ExecutionPlanner.class);
  
  private FileSystem _fileSystem;
  private Properties _props;
  private Date _startDate;
  private Date _endDate;
  private Integer _daysAgo;
  private Integer _numDays;
  private Integer _maxToProcess;
  private DateRange _range;
  private Path _outputPath;
  private boolean _failOnMissing;
  private List _inputPaths = new ArrayList();
  private List> _inputPathsByDate;
  private Map> _availableInputsByDate = new HashMap>();
  
  /**
   * Initializes the execution planner.
   * 
   * @param fs file system to use
   * @param props configuration properties
   */
  public ExecutionPlanner(FileSystem fs, Properties props)
  {
    _props = props;
    _fileSystem = fs;
  }
  
  /**
   * Gets the output path.
   * 
   * @return output path
   */
  public Path getOutputPath()
  {
    return _outputPath;
  }

  /**
   * Gets the input paths.
   * 
   * @return input paths
   */
  public List getInputPaths()
  {
    return _inputPaths;
  }

  /**
   * Sets the output path.
   * 
   * @param outputPath output path
   */
  public void setOutputPath(Path outputPath)
  {
    this._outputPath = outputPath;
  }

  /**
   * Sets the input paths.
   * 
   * @param inputPaths input paths
   */
  public void setInputPaths(List inputPaths)
  {
    this._inputPaths = inputPaths;
  }
  
  /**
   * Sets the start date.
   * 
   * @param startDate start date
   */
  public void setStartDate(Date startDate)
  {
    this._startDate = startDate;
  }
  
  /**
   * Gets the start date
   * 
   * @return start date
   */
  public Date getStartDate()
  {
    return _startDate;
  }

  /**
   * Sets the end date.
   * 
   * @param endDate end date
   */
  public void setEndDate(Date endDate)
  {
    this._endDate = endDate;
  }
  
  /**
   * Gets the end date
   * 
   * @return end date
   */
  public Date getEndDate()
  {
    return _endDate;
  }
  
  /**
   * Sets the number of days to subtract off the end date. 
   * 
   * @param daysAgo days ago
   */
  public void setDaysAgo(Integer daysAgo)
  {
    this._daysAgo = daysAgo;
  }
  
  /**
   * Gets the number of days to subtract off the end date. 
   * 
   * @return days ago
   */
  public Integer getDaysAgo()
  {
    return _daysAgo;
  }

  /**
   * Sets the number of days to process.
   * 
   * @param numDays number of days to process
   */
  public void setNumDays(Integer numDays)
  {
    this._numDays = numDays;
  }
  
  /**
   * Gets the number of days to process.
   * 
   * @return number of days to process
   */
  public Integer getNumDays()
  {
    return _numDays;
  }

  /**
   * Sets the maximum number of days to process at a time.
   * 
   * @param maxToProcess maximum number of days
   */
  public void setMaxToProcess(Integer maxToProcess)
  {
    this._maxToProcess = maxToProcess;
  }
  
  /**
   * Gets the maximum number of days to process at a time.
   * 
   * @return maximum number of days
   */
  public Integer getMaxToProcess()
  {
    return _maxToProcess;
  }
  
  /**
   * Gets whether the job should fail if data is missing within the desired date range.
   * 
   * @return true if the job should fail on missing data
   */
  public boolean isFailOnMissing()
  {
    return _failOnMissing;
  }

  /**
   * Sets whether the job should fail if data is missing within the desired date range.
   * 
   * @param failOnMissing true if the job should fail on missing data
   */
  public void setFailOnMissing(boolean failOnMissing)
  {
    this._failOnMissing = failOnMissing;
  }
  
  /**
   * Gets the desired input date range to process based on the configuration and available inputs.
   * 
   * @return desired date range
   */
  public DateRange getDateRange()
  {
    return _range;
  }

  /**
   * Gets the file system.
   * 
   * @return file system
   */
  protected FileSystem getFileSystem()
  {
    return _fileSystem;
  }
  
  /**
   * Gets the configuration properties.
   * 
   * @return properties
   */
  protected Properties getProps()
  {
    return _props;
  }
  
  /**
   * Gets a map from date to available input data.
   * 
   * @return map from date to available input data
   */
  protected Map> getAvailableInputsByDate()
  {
    return _availableInputsByDate;
  }
  
  /**
   * Get a map from date to path for all paths matching yyyy/MM/dd under the given path.
   * 
   * @param path path to search under
   * @return map of date to path
   * @throws IOException
   */
  protected SortedMap getDailyData(Path path) throws IOException
  {
    SortedMap data = new TreeMap();
    for (DatePath datePath : PathUtils.findNestedDatedPaths(getFileSystem(),path))
    {
      data.put(datePath.getDate(),datePath);
    }
    return data;
  }
  
  /**
   * Get a map from date to path for all paths matching yyyyMMdd under the given path.
   * 
   * @param path path to search under
   * @return map of date to path
   * @throws IOException
   */
  protected SortedMap getDatedData(Path path) throws IOException
  {
    SortedMap data = new TreeMap();
    for (DatePath datePath : PathUtils.findDatedPaths(getFileSystem(),path))
    {
      data.put(datePath.getDate(),datePath);
    }
    return data;
  }
  
  /**
   * Determine what input data is available.
   * 
   * @throws IOException
   */
  protected void loadInputData() throws IOException
  {
    _inputPathsByDate = new ArrayList>();
    for (Path inputPath : getInputPaths())
    {
      _log.info(String.format("Searching for available input data in " + inputPath));
      _inputPathsByDate.add(getDailyData(inputPath));
    }
  }
  
  /**
   * Determines what input data is available.
   */
  protected void determineAvailableInputDates()
  {
    // first find the latest date available for all inputs
    PriorityQueue dates = new PriorityQueue();
    for (SortedMap pathMap : _inputPathsByDate)
    {
      for (Date date : pathMap.keySet())
      {
        dates.add(date);
      }
    }
    if (dates.size() == 0)
    {
      throw new RuntimeException("No input data!");
    }
    List available = new ArrayList();
    Date currentDate = dates.peek();
    int found = 0;
    int needed = getInputPaths().size();
    while (currentDate != null)
    {
      Date date = dates.poll();
      
      if (date != null && date.equals(currentDate))
      {
        found++;
      }
      else
      {
        if (found == needed)
        {
          available.add(currentDate);
        }
        else if (available.size() > 0)
        {
          _log.info("Did not find all input data for date " + PathUtils.datedPathFormat.format(currentDate));
          _log.info("Paths found for " + PathUtils.datedPathFormat.format(currentDate) + ":");
          // collect what's available for this date
          for (SortedMap pathMap : _inputPathsByDate)
          {
            DatePath path = pathMap.get(currentDate);
            if (path != null)
            {
              _log.info("=> " + path);
            }
          }
          
          if (_failOnMissing)
          {
            throw new RuntimeException("Did not find all input data for date " + PathUtils.datedPathFormat.format(currentDate));
          }
          else
          {
            available.add(currentDate);
          }
        }
        
        found = 0;          
        currentDate = date;
        
        if (currentDate != null)
        {
          found++;
        }
      }        
      
      if (found > needed)
      {
        throw new RuntimeException("found more than needed");
      }        
    }
    
    _availableInputsByDate.clear();
    
    for (Date date : available)
    {
      List paths = new ArrayList();
      for (SortedMap map : _inputPathsByDate)
      {
        DatePath path = map.get(date);
        if (path != null)
        {
          paths.add(path);
        }
      }
      _availableInputsByDate.put(date, paths);
    }
  }
  
  /**
   * Determine the date range for inputs to process based on the configuration and available inputs.
   */
  protected void determineDateRange()
  {
    _log.info("Determining range of input data to consume");
    _range = DateRangePlanner.getDateRange(getStartDate(), getEndDate(), getAvailableInputsByDate().keySet(), getDaysAgo(), getNumDays());
    if (_range.getBeginDate() == null || _range.getEndDate() == null)
    {
      throw new RuntimeException("Expected start and end date");
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy