datafu.hourglass.jobs.AbstractPartitionPreservingIncrementalJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datafu-hourglass-incubating Show documentation
Librares that make easier to solve data problems using Hadoop and higher level languages based on it.
There is a newer version: 1.3.3
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.avro.mapreduce.AvroMultipleOutputs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.log4j.Logger;


import datafu.hourglass.avro.AvroMultipleInputsKeyInputFormat;
import datafu.hourglass.avro.AvroMultipleInputsUtil;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.PathUtils;
import datafu.hourglass.mapreduce.DelegatingCombiner;
import datafu.hourglass.mapreduce.DelegatingMapper;
import datafu.hourglass.mapreduce.DelegatingReducer;
import datafu.hourglass.mapreduce.DistributedCacheHelper;
import datafu.hourglass.mapreduce.ObjectMapper;
import datafu.hourglass.mapreduce.ObjectReducer;
import datafu.hourglass.mapreduce.Parameters;
import datafu.hourglass.mapreduce.PartitioningCombiner;
import datafu.hourglass.mapreduce.PartitioningMapper;
import datafu.hourglass.mapreduce.PartitioningReducer;
import datafu.hourglass.model.Accumulator;
import datafu.hourglass.model.Mapper;
import datafu.hourglass.schemas.PartitionPreservingSchemas;

/**
 * An {@link IncrementalJob} that consumes partitioned input data and produces
 * output data having the same partitions.
 * Typically this is used in conjunction with {@link AbstractPartitionCollapsingIncrementalJob}
 * when computing aggregates over sliding windows.  A partition-preserving job can perform
 * initial aggregation per-day, which can then be consumed by a partition-collapsing job to
 * produce the final aggregates over the time window. 
 * Only Avro is supported for the input, intermediate, and output data.
 * 
 * 
 * Implementations of this class must provide key, intermediate value, and output value schemas.
 * The key and intermediate value schemas define the output for the mapper and combiner.
 * The key and output value schemas define the output for the reducer.
 * These are defined by overriding {@link #getKeySchema()}, {@link #getIntermediateValueSchema()},
 * and {@link #getOutputValueSchema()}.
 * 
 * 
 * 
 * Implementations must also provide a mapper by overriding {@link #getMapper()} and an accumulator
 * for the reducer by overriding {@link #getReducerAccumulator()}.  An optional combiner may be
 * provided by overriding {@link #getCombinerAccumulator()}.  For the combiner to be used
 * the property use.combiner must also be set to true.
 * 
 * 
 * 
 * The distinguishing feature this type of job is that the input partitioning is preserved in the ouput.
 * The data from each partition is processed independently of other partitions and then output separately.
 * For example, input that is partitioned by day can be aggregated by day and then output by day.
 * This is achieved by attaching a long value to each key, which represents the partition, so that the reducer
 * receives data grouped by the key and partition together.  Multiple outputs are then used so that the output
 * will have the same partitions as the input.
 * 
 * 
 * 
 * The input path can be provided either through the property input.path
 * or by calling {@link #setInputPaths(List)}.  If multiple input paths are provided then
 * this implicitly means a join is to be performed.  Multiple input paths can be provided via
 * properties by prefixing each with input.path., such as input.path.first
 * and input.path.second.
 * Input data must be partitioned by day according to the naming convention yyyy/MM/dd.
 * The output path can be provided either through the property output.path 
 * or by calling {@link #setOutputPath(Path)}.
 * Output data will be written using the same naming convention as the input, namely yyyy/MM/dd, where the date used
 * to format the output path is the same the date for the input it was derived from.
 * For example, if the desired time range to process is 2013/01/01 through 2013/01/14,
 * then the output will be named 2013/01/01 through 2013/01/14. 
 * By default the job will fail if any input data in the desired time window is missing.  This can be overriden by setting
 * fail.on.missing to false.
 * 
 * 
 * 
 * The job will not process input for which a corresponding output already exists.  For example, if the desired date
 * range is 2013/01/01 through 2013/01/14 and the outputs 2013/01/01 through 2013/01/12 exist, then only
 * 2013/01/13 and 2013/01/14 will be processed and only 2013/01/13 and 2013/01/14 will be produced.  
 * 
 * 
 * 
 * The number of paths in the output to retain can be configured through the property retention.count, 
 * or by calling {@link #setRetentionCount(Integer)}.  When this property is set only the latest paths in the output
 * will be kept; the remainder will be removed.  By default there is no retention count set so all output paths are kept.
 * 
 * 
 * 
 * The inputs to process can be controlled by defining a desired date range.  By default the job will process all input
 * data available.  To limit the number of days of input to process one can set the property num.days
 * or call {@link #setNumDays(Integer)}.  This would define a processing window with the same number of days,
 * where the end date of the window is the latest available input and the start date is num.days ago.  
 * Only inputs within this window would be processed.
 * Because the end date is the same as the latest available input, as new input data becomes available the end of the
 * window will advance forward to include it.  The end date can be adjusted backwards relative to the latest input
 * through the property days.ago, or by calling {@link #setDaysAgo(Integer)}.  This subtracts as many days
 * from the latest available input date to determine the end date.  The start date or end date can also be fixed
 * by setting the properties start.date or end.date, or by calling {@link #setStartDate(Date)}
 * or {@link #setEndDate(Date)}. 
 * 
 * 
 * 
 * The number of reducers to use is automatically determined based on the size of the data to process.  
 * The total size is computed and then divided by the value of the property num.reducers.bytes.per.reducer, which
 * defaults to 256 MB.  This is the number of reducers that will be used.  
 * The number of reducers can also be set to a fixed value through the property num.reducers.
 * 
 * 
 * 
 * This type of job is capable of performing its work over multiple iterations.  
 * The number of days to process at a time can be limited by setting the property max.days.to.process,
 * or by calling {@link #setMaxToProcess(Integer)}.  The default is 90 days.  
 * This can be useful when there are restrictions on how many tasks 
 * can be used by a single MapReduce job in the cluster.  When this property is set, the job will process no more than
 * this many days at a time, and it will perform one or more iterations if necessary to complete the work.
 * The number of iterations can be limited by setting the property max.iterations, or by calling {@link #setMaxIterations(Integer)}.
 * If the number of iterations is exceeded the job will fail.  By default the maximum number of iterations is 20.
 * 
 * 
 * 
 * Hadoop configuration may be provided by setting a property with the prefix hadoop-conf..
 * For example, mapred.min.split.size can be configured by setting property
 * hadoop-conf.mapred.min.split.size to the desired value. 
 * 
 * 
 */
public abstract class AbstractPartitionPreservingIncrementalJob extends IncrementalJob
{
  private final Logger _log = Logger.getLogger(AbstractPartitionPreservingIncrementalJob.class);
  
  private List _reports = new ArrayList();
  private PartitioningMapper _mapper;
  private PartitioningCombiner _combiner;
  private PartitioningReducer _reducer;
  private FileCleaner _garbage;
  
  /**
   * Initializes the job.
   * @throws IOException IOException
   */
  public AbstractPartitionPreservingIncrementalJob() throws IOException
  {     
  }
  
  /**
   * Initializes the job with a job name and properties.
   * 
   * @param name job name
   * @param props configuration properties
   * @throws IOException IOException
   */
  public AbstractPartitionPreservingIncrementalJob(String name, Properties props) throws IOException
  { 
    super(name,props);
  }
  
  /**
   * Gets the mapper.
   * 
   * @return mapper
   */
  public abstract Mapper getMapper();
  
  /**
   * Gets the accumulator used for the combiner.
   * 
   * @return combiner accumulator
   */
  public Accumulator getCombinerAccumulator()
  {
    return null;
  }
  
  /**
   * Gets the accumulator used for the reducer.
   * 
   * @return reducer accumulator
   */
  public abstract Accumulator getReducerAccumulator();
  
  /**
   * Run the job.
   * 
   * @throws IOException IOException
   * @throws InterruptedException InterruptedException
   * @throws ClassNotFoundException ClassNotFoundException
   */
  @Override
  public void run() throws IOException, InterruptedException, ClassNotFoundException
  {     
    try
    {
      initialize();
      validate();
      execute();
    }
    finally
    {
      cleanup();
    }
  }
  
  /**
   * Get reports that summarize each of the job iterations.
   * 
   * @return reports
   */
  public List getReports()
  {
    return Collections.unmodifiableList(_reports);
  }
  
  @Override
  protected void initialize()
  {
    _garbage = new FileCleaner(getFileSystem());
    
    if (getMaxIterations() == null)
    {
      setMaxIterations(20);
    }
    
    if (getMaxToProcess() == null)
    {
      if (getNumDays() != null)
      {
        setMaxToProcess(getNumDays());
      }
      else
      {
        setMaxToProcess(90);
      }
    }
    
    super.initialize();
  }
  
  /**
   * Get the name for the reduce output schema. 
   * By default this is the name of the class with "Output" appended.
   * 
   * @return output schema name
   */
  protected String getOutputSchemaName()
  {
    return this.getClass().getSimpleName() + "Output";
  }
  
  /**
   * Get the namespace for the reduce output schema.
   * By default this is the package of the class.
   * 
   * @return output schema namespace
   */
  protected String getOutputSchemaNamespace()
  {
    return this.getClass().getPackage().getName();
  }
  
  protected ObjectMapper getMapProcessor()
  {
    return _mapper;
  }
  
  protected ObjectReducer getCombineProcessor()
  {
    return _combiner;
  }
  
  protected ObjectReducer getReduceProcessor()
  {
    return _reducer;
  }
   
  /**
   * Execute the job.
   * 
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  private void execute() throws IOException, InterruptedException, ClassNotFoundException
  {  
    int iterations = 0;
    
    while(true)
    {
      PartitionPreservingExecutionPlanner planner = new PartitionPreservingExecutionPlanner(getFileSystem(),getProperties());
      planner.setInputPaths(getInputPaths());
      planner.setOutputPath(getOutputPath());
      planner.setStartDate(getStartDate());
      planner.setEndDate(getEndDate());
      planner.setDaysAgo(getDaysAgo());
      planner.setNumDays(getNumDays());
      planner.setMaxToProcess(getMaxToProcess());
      planner.setFailOnMissing(isFailOnMissing());
      planner.createPlan();
      
      if (planner.getInputsToProcess().size() == 0)
      {
        _log.info("Found all necessary incremental data");
        break;
      }
      
      if (iterations >= getMaxIterations())
      {
        throw new RuntimeException(String.format("Already completed %d iterations but the max is %d and there are still %d inputs to process",
                                                 iterations,
                                                 getMaxIterations(),
                                                 planner.getInputsToProcess().size()));
      }
      
      Path jobTempPath = createRandomTempPath();  
      _garbage.add(jobTempPath);
      ensurePath(getOutputPath());
      
      Path incrementalStagingPath = ensurePath(new Path(jobTempPath,".incremental-staging"));
      Path incrementalStagingTmpPath = ensurePath(new Path(jobTempPath,".incremental-staging-tmp"));
      
      Report report = new Report();    
           
      // create input paths for job
      List inputPaths = new ArrayList();
      for (DatePath input : planner.getInputsToProcess())
      {
        inputPaths.add(input.getPath().toString());
        report.inputFiles.add(input);
      }
      
      _log.info("Staging path: " + incrementalStagingPath);
      final StagedOutputJob job = StagedOutputJob.createStagedJob(
                                    getConf(),
                                    getName() + "-" + "incremental",            
                                    inputPaths,
                                    incrementalStagingTmpPath.toString(),
                                    incrementalStagingPath.toString(),
                                    _log);
              
      job.setCountersParentPath(getCountersParentPath());
      
      final Configuration conf = job.getConfiguration();
      
      config(conf);
      
      PartitionPreservingSchemas fpSchemas = new PartitionPreservingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace() );
      
      job.setInputFormatClass(AvroMultipleInputsKeyInputFormat.class);
      
      job.setOutputFormatClass(AvroKeyOutputFormat.class);
      
      _log.info("Setting input path to schema mappings");
      for (String path : fpSchemas.getMapInputSchemas().keySet())
      {
        Schema schema = fpSchemas.getMapInputSchemas().get(path);
        _log.info("*** " + path);
        _log.info("*** => " + schema.toString());
        AvroMultipleInputsUtil.setInputKeySchemaForPath(job, schema, path);
      }
            
      AvroJob.setMapOutputKeySchema(job, fpSchemas.getMapOutputKeySchema());
      AvroJob.setMapOutputValueSchema(job, fpSchemas.getMapOutputValueSchema());
      AvroJob.setOutputKeySchema(job, fpSchemas.getReduceOutputSchema());
            
      StringBuilder inputTimesJoined = new StringBuilder();
      for (Date input : planner.getDatesToProcess())
      {
        String namedOutput = PathUtils.datedPathFormat.format(input);
        _log.info(String.format("Adding named output %s",namedOutput));
        AvroMultipleOutputs.addNamedOutput(job, 
                                           namedOutput, 
                                           AvroKeyOutputFormat.class, 
                                           fpSchemas.getReduceOutputSchema());
        
        inputTimesJoined.append(Long.toString(input.getTime()));
        inputTimesJoined.append(",");
      }
      
      int numReducers;
      
      if (getNumReducers() != null)
      {
        numReducers = getNumReducers();
        _log.info(String.format("Using %d reducers (fixed)",numReducers));
      }
      else
      {
        numReducers = planner.getNumReducers();
        _log.info(String.format("Using %d reducers (computed)",numReducers));
      }
      
      int avgReducersPerInput = (int)Math.ceil(numReducers/(double)planner.getDatesToProcess().size());
      
      _log.info(String.format("Reducers per input path: %d", avgReducersPerInput));    
      
      // counters for multiple outputs
//      conf.set("mo.counters", "true");
      
      conf.set(TimePartitioner.REDUCERS_PER_INPUT, Integer.toString(avgReducersPerInput));
      conf.set(TimePartitioner.INPUT_TIMES, inputTimesJoined.substring(0,inputTimesJoined.length()-1));    
              
      job.setNumReduceTasks(numReducers);
      
      Path mapperPath = new Path(incrementalStagingPath,".mapper_impl");
      Path reducerPath = new Path(incrementalStagingPath,".reducer_impl");
      Path combinerPath = new Path(incrementalStagingPath,".combiner_impl");
      
      conf.set(Parameters.REDUCER_IMPL_PATH, reducerPath.toString());
      conf.set(Parameters.MAPPER_IMPL_PATH, mapperPath.toString());
      
      _mapper = new PartitioningMapper();
      _mapper.setSchemas(fpSchemas);
      _mapper.setMapper(getMapper());

      _reducer = new PartitioningReducer();
      _reducer.setSchemas(fpSchemas);
      _reducer.setAccumulator(getReducerAccumulator());
      
      DistributedCacheHelper.writeObject(conf, getMapProcessor(), mapperPath);
      DistributedCacheHelper.writeObject(conf, getReduceProcessor(), reducerPath);
      
      job.setMapperClass(DelegatingMapper.class);
      job.setReducerClass(DelegatingReducer.class);
      
      if (isUseCombiner())
      {
        _combiner = new PartitioningCombiner();
        _combiner.setAccumulator(getCombinerAccumulator());
        conf.set(Parameters.COMBINER_IMPL_PATH, combinerPath.toString());
        job.setCombinerClass(DelegatingCombiner.class);
        DistributedCacheHelper.writeObject(conf, getCombineProcessor(), combinerPath);
      }
      
      job.setPartitionerClass(TimePartitioner.class);
            
      if (!job.waitForCompletion(true))
      {
        _log.error("Job failed! Quitting...");
        throw new RuntimeException("Job failed");
      }
      
      report.jobName = job.getJobName();
      report.jobId = job.getJobID().toString();
        
      moveStagedFiles(report,incrementalStagingPath);
      
      if (getCountersParentPath() == null && job.getCountersPath() != null)
      {
        // save the counters in the target path, for lack of a better place to put it
        Path counters = job.getCountersPath();
        if (getFileSystem().exists(counters))
        {
          Path target = new Path(getOutputPath(),counters.getName());
          if (getFileSystem().exists(target))
          {
            _log.info(String.format("Removing old counters at %s",target));
            getFileSystem().delete(target, true);
          }
          _log.info(String.format("Moving %s to %s",counters.getName(),getOutputPath()));
          getFileSystem().rename(counters, target);
          
          report.countersPath = target;
        }
        else
        {
          _log.error("Could not find counters at " + counters);
        }
      }

      applyRetention();
              
      _reports.add(report);
      
      if (!planner.getNeedsAnotherPass())
      {
        break;
      }
      
      cleanup();
      
      iterations++;
    }
  }
  
  /**
   * Remove all temporary paths. 
   * 
   * @throws IOException
   */
  private void cleanup() throws IOException
  {
    if (_garbage != null)
    {
      _garbage.clean();
    }
  }
  
  /**
   * Removes all but the more recent days from the ouput that are within the retention period, if one is specified.
   * 
   * @throws IOException
   */
  private void applyRetention() throws IOException
  {
    if (getRetentionCount() != null)
    {
      PathUtils.keepLatestNestedDatedPaths(getFileSystem(), getOutputPath(), getRetentionCount());
    }
  }
  
  /**
   * Moves files from the staging path to the final output path.
   * 
   * @param report report to update with output paths
   * @param sourcePath source of data to move
   * @throws IOException
   */
  private void moveStagedFiles(Report report, Path sourcePath) throws IOException
  {
    _log.info("Following files produced in staging path:");
    for (FileStatus stat : getFileSystem().globStatus(new Path(sourcePath,"*.avro")))
    {
      _log.info(String.format("* %s (%d bytes)",stat.getPath(),stat.getLen()));
    }
    
    FileStatus[] incrementalParts = getFileSystem().globStatus(new Path(sourcePath,"*"), new PathFilter() {
      @Override
      public boolean accept(Path path)
      {
        String[] pathParts = path.getName().split("-");
        try
        {
          Long.parseLong(pathParts[0]);
          return true;
        }
        catch (NumberFormatException e)
        {
          return false;
        }
      }
    });
    
    // collect the new incremental data from the temp folder and move to subfolders
    Map incrementalTargetPaths = new HashMap();
    for (FileStatus stat : incrementalParts)
    {        
      String[] pathParts = stat.getPath().getName().split("-");
      try
      {        
        String timestamp = pathParts[0];
        
        if (!incrementalTargetPaths.containsKey(timestamp))
        {
          Path parent = new Path(sourcePath,timestamp);
          
          if (!getFileSystem().exists(parent))
          {
            getFileSystem().mkdirs(parent);
          }
          else
          {
            throw new RuntimeException("already exists: " + parent.toString());
          }
          
          incrementalTargetPaths.put(timestamp,parent);
        }
        
        Path parent = incrementalTargetPaths.get(timestamp);
        _log.info(String.format("Moving %s to %s",stat.getPath().getName(),parent.toString()));
        getFileSystem().rename(stat.getPath(), new Path(parent,stat.getPath().getName()));
      }
      catch (NumberFormatException e)
      {
        throw new RuntimeException(e);
      }
    }
    
    for (Path src : incrementalTargetPaths.values())
    {
      Date srcDate;
      try
      {
        srcDate = PathUtils.datedPathFormat.parse(src.getName());
      }
      catch (ParseException e)
      {
        throw new RuntimeException(e);
      }
      Path target = new Path(getOutputPath(),PathUtils.nestedDatedPathFormat.format(srcDate));
      _log.info(String.format("Moving %s to %s",src.getName(),target));
      
      getFileSystem().mkdirs(target.getParent());
      
      if (!getFileSystem().rename(src, target))
      {
        throw new RuntimeException("Failed to rename " + src + " to " + target);
      }
      
      report.outputFiles.add(new DatePath(srcDate,target));
    }
  }
  
  /**
   * Reports files created and processed for an iteration of the job.
   * 
   */
  public static class Report
  {
    private String jobName;
    private String jobId;
    private Path countersPath;
    private List inputFiles = new ArrayList();
    private List outputFiles = new ArrayList();
    
    /**
     * Gets the job name.
     * 
     * @return job name
     */
    public String getJobName()
    {
      return jobName;
    }
    
    /**
     * Gets the job ID.
     * 
     * @return job ID
     */
    public String getJobId()    
    {
      return jobId;
    }
    
    /**
     * Gets the path to the counters file, if one was written.
     * 
     * @return counters path
     */
    public Path getCountersPath()
    {
      return countersPath;
    }
    
    /**
     * Gets input files that were processed.  These are files that are within
     * the desired date range.
     * 
     * @return new input files
     */
    public List getInputFiles()
    {
      return Collections.unmodifiableList(inputFiles);
    }
    
    /**
     * Gets the output files that were produced by the job.
     * 
     * @return old input files
     */
    public List getOutputFiles()
    {
      return Collections.unmodifiableList(outputFiles);
    }
  }
}