datafu.hourglass.jobs.IncrementalJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datafu-hourglass-incubating Show documentation
Librares that make easier to solve data problems using Hadoop and higher level languages based on it.
There is a newer version: 1.3.3
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.util.Properties;

import org.apache.avro.Schema;

import datafu.hourglass.schemas.TaskSchemas;

/**
 * Base class for incremental jobs.  Incremental jobs consume day-partitioned input data.  
 * 
 * 
 * Implementations of this class must provide key, intermediate value, and output value schemas.
 * The key and intermediate value schemas define the output for the mapper and combiner.
 * The key and output value schemas define the output for the reducer.
 * 
 * 
 * 
 * This class has the same configuration and methods as {@link TimeBasedJob}.
 * In addition it also recognizes the following properties:
 * 
 * 
 * 
 *   max.iterations - maximum number of iterations for the job
 *   max.days.to.process - maximum number of days of input data to process in a single run
 *   fail.on.missing - whether the job should fail if input data within the desired range is missing
 * 
 * 
 */
public abstract class IncrementalJob extends TimeBasedJob
{
  private Integer _maxToProcess;
  private Integer _maxIterations;
  private boolean _failOnMissing;
  private TaskSchemas _schemas;
  
  /**
   * Initializes the job.
   */
  public IncrementalJob()
  {    
  }

  /**
   * Initializes the job with a job name and properties.
   * 
   * @param name job name
   * @param props configuration properties
   */
  public IncrementalJob(String name, Properties props)
  {        
    super(name,props);
  }
  
  public void setProperties(Properties props)
  {
    super.setProperties(props);
        
    if (getProperties().get("max.iterations") != null)
    {
      setMaxIterations(Integer.parseInt((String)getProperties().get("max.iterations")));
    }
    
    if (getProperties().get("max.days.to.process") != null)
    {
      setMaxToProcess(Integer.parseInt((String)getProperties().get("max.days.to.process")));
    }
    
    if (getProperties().get("fail.on.missing") != null)
    {
      setFailOnMissing(Boolean.parseBoolean((String)getProperties().get("max.days.to.process")));
    }
  }
  
  protected void initialize()
  {
    super.initialize();
    
    if (getKeySchema() == null)
    {
      throw new RuntimeException("Key schema not specified");
    }

    if (getIntermediateValueSchema() == null)
    {
      throw new RuntimeException("Intermediate schema not specified");
    }

    if (getOutputValueSchema() == null)
    {
      throw new RuntimeException("Output schema not specified");
    }
    
    _schemas = new TaskSchemas.Builder()
      .setKeySchema(getKeySchema())
      .setIntermediateValueSchema(getIntermediateValueSchema())
      .setOutputValueSchema(getOutputValueSchema())
      .build();
  }
  
  /**
   * Gets the Avro schema for the key.
   * 
   * This is also used as the key for the map output.
   * 
   * @return key schema.
   */
  protected abstract Schema getKeySchema();
  
  /**
   * Gets the Avro schema for the intermediate value.
   * 
   * This is also used for the value for the map output.
   * 
   * @return intermediate value schema
   */
  protected abstract Schema getIntermediateValueSchema();
  
  /**
   * Gets the Avro schema for the output data.
   * 
   * @return output data schema
   */
  protected abstract Schema getOutputValueSchema();
  
  /**
   * Gets the schemas.
   * 
   * @return schemas
   */
  protected TaskSchemas getSchemas()
  {
    return _schemas;
  }
  
  /**
   * Gets the maximum number of days of input data to process in a single run.
   * 
   * @return maximum number of days to process
   */
  public Integer getMaxToProcess()
  {
    return _maxToProcess;
  }

  /**
   * Sets the maximum number of days of input data to process in a single run.
   * 
   * @param maxToProcess maximum number of days to process
   */
  public void setMaxToProcess(Integer maxToProcess)
  {
    _maxToProcess = maxToProcess;
  }

  /**
   * Gets the maximum number of iterations for the job.  Multiple iterations will only occur
   * when there is a maximum set for the number of days to process in a single run.
   * An error should be thrown if this number will be exceeded.
   * 
   * @return maximum number of iterations
   */
  public Integer getMaxIterations()
  {
    return _maxIterations;
  }

  /**
   * Sets the maximum number of iterations for the job.  Multiple iterations will only occur
   * when there is a maximum set for the number of days to process in a single run.
   * An error should be thrown if this number will be exceeded.
   * 
   * @param maxIterations maximum number of iterations
   */
  public void setMaxIterations(Integer maxIterations)
  {
    _maxIterations = maxIterations;
  }

  /**
   * Gets whether the job should fail if input data within the desired range is missing. 
   * 
   * @return true if the job should fail on missing data
   */
  public boolean isFailOnMissing()
  {
    return _failOnMissing;
  }

  /**
   * Sets whether the job should fail if input data within the desired range is missing. 
   * 
   * @param failOnMissing true if the job should fail on missing data
   */
  public void setFailOnMissing(boolean failOnMissing)
  {
    _failOnMissing = failOnMissing;
  }
}