datafu.hourglass.jobs.IncrementalJob Maven / Gradle / Ivy
Show all versions of datafu-hourglass Show documentation
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.jobs;
import java.util.Properties;
import org.apache.avro.Schema;
import datafu.hourglass.schemas.TaskSchemas;
/**
* Base class for incremental jobs. Incremental jobs consume day-partitioned input data.
*
*
* Implementations of this class must provide key, intermediate value, and output value schemas.
* The key and intermediate value schemas define the output for the mapper and combiner.
* The key and output value schemas define the output for the reducer.
*
*
*
* This class has the same configuration and methods as {@link TimeBasedJob}.
* In addition it also recognizes the following properties:
*
*
*
* - max.iterations - maximum number of iterations for the job
* - max.days.to.process - maximum number of days of input data to process in a single run
* - fail.on.missing - whether the job should fail if input data within the desired range is missing
*
*
* @author "Matthew Hayes"
*
*/
public abstract class IncrementalJob extends TimeBasedJob
{
private Integer _maxToProcess;
private Integer _maxIterations;
private boolean _failOnMissing;
private TaskSchemas _schemas;
/**
* Initializes the job.
*/
public IncrementalJob()
{
}
/**
* Initializes the job with a job name and properties.
*
* @param name job name
* @param props configuration properties
*/
public IncrementalJob(String name, Properties props)
{
super(name,props);
}
public void setProperties(Properties props)
{
super.setProperties(props);
if (getProperties().get("max.iterations") != null)
{
setMaxIterations(Integer.parseInt((String)getProperties().get("max.iterations")));
}
if (getProperties().get("max.days.to.process") != null)
{
setMaxToProcess(Integer.parseInt((String)getProperties().get("max.days.to.process")));
}
if (getProperties().get("fail.on.missing") != null)
{
setFailOnMissing(Boolean.parseBoolean((String)getProperties().get("max.days.to.process")));
}
}
protected void initialize()
{
super.initialize();
if (getKeySchema() == null)
{
throw new RuntimeException("Key schema not specified");
}
if (getIntermediateValueSchema() == null)
{
throw new RuntimeException("Intermediate schema not specified");
}
if (getOutputValueSchema() == null)
{
throw new RuntimeException("Output schema not specified");
}
_schemas = new TaskSchemas.Builder()
.setKeySchema(getKeySchema())
.setIntermediateValueSchema(getIntermediateValueSchema())
.setOutputValueSchema(getOutputValueSchema())
.build();
}
/**
* Gets the Avro schema for the key.
*
* This is also used as the key for the map output.
*
* @return key schema.
*/
protected abstract Schema getKeySchema();
/**
* Gets the Avro schema for the intermediate value.
*
* This is also used for the value for the map output.
*
* @return intermediate value schema
*/
protected abstract Schema getIntermediateValueSchema();
/**
* Gets the Avro schema for the output data.
*
* @return output data schema
*/
protected abstract Schema getOutputValueSchema();
/**
* Gets the schemas.
*
* @return schemas
*/
protected TaskSchemas getSchemas()
{
return _schemas;
}
/**
* Gets the maximum number of days of input data to process in a single run.
*
* @return maximum number of days to process
*/
public Integer getMaxToProcess()
{
return _maxToProcess;
}
/**
* Sets the maximum number of days of input data to process in a single run.
*
* @param maxToProcess maximum number of days to process
*/
public void setMaxToProcess(Integer maxToProcess)
{
_maxToProcess = maxToProcess;
}
/**
* Gets the maximum number of iterations for the job. Multiple iterations will only occur
* when there is a maximum set for the number of days to process in a single run.
* An error should be thrown if this number will be exceeded.
*
* @return maximum number of iterations
*/
public Integer getMaxIterations()
{
return _maxIterations;
}
/**
* Sets the maximum number of iterations for the job. Multiple iterations will only occur
* when there is a maximum set for the number of days to process in a single run.
* An error should be thrown if this number will be exceeded.
*
* @param maxIterations maximum number of iterations
*/
public void setMaxIterations(Integer maxIterations)
{
_maxIterations = maxIterations;
}
/**
* Gets whether the job should fail if input data within the desired range is missing.
*
* @return true if the job should fail on missing data
*/
public boolean isFailOnMissing()
{
return _failOnMissing;
}
/**
* Sets whether the job should fail if input data within the desired range is missing.
*
* @param failOnMissing true if the job should fail on missing data
*/
public void setFailOnMissing(boolean failOnMissing)
{
_failOnMissing = failOnMissing;
}
}