datafu.hourglass.jobs.AbstractJob Maven / Gradle / Ivy
Show all versions of datafu-hourglass Show documentation
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.jobs;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* Base class for Hadoop jobs.
*
*
* This class defines a set of common methods and configuration shared by Hadoop jobs.
* Jobs can be configured either by providing properties or by calling setters.
* Each property has a corresponding setter.
*
*
* This class recognizes the following properties:
*
*
* - input.path - Input path job will read from
* - output.path - Output path job will write to
* - temp.path - Temporary path under which intermediate files are stored
* - retention.count - Number of days to retain in output directory
* - num.reducers - Number of reducers to use
* - use.combiner - Whether to use a combiner or not
* - counters.path - Path to store job counters in
*
*
*
* The input.path property may be a comma-separated list of paths. When there is more
* than one it implies a join is to be performed. Alternatively the paths may be listed separately.
* For example, input.path.first and input.path.second define two separate input
* paths.
*
*
*
* The num.reducers fixes the number of reducers. When not set the number of reducers
* is computed based on the input size.
*
*
*
* The temp.path property defines the parent directory for temporary paths, not the
* temporary path itself. Temporary paths are created under this directory with an hourglass-
* prefix followed by a GUID.
*
*
*
* The input and output paths are the only required parameters. The rest are optional.
*
*
*
* Hadoop configuration may be provided by setting a property with the prefix hadoop-conf..
* For example, mapred.min.split.size can be configured by setting property
* hadoop-conf.mapred.min.split.size to the desired value.
*
*
* @author "Matthew Hayes"
*
*/
public abstract class AbstractJob extends Configured
{
private static String HADOOP_PREFIX = "hadoop-conf.";
private Properties _props;
private String _name;
private boolean _useCombiner;
private Path _countersParentPath;
private Integer _numReducers;
private Integer _retentionCount;
private List _inputPaths;
private Path _outputPath;
private Path _tempPath = new Path("/tmp");
private FileSystem _fs;
/**
* Initializes the job.
*/
public AbstractJob()
{
setConf(new Configuration());
}
/**
* Initializes the job with a job name and properties.
*
* @param name Job name
* @param props Configuration properties
*/
public AbstractJob(String name, Properties props)
{
this();
setName(name);
setProperties(props);
}
/**
* Gets the job name
*
* @return Job name
*/
public String getName()
{
return _name;
}
/**
* Sets the job name
*
* @param name Job name
*/
public void setName(String name)
{
_name = name;
}
/**
* Gets the configuration properties.
*
* @return Configuration properties
*/
public Properties getProperties()
{
return _props;
}
/**
* Sets the configuration properties.
*
* @param props Properties
*/
public void setProperties(Properties props)
{
_props = props;
updateConfigurationFromProps(_props);
if (_props.get("input.path") != null)
{
String[] pathSplit = ((String)_props.get("input.path")).split(",");
List paths = new ArrayList();
for (String path : pathSplit)
{
if (path != null && path.length() > 0)
{
path = path.trim();
if (path.length() > 0)
{
paths.add(new Path(path));
}
}
}
if (paths.size() > 0)
{
setInputPaths(paths);
}
else
{
throw new RuntimeException("Could not extract input paths from: " + _props.get("input.path"));
}
}
else
{
List inputPaths = new ArrayList();
for (Object o : _props.keySet())
{
String prop = o.toString();
if (prop.startsWith("input.path."))
{
inputPaths.add(new Path(_props.getProperty(prop)));
}
}
if (inputPaths.size() > 0)
{
setInputPaths(inputPaths);
}
}
if (_props.get("output.path") != null)
{
setOutputPath(new Path((String)_props.get("output.path")));
}
if (_props.get("temp.path") != null)
{
setTempPath(new Path((String)_props.get("temp.path")));
}
if (_props.get("retention.count") != null)
{
setRetentionCount(Integer.parseInt((String)_props.get("retention.count")));
}
if (_props.get("num.reducers") != null)
{
setNumReducers(Integer.parseInt((String)_props.get("num.reducers")));
}
if (_props.get("use.combiner") != null)
{
setUseCombiner(Boolean.parseBoolean((String)_props.get("use.combiner")));
}
if (_props.get("counters.path") != null)
{
setCountersParentPath(new Path((String)_props.get("counters.path")));
}
}
/**
* Overridden to provide custom configuration before the job starts.
*
* @param conf
*/
public void config(Configuration conf)
{
}
/**
* Gets the number of reducers to use.
*
* @return Number of reducers
*/
public Integer getNumReducers()
{
return _numReducers;
}
/**
* Sets the number of reducers to use. Can also be set with num.reducers property.
*
* @param numReducers Number of reducers to use
*/
public void setNumReducers(Integer numReducers)
{
this._numReducers = numReducers;
}
/**
* Gets whether the combiner should be used.
*
* @return True if combiner should be used, otherwise false.
*/
public boolean isUseCombiner()
{
return _useCombiner;
}
/**
* Sets whether the combiner should be used. Can also be set with use.combiner.
*
* @param useCombiner True if a combiner should be used, otherwise false.
*/
public void setUseCombiner(boolean useCombiner)
{
this._useCombiner = useCombiner;
}
/**
* Gets the path where counters will be stored.
*
* @return Counters path
*/
public Path getCountersParentPath()
{
return _countersParentPath;
}
/**
* Sets the path where counters will be stored. Can also be set with counters.path.
*
* @param countersParentPath Counters path
*/
public void setCountersParentPath(Path countersParentPath)
{
this._countersParentPath = countersParentPath;
}
/**
* Gets the number of days of data which will be retained in the output path.
* Only the latest will be kept. Older paths will be removed.
*
* @return retention count
*/
public Integer getRetentionCount()
{
return _retentionCount;
}
/**
* Sets the number of days of data which will be retained in the output path.
* Only the latest will be kept. Older paths will be removed.
* Can also be set with retention.count.
*
* @param retentionCount
*/
public void setRetentionCount(Integer retentionCount)
{
this._retentionCount = retentionCount;
}
/**
* Gets the input paths. Multiple input paths imply a join is to be performed.
*
* @return input paths
*/
public List getInputPaths()
{
return _inputPaths;
}
/**
* Sets the input paths. Multiple input paths imply a join is to be performed.
* Can also be set with input.path or several properties starting with
* input.path..
*
* @param inputPaths input paths
*/
public void setInputPaths(List inputPaths)
{
this._inputPaths = inputPaths;
}
/**
* Gets the output path.
*
* @return output path
*/
public Path getOutputPath()
{
return _outputPath;
}
/**
* Sets the output path. Can also be set with output.path.
*
* @param outputPath output path
*/
public void setOutputPath(Path outputPath)
{
this._outputPath = outputPath;
}
/**
* Gets the temporary path under which intermediate files will be stored. Defaults to /tmp.
*
* @return Temporary path
*/
public Path getTempPath()
{
return _tempPath;
}
/**
* Sets the temporary path where intermediate files will be stored. Defaults to /tmp.
*
* @param tempPath Temporary path
*/
public void setTempPath(Path tempPath)
{
this._tempPath = tempPath;
}
/**
* Gets the file system.
*
* @return File system
* @throws IOException
*/
protected FileSystem getFileSystem()
{
if (_fs == null)
{
try
{
_fs = FileSystem.get(getConf());
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
return _fs;
}
/**
* Generates a random temporary path within the file system. This does not create the path.
*
* @return Random temporary path
*/
protected Path randomTempPath()
{
return new Path(_tempPath,String.format("hourglass-%s",UUID.randomUUID()));
}
/**
* Creates a random temporary path within the file system.
*
* @return Random temporary path
* @throws IOException
*/
protected Path createRandomTempPath() throws IOException
{
return ensurePath(randomTempPath());
}
/**
* Creates a path, if it does not already exist.
*
* @param path Path to create
* @return The same path that was provided
* @throws IOException
*/
protected Path ensurePath(Path path) throws IOException
{
if (!getFileSystem().exists(path))
{
getFileSystem().mkdirs(path);
}
return path;
}
/**
* Validation required before running job.
*/
protected void validate()
{
if (_inputPaths == null || _inputPaths.size() == 0)
{
throw new IllegalArgumentException("Input path is not specified.");
}
if (_outputPath == null)
{
throw new IllegalArgumentException("Output path is not specified.");
}
}
/**
* Initialization required before running job.
*/
protected void initialize()
{
}
/**
* Run the job.
*
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public abstract void run() throws IOException, InterruptedException, ClassNotFoundException;
/**
* Creates Hadoop configuration using the provided properties.
*
* @param props
* @return
*/
private void updateConfigurationFromProps(Properties props)
{
Configuration config = getConf();
if (config == null)
{
config = new Configuration();
}
// to enable unit tests to inject configuration
if (props.containsKey("test.conf"))
{
try
{
byte[] decoded = Base64.decodeBase64(props.getProperty("test.conf"));
ByteArrayInputStream byteInput = new ByteArrayInputStream(decoded);
DataInputStream inputStream = new DataInputStream(byteInput);
config.readFields(inputStream);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
else
{
for (String key : props.stringPropertyNames())
{
String newKey = key;
String value = props.getProperty(key);
if (key.toLowerCase().startsWith(HADOOP_PREFIX)) {
newKey = key.substring(HADOOP_PREFIX.length());
config.set(newKey, value);
}
}
}
}
}