All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.AbstractJob Maven / Gradle / Ivy

The newest version!
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * Base class for Hadoop jobs.
 * 
 * 

* This class defines a set of common methods and configuration shared by Hadoop jobs. * Jobs can be configured either by providing properties or by calling setters. * Each property has a corresponding setter. *

* * This class recognizes the following properties: * *
    *
  • input.path - Input path job will read from
  • *
  • output.path - Output path job will write to
  • *
  • temp.path - Temporary path under which intermediate files are stored
  • *
  • retention.count - Number of days to retain in output directory
  • *
  • num.reducers - Number of reducers to use
  • *
  • use.combiner - Whether to use a combiner or not
  • *
  • counters.path - Path to store job counters in
  • *
* *

* The input.path property may be a comma-separated list of paths. When there is more * than one it implies a join is to be performed. Alternatively the paths may be listed separately. * For example, input.path.first and input.path.second define two separate input * paths. *

* *

* The num.reducers fixes the number of reducers. When not set the number of reducers * is computed based on the input size. *

* *

* The temp.path property defines the parent directory for temporary paths, not the * temporary path itself. Temporary paths are created under this directory with an hourglass- * prefix followed by a GUID. *

* *

* The input and output paths are the only required parameters. The rest are optional. *

* *

* Hadoop configuration may be provided by setting a property with the prefix hadoop-conf.. * For example, mapred.min.split.size can be configured by setting property * hadoop-conf.mapred.min.split.size to the desired value. *

* * @author "Matthew Hayes" * */ public abstract class AbstractJob extends Configured { private static String HADOOP_PREFIX = "hadoop-conf."; private Properties _props; private String _name; private boolean _useCombiner; private Path _countersParentPath; private Integer _numReducers; private Integer _retentionCount; private List _inputPaths; private Path _outputPath; private Path _tempPath = new Path("/tmp"); private FileSystem _fs; /** * Initializes the job. */ public AbstractJob() { setConf(new Configuration()); } /** * Initializes the job with a job name and properties. * * @param name Job name * @param props Configuration properties */ public AbstractJob(String name, Properties props) { this(); setName(name); setProperties(props); } /** * Gets the job name * * @return Job name */ public String getName() { return _name; } /** * Sets the job name * * @param name Job name */ public void setName(String name) { _name = name; } /** * Gets the configuration properties. * * @return Configuration properties */ public Properties getProperties() { return _props; } /** * Sets the configuration properties. * * @param props Properties */ public void setProperties(Properties props) { _props = props; updateConfigurationFromProps(_props); if (_props.get("input.path") != null) { String[] pathSplit = ((String)_props.get("input.path")).split(","); List paths = new ArrayList(); for (String path : pathSplit) { if (path != null && path.length() > 0) { path = path.trim(); if (path.length() > 0) { paths.add(new Path(path)); } } } if (paths.size() > 0) { setInputPaths(paths); } else { throw new RuntimeException("Could not extract input paths from: " + _props.get("input.path")); } } else { List inputPaths = new ArrayList(); for (Object o : _props.keySet()) { String prop = o.toString(); if (prop.startsWith("input.path.")) { inputPaths.add(new Path(_props.getProperty(prop))); } } if (inputPaths.size() > 0) { setInputPaths(inputPaths); } } if (_props.get("output.path") != null) { setOutputPath(new Path((String)_props.get("output.path"))); } if (_props.get("temp.path") != null) { setTempPath(new Path((String)_props.get("temp.path"))); } if (_props.get("retention.count") != null) { setRetentionCount(Integer.parseInt((String)_props.get("retention.count"))); } if (_props.get("num.reducers") != null) { setNumReducers(Integer.parseInt((String)_props.get("num.reducers"))); } if (_props.get("use.combiner") != null) { setUseCombiner(Boolean.parseBoolean((String)_props.get("use.combiner"))); } if (_props.get("counters.path") != null) { setCountersParentPath(new Path((String)_props.get("counters.path"))); } } /** * Overridden to provide custom configuration before the job starts. * * @param conf */ public void config(Configuration conf) { } /** * Gets the number of reducers to use. * * @return Number of reducers */ public Integer getNumReducers() { return _numReducers; } /** * Sets the number of reducers to use. Can also be set with num.reducers property. * * @param numReducers Number of reducers to use */ public void setNumReducers(Integer numReducers) { this._numReducers = numReducers; } /** * Gets whether the combiner should be used. * * @return True if combiner should be used, otherwise false. */ public boolean isUseCombiner() { return _useCombiner; } /** * Sets whether the combiner should be used. Can also be set with use.combiner. * * @param useCombiner True if a combiner should be used, otherwise false. */ public void setUseCombiner(boolean useCombiner) { this._useCombiner = useCombiner; } /** * Gets the path where counters will be stored. * * @return Counters path */ public Path getCountersParentPath() { return _countersParentPath; } /** * Sets the path where counters will be stored. Can also be set with counters.path. * * @param countersParentPath Counters path */ public void setCountersParentPath(Path countersParentPath) { this._countersParentPath = countersParentPath; } /** * Gets the number of days of data which will be retained in the output path. * Only the latest will be kept. Older paths will be removed. * * @return retention count */ public Integer getRetentionCount() { return _retentionCount; } /** * Sets the number of days of data which will be retained in the output path. * Only the latest will be kept. Older paths will be removed. * Can also be set with retention.count. * * @param retentionCount */ public void setRetentionCount(Integer retentionCount) { this._retentionCount = retentionCount; } /** * Gets the input paths. Multiple input paths imply a join is to be performed. * * @return input paths */ public List getInputPaths() { return _inputPaths; } /** * Sets the input paths. Multiple input paths imply a join is to be performed. * Can also be set with input.path or several properties starting with * input.path.. * * @param inputPaths input paths */ public void setInputPaths(List inputPaths) { this._inputPaths = inputPaths; } /** * Gets the output path. * * @return output path */ public Path getOutputPath() { return _outputPath; } /** * Sets the output path. Can also be set with output.path. * * @param outputPath output path */ public void setOutputPath(Path outputPath) { this._outputPath = outputPath; } /** * Gets the temporary path under which intermediate files will be stored. Defaults to /tmp. * * @return Temporary path */ public Path getTempPath() { return _tempPath; } /** * Sets the temporary path where intermediate files will be stored. Defaults to /tmp. * * @param tempPath Temporary path */ public void setTempPath(Path tempPath) { this._tempPath = tempPath; } /** * Gets the file system. * * @return File system * @throws IOException */ protected FileSystem getFileSystem() { if (_fs == null) { try { _fs = FileSystem.get(getConf()); } catch (IOException e) { throw new RuntimeException(e); } } return _fs; } /** * Generates a random temporary path within the file system. This does not create the path. * * @return Random temporary path */ protected Path randomTempPath() { return new Path(_tempPath,String.format("hourglass-%s",UUID.randomUUID())); } /** * Creates a random temporary path within the file system. * * @return Random temporary path * @throws IOException */ protected Path createRandomTempPath() throws IOException { return ensurePath(randomTempPath()); } /** * Creates a path, if it does not already exist. * * @param path Path to create * @return The same path that was provided * @throws IOException */ protected Path ensurePath(Path path) throws IOException { if (!getFileSystem().exists(path)) { getFileSystem().mkdirs(path); } return path; } /** * Validation required before running job. */ protected void validate() { if (_inputPaths == null || _inputPaths.size() == 0) { throw new IllegalArgumentException("Input path is not specified."); } if (_outputPath == null) { throw new IllegalArgumentException("Output path is not specified."); } } /** * Initialization required before running job. */ protected void initialize() { } /** * Run the job. * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public abstract void run() throws IOException, InterruptedException, ClassNotFoundException; /** * Creates Hadoop configuration using the provided properties. * * @param props * @return */ private void updateConfigurationFromProps(Properties props) { Configuration config = getConf(); if (config == null) { config = new Configuration(); } // to enable unit tests to inject configuration if (props.containsKey("test.conf")) { try { byte[] decoded = Base64.decodeBase64(props.getProperty("test.conf")); ByteArrayInputStream byteInput = new ByteArrayInputStream(decoded); DataInputStream inputStream = new DataInputStream(byteInput); config.readFields(inputStream); } catch (IOException e) { throw new RuntimeException(e); } } else { for (String key : props.stringPropertyNames()) { String newKey = key; String value = props.getProperty(key); if (key.toLowerCase().startsWith(HADOOP_PREFIX)) { newKey = key.substring(HADOOP_PREFIX.length()); config.set(newKey, value); } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy