All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.ReduceEstimator Maven / Gradle / Ivy

/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import datafu.hourglass.fs.PathUtils;

/**
 * Estimates the number of reducers needed based on input size.
 * 
 * 

* This sums the size of the inputs and uses bytes-per-reducer * settings to compute the number of reducers. By default, * the bytes-per-reducer is 256 MB. This means that if the * total input size is 1 GB, the total number of reducers * computed will be 4. *

* *

* The bytes-per-reducer can be configured through properties * provided in the constructor. The default bytes-per-reducer * can be overriden by setting num.reducers.bytes.per.reducer. * For example, if 536870912 (512 MB) is used for this setting, * then 2 reducers would be used for 1 GB. *

* *

* The bytes-per-reducer can also be configured separately for * different types of inputs. Inputs can be identified by a tag. * For example, if an input is tagged with mydata, then * the reducers for this input data can be configured with * num.reducers.mydata.bytes.per.reducer. *

* * @author "Matthew Hayes" * */ public class ReduceEstimator { private final Logger _log = Logger.getLogger(ReduceEstimator.class); private final Set inputPaths = new HashSet(); private final Map pathToTag = new HashMap(); private final Map tagToBytesPerReducer = new HashMap(); private final FileSystem fs; private final static String DEFAULT = "default"; private final static Long DEFAULT_BYTES_PER_REDUCER = 256L*1024L*1024L; // 256 MB public ReduceEstimator(FileSystem fs, Properties props) { this.fs = fs; if (props != null) { for (Object o : props.keySet()) { String key = (String)o; if (key.startsWith("num.reducers.")) { if (key.equals("num.reducers.bytes.per.reducer")) { tagToBytesPerReducer.put(DEFAULT, Long.parseLong(props.getProperty(key))); } else { Pattern p = Pattern.compile("num\\.reducers\\.([a-z]+)\\.bytes\\.per\\.reducer"); Matcher m = p.matcher(key); if (m.matches()) { String tag = m.group(1); tagToBytesPerReducer.put(tag, Long.parseLong(props.getProperty(key))); } else { throw new RuntimeException("Property not recognized: " + key); } } } } } if (!tagToBytesPerReducer.containsKey(DEFAULT)) { long defaultValue = DEFAULT_BYTES_PER_REDUCER; _log.info(String.format("No default bytes per reducer set, using %.2f MB",toMB(defaultValue))); tagToBytesPerReducer.put(DEFAULT, defaultValue); } } public void addInputPath(Path input) { addInputPath(DEFAULT,input); } public void addInputPath(String tag, Path input) { if (!inputPaths.contains(input)) { inputPaths.add(input); pathToTag.put(input, tag); } else { throw new RuntimeException("Already added input: " + input); } } public int getNumReducers() throws IOException { Map bytesPerTag = getTagToInputBytes(); double numReducers = 0.0; for (String tag : bytesPerTag.keySet()) { long bytes = bytesPerTag.get(tag); _log.info(String.format("Found %d bytes (%.2f GB) for inputs tagged with '%s'",bytes,toGB(bytes),tag)); Long bytesPerReducer = tagToBytesPerReducer.get(tag); if (bytesPerReducer == null) { bytesPerReducer = tagToBytesPerReducer.get(DEFAULT); if (bytesPerReducer == null) { throw new RuntimeException("Could not determine bytes per reducer"); } _log.info(String.format("No configured bytes per reducer for '%s', using default value of %.2f MB",tag,toMB(bytesPerReducer))); } else { _log.info(String.format("Using configured bytes per reducer for '%s' of %.2f MB",tag,toMB(bytesPerReducer))); } double partialNumReducers = bytes/(double)bytesPerReducer; _log.info(String.format("Reducers computed for '%s' is %.2f",tag,partialNumReducers)); numReducers += bytes/(double)bytesPerReducer; } int finalNumReducers = Math.max(1, (int)Math.ceil(numReducers)); _log.info(String.format("Final computed reducers is: %d",finalNumReducers)); return finalNumReducers; } private static double toGB(long bytes) { return bytes/(1024.0*1024.0*1024.0); } private static double toMB(long bytes) { return bytes/(1024.0*1024.0); } /** * Gets the total number of bytes per tag. * * @return Map from tag to total bytes * @throws IOException */ private Map getTagToInputBytes() throws IOException { Map result = new HashMap(); for (Path input : inputPaths) { long bytes = PathUtils.countBytes(fs, input); String tag = pathToTag.get(input); if (tag == null) throw new RuntimeException("Could not find tag for input: " + input); Long current = result.get(tag); if (current == null) current = 0L; current += bytes; result.put(tag, current); } return result; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy