
datafu.hourglass.jobs.ReduceEstimator Maven / Gradle / Ivy
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.jobs;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import datafu.hourglass.fs.PathUtils;
/**
* Estimates the number of reducers needed based on input size.
*
*
* This sums the size of the inputs and uses bytes-per-reducer
* settings to compute the number of reducers. By default,
* the bytes-per-reducer is 256 MB. This means that if the
* total input size is 1 GB, the total number of reducers
* computed will be 4.
*
*
*
* The bytes-per-reducer can be configured through properties
* provided in the constructor. The default bytes-per-reducer
* can be overriden by setting num.reducers.bytes.per.reducer.
* For example, if 536870912 (512 MB) is used for this setting,
* then 2 reducers would be used for 1 GB.
*
*
*
* The bytes-per-reducer can also be configured separately for
* different types of inputs. Inputs can be identified by a tag.
* For example, if an input is tagged with mydata, then
* the reducers for this input data can be configured with
* num.reducers.mydata.bytes.per.reducer.
*
*
* @author "Matthew Hayes"
*
*/
public class ReduceEstimator
{
private final Logger _log = Logger.getLogger(ReduceEstimator.class);
private final Set inputPaths = new HashSet();
private final Map pathToTag = new HashMap();
private final Map tagToBytesPerReducer = new HashMap();
private final FileSystem fs;
private final static String DEFAULT = "default";
private final static Long DEFAULT_BYTES_PER_REDUCER = 256L*1024L*1024L; // 256 MB
public ReduceEstimator(FileSystem fs, Properties props)
{
this.fs = fs;
if (props != null)
{
for (Object o : props.keySet())
{
String key = (String)o;
if (key.startsWith("num.reducers."))
{
if (key.equals("num.reducers.bytes.per.reducer"))
{
tagToBytesPerReducer.put(DEFAULT, Long.parseLong(props.getProperty(key)));
}
else
{
Pattern p = Pattern.compile("num\\.reducers\\.([a-z]+)\\.bytes\\.per\\.reducer");
Matcher m = p.matcher(key);
if (m.matches())
{
String tag = m.group(1);
tagToBytesPerReducer.put(tag, Long.parseLong(props.getProperty(key)));
}
else
{
throw new RuntimeException("Property not recognized: " + key);
}
}
}
}
}
if (!tagToBytesPerReducer.containsKey(DEFAULT))
{
long defaultValue = DEFAULT_BYTES_PER_REDUCER;
_log.info(String.format("No default bytes per reducer set, using %.2f MB",toMB(defaultValue)));
tagToBytesPerReducer.put(DEFAULT, defaultValue);
}
}
public void addInputPath(Path input)
{
addInputPath(DEFAULT,input);
}
public void addInputPath(String tag, Path input)
{
if (!inputPaths.contains(input))
{
inputPaths.add(input);
pathToTag.put(input, tag);
}
else
{
throw new RuntimeException("Already added input: " + input);
}
}
public int getNumReducers() throws IOException
{
Map bytesPerTag = getTagToInputBytes();
double numReducers = 0.0;
for (String tag : bytesPerTag.keySet())
{
long bytes = bytesPerTag.get(tag);
_log.info(String.format("Found %d bytes (%.2f GB) for inputs tagged with '%s'",bytes,toGB(bytes),tag));
Long bytesPerReducer = tagToBytesPerReducer.get(tag);
if (bytesPerReducer == null)
{
bytesPerReducer = tagToBytesPerReducer.get(DEFAULT);
if (bytesPerReducer == null)
{
throw new RuntimeException("Could not determine bytes per reducer");
}
_log.info(String.format("No configured bytes per reducer for '%s', using default value of %.2f MB",tag,toMB(bytesPerReducer)));
}
else
{
_log.info(String.format("Using configured bytes per reducer for '%s' of %.2f MB",tag,toMB(bytesPerReducer)));
}
double partialNumReducers = bytes/(double)bytesPerReducer;
_log.info(String.format("Reducers computed for '%s' is %.2f",tag,partialNumReducers));
numReducers += bytes/(double)bytesPerReducer;
}
int finalNumReducers = Math.max(1, (int)Math.ceil(numReducers));
_log.info(String.format("Final computed reducers is: %d",finalNumReducers));
return finalNumReducers;
}
private static double toGB(long bytes)
{
return bytes/(1024.0*1024.0*1024.0);
}
private static double toMB(long bytes)
{
return bytes/(1024.0*1024.0);
}
/**
* Gets the total number of bytes per tag.
*
* @return Map from tag to total bytes
* @throws IOException
*/
private Map getTagToInputBytes() throws IOException
{
Map result = new HashMap();
for (Path input : inputPaths)
{
long bytes = PathUtils.countBytes(fs, input);
String tag = pathToTag.get(input);
if (tag == null) throw new RuntimeException("Could not find tag for input: " + input);
Long current = result.get(tag);
if (current == null) current = 0L;
current += bytes;
result.put(tag, current);
}
return result;
}
}