ml.shifu.guagua.hadoop.GuaguaMRUnitDriver Maven / Gradle / Ivy
/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.unit.GuaguaUnitDriver;
import ml.shifu.guagua.yarn.GuaguaYarnConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.util.StringUtils;
/**
* {@link GuaguaMRUnitDriver} is used to run in-memory guagua application by using hadoop MapReduce splits.
*
* @param
* master result for computation in each iteration.
* @param
* worker result for computation in each iteration.
*
* @see ml.shifu.guagua.mapreduce.example.sum.SumTest in guagua-mapreduce-examples project.
*/
public class GuaguaMRUnitDriver extends
GuaguaUnitDriver {
/**
* A only constructor here for local in-memory guagua job.
*
* @param props
* set all the configurations like input, output and ..
*
* @see ml.shifu.guagua.mapreduce.example.sum.SumTest in guagua-mapreduce-examples project.
*
*/
public GuaguaMRUnitDriver(Properties props) {
super(props);
}
/**
* Whether it is not pig or hadoop meta output file.
*/
private boolean isPigOrHadoopMetaFile(Path path) {
return path.toString().indexOf(GuaguaYarnConstants.HADOOP_SUCCESS) >= 0
|| path.toString().indexOf(GuaguaYarnConstants.PIG_HEADER) >= 0
|| path.toString().indexOf(GuaguaYarnConstants.PIG_SCHEMA) >= 0;
}
/**
* Check whether file is splitable in HDFS.
*/
private boolean isSplitable(Configuration conf, Path file) {
// bzip2 can be split.
if(file.getName().endsWith(GuaguaYarnConstants.BZ2)) {
return true;
}
// other compression can not be split, maybe for lzo I should add it to split list.
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
return codec == null;
}
@Override
public List generateWorkerSplits(String inputs) throws IOException {
List splits = new ArrayList();
Configuration conf = new Configuration();
// generate splits
List files = listStatus(conf, inputs);
for(FileStatus file: files) {
Path path = file.getPath();
if(isPigOrHadoopMetaFile(path)) {
continue;
}
long length = file.getLen();
if((length != 0) && isSplitable(conf, path)) {
long splitSize = file.getBlockSize();
long bytesRemaining = length;
while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), length - bytesRemaining,
splitSize) });
bytesRemaining -= splitSize;
}
if(bytesRemaining != 0) {
splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), length - bytesRemaining,
bytesRemaining) });
}
} else if(length != 0) {
splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), 0, length) });
}
}
return splits;
}
/**
* Get the list of input {@link Path}s for the map-reduce job.
*/
private static Path[] getInputPaths(String inputs) {
String[] list = StringUtils.split(inputs);
Path[] result = new Path[list.length];
for(int i = 0; i < list.length; i++) {
result[i] = new Path(StringUtils.unEscapeString(list[i]));
}
return result;
}
private static final PathFilter hiddenFileFilter = new PathFilter() {
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
/**
* Proxy PathFilter that accepts a path only if all filters given in the constructor do. Used by the listPaths() to
* apply the built-in hiddenFileFilter together with a user provided one (if any).
*/
private static class MultiPathFilter implements PathFilter {
private List filters;
public MultiPathFilter(List filters) {
this.filters = filters;
}
public boolean accept(Path path) {
for(PathFilter filter: filters) {
if(!filter.accept(path)) {
return false;
}
}
return true;
}
}
/**
* List input directories.
* Subclasses may override to, e.g., select only files matching a regular expression.
*
* @param job
* the job to list input paths for
* @return array of FileStatus objects
* @throws IOException
* if zero items.
* @throws InvalidInputException
* If any IOException for input files.
*/
protected List listStatus(Configuration conf, String input) throws IOException {
List result = new ArrayList();
Path[] dirs = getInputPaths(input);
if(dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
List errors = new ArrayList();
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List filters = new ArrayList();
filters.add(hiddenFileFilter);
PathFilter inputFilter = new MultiPathFilter(filters);
for(int i = 0; i < dirs.length; ++i) {
Path p = dirs[i];
FileSystem fs = p.getFileSystem(conf);
FileStatus[] matches = fs.globStatus(p, inputFilter);
if(matches == null) {
errors.add(new IOException("Input path does not exist: " + p));
} else if(matches.length == 0) {
errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
} else {
for(FileStatus globStat: matches) {
if(globStat.isDirectory()) {
for(FileStatus stat: fs.listStatus(globStat.getPath(), inputFilter)) {
result.add(stat);
}
} else {
result.add(globStat);
}
}
}
}
if(!errors.isEmpty()) {
throw new IOException(errors.toString());
}
return result;
}
}