Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
ml.shifu.guagua.yarn.util.InputSplitUtils Maven / Gradle / Ivy
/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.yarn.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import ml.shifu.guagua.GuaguaRuntimeException;
import ml.shifu.guagua.hadoop.io.GuaguaInputSplit;
import ml.shifu.guagua.yarn.GuaguaYarnConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Helper class to get input splits.
*/
public final class InputSplitUtils {
private static final Logger LOG = LoggerFactory.getLogger(InputSplitUtils.class);
// To avoid somebody new InputSplitUtils
private InputSplitUtils() {
}
/**
* Copy from pig implementation, need to check this code logic.
*/
public static List getFinalCombineGuaguaSplits(List newSplits, long combineSize)
throws IOException {
List> combinePigSplits;
try {
combinePigSplits = getCombineGuaguaSplits(newSplits, combineSize);
} catch (InterruptedException e) {
throw new GuaguaRuntimeException(e);
}
newSplits = new ArrayList();
for(List inputSplits: combinePigSplits) {
FileSplit[] fss = new FileSplit[inputSplits.size()];
for(int i = 0; i < inputSplits.size(); i++) {
fss[i] = (FileSplit) (inputSplits.get(i));
}
newSplits.add(new GuaguaInputSplit(false, fss));
}
return newSplits;
}
/**
* List all the inputs files. Better to follow FileInputFormat#listStatus
*/
public static FileStatus[] listStatus(Configuration conf) throws IOException {
String newPath = expandInputFolder(conf);
// Get all files except pig or hadoop meta
FileStatus[] fileStatus = FileSystem.get(conf).globStatus(new Path(newPath), new PathFilter() {
@Override
public boolean accept(Path path) {
return !isPigOrHadoopMetaFile(path);
}
});
return fileStatus;
}
/**
* Expand folder to all files to support all files in that folder
*/
public static String expandInputFolder(Configuration conf) throws IOException {
Path path = new Path(conf.get(GuaguaYarnConstants.GUAGUA_YARN_INPUT_DIR));
String newPath = path.toString();
return FileSystem.get(conf).getFileStatus(path).isDirectory() ? newPath + Path.SEPARATOR + "*" : newPath;
}
/**
* Generate the list of files and make them into FileSplits.
*/
public static List getFileSplits(Configuration conf, long splitSize) throws IOException {
// generate splits
List splits = new ArrayList();
FileStatus[] files = listStatus(conf);
for(FileStatus file: files) {
Path path = file.getPath();
if(isPigOrHadoopMetaFile(path)) {
continue;
}
FileSystem fs = path.getFileSystem(conf);
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
if((length != 0) && isSplitable(conf, path)) {
long bytesRemaining = length;
while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex]
.getHosts()));
bytesRemaining -= splitSize;
}
if(bytesRemaining != 0) {
splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
blkLocations[blkLocations.length - 1].getHosts()));
}
} else if(length != 0) {
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
} else {
// Create empty hosts array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
LOG.debug("Total # of splits: {}", splits.size());
return splits;
}
/**
* Generate the list of files and make them into FileSplits.
*/
public static List getGuaguaSplits(Configuration conf, long splitSize) throws IOException {
// generate splits
List splits = new ArrayList();
FileStatus[] files = listStatus(conf);
for(FileStatus file: files) {
Path path = file.getPath();
if(isPigOrHadoopMetaFile(path)) {
continue;
}
FileSystem fs = path.getFileSystem(conf);
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
if((length != 0) && isSplitable(conf, path)) {
long bytesRemaining = length;
while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length
- bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()) }));
bytesRemaining -= splitSize;
}
if(bytesRemaining != 0) {
splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length
- bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()) }));
}
} else if(length != 0) {
splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, 0, length, blkLocations[0]
.getHosts()) }));
} else {
// Create empty hosts array for zero length files
splits.add(new GuaguaInputSplit(false,
new FileSplit[] { new FileSplit(path, 0, length, new String[0]) }));
}
}
LOG.debug("Total # of splits: {}", splits.size());
return splits;
}
public static int getBlockIndex(BlockLocation[] blkLocations, long offset) {
for(int i = 0; i < blkLocations.length; i++) {
// is the offset inside this block?
if((blkLocations[i].getOffset() <= offset)
&& (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
return i;
}
}
BlockLocation last = blkLocations[blkLocations.length - 1];
long fileLength = last.getOffset() + last.getLength() - 1;
throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
}
private static final class ComparableSplit implements Comparable {
private InputSplit rawInputSplit;
private HashSet nodes;
// id used as a tie-breaker when two splits are of equal size.
private long id;
ComparableSplit(InputSplit split, long id) {
rawInputSplit = split;
nodes = new HashSet();
this.id = id;
}
void add(Node node) {
nodes.add(node);
}
void removeFromNodes() {
for(Node node: nodes)
node.remove(this);
}
public InputSplit getSplit() {
return rawInputSplit;
}
@Override
public boolean equals(Object other) {
if(other == null || !(other instanceof ComparableSplit))
return false;
return (compareTo((ComparableSplit) other) == 0);
}
@Override
public int hashCode() {
return 41;
}
@Override
public int compareTo(ComparableSplit other) {
try {
long cmp = rawInputSplit.getLength() - other.rawInputSplit.getLength();
// in descending order
return cmp == 0 ? (id == other.id ? 0 : id < other.id ? -1 : 1) : cmp < 0 ? 1 : -1;
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
private static class DummySplit extends InputSplit {
private long length;
@Override
public String[] getLocations() {
return null;
}
@Override
public long getLength() {
return length;
}
public void setLength(long length) {
this.length = length;
}
}
private static class Node {
private long length = 0;
private ArrayList splits;
private boolean sorted;
Node() throws IOException, InterruptedException {
length = 0;
splits = new ArrayList();
sorted = false;
}
void add(ComparableSplit split) throws IOException, InterruptedException {
splits.add(split);
length++;
}
void remove(ComparableSplit split) {
if(!sorted)
sort();
int index = Collections.binarySearch(splits, split);
if(index >= 0) {
splits.remove(index);
length--;
}
}
void sort() {
if(!sorted) {
Collections.sort(splits);
sorted = true;
}
}
ArrayList getSplits() {
return splits;
}
@SuppressWarnings("unused")
public long getLength() {
return length;
}
}
public static List> getCombineGuaguaSplits(List oneInputSplits,
long maxCombinedSplitSize) throws IOException, InterruptedException {
ArrayList nodes = new ArrayList();
HashMap nodeMap = new HashMap();
List> result = new ArrayList>();
List resultLengths = new ArrayList();
long comparableSplitId = 0;
int size = 0, nSplits = oneInputSplits.size();
InputSplit lastSplit = null;
int emptyCnt = 0;
for(InputSplit split: oneInputSplits) {
if(split.getLength() == 0) {
emptyCnt++;
continue;
}
if(split.getLength() >= maxCombinedSplitSize) {
comparableSplitId++;
ArrayList combinedSplits = new ArrayList();
combinedSplits.add(split);
result.add(combinedSplits);
resultLengths.add(split.getLength());
} else {
ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
String[] locations = split.getLocations();
// sort the locations to stabilize the number of maps: PIG-1757
Arrays.sort(locations);
HashSet locationSeen = new HashSet();
for(String location: locations) {
if(!locationSeen.contains(location)) {
Node node = nodeMap.get(location);
if(node == null) {
node = new Node();
nodes.add(node);
nodeMap.put(location, node);
}
node.add(csplit);
csplit.add(node);
locationSeen.add(location);
}
}
lastSplit = split;
size++;
}
}
if(nSplits > 0 && emptyCnt == nSplits) {
// if all splits are empty, add a single empty split as currently an empty directory is
// not properly handled somewhere
ArrayList combinedSplits = new ArrayList();
combinedSplits.add(oneInputSplits.get(0));
result.add(combinedSplits);
} else if(size == 1) {
ArrayList combinedSplits = new ArrayList();
combinedSplits.add(lastSplit);
result.add(combinedSplits);
} else if(size > 1) {
// combine small splits
Collections.sort(nodes, nodeComparator);
DummySplit dummy = new DummySplit();
// dummy is used to search for next split of suitable size to be combined
ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
for(Node node: nodes) {
// sort the splits on this node in descending order
node.sort();
long totalSize = 0;
ArrayList splits = node.getSplits();
int idx;
int lenSplits;
ArrayList combinedSplits = new ArrayList();
ArrayList combinedComparableSplits = new ArrayList();
while(!splits.isEmpty()) {
combinedSplits.add(splits.get(0).getSplit());
combinedComparableSplits.add(splits.get(0));
int startIdx = 1;
lenSplits = splits.size();
totalSize += splits.get(0).getSplit().getLength();
long spaceLeft = maxCombinedSplitSize - totalSize;
dummy.setLength(spaceLeft);
idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
idx = -idx - 1 + startIdx;
while(idx < lenSplits) {
long thisLen = splits.get(idx).getSplit().getLength();
combinedSplits.add(splits.get(idx).getSplit());
combinedComparableSplits.add(splits.get(idx));
totalSize += thisLen;
spaceLeft -= thisLen;
if(spaceLeft <= 0)
break;
// find next combinable chunk
startIdx = idx + 1;
if(startIdx >= lenSplits)
break;
dummy.setLength(spaceLeft);
idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
dummyComparableSplit);
idx = -idx - 1 + startIdx;
}
if(totalSize > maxCombinedSplitSize / 2) {
result.add(combinedSplits);
resultLengths.add(totalSize);
removeSplits(combinedComparableSplits);
totalSize = 0;
combinedSplits = new ArrayList();
combinedComparableSplits.clear();
splits = node.getSplits();
} else {
if(combinedSplits.size() != lenSplits)
throw new AssertionError("Combined split logic error!");
break;
}
}
}
// handle leftovers
ArrayList leftoverSplits = new ArrayList();
HashSet seen = new HashSet();
for(Node node: nodes) {
for(ComparableSplit split: node.getSplits()) {
if(!seen.contains(split.getSplit())) {
// remove duplicates. The set has to be on the raw input split not the
// comparable input split as the latter overrides the compareTo method
// so its equality semantics is changed and not we want here
seen.add(split.getSplit());
leftoverSplits.add(split);
}
}
}
if(!leftoverSplits.isEmpty()) {
long totalSize = 0;
ArrayList combinedSplits = new ArrayList();
ArrayList combinedComparableSplits = new ArrayList();
int splitLen = leftoverSplits.size();
for(int i = 0; i < splitLen; i++) {
ComparableSplit split = leftoverSplits.get(i);
long thisLen = split.getSplit().getLength();
if(totalSize + thisLen >= maxCombinedSplitSize) {
removeSplits(combinedComparableSplits);
result.add(combinedSplits);
resultLengths.add(totalSize);
combinedSplits = new ArrayList();
combinedComparableSplits.clear();
totalSize = 0;
}
combinedSplits.add(split.getSplit());
combinedComparableSplits.add(split);
totalSize += split.getSplit().getLength();
if(i == splitLen - 1) {
// last piece: it could be very small, try to see it can be squeezed into any existing splits
for(int j = 0; j < result.size(); j++) {
if(resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
List isList = result.get(j);
for(InputSplit csplit: combinedSplits) {
isList.add(csplit);
}
removeSplits(combinedComparableSplits);
combinedSplits.clear();
break;
}
}
if(!combinedSplits.isEmpty()) {
// last piece can not be squeezed in, create a new combined split for them.
removeSplits(combinedComparableSplits);
result.add(combinedSplits);
}
}
}
}
}
LOG.info("Total input paths (combined) to process : {}", result.size());
return result;
}
/*
* The following codes are for split combination: see PIG-1518
*/
private static Comparator nodeComparator = new Comparator() {
@Override
public int compare(Node o1, Node o2) {
long cmp = o1.length - o2.length;
return cmp == 0 ? 0 : cmp < 0 ? -1 : 1;
}
};
private static void removeSplits(List splits) {
for(ComparableSplit split: splits)
split.removeFromNodes();
}
/**
* Whether it is not pig or hadoop meta output file.
*/
private static boolean isPigOrHadoopMetaFile(Path path) {
return path.toString().indexOf(GuaguaYarnConstants.HADOOP_SUCCESS) >= 0
|| path.toString().indexOf(GuaguaYarnConstants.PIG_HEADER) >= 0
|| path.toString().indexOf(GuaguaYarnConstants.PIG_SCHEMA) >= 0;
}
private static boolean isSplitable(Configuration conf, Path file) {
// other compression can not be split, maybe for lzo I should add it to split list.
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
return codec == null;
}
}