com.twitter.elephantbird.util.SplitUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elephant-bird-core Show documentation
Core utilities.
There is a newer version: 4.17
package com.twitter.elephantbird.util;

import com.twitter.elephantbird.mapreduce.input.combine.CompositeInputSplit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapreduce.InputSplit;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.*;

import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This code facilitates combining InputSplits, managing the locality vs
 * input size skew tradeoff. This code is adapted and cleaned up from
 * Apache Pig. It was copied to avoid having to depend on all of Pig for
 * this utility.
 *
 * @author Jonathan Coveney
 */
public class SplitUtil {
  private static final Logger LOG = LoggerFactory.getLogger(SplitUtil.class);

  public static final  String COMBINE_SPLIT_SIZE = "elephantbird.combine.split.size";

  private static long getCombinedSplitSize(Configuration conf) throws IOException {
    long splitSize = conf.getLong(COMBINE_SPLIT_SIZE, -1);
    if (splitSize == -1) {
      splitSize = FileSystem.get(conf).getDefaultBlockSize(new Path("."));
    }
    return splitSize;
  }

  private static class Node {
    private long length = 0;
    private List splits;
    private boolean sorted;

    public Node() throws IOException, InterruptedException {
      length = 0;
      splits = new ArrayList();
      sorted = false;
    }

    public void add(ComparableSplit split) throws IOException, InterruptedException {
      splits.add(split);
      length++;
    }

    public void remove(ComparableSplit split) {
      if (!sorted) {
        sort();
      }
      int index = Collections.binarySearch(splits, split);
      if (index >= 0) {
        splits.remove(index);
        length--;
      }
    }

    public void sort() {
      if (!sorted) {
        Collections.sort(splits);
        sorted = true;
      }
    }

    public List getSplits() {
      return splits;
    }

    public long getLength() {
      return length;
    }
  }

  private static Comparator nodeComparator = new Comparator() {
    @Override
    public int compare(Node o1, Node o2) {
      return Long.signum(o1.length - o2.length);
    }
  };

  private static final class ComparableSplit implements Comparable {
    private InputSplit rawInputSplit;
    private Set nodes;
    // id used as a tie-breaker when two splits are of equal size.
    private long id;
    public ComparableSplit(InputSplit split, long id) {
      rawInputSplit = split;
      nodes = new HashSet();
      this.id = id;
    }

    public void add(Node node) {
      nodes.add(node);
    }

    public void removeFromNodes() {
      for (Node node : nodes) {
        node.remove(this);
      }
    }

    public InputSplit getSplit() {
      return rawInputSplit;
    }

    @Override
    public boolean equals(Object other) {
      if (other == null || !(other instanceof ComparableSplit))
        return false;
      return (compareTo((ComparableSplit) other) == 0);
    }

    @Override
    public int hashCode() {
      int hashCode = 31 + rawInputSplit.hashCode();
      for (Node node : nodes) {
        hashCode = 17 * hashCode + node.hashCode();
      }
      return hashCode;
    }

    @Override
    public int compareTo(ComparableSplit other) {
      try {
        int cmp = -Long.signum(rawInputSplit.getLength() - other.rawInputSplit.getLength());
        // in descending order
        return cmp == 0 ? Long.signum(id - other.id) : cmp;
      } catch (IOException e) {
        throw new RuntimeException(e);
      } catch (InterruptedException e) {
        throw new RuntimeException(e);
      }
    }
  }

  private static class DummySplit extends InputSplit {
    private long length;

    @Override
    public String[] getLocations() {
      return null;
    }

    @Override
    public long getLength() {
      return length;
    }

    public void setLength(long length) {
      this.length = length;
    }
  }

  private static void removeSplits(List splits) {
    for (ComparableSplit split: splits) {
      split.removeFromNodes();
    }
  }

  public static List> getCombinedSplits(
          List oneInputSplits, long maxCombinedSplitSize, Configuration conf)
          throws IOException, InterruptedException {
    List nodes = new ArrayList();
    Map nodeMap = new HashMap();
    List> result = new ArrayList>();
    List resultLengths = new ArrayList();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
      if (split.getLength() == 0) {
        emptyCnt++;
        continue;
      }
      if (split.getLength() >= maxCombinedSplitSize) {
        comparableSplitId++;
        List combinedSplits = new ArrayList();
        combinedSplits.add(split);
        result.add(combinedSplits);
        resultLengths.add(split.getLength());
      } else {
        ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
        String[] locations = split.getLocations();
        // sort the locations to stabilize the number of maps: PIG-1757
        Arrays.sort(locations);
        Set locationSeen = new HashSet();
        for (String location : locations) {
          if (!locationSeen.contains(location)) {
            Node node = nodeMap.get(location);
            if (node == null) {
              node = new Node();
              nodes.add(node);
              nodeMap.put(location, node);
            }
            node.add(csplit);
            csplit.add(node);
            locationSeen.add(location);
          }
        }
        lastSplit = split;
        size++;
      }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
      // if all splits are empty, add a single empty split as currently an empty directory is
      // not properly handled somewhere
      List combinedSplits = new ArrayList();
      combinedSplits.add(oneInputSplits.get(0));
      result.add(combinedSplits);
    } else if (size == 1) {
      List combinedSplits = new ArrayList();
      combinedSplits.add(lastSplit);
      result.add(combinedSplits);
    } else if (size > 1) {
      // combine small splits
      Collections.sort(nodes, nodeComparator);
      DummySplit dummy = new DummySplit();
      // dummy is used to search for next split of suitable size to be combine
      ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
      for (Node node : nodes) {
        // sort the splits on this node in descending order
        node.sort();
        long totalSize = 0;
        List splits = node.getSplits();
        int idx;
        int lenSplits;
        List combinedSplits = new ArrayList();
        List combinedComparableSplits = new ArrayList();
        while (!splits.isEmpty()) {
          combinedSplits.add(splits.get(0).getSplit());
          combinedComparableSplits.add(splits.get(0));
          int startIdx = 1;
          lenSplits = splits.size();
          totalSize += splits.get(0).getSplit().getLength();
          long spaceLeft = maxCombinedSplitSize - totalSize;
          dummy.setLength(spaceLeft);
          idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
          idx = -idx-1+startIdx;
          while (idx < lenSplits) {
            long thisLen = splits.get(idx).getSplit().getLength();
            combinedSplits.add(splits.get(idx).getSplit());
            combinedComparableSplits.add(splits.get(idx));
            totalSize += thisLen;
            spaceLeft -= thisLen;
            if (spaceLeft <= 0)
              break;
            // find next combinable chunk
            startIdx = idx + 1;
            if (startIdx >= lenSplits)
              break;
            dummy.setLength(spaceLeft);
            idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
            idx = -idx-1+startIdx;
          }
          if (totalSize > maxCombinedSplitSize/2) {
            result.add(combinedSplits);
            resultLengths.add(totalSize);
            removeSplits(combinedComparableSplits);
            totalSize = 0;
            combinedSplits = new ArrayList();
            combinedComparableSplits.clear();
            splits = node.getSplits();
          } else {
            if (combinedSplits.size() != lenSplits)
              throw new AssertionError("Combined split logic error!");
            break;
          }
        }
      }
      // handle leftovers
      List leftoverSplits = new ArrayList();
      Set seen = new HashSet();
      for (Node node : nodes) {
        for (ComparableSplit split : node.getSplits()) {
          if (!seen.contains(split.getSplit())) {
            // remove duplicates. The set has to be on the raw input split not the
            // comparable input split as the latter overrides the compareTo method
            // so its equality semantics is changed and not we want here
            seen.add(split.getSplit());
            leftoverSplits.add(split);
          }
        }
      }

      if (!leftoverSplits.isEmpty()) {
        long totalSize = 0;
        List combinedSplits = new ArrayList();
        List combinedComparableSplits = new ArrayList();

        int splitLen = leftoverSplits.size();
        for (int i = 0; i < splitLen; i++) {
          ComparableSplit split = leftoverSplits.get(i);
          long thisLen = split.getSplit().getLength();
          if (totalSize + thisLen >= maxCombinedSplitSize) {
            removeSplits(combinedComparableSplits);
            result.add(combinedSplits);
            resultLengths.add(totalSize);
            combinedSplits = new ArrayList();
            combinedComparableSplits.clear();
            totalSize = 0;
          }
          combinedSplits.add(split.getSplit());
          combinedComparableSplits.add(split);
          totalSize += split.getSplit().getLength();
          if (i == splitLen - 1) {
            // last piece: it could be very small, try to see it can be squeezed into any existing splits
            for (int j =0; j < result.size(); j++) {
              if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                List isList = result.get(j);
                for (InputSplit csplit : combinedSplits) {
                  isList.add(csplit);
                }
                removeSplits(combinedComparableSplits);
                combinedSplits.clear();
                break;
              }
            }
            if (!combinedSplits.isEmpty()) {
              // last piece can not be squeezed in, create a new combine split for them.
              removeSplits(combinedComparableSplits);
              result.add(combinedSplits);
            }
          }
        }
      }
    }
    LOG.info("Original input paths (" + oneInputSplits.size() + ") combine into (" + result.size() + ")");
    return result;
  }

  public static List getCombinedCompositeSplits(
          List oneInputSplits, long maxCombinedSplitSize, Configuration conf)
          throws IOException, InterruptedException {
    List compositeInputSplits = new ArrayList(oneInputSplits.size());
    for (List inputSplits : getCombinedSplits(oneInputSplits, maxCombinedSplitSize, conf)) {
      compositeInputSplits.add(new CompositeInputSplit(inputSplits));
    }
    return compositeInputSplits;
  }

  public static List getCombinedCompositeSplits(
          List oneInputSplits, Configuration conf)
          throws IOException, InterruptedException {
    return getCombinedCompositeSplits(oneInputSplits, getCombinedSplitSize(conf), conf);
  }

  public static void serializeInputSplit(Configuration conf, DataOutputStream out, InputSplit split)
      throws IOException {
    Class clazz = split.getClass().asSubclass(InputSplit.class);
    Text.writeString(out, clazz.getName());
    SerializationFactory factory = new SerializationFactory(conf);
    Serializer serializer = factory.getSerializer(clazz);
    serializer.open(out instanceof UncloseableDataOutputStream ? out : new UncloseableDataOutputStream(out));
    serializer.serialize(split);
  }

  public static InputSplit deserializeInputSplit(Configuration conf, DataInputStream in) throws IOException {
    String name = Text.readString(in);
    Class clazz;
    try {
      clazz = conf.getClassByName(name).asSubclass(InputSplit.class);
    } catch (ClassNotFoundException e) {
      throw new IOException("Could not find class for deserialized class name: " + name, e);
    }
    return deserializeInputSplitInternal(
        conf, in instanceof UncloseableDataInputStream ? in : new UncloseableDataInputStream(in), clazz);
  }

  private static  T deserializeInputSplitInternal(
      Configuration conf, DataInputStream in, Class clazz) throws IOException {
    T split = ReflectionUtils.newInstance(clazz, conf);
    SerializationFactory factory = new SerializationFactory(conf);
    Deserializer deserializer = factory.getDeserializer(clazz);
    deserializer.open(in instanceof UncloseableDataInputStream ? in : new UncloseableDataInputStream(in));
    return deserializer.deserialize(split);
  }

  private static class UncloseableDataOutputStream extends DataOutputStream {
    public UncloseableDataOutputStream(DataOutputStream os) {
      super(os);
    }

    @Override
    public void close() {
      // We don't want classes given this stream to close it
    }
  }

  private static class UncloseableDataInputStream extends DataInputStream {
    public UncloseableDataInputStream(DataInputStream is) {
      super(is);
    }

    @Override
    public void close() {
      // We don't want classes given this stream to close it
    }
  }

}