org.apache.hadoop.hive.ql.exec.spark.SortByShuffler Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.spark;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.io.BytesWritable;
import org.apache.spark.HashPartitioner;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
import java.util.*;
public class SortByShuffler implements SparkShuffler {
private final boolean totalOrder;
/**
* @param totalOrder whether this shuffler provides total order shuffle.
*/
public SortByShuffler(boolean totalOrder) {
this.totalOrder = totalOrder;
}
@Override
public JavaPairRDD> shuffle(
JavaPairRDD input, int numPartitions) {
JavaPairRDD rdd;
if (totalOrder) {
if (numPartitions > 0) {
rdd = input.sortByKey(true, numPartitions);
} else {
rdd = input.sortByKey(true);
}
} else {
Partitioner partitioner = new HashPartitioner(numPartitions);
rdd = input.repartitionAndSortWithinPartitions(partitioner);
}
return rdd.mapPartitionsToPair(new ShuffleFunction());
}
private static class ShuffleFunction implements
PairFlatMapFunction>,
HiveKey, Iterable> {
// make eclipse happy
private static final long serialVersionUID = 1L;
@Override
public Iterable>> call(
final Iterator> it) throws Exception {
// Use input iterator to back returned iterable object.
final Iterator>> resultIt =
new Iterator>>() {
HiveKey curKey = null;
List curValues = new ArrayList();
@Override
public boolean hasNext() {
return it.hasNext() || curKey != null;
}
@Override
public Tuple2> next() {
// TODO: implement this by accumulating rows with the same key into a list.
// Note that this list needs to improved to prevent excessive memory usage, but this
// can be done in later phase.
while (it.hasNext()) {
Tuple2 pair = it.next();
if (curKey != null && !curKey.equals(pair._1())) {
HiveKey key = curKey;
List values = curValues;
curKey = pair._1();
curValues = new ArrayList();
curValues.add(pair._2());
return new Tuple2>(key, values);
}
curKey = pair._1();
curValues.add(pair._2());
}
if (curKey == null) {
throw new NoSuchElementException();
}
// if we get here, this should be the last element we have
HiveKey key = curKey;
curKey = null;
return new Tuple2>(key, curValues);
}
@Override
public void remove() {
// Not implemented.
// throw Unsupported Method Invocation Exception.
throw new UnsupportedOperationException();
}
};
return new Iterable>>() {
@Override
public Iterator>> iterator() {
return resultIt;
}
};
}
}
}