org.apache.hadoop.hive.ql.exec.ReduceSinkOperator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
Show all versions of hive-exec Show documentation
Hive is a data warehouse infrastructure built on top of Hadoop see
http://wiki.apache.org/hadoop/Hive
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* Reduce Sink Operator sends output to the reduce stage.
**/
public class ReduceSinkOperator extends TerminalOperator
implements Serializable {
private static final long serialVersionUID = 1L;
/**
* The evaluators for the key columns. Key columns decide the sort order on
* the reducer side. Key columns are passed to the reducer in the "key".
*/
protected transient ExprNodeEvaluator[] keyEval;
/**
* The evaluators for the value columns. Value columns are passed to reducer
* in the "value".
*/
protected transient ExprNodeEvaluator[] valueEval;
/**
* The evaluators for the partition columns (CLUSTER BY or DISTRIBUTE BY in
* Hive language). Partition columns decide the reducer that the current row
* goes to. Partition columns are not passed to reducer.
*/
protected transient ExprNodeEvaluator[] partitionEval;
// TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is
// ready
transient Serializer keySerializer;
transient boolean keyIsText;
transient Serializer valueSerializer;
transient int tag;
transient byte[] tagByte = new byte[1];
transient protected int numDistributionKeys;
transient protected int numDistinctExprs;
transient protected boolean optimizeSkew;
transient String inputAlias; // input alias of this RS for join (used for PPD)
public void setInputAlias(String inputAlias) {
this.inputAlias = inputAlias;
}
public String getInputAlias() {
return inputAlias;
}
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
try {
keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()];
int i = 0;
for (ExprNodeDesc e : conf.getKeyCols()) {
keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
numDistributionKeys = conf.getNumDistributionKeys();
distinctColIndices = conf.getDistinctColumnIndices();
numDistinctExprs = distinctColIndices.size();
optimizeSkew = conf.isOptimizeSkew();
valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getValueCols()) {
valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getPartitionCols()) {
partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
tag = conf.getTag();
tagByte[0] = (byte) tag;
LOG.info("Using tag = " + tag);
TableDesc keyTableDesc = conf.getKeySerializeInfo();
keySerializer = (Serializer) keyTableDesc.getDeserializerClass()
.newInstance();
keySerializer.initialize(null, keyTableDesc.getProperties());
keyIsText = keySerializer.getSerializedClass().equals(Text.class);
TableDesc valueTableDesc = conf.getValueSerializeInfo();
valueSerializer = (Serializer) valueTableDesc.getDeserializerClass()
.newInstance();
valueSerializer.initialize(null, valueTableDesc.getProperties());
firstRow = true;
initializeChildren(hconf);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
transient InspectableObject tempInspectableObject = new InspectableObject();
transient HiveKey keyWritable = new HiveKey();
transient Writable value;
transient StructObjectInspector keyObjectInspector;
transient StructObjectInspector valueObjectInspector;
transient ObjectInspector[] partitionObjectInspectors;
transient ObjectInspector[] keyColObjectInspectors;
transient Object[][] cachedKeys;
transient Object[] cachedValues;
transient List> distinctColIndices;
boolean firstRow;
transient Random random;
/**
* Initializes array of ExprNodeEvaluator. Adds Union field for distinct
* column indices for group by.
* Puts the return values into a StructObjectInspector with output column
* names.
*
* If distinctColIndices is empty, the object inspector is same as
* {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)}
*/
protected static StructObjectInspector initEvaluatorsAndReturnStruct(
ExprNodeEvaluator[] evals, List> distinctColIndices,
List outputColNames,
int length, ObjectInspector rowInspector, boolean optimizeSkew,
ObjectInspector[] keyColObjectInspectors)
throws HiveException {
int inspectorLen = evals.length > length ? length + 1 : evals.length;
List sois = new ArrayList(inspectorLen);
// keys
ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector);
sois.addAll(Arrays.asList(fieldObjectInspectors));
if (optimizeSkew) {
System.arraycopy(fieldObjectInspectors, 0, keyColObjectInspectors, 0, length);
}
if (outputColNames.size() > length) {
// union keys
List uois = new ArrayList();
for (List distinctCols : distinctColIndices) {
List names = new ArrayList();
List eois = new ArrayList();
int numExprs = 0;
for (int i : distinctCols) {
names.add(HiveConf.getColumnInternalName(numExprs));
ObjectInspector oi = evals[i].initialize(rowInspector);
eois.add(oi);
if (optimizeSkew && keyColObjectInspectors[i] == null) {
keyColObjectInspectors[i] = oi;
}
numExprs++;
}
uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois));
}
UnionObjectInspector uoi =
ObjectInspectorFactory.getStandardUnionObjectInspector(uois);
sois.add(uoi);
}
return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois );
}
protected static StructObjectInspector initEvaluatorsAndReturnStruct(
ExprNodeEvaluator[] evals, List> distinctColIndices,
List outputColNames,
int length, ObjectInspector rowInspector) throws HiveException {
return initEvaluatorsAndReturnStruct(evals, distinctColIndices, outputColNames, length,
rowInspector, false, null);
}
@Override
public void processOp(Object row, int tag) throws HiveException {
try {
ObjectInspector rowInspector = inputObjInspectors[tag];
if (firstRow) {
firstRow = false;
if (optimizeSkew) {
keyColObjectInspectors = new ObjectInspector[keyEval.length];
}
keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval,
distinctColIndices,
conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector,
optimizeSkew, keyColObjectInspectors);
valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf
.getOutputValueColumnNames(), rowInspector);
partitionObjectInspectors = initEvaluators(partitionEval, rowInspector);
//keyColObjectInspectors = initEvaluators(keyEval, rowInspector);
int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1;
int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 :
numDistributionKeys;
cachedKeys = new Object[numKeys][keyLen];
cachedValues = new Object[valueEval.length];
}
// Evaluate the value
for (int i = 0; i < valueEval.length; i++) {
cachedValues[i] = valueEval[i].evaluate(row);
}
// Serialize the value
value = valueSerializer.serialize(cachedValues, valueObjectInspector);
// Evaluate the keys
Object[] distributionKeys = new Object[numDistributionKeys];
for (int i = 0; i < numDistributionKeys; i++) {
distributionKeys[i] = keyEval[i].evaluate(row);
}
// Evaluate the HashCode
int currentHashCode = 0;
int keyHashCode[] = new int[numDistinctExprs > 0 ? numDistinctExprs : 1];
if (partitionEval.length == 0 && numDistinctExprs == 0) {
// If no partition cols, just distribute the data uniformly to provide
// better
// load balance. If the requirement is to have a single reducer, we
// should set
// the number of reducers to 1.
// Use a constant seed to make the code deterministic.
if (random == null) {
random = new Random(12345);
}
currentHashCode = random.nextInt();
} else {
for (int i = 0; i < partitionEval.length; i++) {
Object o = partitionEval[i].evaluate(row);
currentHashCode = currentHashCode * 31
+ ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
}
}
if (numDistinctExprs > 0) {
// with distinct key(s)
for (int i = 0; i < numDistinctExprs; i++) {
System.arraycopy(distributionKeys, 0, cachedKeys[i], 0, numDistributionKeys);
Object[] distinctParameters =
new Object[distinctColIndices.get(i).size()];
keyHashCode[i] = currentHashCode;
for (int j = 0; j < distinctParameters.length; j++) {
distinctParameters[j] = keyEval[distinctColIndices.get(i).get(j)].evaluate(row);
if (optimizeSkew) {
keyHashCode[i] = keyHashCode[i] * 31
+ ObjectInspectorUtils.hashCode(distinctParameters[j],
keyColObjectInspectors[distinctColIndices.get(i).get(j)]);
}
}
cachedKeys[i][numDistributionKeys] =
new StandardUnion((byte)i, distinctParameters);
}
} else {
// no distinct key
keyHashCode[0] = currentHashCode;
System.arraycopy(distributionKeys, 0, cachedKeys[0], 0, numDistributionKeys);
}
// Serialize the keys and append the tag
for (int i = 0; i < cachedKeys.length; i++) {
if (keyIsText) {
Text key = (Text) keySerializer.serialize(cachedKeys[i],
keyObjectInspector);
if (tag == -1) {
keyWritable.set(key.getBytes(), 0, key.getLength());
} else {
int keyLength = key.getLength();
keyWritable.setSize(keyLength + 1);
System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
keyWritable.get()[keyLength] = tagByte[0];
}
} else {
// Must be BytesWritable
BytesWritable key = (BytesWritable) keySerializer.serialize(
cachedKeys[i], keyObjectInspector);
if (tag == -1) {
keyWritable.set(key.getBytes(), 0, key.getLength());
} else {
int keyLength = key.getLength();
keyWritable.setSize(keyLength + 1);
System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
keyWritable.get()[keyLength] = tagByte[0];
}
}
keyWritable.setHashCode(keyHashCode[i]);
if (out != null) {
out.collect(keyWritable, value);
// Since this is a terminal operator, update counters explicitly -
// forward is not called
if (counterNameToEnum != null) {
++outputRows;
if (outputRows % 1000 == 0) {
incrCounter(numOutputRowsCntr, outputRows);
outputRows = 0;
}
}
}
}
} catch (SerDeException e) {
throw new HiveException(e);
} catch (IOException e) {
throw new HiveException(e);
}
}
/**
* @return the name of the operator
*/
@Override
public String getName() {
return getOperatorName();
}
static public String getOperatorName() {
return "RS";
}
@Override
public OperatorType getType() {
return OperatorType.REDUCESINK;
}
@Override
public boolean opAllowedBeforeMapJoin() {
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy