org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerGenerateResultOperator Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.mapjoin;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.JoinUtil;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* This class has methods for generating vectorized join results for inner joins.
*
* Inner joins use a hash map to lookup the 1 or more small table values.
*
* One vector inner join optimization is projecting inner keys. When a key appears
* in the small table results area, instead of copying or referencing key we just include
* that key again in the output projection.
*
* Another optimization is when an inner join does not have any small table columns in the
* join result, we use a different variation call inner big only. That variation uses
* a hash multi-set instead of hash map since there are no values (just a count).
*/
public abstract class VectorMapJoinInnerGenerateResultOperator
extends VectorMapJoinGenerateResultOperator {
private static final long serialVersionUID = 1L;
private static final Log LOG = LogFactory.getLog(VectorMapJoinInnerGenerateResultOperator.class.getName());
//---------------------------------------------------------------------------
// Inner join specific members.
//
// An array of hash map results so we can do lookups on the whole batch before output result
// generation.
protected transient VectorMapJoinHashMapResult hashMapResults[];
// Pre-allocated member for storing the (physical) batch index of matching row (single- or
// multi-small-table-valued) indexes during a process call.
protected transient int[] allMatchs;
/*
* Pre-allocated members for storing information equal key series for small-table matches.
*
* ~HashMapResultIndices
* Index into the hashMapResults array for the match.
* ~AllMatchIndices
* (Logical) indices into allMatchs to the first row of a match of a
* possible series of duplicate keys.
* ~IsSingleValue
* Whether there is 1 or multiple small table values.
* ~DuplicateCounts
* The duplicate count for each matched key.
*
*/
protected transient int[] equalKeySeriesHashMapResultIndices;
protected transient int[] equalKeySeriesAllMatchIndices;
protected transient boolean[] equalKeySeriesIsSingleValue;
protected transient int[] equalKeySeriesDuplicateCounts;
// Pre-allocated member for storing the (physical) batch index of rows that need to be spilled.
protected transient int[] spills;
// Pre-allocated member for storing index into the hashMapResults for each spilled row.
protected transient int[] spillHashMapResultIndices;
public VectorMapJoinInnerGenerateResultOperator() {
super();
}
public VectorMapJoinInnerGenerateResultOperator(VectorizationContext vContext, OperatorDesc conf)
throws HiveException {
super(vContext, conf);
}
/*
* Setup our inner join specific members.
*/
protected void commonSetup(VectorizedRowBatch batch) throws HiveException {
super.commonSetup(batch);
// Inner join specific.
VectorMapJoinHashMap baseHashMap = (VectorMapJoinHashMap) vectorMapJoinHashTable;
hashMapResults = new VectorMapJoinHashMapResult[batch.DEFAULT_SIZE];
for (int i = 0; i < hashMapResults.length; i++) {
hashMapResults[i] = baseHashMap.createHashMapResult();
}
allMatchs = new int[batch.DEFAULT_SIZE];
equalKeySeriesHashMapResultIndices = new int[batch.DEFAULT_SIZE];
equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE];
equalKeySeriesIsSingleValue = new boolean[batch.DEFAULT_SIZE];
equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE];
spills = new int[batch.DEFAULT_SIZE];
spillHashMapResultIndices = new int[batch.DEFAULT_SIZE];
}
/*
* Inner join (hash map).
*/
/**
* Do the per-batch setup for an inner join.
*/
protected void innerPerBatchSetup(VectorizedRowBatch batch) {
// For join operators that can generate small table results, reset their
// (target) scratch columns.
for (int column : smallTableOutputVectorColumns) {
ColumnVector smallTableColumn = batch.cols[column];
smallTableColumn.reset();
}
}
/**
* Generate the inner join output results for one vectorized row batch.
*
* @param batch
* The big table batch with any matching and any non matching rows both as
* selected in use.
* @param allMatchCount
* Number of matches in allMatchs.
* @param equalKeySeriesCount
* Number of single value matches.
* @param spillCount
* Number of spills in spills.
* @param hashMapResultCount
* Number of entries in hashMapResults.
*/
protected void finishInner(VectorizedRowBatch batch,
int allMatchCount, int equalKeySeriesCount, int spillCount, int hashMapResultCount)
throws HiveException, IOException {
int numSel = 0;
/*
* Optimize by running value expressions only over the matched rows.
*/
if (allMatchCount > 0 && bigTableValueExpressions != null) {
performValueExpressions(batch, allMatchs, allMatchCount);
}
for (int i = 0; i < equalKeySeriesCount; i++) {
int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i];
VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex];
int allMatchesIndex = equalKeySeriesAllMatchIndices[i];
boolean isSingleValue = equalKeySeriesIsSingleValue[i];
int duplicateCount = equalKeySeriesDuplicateCounts[i];
if (isSingleValue) {
numSel = generateHashMapResultSingleValue(
batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel);
} else {
generateHashMapResultMultiValue(
batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount);
}
}
if (spillCount > 0) {
spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults,
spills, spillHashMapResultIndices, spillCount);
}
batch.size = numSel;
batch.selectedInUse = true;
}
protected void finishInnerRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult,
VectorMapJoinHashTableResult hashMapResult) throws HiveException, IOException {
int numSel = 0;
switch (joinResult) {
case MATCH:
if (bigTableValueExpressions != null) {
// Run our value expressions over whole batch.
for(VectorExpression ve: bigTableValueExpressions) {
ve.evaluate(batch);
}
}
// Generate special repeated case.
generateHashMapResultRepeatedAll(batch, hashMapResults[0]);
break;
case SPILL:
// Whole batch is spilled.
spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMapResults[0]);
batch.size = 0;
break;
case NOMATCH:
// No match for entire batch.
batch.size = 0;
break;
}
}
}