org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiGenerateResultOperator Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.mapjoin;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.JoinUtil;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* This class has methods for generating vectorized join results for left semi joins.
*
* The big difference between inner joins and left semi joins is existence testing.
*
* Inner joins use a hash map to lookup the 1 or more small table values.
*
* Left semi joins are a specialized join for outputting big table rows whose key exists
* in the small table.
*
* No small table values are needed for left semi join since they would be empty. So,
* we use a hash set as the hash table. Hash sets just report whether a key exists. This
* is a big performance optimization.
*/
public abstract class VectorMapJoinLeftSemiGenerateResultOperator
extends VectorMapJoinGenerateResultOperator {
private static final long serialVersionUID = 1L;
private static final Log LOG = LogFactory.getLog(VectorMapJoinLeftSemiGenerateResultOperator.class.getName());
//---------------------------------------------------------------------------
// Semi join specific members.
//
// An array of hash set results so we can do lookups on the whole batch before output result
// generation.
protected transient VectorMapJoinHashSetResult hashSetResults[];
// Pre-allocated member for storing the (physical) batch index of matching row (single- or
// multi-small-table-valued) indexes during a process call.
protected transient int[] allMatchs;
// Pre-allocated member for storing the (physical) batch index of rows that need to be spilled.
protected transient int[] spills;
// Pre-allocated member for storing index into the hashSetResults for each spilled row.
protected transient int[] spillHashMapResultIndices;
public VectorMapJoinLeftSemiGenerateResultOperator() {
super();
}
public VectorMapJoinLeftSemiGenerateResultOperator(VectorizationContext vContext, OperatorDesc conf)
throws HiveException {
super(vContext, conf);
}
/*
* Setup our left semi join specific members.
*/
protected void commonSetup(VectorizedRowBatch batch) throws HiveException {
super.commonSetup(batch);
// Semi join specific.
VectorMapJoinHashSet baseHashSet = (VectorMapJoinHashSet) vectorMapJoinHashTable;
hashSetResults = new VectorMapJoinHashSetResult[batch.DEFAULT_SIZE];
for (int i = 0; i < hashSetResults.length; i++) {
hashSetResults[i] = baseHashSet.createHashSetResult();
}
allMatchs = new int[batch.DEFAULT_SIZE];
spills = new int[batch.DEFAULT_SIZE];
spillHashMapResultIndices = new int[batch.DEFAULT_SIZE];
}
//-----------------------------------------------------------------------------------------------
/*
* Left semi join (hash set).
*/
/**
* Generate the left semi join output results for one vectorized row batch.
*
* @param batch
* The big table batch with any matching and any non matching rows both as
* selected in use.
* @param allMatchCount
* Number of matches in allMatchs.
* @param spillCount
* Number of spills in spills.
* @param hashTableResults
* The array of all hash table results for the batch. We need the
* VectorMapJoinHashTableResult for the spill information.
*/
protected void finishLeftSemi(VectorizedRowBatch batch,
int allMatchCount, int spillCount,
VectorMapJoinHashTableResult[] hashTableResults) throws HiveException, IOException {
// Get rid of spills before we start modifying the batch.
if (spillCount > 0) {
spillHashMapBatch(batch, hashTableResults,
spills, spillHashMapResultIndices, spillCount);
}
/*
* Optimize by running value expressions only over the matched rows.
*/
if (allMatchCount > 0 && bigTableValueExpressions != null) {
performValueExpressions(batch, allMatchs, allMatchCount);
}
int numSel = generateHashSetResults(batch, allMatchs, allMatchCount);
batch.size = numSel;
batch.selectedInUse = true;
}
/**
* Generate the matching left semi join output results of a vectorized row batch.
*
* @param batch
* The big table batch.
* @param allMatchs
* A subset of the rows of the batch that are matches.
* @param allMatchCount
* Number of matches in allMatchs.
*/
private int generateHashSetResults(VectorizedRowBatch batch,
int[] allMatchs, int allMatchCount)
throws HiveException, IOException {
int numSel = 0;
// Generate result within big table batch itself.
for (int i = 0; i < allMatchCount; i++) {
int batchIndex = allMatchs[i];
// Use the big table row as output.
batch.selected[numSel++] = batchIndex;
}
return numSel;
}
/**
* Generate the left semi join output results for one vectorized row batch with a repeated key.
*
* @param batch
* The big table batch whose repeated key matches.
*/
protected int generateHashSetResultRepeatedAll(VectorizedRowBatch batch) throws HiveException {
if (batch.selectedInUse) {
// The selected array is already filled in as we want it.
} else {
int[] selected = batch.selected;
for (int i = 0; i < batch.size; i++) {
selected[i] = i;
}
batch.selectedInUse = true;
}
return batch.size;
}
protected void finishLeftSemiRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult,
VectorMapJoinHashTableResult hashSetResult) throws HiveException, IOException {
switch (joinResult) {
case MATCH:
if (bigTableValueExpressions != null) {
// Run our value expressions over whole batch.
for(VectorExpression ve: bigTableValueExpressions) {
ve.evaluate(batch);
}
}
// Generate special repeated case.
int numSel = generateHashSetResultRepeatedAll(batch);
batch.size = numSel;
batch.selectedInUse = true;
break;
case SPILL:
// Whole batch is spilled.
spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashSetResult);
batch.size = 0;
break;
case NOMATCH:
// No match for entire batch.
batch.size = 0;
break;
}
}
}