Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.mapjoin;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.JoinUtil;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSet;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* This class has methods for generating vectorized join results for the big table only
* variation of inner joins.
*
* When an inner join does not have any small table columns in the join result, we use this
* variation we call inner big only. This variation uses a hash multi-set instead of hash map
* since there are no values (just a count).
*
* Note that if a inner key appears in the small table results area, we use the inner join
* projection optimization and are able to use this variation.
*/
public abstract class VectorMapJoinInnerBigOnlyGenerateResultOperator
extends VectorMapJoinGenerateResultOperator {
private static final long serialVersionUID = 1L;
private static final Log LOG = LogFactory.getLog(VectorMapJoinInnerBigOnlyGenerateResultOperator.class.getName());
//---------------------------------------------------------------------------
// Inner big-table only join specific members.
//
// An array of hash multi-set results so we can do lookups on the whole batch before output result
// generation.
protected transient VectorMapJoinHashMultiSetResult hashMultiSetResults[];
// Pre-allocated member for storing the (physical) batch index of matching row (single- or
// multi-small-table-valued) indexes during a process call.
protected transient int[] allMatchs;
/*
* Pre-allocated members for storing information on single- and multi-valued-small-table matches.
*
* ~ValueCounts
* Number of (empty) small table values.
* ~AllMatchIndices
* (Logical) indices into allMatchs to the first row of a match of a
* possible series of duplicate keys.
* ~DuplicateCounts
* The duplicate count for each matched key.
*
*/
protected transient long[] equalKeySeriesValueCounts;
protected transient int[] equalKeySeriesAllMatchIndices;
protected transient int[] equalKeySeriesDuplicateCounts;
// Pre-allocated member for storing the (physical) batch index of rows that need to be spilled.
protected transient int[] spills;
// Pre-allocated member for storing index into the hashMultiSetResults for each spilled row.
protected transient int[] spillHashMapResultIndices;
public VectorMapJoinInnerBigOnlyGenerateResultOperator() {
super();
}
public VectorMapJoinInnerBigOnlyGenerateResultOperator(VectorizationContext vContext, OperatorDesc conf)
throws HiveException {
super(vContext, conf);
}
/*
* Setup our inner big table only join specific members.
*/
protected void commonSetup(VectorizedRowBatch batch) throws HiveException {
super.commonSetup(batch);
// Inner big-table only join specific.
VectorMapJoinHashMultiSet baseHashMultiSet = (VectorMapJoinHashMultiSet) vectorMapJoinHashTable;
hashMultiSetResults = new VectorMapJoinHashMultiSetResult[batch.DEFAULT_SIZE];
for (int i = 0; i < hashMultiSetResults.length; i++) {
hashMultiSetResults[i] = baseHashMultiSet.createHashMultiSetResult();
}
allMatchs = new int[batch.DEFAULT_SIZE];
equalKeySeriesValueCounts = new long[batch.DEFAULT_SIZE];
equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE];
equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE];
spills = new int[batch.DEFAULT_SIZE];
spillHashMapResultIndices = new int[batch.DEFAULT_SIZE];
}
//-----------------------------------------------------------------------------------------------
/*
* Inner big table only join (hash multi-set).
*/
/**
* Generate the inner big table only join output results for one vectorized row batch.
*
* @param batch
* The big table batch with any matching and any non matching rows both as
* selected in use.
* @param allMatchCount
* Number of matches in allMatchs.
* @param equalKeySeriesCount
* Number of single value matches.
* @param spillCount
* Number of spills in spills.
* @param hashTableResults
* The array of all hash table results for the batch. We need the
* VectorMapJoinHashTableResult for the spill information.
* @param hashMapResultCount
* Number of entries in hashMapResults.
*
**/
protected void finishInnerBigOnly(VectorizedRowBatch batch,
int allMatchCount, int equalKeySeriesCount, int spillCount,
VectorMapJoinHashTableResult[] hashTableResults, int hashMapResultCount)
throws HiveException, IOException {
// Get rid of spills before we start modifying the batch.
if (spillCount > 0) {
spillHashMapBatch(batch, hashTableResults,
spills, spillHashMapResultIndices, spillCount);
}
/*
* Optimize by running value expressions only over the matched rows.
*/
if (allMatchCount > 0 && bigTableValueExpressions != null) {
performValueExpressions(batch, allMatchs, allMatchCount);
}
int numSel = 0;
for (int i = 0; i < equalKeySeriesCount; i++) {
long count = equalKeySeriesValueCounts[i];
int allMatchesIndex = equalKeySeriesAllMatchIndices[i];
int duplicateCount = equalKeySeriesDuplicateCounts[i];
if (count == 1) {
numSel = generateHashMultiSetResultSingleValue(
batch, allMatchs, allMatchesIndex, duplicateCount, numSel);
} else {
generateHashMultiSetResultMultiValue(batch,
allMatchs, allMatchesIndex,
duplicateCount, count);
}
}
batch.size = numSel;
batch.selectedInUse = true;
}
/**
* Generate the single value match inner big table only join output results for a match.
*
* @param batch
* The big table batch.
* @param allMatchs
* A subset of the rows of the batch that are matches.
* @param allMatchesIndex
* The logical index into allMatchs of the first equal key.
* @param duplicateCount
* The number of duplicates or equal keys.
* @param numSel
* The current count of rows in the rebuilding of the selected array.
*
* @return
* The new count of selected rows.
*/
private int generateHashMultiSetResultSingleValue(VectorizedRowBatch batch,
int[] allMatchs, int allMatchesIndex, int duplicateCount, int numSel)
throws HiveException, IOException {
// LOG.debug("generateHashMultiSetResultSingleValue enter...");
// Generate result within big table batch itself.
// LOG.debug("generateHashMultiSetResultSingleValue with big table...");
for (int i = 0; i < duplicateCount; i++) {
int batchIndex = allMatchs[allMatchesIndex + i];
// Use the big table row as output.
batch.selected[numSel++] = batchIndex;
}
return numSel;
}
/**
* Generate results for a N x M cross product.
*
* @param batch
* The big table batch.
* @param allMatchs
* The all match selected array that contains (physical) batch indices.
* @param allMatchesIndex
* The index of the match key.
* @param duplicateCount
* Number of equal key rows.
* @param count
* Value count.
*/
private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch,
int[] allMatchs, int allMatchesIndex,
int duplicateCount, long count) throws HiveException, IOException {
// LOG.debug("generateHashMultiSetResultMultiValue allMatchesIndex " + allMatchesIndex + " duplicateCount " + duplicateCount + " count " + count);
// TODO: Look at repeating optimizations...
for (int i = 0; i < duplicateCount; i++) {
int batchIndex = allMatchs[allMatchesIndex + i];
for (long l = 0; l < count; l++) {
// Copy the BigTable values into the overflow batch. Since the overflow batch may
// not get flushed here, we must copy by value.
if (bigTableRetainedVectorCopy != null) {
bigTableRetainedVectorCopy.copyByValue(batch, batchIndex,
overflowBatch, overflowBatch.size);
}
overflowBatch.size++;
if (overflowBatch.size == overflowBatch.DEFAULT_SIZE) {
forwardOverflow();
}
}
}
}
/**
* Generate the inner big table only join output results for one vectorized row batch with
* a repeated key.
*
* @param batch
* The big table batch with any matching and any non matching rows both as
* selected in use.
* @param hashMultiSetResult
* The hash multi-set results for the batch.
*/
protected int generateHashMultiSetResultRepeatedAll(VectorizedRowBatch batch,
VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException {
long count = hashMultiSetResult.count();
if (batch.selectedInUse) {
// The selected array is already filled in as we want it.
} else {
int[] selected = batch.selected;
for (int i = 0; i < batch.size; i++) {
selected[i] = i;
}
batch.selectedInUse = true;
}
do {
forwardBigTableBatch(batch);
count--;
} while (count > 0);
// We forwarded the batch in this method.
return 0;
}
protected void finishInnerBigOnlyRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult,
VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException, IOException {
switch (joinResult) {
case MATCH:
if (bigTableValueExpressions != null) {
// Run our value expressions over whole batch.
for(VectorExpression ve: bigTableValueExpressions) {
ve.evaluate(batch);
}
}
// Generate special repeated case.
int numSel = generateHashMultiSetResultRepeatedAll(batch, hashMultiSetResult);
batch.size = numSel;
batch.selectedInUse = true;
break;
case SPILL:
// Whole batch is spilled.
spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMultiSetResult);
batch.size = 0;
break;
case NOMATCH:
// No match for entire batch.
batch.size = 0;
break;
}
}
}