Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.KeyWrapper;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.DataOutputBuffer;
/**
* Vectorized GROUP BY operator implementation. Consumes the vectorized input and
* stores the aggregate operators' intermediate states. Emits row mode output.
*
*/
public class VectorGroupByOperator extends Operator implements
VectorizationContextRegion {
private static final Log LOG = LogFactory.getLog(
VectorGroupByOperator.class.getName());
/**
* This is the vector of aggregators. They are stateless and only implement
* the algorithm of how to compute the aggregation. state is kept in the
* aggregation buffers and is our responsibility to match the proper state for each key.
*/
private VectorAggregateExpression[] aggregators;
/**
* Key vector expressions.
*/
private VectorExpression[] keyExpressions;
private int outputKeyLength;
private boolean isVectorOutput;
// Create a new outgoing vectorization context because column name map will change.
private VectorizationContext vOutContext = null;
// The above members are initialized by the constructor and must not be
// transient.
//---------------------------------------------------------------------------
private transient VectorExpressionWriter[] keyOutputWriters;
/**
* The aggregation buffers to use for the current batch.
*/
private transient VectorAggregationBufferBatch aggregationBatchInfo;
/**
* The current batch key wrappers.
* The very same instance gets reused for all batches.
*/
private transient VectorHashKeyWrapperBatch keyWrappersBatch;
private transient Object[] forwardCache;
private transient VectorizedRowBatch outputBatch;
private transient VectorizedRowBatchCtx vrbCtx;
private transient VectorAssignRowSameBatch vectorAssignRowSameBatch;
private transient int numEntriesHashTable;
private transient long maxHashTblMemory;
private transient long maxMemory;
private float memoryThreshold;
/**
* Interface for processing mode: global, hash, unsorted streaming, or group batch
*/
private static interface IProcessingMode {
public void initialize(Configuration hconf) throws HiveException;
public void startGroup() throws HiveException;
public void endGroup() throws HiveException;
public void processBatch(VectorizedRowBatch batch) throws HiveException;
public void close(boolean aborted) throws HiveException;
}
/**
* Base class for all processing modes
*/
private abstract class ProcessingModeBase implements IProcessingMode {
// Overridden and used in sorted reduce group batch processing mode.
@Override
public void startGroup() throws HiveException {
// Do nothing.
}
@Override
public void endGroup() throws HiveException {
// Do nothing.
}
/**
* Evaluates the aggregators on the current batch.
* The aggregationBatchInfo must have been prepared
* by calling {@link #prepareBatchAggregationBufferSets} first.
*/
protected void processAggregators(VectorizedRowBatch batch) throws HiveException {
// We now have a vector of aggregation buffer sets to use for each row
// We can start computing the aggregates.
// If the number of distinct keys in the batch is 1 we can
// use the optimized code path of aggregateInput
VectorAggregationBufferRow[] aggregationBufferSets =
aggregationBatchInfo.getAggregationBuffers();
if (aggregationBatchInfo.getDistinctBufferSetCount() == 1) {
VectorAggregateExpression.AggregationBuffer[] aggregationBuffers =
aggregationBufferSets[0].getAggregationBuffers();
for (int i = 0; i < aggregators.length; ++i) {
aggregators[i].aggregateInput(aggregationBuffers[i], batch);
}
} else {
for (int i = 0; i < aggregators.length; ++i) {
aggregators[i].aggregateInputSelection(
aggregationBufferSets,
i,
batch);
}
}
}
/**
* allocates a new aggregation buffer set.
*/
protected VectorAggregationBufferRow allocateAggregationBuffer() throws HiveException {
VectorAggregateExpression.AggregationBuffer[] aggregationBuffers =
new VectorAggregateExpression.AggregationBuffer[aggregators.length];
for (int i=0; i < aggregators.length; ++i) {
aggregationBuffers[i] = aggregators[i].getNewAggregationBuffer();
aggregators[i].reset(aggregationBuffers[i]);
}
VectorAggregationBufferRow bufferSet = new VectorAggregationBufferRow(aggregationBuffers);
return bufferSet;
}
}
/**
* Global aggregates (no GROUP BY clause, no keys)
* This mode is very simple, there are no keys to consider, and only flushes one row at closing
* The one row must flush even if no input was seen (NULLs)
*/
private class ProcessingModeGlobalAggregate extends ProcessingModeBase {
/**
* In global processing mode there is only one set of aggregation buffers
*/
private VectorAggregationBufferRow aggregationBuffers;
@Override
public void initialize(Configuration hconf) throws HiveException {
aggregationBuffers = allocateAggregationBuffer();
LOG.info("using global aggregation processing mode");
}
@Override
public void processBatch(VectorizedRowBatch batch) throws HiveException {
for (int i = 0; i < aggregators.length; ++i) {
aggregators[i].aggregateInput(aggregationBuffers.getAggregationBuffer(i), batch);
}
}
@Override
public void close(boolean aborted) throws HiveException {
if (!aborted) {
writeSingleRow(null, aggregationBuffers);
}
}
}
/**
* Hash Aggregate mode processing
*/
private class ProcessingModeHashAggregate extends ProcessingModeBase {
/**
* The global key-aggregation hash map.
*/
private Map mapKeysAggregationBuffers;
/**
* Total per hashtable entry fixed memory (does not depend on key/agg values).
*/
private int fixedHashEntrySize;
/**
* Average per hashtable entry variable size memory (depends on key/agg value).
*/
private int avgVariableSize;
/**
* Number of entries added to the hashtable since the last check if it should flush.
*/
private int numEntriesSinceCheck;
/**
* Sum of batch size processed (ie. rows).
*/
private long sumBatchSize;
/**
* Max number of entries in the vector group by aggregation hashtables.
* Exceeding this will trigger a flush irrelevant of memory pressure condition.
*/
private int maxHtEntries = 1000000;
/**
* The number of new entries that must be added to the hashtable before a memory size check.
*/
private int checkInterval = 10000;
/**
* Percent of entries to flush when memory threshold exceeded.
*/
private float percentEntriesToFlush = 0.1f;
/**
* A soft reference used to detect memory pressure
*/
private SoftReference