org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.JoinUtil;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;

/**
 * The vectorized version of the MapJoinOperator.
 */
public class VectorMapJoinOperator extends VectorMapJoinBaseOperator {

  private static final long serialVersionUID = 1L;

  private static final Log LOG = LogFactory.getLog(
      VectorMapJoinOperator.class.getName());

  protected VectorExpression[] keyExpressions;

  protected VectorExpression[] bigTableFilterExpressions;
  protected VectorExpression[] bigTableValueExpressions;

  // The above members are initialized by the constructor and must not be
  // transient.
  //---------------------------------------------------------------------------


  private transient VectorExpressionWriter[] valueWriters;

  // These members are used as out-of-band params
  // for the inner-loop supper.processOp callbacks
  //
  private transient int batchIndex;
  private transient VectorHashKeyWrapper[] keyValues;
  private transient VectorHashKeyWrapperBatch keyWrapperBatch;
  private transient VectorExpressionWriter[] keyOutputWriters;

  private VectorExpressionWriter[] rowWriters;  // Writer for producing row from input batch
  protected transient Object[] singleRow;

  public VectorMapJoinOperator() {
    super();
  }


  public VectorMapJoinOperator (VectorizationContext vContext, OperatorDesc conf)
    throws HiveException {

    super(vContext, conf);

    MapJoinDesc desc = (MapJoinDesc) conf;

    Map> filterExpressions = desc.getFilters();
    bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable),
        VectorExpressionDescriptor.Mode.FILTER);

    List keyDesc = desc.getKeys().get(posBigTable);
    keyExpressions = vContext.getVectorExpressions(keyDesc);

    // We're only going to evaluate the big table vectorized expressions,
    Map> exprs = desc.getExprs();
    bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable));
  }

  @Override
  public Collection> initializeOp(Configuration hconf) throws HiveException {

    // Use a final variable to properly parameterize the processVectorInspector closure.
    // Using a member variable in the closure will not do the right thing...
    final int parameterizePosBigTable = conf.getPosBigTable();

    // Code borrowed from VectorReduceSinkOperator.initializeOp
    VectorExpressionWriterFactory.processVectorInspector(
        (StructObjectInspector) inputObjInspectors[parameterizePosBigTable],
        new VectorExpressionWriterFactory.SingleOIDClosure() {
          @Override
          public void assign(VectorExpressionWriter[] writers,
                             ObjectInspector objectInspector) {
            rowWriters = writers;
            inputObjInspectors[parameterizePosBigTable] = objectInspector;
          }
        });
    singleRow = new Object[rowWriters.length];

    Collection> result = super.initializeOp(hconf);

    List keyDesc = conf.getKeys().get(posBigTable);
    keyOutputWriters = VectorExpressionWriterFactory.getExpressionWriters(keyDesc);

    keyWrapperBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions);

    Map> valueExpressions = conf.getExprs();
    List bigTableExpressions = valueExpressions.get(posBigTable);

    VectorExpressionWriterFactory.processVectorExpressions(
        bigTableExpressions,
        new VectorExpressionWriterFactory.ListOIDClosure() {
          @Override
          public void assign(VectorExpressionWriter[] writers, List oids) {
            valueWriters = writers;
            joinValuesObjectInspectors[posBigTable] = oids;
          }
        });

    // We're hijacking the big table evaluators an replace them with our own custom ones
    // which are going to return values from the input batch vector expressions
    List vectorNodeEvaluators = new ArrayList(bigTableExpressions.size());

    for(int i=0; i(desc) {
        int columnIndex;;
        int writerIndex;

        public ExprNodeEvaluator initVectorExpr(int columnIndex, int writerIndex) {
          this.columnIndex = columnIndex;
          this.writerIndex = writerIndex;
          return this;
        }

        @Override
        public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException {
          throw new HiveException("should never reach here");
        }

        @Override
        protected Object _evaluate(Object row, int version) throws HiveException {
          VectorizedRowBatch inBatch = (VectorizedRowBatch) row;
          int rowIndex = inBatch.selectedInUse ? inBatch.selected[batchIndex] : batchIndex;
          return valueWriters[writerIndex].writeValue(inBatch.cols[columnIndex], rowIndex);
        }
      }.initVectorExpr(vectorExpr.getOutputColumn(), i);
      vectorNodeEvaluators.add(eval);
    }
    // Now replace the old evaluators with our own
    joinValues[posBigTable] = vectorNodeEvaluators;

    // Filtering is handled in the input batch processing
    filterMaps[posBigTable] = null;

    return result;
  }

  @Override
  protected JoinUtil.JoinResult setMapJoinKey(ReusableGetAdaptor dest, Object row, byte alias)
      throws HiveException {
    return dest.setFromVector(keyValues[batchIndex], keyOutputWriters, keyWrapperBatch);
  }

  @Override
  public void process(Object row, int tag) throws HiveException {

    VectorizedRowBatch inBatch = (VectorizedRowBatch) row;

    // Preparation for hybrid grace hash join
    this.tag = tag;
    if (scratchBatch == null) {
      scratchBatch = VectorizedBatchUtil.makeLike(inBatch);
    }

    if (null != bigTableFilterExpressions) {
      for(VectorExpression ve:bigTableFilterExpressions) {
        ve.evaluate(inBatch);
      }
    }

    if (null != bigTableValueExpressions) {
      for(VectorExpression ve: bigTableValueExpressions) {
        ve.evaluate(inBatch);
      }
    }

    keyWrapperBatch.evaluateBatch(inBatch);
    keyValues = keyWrapperBatch.getVectorHashKeyWrappers();

    // This implementation of vectorized JOIN is delegating all the work
    // to the row-mode implementation by hijacking the big table node evaluators
    // and calling the row-mode join processOp for each row in the input batch.
    // Since the JOIN operator is not fully vectorized anyway atm (due to the use
    // of row-mode small-tables) this is a reasonable trade-off.
    //
    for(batchIndex=0; batchIndex < inBatch.size; ++batchIndex) {
      super.process(row, tag);
    }

    // Set these two to invalid values so any attempt to use them
    // outside the inner loop results in NPE/OutOfBounds errors
    batchIndex = -1;
    keyValues = null;
  }

  @Override
  protected void spillBigTableRow(MapJoinTableContainer hybridHtContainer, Object row)
      throws HiveException {
    // Extract the actual row from row batch
    VectorizedRowBatch inBatch = (VectorizedRowBatch) row;
    Object[] actualRow = getRowObject(inBatch, batchIndex);
    super.spillBigTableRow(hybridHtContainer, actualRow);
  }

  // Code borrowed from VectorReduceSinkOperator
  private Object[] getRowObject(VectorizedRowBatch vrb, int rowIndex) throws HiveException {
    int batchIndex = rowIndex;
    if (vrb.selectedInUse) {
      batchIndex = vrb.selected[rowIndex];
    }
    for (int i = 0; i < vrb.projectionSize; i++) {
      ColumnVector vectorColumn = vrb.cols[vrb.projectedColumns[i]];
      if (vectorColumn != null) {
        singleRow[i] = rowWriters[i].writeValue(vectorColumn, batchIndex);
      } else {
        // Some columns from tables are not used.
        singleRow[i] = null;
      }
    }
    return singleRow;
  }
}