All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ObjectPair;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.PriorityQueue;
import org.apache.hive.common.util.ReflectionUtil;

/**
 * Sorted Merge Map Join Operator.
 */
public class SMBMapJoinOperator extends AbstractMapJoinOperator implements
    Serializable {

  private static final long serialVersionUID = 1L;

  private static final Logger LOG = LoggerFactory.getLogger(SMBMapJoinOperator.class
      .getName());

  private MapredLocalWork localWork = null;
  private Map aliasToMergeQueue = Collections.emptyMap();

  transient List[] keyWritables;
  transient List[] nextKeyWritables;
  RowContainer>[] nextGroupStorage;
  RowContainer>[] candidateStorage;

  transient String[] tagToAlias;
  private transient boolean[] fetchDone;
  private transient boolean[] foundNextKeyGroup;
  transient boolean firstFetchHappened = false;
  private transient boolean inputFileChanged = false;
  transient boolean localWorkInited = false;
  transient boolean initDone = false;

  // This join has been converted to a SMB join by the hive optimizer. The user did not
  // give a mapjoin hint in the query. The hive optimizer figured out that the join can be
  // performed as a smb join, based on all the tables/partitions being joined.
  private transient boolean convertedAutomaticallySMBJoin = false;

  /** Kryo ctor. */
  protected SMBMapJoinOperator() {
    super();
  }

  public SMBMapJoinOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  public SMBMapJoinOperator(AbstractMapJoinOperator mapJoinOp) {
    super(mapJoinOp);
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {

    // If there is a sort-merge join followed by a regular join, the SMBJoinOperator may not
    // get initialized at all. Consider the following query:
    // A SMB B JOIN C
    // For the mapper processing C, The SMJ is not initialized, no need to close it either.
    initDone = true;

    super.initializeOp(hconf);

    closeCalled = false;

    this.firstFetchHappened = false;
    this.inputFileChanged = false;

    // get the largest table alias from order
    int maxAlias = 0;
    for (byte pos = 0; pos < order.length; pos++) {
      if (pos > maxAlias) {
        maxAlias = pos;
      }
    }
    maxAlias += 1;

    nextGroupStorage = new RowContainer[maxAlias];
    candidateStorage = new RowContainer[maxAlias];
    keyWritables = new ArrayList[maxAlias];
    nextKeyWritables = new ArrayList[maxAlias];
    fetchDone = new boolean[maxAlias];
    foundNextKeyGroup = new boolean[maxAlias];

    int bucketSize;

    // For backwards compatibility reasons we honor the older
    // HIVEMAPJOINBUCKETCACHESIZE if set different from default.
    // By hive 0.13 we should remove this code.
    int oldVar = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEMAPJOINBUCKETCACHESIZE);
    if (oldVar != 100) {
      bucketSize = oldVar;
    } else {
      bucketSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVESMBJOINCACHEROWS);
    }

    for (byte pos = 0; pos < order.length; pos++) {
      RowContainer> rc = JoinUtil.getRowContainer(hconf,
          rowContainerStandardObjectInspectors[pos],
          pos, bucketSize,spillTableDesc, conf, !hasFilter(pos),
          reporter);
      nextGroupStorage[pos] = rc;
      RowContainer> candidateRC = JoinUtil.getRowContainer(hconf,
          rowContainerStandardObjectInspectors[pos],
          pos, bucketSize,spillTableDesc, conf, !hasFilter(pos),
          reporter);
      candidateStorage[pos] = candidateRC;
    }
    tagToAlias = conf.convertToArray(conf.getTagToAlias(), String.class);

    for (byte pos = 0; pos < order.length; pos++) {
      if (pos != posBigTable) {
        fetchDone[pos] = false;
      }
      foundNextKeyGroup[pos] = false;
    }
  }

  @Override
  public void initializeLocalWork(Configuration hconf) throws HiveException {
    initializeMapredLocalWork(this.getConf(), hconf, this.getConf().getLocalWork(), LOG);
    super.initializeLocalWork(hconf);
  }

  public void initializeMapredLocalWork(MapJoinDesc mjConf, Configuration hconf,
      MapredLocalWork localWork, Logger l4j) throws HiveException {
    if (localWork == null || localWorkInited) {
      return;
    }
    localWorkInited = true;
    this.localWork = localWork;
    aliasToMergeQueue = new HashMap();

    // create map local operators
    Map aliasToFetchWork = localWork.getAliasToFetchWork();
    Map> aliasToWork = localWork.getAliasToWork();
    Map aliasToSinkWork = conf.getAliasToSink();

    // The operator tree till the sink operator needs to be processed while
    // fetching the next row to fetch from the priority queue (possibly containing
    // multiple files in the small table given a file in the big table). The remaining
    // tree will be processed while processing the join.
    // Look at comments in DummyStoreOperator for additional explanation.
    for (Map.Entry entry : aliasToFetchWork.entrySet()) {
      String alias = entry.getKey();
      FetchWork fetchWork = entry.getValue();

      JobConf jobClone = new JobConf(hconf);
      if (UserGroupInformation.isSecurityEnabled()) {
        String hadoopAuthToken = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION);
        if(hadoopAuthToken != null){
          jobClone.set("mapreduce.job.credentials.binary", hadoopAuthToken);
        }
      }

      TableScanOperator ts = (TableScanOperator)aliasToWork.get(alias);
      // push down projections
      ColumnProjectionUtils.appendReadColumns(
          jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
      // push down filters
      HiveInputFormat.pushFilters(jobClone, ts, null);

      AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().isTranscationalTable(),
          ts.getConf().getAcidOperationalProperties());
      AcidUtils.setValidWriteIdList(jobClone, ts.getConf());

      ts.passExecContext(getExecContext());

      FetchOperator fetchOp = new FetchOperator(fetchWork, jobClone);
      ts.initialize(jobClone, new ObjectInspector[]{fetchOp.getOutputObjectInspector()});
      fetchOp.clearFetchContext();

      DummyStoreOperator sinkOp = aliasToSinkWork.get(alias);

      MergeQueue mergeQueue = new MergeQueue(alias, fetchWork, jobClone, ts, sinkOp);

      aliasToMergeQueue.put(alias, mergeQueue);
      l4j.info("fetch operators for " + alias + " initialized");
    }
  }

  private byte tagForAlias(String alias) {
    for (byte tag = 0; tag < tagToAlias.length; tag++) {
      if (alias.equals(tagToAlias[tag])) {
        return tag;
      }
    }
    return -1;
  }

  // The input file has changed - load the correct hash bucket
  @Override
  public void cleanUpInputFileChangedOp() throws HiveException {
    inputFileChanged = true;
  }

  protected List smbJoinComputeKeys(Object row, byte alias) throws HiveException {
    return JoinUtil.computeKeys(row, joinKeys[alias],
          joinKeysObjectInspectors[alias]);
  }

  @Override
  public void process(Object row, int tag) throws HiveException {

    if (tag == posBigTable) {
      if (inputFileChanged) {
        if (firstFetchHappened) {
          // we need to first join and flush out data left by the previous file.
          joinFinalLeftData();
        }
        // set up the fetch operator for the new input file.
        for (Map.Entry entry : aliasToMergeQueue.entrySet()) {
          String alias = entry.getKey();
          MergeQueue mergeQueue = entry.getValue();
          setUpFetchContexts(alias, mergeQueue);
        }
        firstFetchHappened = false;
        inputFileChanged = false;
      }
    }

    if (!firstFetchHappened) {
      firstFetchHappened = true;
      // fetch the first group for all small table aliases
      for (byte pos = 0; pos < order.length; pos++) {
        if (pos != posBigTable) {
          fetchNextGroup(pos);
        }
      }
    }

    byte alias = (byte) tag;

    // compute keys and values as StandardObjects
    List key = smbJoinComputeKeys(row, alias);

    List value = getFilteredValue(alias, row);


    //have we reached a new key group?
    boolean nextKeyGroup = processKey(alias, key);
    if (nextKeyGroup) {
      //assert this.nextGroupStorage[alias].size() == 0;
      this.nextGroupStorage[alias].addRow(value);
      foundNextKeyGroup[tag] = true;
      if (tag != posBigTable) {
        return;
      }
    }

    reportProgress();
    numMapRowsRead++;

    // the big table has reached a new key group. try to let the small tables
    // catch up with the big table.
    if (nextKeyGroup) {
      assert tag == posBigTable;
      List smallestPos = null;
      do {
        smallestPos = joinOneGroup();
        //jump out the loop if we need input from the big table
      } while (smallestPos != null && smallestPos.size() > 0
          && !smallestPos.contains(this.posBigTable));

      return;
    }

    assert !nextKeyGroup;
    candidateStorage[tag].addRow(value);
  }

  /*
   * this happens either when the input file of the big table is changed or in
   * closeop. It needs to fetch all the left data from the small tables and try
   * to join them.
   */
  private void joinFinalLeftData() throws HiveException {
    RowContainer bigTblRowContainer = this.candidateStorage[this.posBigTable];

    boolean allFetchDone = allFetchDone();
    // if all left data in small tables are less than and equal to the left data
    // in big table, let's them catch up
    while (bigTblRowContainer != null && bigTblRowContainer.rowCount() > 0
        && !allFetchDone) {
      joinOneGroup();
      bigTblRowContainer = this.candidateStorage[this.posBigTable];
      allFetchDone = allFetchDone();
    }

    while (!allFetchDone) {
      List ret = joinOneGroup();
      if (ret == null || ret.size() == 0) {
        break;
      }
      reportProgress();
      numMapRowsRead++;
      allFetchDone = allFetchDone();
    }

    boolean dataInCache = true;
    while (dataInCache) {
      for (byte pos = 0; pos < order.length; pos++) {
        if (this.foundNextKeyGroup[pos]
            && this.nextKeyWritables[pos] != null) {
          promoteNextGroupToCandidate(pos);
        }
      }
      joinOneGroup();
      dataInCache = false;
      for (byte pos = 0; pos < order.length; pos++) {
        if (this.candidateStorage[pos] != null && this.candidateStorage[pos].hasRows()) {
          dataInCache = true;
          break;
        }
      }
    }
  }

  private boolean allFetchDone() {
    boolean allFetchDone = true;
    for (byte pos = 0; pos < order.length; pos++) {
      if (pos == posBigTable) {
        continue;
      }
      allFetchDone = allFetchDone && fetchDone[pos];
    }
    return allFetchDone;
  }

  private List joinOneGroup() throws HiveException {
    int[] smallestPos = findSmallestKey();
    List listOfNeedFetchNext = null;
    if(smallestPos != null) {
      listOfNeedFetchNext = joinObject(smallestPos);
      if (listOfNeedFetchNext.size() > 0) {
        // listOfNeedFetchNext contains all tables that we have joined data in their
        // candidateStorage, and we need to clear candidate storage and promote their
        // nextGroupStorage to candidateStorage and fetch data until we reach a
        // new group.
        for (Byte b : listOfNeedFetchNext) {
          fetchNextGroup(b);
        }
      }
    }
    return listOfNeedFetchNext;
  }

  private List joinObject(int[] smallestPos) throws HiveException {
    List needFetchList = new ArrayList();
    byte index = (byte) (smallestPos.length - 1);
    for (; index >= 0; index--) {
      if (smallestPos[index] > 0 || keyWritables[index] == null) {
        putDummyOrEmpty(index);
        continue;
      }
      storage[index] = candidateStorage[index];
      needFetchList.add(index);
      if (smallestPos[index] < 0) {
        break;
      }
    }
    for (index--; index >= 0; index--) {
      putDummyOrEmpty(index);
    }
    checkAndGenObject();
    for (Byte pos : needFetchList) {
      this.candidateStorage[pos].clearRows();
      this.keyWritables[pos] = null;
    }
    return needFetchList;
  }

  private void fetchNextGroup(Byte t) throws HiveException {
    if (foundNextKeyGroup[t]) {
      // first promote the next group to be the current group if we reached a
      // new group in the previous fetch
      if (this.nextKeyWritables[t] != null) {
        promoteNextGroupToCandidate(t);
      } else {
        this.keyWritables[t] = null;
        this.candidateStorage[t] = null;
        this.nextGroupStorage[t] = null;
      }
      foundNextKeyGroup[t] = false;
    }
    //for the big table, we only need to promote the next group to the current group.
    if(t == posBigTable) {
      return;
    }

    //for tables other than the big table, we need to fetch more data until reach a new group or done.
    while (!foundNextKeyGroup[t]) {
      if (fetchDone[t]) {
        break;
      }
      fetchOneRow(t);
    }
    if (!foundNextKeyGroup[t] && fetchDone[t]) {
      this.nextKeyWritables[t] = null;
    }
  }

  private void promoteNextGroupToCandidate(Byte t) throws HiveException {
    this.keyWritables[t] = this.nextKeyWritables[t];
    this.nextKeyWritables[t] = null;
    RowContainer> oldRowContainer = this.candidateStorage[t];
    oldRowContainer.clearRows();
    this.candidateStorage[t] = this.nextGroupStorage[t];
    this.nextGroupStorage[t] = oldRowContainer;
  }

  private int compareKeys (List k1, List k2) {
    int ret = 0;

    // join keys have difference sizes?
    ret = k1.size() - k2.size();
    if (ret != 0) {
      return ret;
    }

    for (int i = 0; i < k1.size(); i++) {
      WritableComparable key_1 = (WritableComparable) k1.get(i);
      WritableComparable key_2 = (WritableComparable) k2.get(i);
      if (key_1 == null && key_2 == null) {
        return nullsafes != null && nullsafes[i] ? 0 : -1; // just return k1 is smaller than k2
      } else if (key_1 == null) {
        return -1;
      } else if (key_2 == null) {
        return 1;
      }
      ret = WritableComparator.get(key_1.getClass()).compare(key_1, key_2);
      if(ret != 0) {
        return ret;
      }
    }
    return ret;
  }

  private void putDummyOrEmpty(Byte i) {
    // put a empty list or null
    if (noOuterJoin) {
      storage[i] = emptyList;
    } else {
      storage[i] = dummyObjVectors[i];
    }
  }

  private int[] findSmallestKey() {
    int[] result = new int[order.length];
    List smallestOne = null;

    for (byte pos = 0; pos < order.length; pos++) {
      List key = keyWritables[pos];
      if (key == null) {
        continue;
      }
      if (smallestOne == null) {
        smallestOne = key;
        result[pos] = -1;
        continue;
      }
      result[pos] = compareKeys(key, smallestOne);
      if (result[pos] < 0) {
        smallestOne = key;
      }
    }
    return smallestOne == null ? null : result;
  }

  private boolean processKey(byte alias, List key)
      throws HiveException {
    List keyWritable = keyWritables[alias];
    if (keyWritable == null) {
      //the first group.
      keyWritables[alias] = key;
      return false;
    } else {
      int cmp = compareKeys(key, keyWritable);
      if (cmp != 0) {
        nextKeyWritables[alias] = key;
        return true;
      }
      return false;
    }
  }

  private void setUpFetchContexts(String alias, MergeQueue mergeQueue) throws HiveException {
    mergeQueue.clearFetchContext();

    Path currentInputPath = getExecContext().getCurrentInputPath();

    BucketMapJoinContext bucketMatcherCxt = localWork.getBucketMapjoinContext();
    Class bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
    BucketMatcher bucketMatcher = ReflectionUtil.newInstance(bucketMatcherCls, null);

    getExecContext().setFileId(bucketMatcherCxt.createFileId(currentInputPath.toString()));
    if (LOG.isInfoEnabled()) {
      LOG.info("set task id: " + getExecContext().getFileId());
    }

    bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt
        .getAliasBucketFileNameMapping());

    List aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputPath.toString(),
        bucketMatcherCxt.getMapJoinBigTableAlias(), alias);

    mergeQueue.setupContext(aliasFiles);
  }

  private void fetchOneRow(byte tag) {
    String table = tagToAlias[tag];
    MergeQueue mergeQueue = aliasToMergeQueue.get(table);

    // The operator tree till the sink operator has already been processed while
    // fetching the next row to fetch from the priority queue (possibly containing
    // multiple files in the small table given a file in the big table). Now, process
    // the remaining tree. Look at comments in DummyStoreOperator for additional
    // explanation.
    Operator forwardOp =
        conf.getAliasToSink().get(table).getChildOperators().get(0);
    try {
      InspectableObject row = mergeQueue.getNextRow();
      if (row == null) {
        fetchDone[tag] = true;
        return;
      }
      forwardOp.process(row.o, tag);
      // check if any operator had a fatal error or early exit during
      // execution
      if (forwardOp.getDone()) {
        fetchDone[tag] = true;
      }
    } catch (Throwable e) {
      if (e instanceof OutOfMemoryError) {
        // Don't create a new object if we are already out of memory
        throw (OutOfMemoryError) e;
      } else {
        throw new RuntimeException("Map local work failed", e);
      }
    }
  }

  transient boolean closeCalled = false;
  @Override
  public void closeOp(boolean abort) throws HiveException {
    if(closeCalled) {
      return;
    }
    closeCalled = true;

    // If there is a sort-merge join followed by a regular join, the SMBJoinOperator may not
    // get initialized at all. Consider the following query:
    // A SMB B JOIN C
    // For the mapper processing C, The SMJ is not initialized, no need to close it either.
    if (!initDone) {
      return;
    }


    if (inputFileChanged || !firstFetchHappened) {
      //set up the fetch operator for the new input file.
      for (Map.Entry entry : aliasToMergeQueue.entrySet()) {
        String alias = entry.getKey();
        MergeQueue mergeQueue = entry.getValue();
        setUpFetchContexts(alias, mergeQueue);
      }
      firstFetchHappened = true;
      for (byte pos = 0; pos < order.length; pos++) {
        if (pos != posBigTable) {
          fetchNextGroup(pos);
        }
      }
      inputFileChanged = false;
    }

    joinFinalLeftData();

    //clean up
    for (int pos = 0; pos < order.length; pos++) {
      if (pos != posBigTable) {
        fetchDone[pos] = false;
      }
      foundNextKeyGroup[pos] = false;
    }

    localWorkInited = false;

    super.closeOp(abort);
    for (Map.Entry entry : aliasToMergeQueue.entrySet()) {
      String alias = entry.getKey();
      MergeQueue mergeQueue = entry.getValue();
      Operator forwardOp = localWork.getAliasToWork().get(alias);
      forwardOp.close(abort);
      mergeQueue.clearFetchContext();
    }
  }

  @Override
  protected boolean allInitializedParentsAreClosed() {
    return true;
  }

  /**
   * Implements the getName function for the Node Interface.
   *
   * @return the name of the operator
   */
  @Override
  public String getName() {
    return getOperatorName();
  }

  static public String getOperatorName() {
    return "MAPJOIN";
  }

  @Override
  public OperatorType getType() {
    return OperatorType.MAPJOIN;
  }

  public boolean isConvertedAutomaticallySMBJoin() {
    return convertedAutomaticallySMBJoin;
  }

  public void setConvertedAutomaticallySMBJoin(boolean convertedAutomaticallySMBJoin) {
    this.convertedAutomaticallySMBJoin = convertedAutomaticallySMBJoin;
  }

  // returns rows from possibly multiple bucket files of small table in ascending order
  // by utilizing primary queue (borrowed from hadoop)
  // elements of queue (Integer) are index to FetchOperator[] (segments)
  private class MergeQueue extends PriorityQueue {

    private final String alias;
    private final FetchWork fetchWork;
    private final JobConf jobConf;

    // for keeping track of the number of elements read. just for debugging
    transient int counter;

    transient FetchOperator[] segments;
    transient List keyFields;
    transient List keyFieldOIs;
    transient Operator forwardOp;
    transient DummyStoreOperator sinkOp;

    // index of FetchOperator which is providing smallest one
    transient Integer currentMinSegment;
    transient ObjectPair, InspectableObject>[] keys;

    public MergeQueue(String alias, FetchWork fetchWork, JobConf jobConf,
        Operator forwardOp,
        DummyStoreOperator sinkOp) {
      this.alias = alias;
      this.fetchWork = fetchWork;
      this.jobConf = jobConf;
      this.forwardOp = forwardOp;
      this.sinkOp = sinkOp;
    }

    // paths = bucket files of small table for current bucket file of big table
    // initializes a FetchOperator for each file in paths, reuses FetchOperator if possible
    // currently, number of paths is always the same (bucket numbers are all the same over
    // all partitions in a table).
    // But if hive supports assigning bucket number for each partition, this can be vary
    public void setupContext(List paths) throws HiveException {
      int segmentLen = paths.size();
      FetchOperator.setFetchOperatorContext(jobConf, fetchWork.getPartDir());
      FetchOperator[] segments = segmentsForSize(segmentLen);
      for (int i = 0 ; i < segmentLen; i++) {
        Path path = paths.get(i);
        if (segments[i] == null) {
          segments[i] = new FetchOperator(fetchWork, new JobConf(jobConf));
        }
        segments[i].setupContext(Arrays.asList(path));
      }
      initialize(segmentLen);
      for (int i = 0; i < segmentLen; i++) {
        if (nextHive(i)) {
          put(i);
        }
      }
      counter = 0;
    }

    @SuppressWarnings("unchecked")
    private FetchOperator[] segmentsForSize(int segmentLen) {
      if (segments == null || segments.length < segmentLen) {
        FetchOperator[] newSegments = new FetchOperator[segmentLen];
        ObjectPair, InspectableObject>[] newKeys = new ObjectPair[segmentLen];
        if (segments != null) {
          System.arraycopy(segments, 0, newSegments, 0, segments.length);
          System.arraycopy(keys, 0, newKeys, 0, keys.length);
        }
        segments = newSegments;
        keys = newKeys;
      }
      return segments;
    }

    public void clearFetchContext() throws HiveException {
      if (segments != null) {
        for (FetchOperator op : segments) {
          if (op != null) {
            op.clearFetchContext();
          }
        }
      }
    }

    @Override
    protected boolean lessThan(Object a, Object b) {
      return compareKeys(keys[(Integer) a].getFirst(), keys[(Integer)b].getFirst()) < 0;
    }

    public final InspectableObject getNextRow() throws IOException {
      if (currentMinSegment != null) {
        adjustPriorityQueue(currentMinSegment);
      }
      Integer current = top();
      if (current == null) {
        if (LOG.isInfoEnabled()) {
          LOG.info("MergeQueue forwarded " + counter + " rows");
        }
        return null;
      }
      counter++;
      return keys[currentMinSegment = current].getSecond();
    }

    private void adjustPriorityQueue(Integer current) throws IOException {
      if (nextIO(current)) {
        adjustTop();  // sort
      } else {
        pop();
      }
    }

    // wrapping for exception handling
    private boolean nextHive(Integer current) throws HiveException {
      try {
        return next(current);
      } catch (IOException e) {
        throw new HiveException(e);
      }
    }

    // wrapping for exception handling
    private boolean nextIO(Integer current) throws IOException {
      try {
        return next(current);
      } catch (HiveException e) {
        throw new IOException(e);
      }
    }

    // return true if current min segment(FetchOperator) has next row
    private boolean next(Integer current) throws IOException, HiveException {
      if (keyFields == null) {
        byte tag = tagForAlias(alias);
        // joinKeys/joinKeysOI are initialized after making merge queue, so setup lazily at runtime
        keyFields = joinKeys[tag];
        keyFieldOIs = joinKeysObjectInspectors[tag];
      }
      InspectableObject nextRow = segments[current].getNextRow();
      while (nextRow != null) {
        sinkOp.reset();
        if (keys[current] == null) {
          keys[current] = new ObjectPair, InspectableObject>();
        }

        // Pass the row though the operator tree. It is guaranteed that not more than 1 row can
        // be produced from a input row.
        forwardOp.process(nextRow.o, 0);
        nextRow = sinkOp.getResult();

        // It is possible that the row got absorbed in the operator tree.
        if (nextRow.o != null) {
          // todo this should be changed to be evaluated lazily, especially for single segment case
          keys[current].setFirst(JoinUtil.computeKeys(nextRow.o, keyFields, keyFieldOIs));
          keys[current].setSecond(nextRow);
          return true;
        }
        nextRow = segments[current].getNextRow();
      }
      keys[current] = null;
      return false;
    }
  }

  @Override
  public boolean opAllowedConvertMapJoin() {
    return false;
  }
}