org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastValueStore Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult;
import org.apache.hadoop.hive.serde2.WriteBuffers;
import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef;
import org.apache.hadoop.hive.serde2.WriteBuffers.Position;;


// Supports random access.

public class VectorMapJoinFastValueStore {

  private static final Log LOG = LogFactory.getLog(VectorMapJoinFastValueStore.class.getName());

  private WriteBuffers writeBuffers;


  /**
   * A store for "lists" of arbitrary length values in memory.
   *
   * The memory is a "infinite" byte array or WriteBuffers object.
   *
   * We give the client a 64-bit (long) value reference to keep that has the offset within
   * the "infinite" byte array of the last value inserted in a "list".
   *
   * We optimize the common case when "list"s are 1 element and values are short and store the
   * value length in the value reference word.
   *
   * We also support keeping a count (up to a limit or cap) so help with join result generation
   * algorithms.
   *
   * If the last value is big, the big length will be encoded as an integer at the beginning
   * of the value followed by the big value bytes.
   *
   * Due to optimizing by keeping the last value's length in the value reference, when we have
   * more than one value, a new value will need to keep the small value length of the next
   * value.
   *
   * So, values after the first value have 4 parts: a relative offset word with flags, an
   * optional length if the current value is big, an optional next value length if it is small,
   * and the value bytes.
   *
   * Cases:
   *  1) One element, small:
   *
   *    Value Reference -------------
   *                                 |
   *                                 |
   *                                 v
   *                                 {Small Value Bytes}
   *
   *  2) One element, big:
   *
   *   Value Reference --------------
   *                                 |
   *                                 |
   *                                 v
   *                                 {Big Value Len} {Big Value Bytes}
   *
   *  1) Multiple elements:
   *
   *    Value Reference ----------------
   *                                    |
   *                                    |    //  Last value added.
   *                                    |
   *                                    v
   *                                    {Rel Offset Word} [Big Value Len] [Next Value Small Len] {Value Bytes}
   *                                             |            optional        optional
   *                                             |
   *                                             |
   *                                --- . . . ---
   *                               |
   *                               |      // 0 or more
   *                               |
   *                               v
   *                              {Rel Offset Word} [Big Value Len] [Next Value Small Len] {Value Bytes}
   *                                         |          optional        optional
   *                                         |
   *                                         |
   *                     --------------------
   *                    |
   *                    |
   *                    v
   *                   [Big Value Length] {Value Bytes}
   *                       optional
   *
   *                   // First value added without Relative Offset Word, etc.
   */

  public WriteBuffers writeBuffers() {
    return writeBuffers;
  }

  public static class HashMapResult extends VectorMapJoinHashMapResult {

    private VectorMapJoinFastValueStore valueStore;

    private boolean hasRows;
    private long valueRefWord;
    private boolean isSingleRow;
    private int cappedCount;

    private boolean haveReadCurrent;
    private int readIndex;
    private boolean isEof;

    private boolean isNextEof;
    private boolean isNextLast;
    long nextAbsoluteValueOffset;
    boolean isNextValueLengthSmall;
    int nextSmallValueLength;

    private ByteSegmentRef byteSegmentRef;
    private Position readPos;

    public HashMapResult() {
      super();
      valueRefWord = -1;
      hasRows = false;
      byteSegmentRef = new ByteSegmentRef();
      readPos = new Position();
    }

    public void set(VectorMapJoinFastValueStore valueStore, long valueRefWord) {
      // LOG.debug("VectorMapJoinFastValueStore set valueRefWord " + Long.toHexString(valueRefWord));

      this.valueStore = valueStore;
      this.valueRefWord = valueRefWord;

      hasRows = true;
      isSingleRow = ((valueRefWord & IsLastFlag.flagOnMask) != 0);
      cappedCount =
          (int) ((valueRefWord & CappedCount.bitMask) >> CappedCount.bitShift);
      // Position to beginning.
      haveReadCurrent = false;
      readIndex = 0;
      isEof = false;
    }

    @Override
    public boolean hasRows() {
      return hasRows;
    }

    @Override
    public boolean isSingleRow() {
      if (!hasRows) {
        return false;
      }

      return isSingleRow;
    }

    @Override
    public boolean isCappedCountAvailable() {
      return true;
    }

    @Override
    public int cappedCount() {
      if (!hasRows) {
        return 0;
      }

      return cappedCount;
    }

    @Override
    public ByteSegmentRef first() {
      if (!hasRows) {
        return null;
      }

      // Position to beginning.
      haveReadCurrent = false;
      readIndex = 0;
      isEof = false;

      return internalRead();
    }

    @Override
    public ByteSegmentRef next() {
      if (!hasRows) {
        return null;
      }

      return internalRead();
    }


    public ByteSegmentRef internalRead() {

      long absoluteValueOffset;

      int valueLength;

      if (readIndex == 0) {
        /*
         * Extract information from reference word from slot table.
         */
        absoluteValueOffset =
            (valueRefWord & AbsoluteValueOffset.bitMask);

        // Position before the last written value.
        valueStore.writeBuffers.setReadPoint(absoluteValueOffset, readPos);

        if (isSingleRow) {
          isNextEof = true;

          valueLength =
              (int) ((valueRefWord & SmallValueLength.bitMask) >> SmallValueLength.bitShift);
          boolean isValueLengthSmall = (valueLength != SmallValueLength.allBitsOn);
          if (!isValueLengthSmall) {
            // And, if current value is big we must read it.
            valueLength = valueStore.writeBuffers.readVInt(readPos);
          }
        } else {
          isNextEof = false;

          // 2nd and beyond records have a relative offset word at the beginning.
          long relativeOffsetWord = valueStore.writeBuffers.readVLong(readPos);

          long relativeOffset =
              (relativeOffsetWord & NextRelativeValueOffset.bitMask) >> NextRelativeValueOffset.bitShift;

          nextAbsoluteValueOffset = absoluteValueOffset - relativeOffset;

          isNextLast = ((relativeOffsetWord & IsNextValueLastFlag.flagOnMask) != 0);
          isNextValueLengthSmall =
              ((relativeOffsetWord & IsNextValueLengthSmallFlag.flagOnMask) != 0);
        }

        valueLength =
            (int) ((valueRefWord & SmallValueLength.bitMask) >> SmallValueLength.bitShift);
        boolean isValueLengthSmall = (valueLength != SmallValueLength.allBitsOn);
        if (!isValueLengthSmall) {
          // And, if current value is big we must read it.
          valueLength = valueStore.writeBuffers.readVInt(readPos);
        }

        // 2nd and beyond have the next value's small length in the current record.
        if (isNextValueLengthSmall) {
          nextSmallValueLength = valueStore.writeBuffers.readVInt(readPos);
        } else {
          nextSmallValueLength = -1;
        }

      } else {
        if (isNextEof) {
          return null;
        }

        absoluteValueOffset =  nextAbsoluteValueOffset;

        // Position before the last written value.
        valueStore.writeBuffers.setReadPoint(absoluteValueOffset, readPos);

        if (isNextLast) {
          isNextEof = true;

          if (isNextValueLengthSmall) {
            valueLength = nextSmallValueLength;
          } else {
            valueLength = (int) valueStore.writeBuffers.readVLong(readPos);
          }
        } else {
          isNextEof = false;

          // 2nd and beyond records have a relative offset word at the beginning.
          long relativeOffsetWord = valueStore.writeBuffers.readVLong(readPos);

          // Read current value's big length now, if necessary.
          if (isNextValueLengthSmall) {
            valueLength = nextSmallValueLength;
          } else {
            valueLength = (int) valueStore.writeBuffers.readVLong(readPos);
          }

          long relativeOffset =
              (relativeOffsetWord & NextRelativeValueOffset.bitMask) >> NextRelativeValueOffset.bitShift;

          nextAbsoluteValueOffset = absoluteValueOffset - relativeOffset;

          isNextLast = ((relativeOffsetWord & IsNextValueLastFlag.flagOnMask) != 0);
          isNextValueLengthSmall =
              ((relativeOffsetWord & IsNextValueLengthSmallFlag.flagOnMask) != 0);
          if (isNextValueLengthSmall) {
            // TODO: Write readVInt
            nextSmallValueLength = (int) valueStore.writeBuffers.readVLong(readPos);
          } else {
            nextSmallValueLength = -1;
          }
        }
      }

      // Our reading is positioned to the value.
      valueStore.writeBuffers.getByteSegmentRefToCurrent(byteSegmentRef, valueLength, readPos);

      readIndex++;
      return byteSegmentRef;
    }

    @Override
    public boolean isEof() {
      if (!hasRows) {
        return true;
      }
      return isEof;
    }

    @Override
    public void forget() {
    }


    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("(" + super.toString() + ", ");
      sb.append("cappedCount " + cappedCount() + ")");
      return sb.toString();
    }
  }

  /**
   * Bit-length fields within a 64-bit (long) value reference.
   *
   * Lowest field: An absolute byte offset the value in the WriteBuffers.
   *
   * 2nd field: For short values, the length of the value.  Otherwise, a special constant
   * indicating a big value whose length is stored with the value.
   *
   * 3rd field: A value count, up to a limit (a cap).  Have a count helps the join result
   * algorithms determine which optimization to use for M x N result cross products.
   * A special constant indicates if the value count is >= the cap.
   *
   * Last field: an bit indicating whether there is only one value.
   */

  // Lowest field.
  private final class AbsoluteValueOffset {
    private static final int bitLength = 40;
    private static final long allBitsOn = (1L << bitLength) - 1;
    private static final long bitMask = allBitsOn;

    // Make it a power of 2.
    private static final long maxSize = 1L << (bitLength - 2);
  }

  private final class SmallValueLength {
    private static final int bitLength = 10;
    private static final int allBitsOn = (1 << bitLength) - 1;
    private static final int threshold = allBitsOn;  // Lower this for big value testing.
    private static final int bitShift = AbsoluteValueOffset.bitLength;
    private static final long bitMask = ((long) allBitsOn) << bitShift;
    private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift;
  }

  private final class CappedCount {
    private static final int bitLength = 10;
    private static final int allBitsOn = (1 << bitLength) - 1;
    private static final int limit = allBitsOn;
    private static final int bitShift =  SmallValueLength.bitShift + SmallValueLength.bitLength;
    private static final long bitMask = ((long) allBitsOn) << bitShift;
  }

  private final class IsLastFlag {
    private static final int bitShift = CappedCount.bitShift + CappedCount.bitLength;;
    private static final long flagOnMask = 1L << bitShift;
  }

  // This bit should not be on for valid value references.  We use -1 for a no value marker.
  private final class IsInvalidFlag {
    private static final int bitShift = 63;
    private static final long flagOnMask = 1L << bitShift;
  }

  /**
   * Relative Offset Word stored at the beginning of all but the last value that has a
   * relative offset and 2 flags.
   *
   * We put the flags at the low end of the word so the variable length integer will
   * encode smaller.
   *
   * First bit is a flag indicating if the next value (not the current value) has a small length.
   * When the first value is added and it has a small length, that length is stored in the
   * value reference and not with the value.  So, when we have multiple values, we need a way to
   * know to keep the next value's small length with the current value.
   *
   * Second bit is a flag indicating if the next value (not the current value) is the last value.
   *
   * The relative offset *backwards* to the next value.
   */

  private final class IsNextValueLengthSmallFlag {
    private static final int bitLength = 1;
    private static final long flagOnMask = 1L;
  }

  private final class IsNextValueLastFlag {
    private static final int bitLength = 1;
    private static final int bitShift = IsNextValueLengthSmallFlag.bitLength;
    private static final long flagOnMask = 1L << bitShift;
  }

  private final class NextRelativeValueOffset {
    private static final int bitLength = 40;
    private static final long allBitsOn = (1L << bitLength) - 1;
    private static final int bitShift = IsNextValueLastFlag.bitShift + IsNextValueLastFlag.bitLength;
    private static final long bitMask = allBitsOn << bitShift;
  }

  public long addFirst(byte[] valueBytes, int valueStart, int valueLength) {

    // First value is written without: next relative offset, next value length, is next value last
    // flag, is next value length small flag, etc.

    /*
     * We build up the Value Reference Word we will return that will be kept by the caller.
     */

    long valueRefWord = IsLastFlag.flagOnMask;

    valueRefWord |= ((long) 1 << CappedCount.bitShift);

    long newAbsoluteOffset;
    if (valueLength < SmallValueLength.threshold) {

      // Small case: Just write the value bytes only.

      if (valueLength == 0) {
        // We don't write a first empty value.
        // Get an offset to reduce the relative offset later if there are more than 1 value.
        newAbsoluteOffset = writeBuffers.getWritePoint();
      } else {
        newAbsoluteOffset = writeBuffers.getWritePoint();
        writeBuffers.write(valueBytes, valueStart, valueLength);
      }

      // The caller remembers the small value length.
      valueRefWord |= ((long) valueLength) << SmallValueLength.bitShift;
    } else {

      // Big case: write the length as a VInt and then the value bytes.

      newAbsoluteOffset = writeBuffers.getWritePoint();

      writeBuffers.writeVInt(valueLength);
      writeBuffers.write(valueBytes, valueStart, valueLength);

      // Use magic length value to indicate big.
      valueRefWord |= SmallValueLength.allBitsOnBitShifted;
    }

    // LOG.debug("VectorMapJoinFastValueStore addFirst valueLength " + valueLength + " newAbsoluteOffset " + newAbsoluteOffset + " valueRefWord " + Long.toHexString(valueRefWord));

    // The lower bits are the absolute value offset.
    valueRefWord |= newAbsoluteOffset;

    return valueRefWord;
  }

  public long addMore(long oldValueRef, byte[] valueBytes, int valueStart, int valueLength) {

    if ((oldValueRef & IsInvalidFlag.flagOnMask) != 0) {
      throw new RuntimeException("Invalid optimized hash table reference");
    }
    /*
     * Extract information about the old value.
     */
    long oldAbsoluteValueOffset =
        (oldValueRef & AbsoluteValueOffset.bitMask);
    int oldSmallValueLength =
        (int) ((oldValueRef & SmallValueLength.bitMask) >> SmallValueLength.bitShift);
    boolean isOldValueLengthSmall = (oldSmallValueLength != SmallValueLength.allBitsOn);
    int oldCappedCount =
        (int) ((oldValueRef & CappedCount.bitMask) >> CappedCount.bitShift);
    boolean isOldValueLast =
        ((oldValueRef & IsLastFlag.flagOnMask) != 0);

    // LOG.debug("VectorMapJoinFastValueStore addMore isOldValueLast " + isOldValueLast + " oldSmallValueLength " + oldSmallValueLength + " oldAbsoluteValueOffset " + oldAbsoluteValueOffset + " oldValueRef " + Long.toHexString(oldValueRef));

    /*
     * Write information about the old value (which becomes our next) at the beginning
     * of our new value.
     */
    long newAbsoluteOffset = writeBuffers.getWritePoint();

    long relativeOffsetWord = 0;
    if (isOldValueLengthSmall) {
      relativeOffsetWord |= IsNextValueLengthSmallFlag.flagOnMask;
    }
    if (isOldValueLast) {
      relativeOffsetWord |= IsNextValueLastFlag.flagOnMask;
    }
    int newCappedCount = oldCappedCount;
    if (newCappedCount < CappedCount.limit) {
      newCappedCount++;
    }
    long relativeOffset = newAbsoluteOffset - oldAbsoluteValueOffset;
    relativeOffsetWord |= (relativeOffset << NextRelativeValueOffset.bitShift);

    writeBuffers.writeVLong(relativeOffsetWord);

    // When the next value is small it was not recorded with the old (i.e. next) value and we
    // have to remember it.
    if (isOldValueLengthSmall) {
      writeBuffers.writeVInt(oldSmallValueLength);
    }

    // Now, we have written all information about the next value, work on the *new* value.

    long newValueRef = ((long) newCappedCount) << CappedCount.bitShift;
    boolean isNewValueSmall = (valueLength < SmallValueLength.threshold);
    if (!isNewValueSmall) {
      // Use magic value to indicating we are writing the big value length.
      newValueRef |= ((long) SmallValueLength.allBitsOn << SmallValueLength.bitShift);
      writeBuffers.writeVInt(valueLength);
    } else {
      // Caller must remember small value length.
      newValueRef |= ((long) valueLength) << SmallValueLength.bitShift;
    }
    writeBuffers.write(valueBytes, valueStart, valueLength);

    // The lower bits are the absolute value offset.
    newValueRef |=  newAbsoluteOffset;

    // LOG.debug("VectorMapJoinFastValueStore addMore valueLength " + valueLength + " newAbsoluteOffset " + newAbsoluteOffset + " newValueRef " + Long.toHexString(newValueRef));

    return newValueRef;
  }

  public VectorMapJoinFastValueStore(int writeBuffersSize) {
    writeBuffers = new WriteBuffers(writeBuffersSize, AbsoluteValueOffset.maxSize);
  }
}