org.apache.hadoop.hive.ql.io.BucketCodec Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io;

import com.google.common.base.Preconditions;

/**
 * This class makes sense of {@link RecordIdentifier#getBucketProperty()}.  Up until ASF Hive 3.0 this
 * field was simply the bucket ID.  Since 3.0 it does bit packing to store several things:
 * top 3 bits - version describing the format (we can only have 8).
 * The rest is version specific - see below.
 */
public enum BucketCodec {
  /**
   * This is the "legacy" version.  The whole {@code bucket} value just has the bucket ID in it.
   * The numeric code for this version is 0. (Assumes bucket ID takes less than 29 bits... which
   * implies top 3 bits are 000 so data written before Hive 3.0 is readable with this scheme).
   */
  V0(0) {
    @Override
    public int decodeWriterId(int bucketProperty) {
      return bucketProperty;
    }
    @Override
    public int decodeStatementId(int bucketProperty) {
      return 0;
    }
    @Override
    public int encode(AcidOutputFormat.Options options) {
      return options.getBucketId();
    }
  },
  /**
   * Represents format of "bucket" property in Hive 3.0.
   * top 3 bits - version code.
   * next 1 bit - reserved for future
   * next 12 bits - the bucket ID
   * next 4 bits reserved for future
   * remaining 12 bits - the statement ID - 0-based numbering of all statements within a
   * transaction.  Each leg of a multi-insert statement gets a separate statement ID.
   * The reserved bits align it so that it's easier to interpret it in Hex.
   *
   * Constructs like Merge and Multi-Insert may have multiple tasks writing data that belongs to
   * the same physical bucket file.  For example, a Merge stmt with update and insert clauses,
   * (and split update enabled - should be the default in 3.0).  A task on behalf of insert may
   * be writing a row into bucket 0 and another task in the update branch may be writing an insert
   * event into bucket 0.  Each of these tasks are writing to different delta directory - distinguished
   * by statement ID.  By including both bucket ID and statement ID in {@link RecordIdentifier}
   * we ensure that {@link RecordIdentifier} is unique.
   *
   * The intent is that sorting rows by {@link RecordIdentifier} groups rows in the same physical
   * bucket next to each other.
   * For any row created by a given version of Hive, top 3 bits are constant.  The next
   * most significant bits are the bucket ID, then the statement ID.  This ensures that
   * {@link org.apache.hadoop.hive.ql.optimizer.SortedDynPartitionOptimizer} works which is
   * designed so that each task only needs to keep 1 writer opened at a time.  It could be
   * configured such that a single writer sees data for multiple buckets, so it must "group" data
   * by bucket ID (and then sort within each bucket as required) which is achieved via sorting
   * by {@link RecordIdentifier} which includes the {@link RecordIdentifier#getBucketProperty()}
   * which has the actual bucket ID in the high order bits.  This scheme also ensures that
   * {@link org.apache.hadoop.hive.ql.exec.FileSinkOperator#process(Object, int)} works in case
   * there numBuckets > numReducers.  (The later could be fixed by changing how writers are
   * initialized in "if (fpaths.acidLastBucket != bucketNum) {")
   */
  V1(1) {
    @Override
    public int decodeWriterId(int bucketProperty) {
      return (bucketProperty & 0b0000_1111_1111_1111_0000_0000_0000_0000) >>> 16;
    }
    @Override
    public int decodeStatementId(int bucketProperty) {
      return (bucketProperty & 0b0000_0000_0000_0000_0000_1111_1111_1111);
    }
    @Override
    public int encode(AcidOutputFormat.Options options) {
      int statementId = Math.max(0, options.getStatementId());
      int bucketId = options.getBucketId();
      int maxStatementId = options.getMaxStmtId();
      if (maxStatementId < 0) {              // uninitialized, use the default
        maxStatementId = MAX_STATEMENT_ID;
      } else if (maxStatementId == 0) {      // single statement tx, id starts from zero, set it to one to make the below logic work
        maxStatementId = 1;
      }
      Preconditions.checkArgument(maxStatementId >= 0 && maxStatementId <= MAX_STATEMENT_ID,
              "Max Statement ID out of range: " + bucketId);
      Preconditions.checkArgument(statementId >= 0 && statementId <= MAX_STATEMENT_ID,
              "Statement ID out of range: " + statementId);
      Preconditions.checkArgument(bucketId >= 0, "Bucket ID out of range: " + bucketId);

      if (bucketId > MAX_BUCKET_ID) {
        int extraBits = NUM_STATEMENT_ID_BITS - (32 - Integer.numberOfLeadingZeros(maxStatementId)); // allocate some bits from the stmt id
        int overflowedParts = bucketId >>> NUM_BUCKET_ID_BITS; // this part doesn't fit to 12bit, move it to stmt
        int maxBucketId = (1 << (NUM_BUCKET_ID_BITS + extraBits)) -1; // this is the max we can handle using 12bit + extra bits from stmt id
        Preconditions.checkArgument(bucketId >= 0 && bucketId <= maxBucketId, "Bucket ID out of range: " + bucketId + " max: " + maxBucketId);
        statementId = (overflowedParts << (NUM_STATEMENT_ID_BITS - extraBits)) | statementId;
        bucketId = bucketId & MAX_BUCKET_ID;
      }

      return this.version << (1 + NUM_BUCKET_ID_BITS + 4 + NUM_STATEMENT_ID_BITS)
          | bucketId << (4 + NUM_STATEMENT_ID_BITS) | Math.max(0, statementId);
    }
  };
  private static final int TOP3BITS_MASK = 0b1110_0000_0000_0000_0000_0000_0000_0000;
  private static final int NUM_VERSION_BITS = 3;
  private static final int NUM_BUCKET_ID_BITS = 12;
  private static final int NUM_STATEMENT_ID_BITS = 12;
  public static final int MAX_VERSION = (1 << NUM_VERSION_BITS) - 1;
  public static final int MAX_BUCKET_ID = (1 << NUM_BUCKET_ID_BITS) - 1;
  public static final int MAX_STATEMENT_ID = (1 << NUM_STATEMENT_ID_BITS) - 1;

  public static BucketCodec determineVersion(int bucket) {
    try {
      // look at top 3 bits and return appropriate enum
      return getCodec((BucketCodec.TOP3BITS_MASK & bucket) >>> 29);
    } catch (IllegalArgumentException iae) {
      throw new IllegalArgumentException("Cannot decode version from bucket number: " + Integer.toHexString(bucket),
          iae);
    }
  }

  public static BucketCodec getCodec(int version) {
    switch (version) {
    case 0:
      return BucketCodec.V0;
    case 1:
      return BucketCodec.V1;
    default:
      throw new IllegalArgumentException("Illegal 'bucket' format. Version=" + version);
    }
  }

  final int version;

  BucketCodec(int version) {
    Preconditions.checkPositionIndex(version, MAX_VERSION, "Version out of range: " + version);
    this.version = version;
  }

  /**
   * For bucketed tables this the bucketId, otherwise writerId
   */
  public abstract int decodeWriterId(int bucketProperty);
  public abstract int decodeStatementId(int bucketProperty);
  public abstract int encode(AcidOutputFormat.Options options);
  public int getVersion() {
    return version;
  }
}