org.apache.iceberg.spark.functions.BucketFunction Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.3_2.12 Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.functions;

import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.Set;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.util.BucketUtil;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.catalog.functions.BoundFunction;
import org.apache.spark.sql.connector.catalog.functions.ScalarFunction;
import org.apache.spark.sql.connector.catalog.functions.UnboundFunction;
import org.apache.spark.sql.types.BinaryType;
import org.apache.spark.sql.types.ByteType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.DecimalType;
import org.apache.spark.sql.types.IntegerType;
import org.apache.spark.sql.types.LongType;
import org.apache.spark.sql.types.ShortType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.TimestampType;
import org.apache.spark.unsafe.types.UTF8String;

/**
 * A Spark function implementation for the Iceberg bucket transform.
 *
 * Example usage: {@code SELECT system.bucket(128, 'abc')}, which returns the bucket 122.
 *
 * Note that for performance reasons, the given input number of buckets is not validated in the
 * implementations used in code-gen. The number of buckets must be positive to give meaningful
 * results.
 */
public class BucketFunction implements UnboundFunction {

  private static final int NUM_BUCKETS_ORDINAL = 0;
  private static final int VALUE_ORDINAL = 1;

  private static final Set SUPPORTED_NUM_BUCKETS_TYPES =
      ImmutableSet.of(DataTypes.ByteType, DataTypes.ShortType, DataTypes.IntegerType);

  @Override
  public BoundFunction bind(StructType inputType) {
    if (inputType.size() != 2) {
      throw new UnsupportedOperationException(
          "Wrong number of inputs (expected numBuckets and value)");
    }

    StructField numBucketsField = inputType.fields()[NUM_BUCKETS_ORDINAL];
    StructField valueField = inputType.fields()[VALUE_ORDINAL];

    if (!SUPPORTED_NUM_BUCKETS_TYPES.contains(numBucketsField.dataType())) {
      throw new UnsupportedOperationException(
          "Expected number of buckets to be tinyint, shortint or int");
    }

    DataType type = valueField.dataType();
    if (type instanceof DateType) {
      return new BucketInt(type);
    } else if (type instanceof ByteType
        || type instanceof ShortType
        || type instanceof IntegerType) {
      return new BucketInt(DataTypes.IntegerType);
    } else if (type instanceof LongType) {
      return new BucketLong(type);
    } else if (type instanceof TimestampType) {
      return new BucketLong(type);
    } else if (type instanceof DecimalType) {
      return new BucketDecimal(type);
    } else if (type instanceof StringType) {
      return new BucketString();
    } else if (type instanceof BinaryType) {
      return new BucketBinary();
    } else {
      throw new UnsupportedOperationException(
          "Expected column to be date, tinyint, smallint, int, bigint, decimal, timestamp, string, or binary");
    }
  }

  @Override
  public String description() {
    return name()
        + "(numBuckets, col) - Call Iceberg's bucket transform\n"
        + "  numBuckets :: number of buckets to divide the rows into, e.g. bucket(100, 34) -> 79 (must be a tinyint, smallint, or int)\n"
        + "  col :: column to bucket (must be a date, integer, long, timestamp, decimal, string, or binary)";
  }

  @Override
  public String name() {
    return "bucket";
  }

  public abstract static class BucketBase implements ScalarFunction {
    public static int apply(int numBuckets, int hashedValue) {
      return (hashedValue & Integer.MAX_VALUE) % numBuckets;
    }

    @Override
    public String name() {
      return "bucket";
    }

    @Override
    public DataType resultType() {
      return DataTypes.IntegerType;
    }
  }

  // Used for both int and date - tinyint and smallint are upcasted to int by Spark.
  public static class BucketInt extends BucketBase {
    private final DataType sqlType;

    // magic method used in codegen
    public static int invoke(int numBuckets, int value) {
      return apply(numBuckets, hash(value));
    }

    // Visible for testing
    public static int hash(int value) {
      return BucketUtil.hash(value);
    }

    public BucketInt(DataType sqlType) {
      this.sqlType = sqlType;
    }

    @Override
    public DataType[] inputTypes() {
      return new DataType[] {DataTypes.IntegerType, sqlType};
    }

    @Override
    public String canonicalName() {
      return String.format("iceberg.bucket(%s)", sqlType.catalogString());
    }

    @Override
    public Integer produceResult(InternalRow input) {
      // return null for null input to match what Spark does in the code-generated versions.
      if (input.isNullAt(NUM_BUCKETS_ORDINAL) || input.isNullAt(VALUE_ORDINAL)) {
        return null;
      } else {
        return invoke(input.getInt(NUM_BUCKETS_ORDINAL), input.getInt(VALUE_ORDINAL));
      }
    }
  }

  // Used for both BigInt and Timestamp
  public static class BucketLong extends BucketBase {
    private final DataType sqlType;

    // magic function for usage with codegen - needs to be static
    public static int invoke(int numBuckets, long value) {
      return apply(numBuckets, hash(value));
    }

    // Visible for testing
    public static int hash(long value) {
      return BucketUtil.hash(value);
    }

    public BucketLong(DataType sqlType) {
      this.sqlType = sqlType;
    }

    @Override
    public DataType[] inputTypes() {
      return new DataType[] {DataTypes.IntegerType, sqlType};
    }

    @Override
    public String canonicalName() {
      return String.format("iceberg.bucket(%s)", sqlType.catalogString());
    }

    @Override
    public Integer produceResult(InternalRow input) {
      if (input.isNullAt(NUM_BUCKETS_ORDINAL) || input.isNullAt(VALUE_ORDINAL)) {
        return null;
      } else {
        return invoke(input.getInt(NUM_BUCKETS_ORDINAL), input.getLong(VALUE_ORDINAL));
      }
    }
  }

  public static class BucketString extends BucketBase {
    // magic function for usage with codegen
    public static Integer invoke(int numBuckets, UTF8String value) {
      if (value == null) {
        return null;
      }

      // TODO - We can probably hash the bytes directly given they're already UTF-8 input.
      return apply(numBuckets, hash(value.toString()));
    }

    // Visible for testing
    public static int hash(String value) {
      return BucketUtil.hash(value);
    }

    @Override
    public DataType[] inputTypes() {
      return new DataType[] {DataTypes.IntegerType, DataTypes.StringType};
    }

    @Override
    public String canonicalName() {
      return "iceberg.bucket(string)";
    }

    @Override
    public Integer produceResult(InternalRow input) {
      if (input.isNullAt(NUM_BUCKETS_ORDINAL) || input.isNullAt(VALUE_ORDINAL)) {
        return null;
      } else {
        return invoke(input.getInt(NUM_BUCKETS_ORDINAL), input.getUTF8String(VALUE_ORDINAL));
      }
    }
  }

  public static class BucketBinary extends BucketBase {
    public static Integer invoke(int numBuckets, byte[] value) {
      if (value == null) {
        return null;
      }

      return apply(numBuckets, hash(ByteBuffer.wrap(value)));
    }

    // Visible for testing
    public static int hash(ByteBuffer value) {
      return BucketUtil.hash(value);
    }

    @Override
    public DataType[] inputTypes() {
      return new DataType[] {DataTypes.IntegerType, DataTypes.BinaryType};
    }

    @Override
    public Integer produceResult(InternalRow input) {
      if (input.isNullAt(NUM_BUCKETS_ORDINAL) || input.isNullAt(VALUE_ORDINAL)) {
        return null;
      } else {
        return invoke(input.getInt(NUM_BUCKETS_ORDINAL), input.getBinary(VALUE_ORDINAL));
      }
    }

    @Override
    public String canonicalName() {
      return "iceberg.bucket(binary)";
    }
  }

  public static class BucketDecimal extends BucketBase {
    private final DataType sqlType;
    private final int precision;
    private final int scale;

    // magic method used in codegen
    public static Integer invoke(int numBuckets, Decimal value) {
      if (value == null) {
        return null;
      }

      return apply(numBuckets, hash(value.toJavaBigDecimal()));
    }

    // Visible for testing
    public static int hash(BigDecimal value) {
      return BucketUtil.hash(value);
    }

    public BucketDecimal(DataType sqlType) {
      this.sqlType = sqlType;
      this.precision = ((DecimalType) sqlType).precision();
      this.scale = ((DecimalType) sqlType).scale();
    }

    @Override
    public DataType[] inputTypes() {
      return new DataType[] {DataTypes.IntegerType, sqlType};
    }

    @Override
    public Integer produceResult(InternalRow input) {
      if (input.isNullAt(NUM_BUCKETS_ORDINAL) || input.isNullAt(VALUE_ORDINAL)) {
        return null;
      } else {
        int numBuckets = input.getInt(NUM_BUCKETS_ORDINAL);
        Decimal value = input.getDecimal(VALUE_ORDINAL, precision, scale);
        return invoke(numBuckets, value);
      }
    }

    @Override
    public String canonicalName() {
      return "iceberg.bucket(decimal)";
    }
  }
}