org.apache.iceberg.spark.actions.SparkZOrderDataRewriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.3_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.actions;

import static org.apache.spark.sql.functions.array;

import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.NullOrder;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SortDirection;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.Table;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.SparkUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.ZOrderByteUtils;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class SparkZOrderDataRewriter extends SparkShufflingDataRewriter {

  private static final Logger LOG = LoggerFactory.getLogger(SparkZOrderDataRewriter.class);

  private static final String Z_COLUMN = "ICEZVALUE";
  private static final Schema Z_SCHEMA =
      new Schema(Types.NestedField.required(0, Z_COLUMN, Types.BinaryType.get()));
  private static final SortOrder Z_SORT_ORDER =
      SortOrder.builderFor(Z_SCHEMA)
          .sortBy(Z_COLUMN, SortDirection.ASC, NullOrder.NULLS_LAST)
          .build();

  /**
   * Controls the amount of bytes interleaved in the ZOrder algorithm. Default is all bytes being
   * interleaved.
   */
  public static final String MAX_OUTPUT_SIZE = "max-output-size";

  public static final int MAX_OUTPUT_SIZE_DEFAULT = Integer.MAX_VALUE;

  /**
   * Controls the number of bytes considered from an input column of a type with variable length
   * (String, Binary).
   *
   * Default is to use the same size as primitives {@link ZOrderByteUtils#PRIMITIVE_BUFFER_SIZE}.
   */
  public static final String VAR_LENGTH_CONTRIBUTION = "var-length-contribution";

  public static final int VAR_LENGTH_CONTRIBUTION_DEFAULT = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE;

  private final List zOrderColNames;
  private int maxOutputSize;
  private int varLengthContribution;

  SparkZOrderDataRewriter(SparkSession spark, Table table, List zOrderColNames) {
    super(spark, table);
    this.zOrderColNames = validZOrderColNames(spark, table, zOrderColNames);
  }

  @Override
  public String description() {
    return "Z-ORDER";
  }

  @Override
  public Set validOptions() {
    return ImmutableSet.builder()
        .addAll(super.validOptions())
        .add(MAX_OUTPUT_SIZE)
        .add(VAR_LENGTH_CONTRIBUTION)
        .build();
  }

  @Override
  public void init(Map options) {
    super.init(options);
    this.maxOutputSize = maxOutputSize(options);
    this.varLengthContribution = varLengthContribution(options);
  }

  @Override
  protected SortOrder sortOrder() {
    return Z_SORT_ORDER;
  }

  /**
   * Overrides the sortSchema method to include columns from Z_SCHEMA.
   *
   * This method generates a new Schema object which consists of columns from the original table
   * schema and Z_SCHEMA.
   */
  @Override
  protected Schema sortSchema() {
    return new Schema(
        new ImmutableList.Builder()
            .addAll(table().schema().columns())
            .addAll(Z_SCHEMA.columns())
            .build());
  }

  @Override
  protected Dataset sortedDF(Dataset df, List group) {
    Dataset zValueDF = df.withColumn(Z_COLUMN, zValue(df));
    Dataset sortedDF = sort(zValueDF, outputSortOrder(group, Z_SORT_ORDER));
    return sortedDF.drop(Z_COLUMN);
  }

  private Column zValue(Dataset df) {
    SparkZOrderUDF zOrderUDF =
        new SparkZOrderUDF(zOrderColNames.size(), varLengthContribution, maxOutputSize);

    Column[] zOrderCols =
        zOrderColNames.stream()
            .map(df.schema()::apply)
            .map(col -> zOrderUDF.sortedLexicographically(df.col(col.name()), col.dataType()))
            .toArray(Column[]::new);

    return zOrderUDF.interleaveBytes(array(zOrderCols));
  }

  private int varLengthContribution(Map options) {
    int value =
        PropertyUtil.propertyAsInt(
            options, VAR_LENGTH_CONTRIBUTION, VAR_LENGTH_CONTRIBUTION_DEFAULT);
    Preconditions.checkArgument(
        value > 0,
        "Cannot use less than 1 byte for variable length types with ZOrder, '%s' was set to %s",
        VAR_LENGTH_CONTRIBUTION,
        value);
    return value;
  }

  private int maxOutputSize(Map options) {
    int value = PropertyUtil.propertyAsInt(options, MAX_OUTPUT_SIZE, MAX_OUTPUT_SIZE_DEFAULT);
    Preconditions.checkArgument(
        value > 0,
        "Cannot have the interleaved ZOrder value use less than 1 byte, '%s' was set to %s",
        MAX_OUTPUT_SIZE,
        value);
    return value;
  }

  private List validZOrderColNames(
      SparkSession spark, Table table, List inputZOrderColNames) {

    Preconditions.checkArgument(
        inputZOrderColNames != null && !inputZOrderColNames.isEmpty(),
        "Cannot ZOrder when no columns are specified");

    Schema schema = table.schema();
    Set identityPartitionFieldIds = table.spec().identitySourceIds();
    boolean caseSensitive = SparkUtil.caseSensitive(spark);

    List validZOrderColNames = Lists.newArrayList();

    for (String colName : inputZOrderColNames) {
      Types.NestedField field =
          caseSensitive ? schema.findField(colName) : schema.caseInsensitiveFindField(colName);
      Preconditions.checkArgument(
          field != null,
          "Cannot find column '%s' in table schema (case sensitive = %s): %s",
          colName,
          caseSensitive,
          schema.asStruct());

      if (identityPartitionFieldIds.contains(field.fieldId())) {
        LOG.warn("Ignoring '{}' as such values are constant within a partition", colName);
      } else {
        validZOrderColNames.add(colName);
      }
    }

    Preconditions.checkArgument(
        !validZOrderColNames.isEmpty(),
        "Cannot ZOrder, all columns provided were identity partition columns and cannot be used");

    return validZOrderColNames;
  }
}