org.apache.iceberg.spark.SparkSchemaUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.3_2.12 Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark;

import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.math.LongMath;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalog.Column;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType;

/** Helper methods for working with Spark/Hive metadata. */
public class SparkSchemaUtil {
  private SparkSchemaUtil() {}

  /**
   * Returns a {@link Schema} for the given table with fresh field ids.
   *
   * This creates a Schema for an existing table by looking up the table's schema with Spark and
   * converting that schema. Spark/Hive partition columns are included in the schema.
   *
   * @param spark a Spark session
   * @param name a table name and (optional) database
   * @return a Schema for the table, if found
   */
  public static Schema schemaForTable(SparkSession spark, String name) {
    return convert(spark.table(name).schema());
  }

  /**
   * Returns a {@link PartitionSpec} for the given table.
   *
   * 
This creates a partition spec for an existing table by looking up the table's schema and
   * creating a spec with identity partitions for each partition column.
   *
   * @param spark a Spark session
   * @param name a table name and (optional) database
   * @return a PartitionSpec for the table
   * @throws AnalysisException if thrown by the Spark catalog
   */
  public static PartitionSpec specForTable(SparkSession spark, String name)
      throws AnalysisException {
    List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name));
    String db = parts.size() == 1 ? "default" : parts.get(0);
    String table = parts.get(parts.size() == 1 ? 0 : 1);

    PartitionSpec spec =
        identitySpec(
            schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList());
    return spec == null ? PartitionSpec.unpartitioned() : spec;
  }

  /**
   * Convert a {@link Schema} to a {@link DataType Spark type}.
   *
   * @param schema a Schema
   * @return the equivalent Spark type
   * @throws IllegalArgumentException if the type cannot be converted to Spark
   */
  public static StructType convert(Schema schema) {
    return (StructType) TypeUtil.visit(schema, new TypeToSparkType());
  }

  /**
   * Convert a {@link Type} to a {@link DataType Spark type}.
   *
   * @param type a Type
   * @return the equivalent Spark type
   * @throws IllegalArgumentException if the type cannot be converted to Spark
   */
  public static DataType convert(Type type) {
    return TypeUtil.visit(type, new TypeToSparkType());
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids.
   *
   * 
This conversion assigns fresh ids.
   *
   * 
Some data types are represented as the same Spark type. These are converted to a default
   * type.
   *
   * 
To convert using a reference schema for field ids and ambiguous types, use {@link
   * #convert(Schema, StructType)}.
   *
   * @param sparkType a Spark StructType
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted
   */
  public static Schema convert(StructType sparkType) {
    return convert(sparkType, false);
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids.
   *
   * 
This conversion assigns fresh ids.
   *
   * 
Some data types are represented as the same Spark type. These are converted to a default
   * type.
   *
   * 
To convert using a reference schema for field ids and ambiguous types, use {@link
   * #convert(Schema, StructType)}.
   *
   * @param sparkType a Spark StructType
   * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without
   *     timezone
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted
   */
  public static Schema convert(StructType sparkType, boolean useTimestampWithoutZone) {
    Type converted = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType));
    Schema schema = new Schema(converted.asNestedType().asStructType().fields());
    if (useTimestampWithoutZone) {
      schema = SparkFixupTimestampType.fixup(schema);
    }
    return schema;
  }

  /**
   * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids.
   *
   * 
This conversion assigns fresh ids.
   *
   * 
Some data types are represented as the same Spark type. These are converted to a default
   * type.
   *
   * 
To convert using a reference schema for field ids and ambiguous types, use {@link
   * #convert(Schema, StructType)}.
   *
   * @param sparkType a Spark DataType
   * @return the equivalent Type
   * @throws IllegalArgumentException if the type cannot be converted
   */
  public static Type convert(DataType sparkType) {
    return SparkTypeVisitor.visit(sparkType, new SparkTypeToType());
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema.
   *
   * 
This conversion does not assign new ids; it uses ids from the base schema.
   *
   * 
Data types, field order, and nullability will match the spark type. This conversion may
   * return a schema that is not compatible with base schema.
   *
   * @param baseSchema a Schema on which conversion is based
   * @param sparkType a Spark StructType
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted or there are missing ids
   */
  public static Schema convert(Schema baseSchema, StructType sparkType) {
    return convert(baseSchema, sparkType, true);
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema.
   *
   * 
This conversion does not assign new ids; it uses ids from the base schema.
   *
   * 
Data types, field order, and nullability will match the spark type. This conversion may
   * return a schema that is not compatible with base schema.
   *
   * @param baseSchema a Schema on which conversion is based
   * @param sparkType a Spark StructType
   * @param caseSensitive when false, the case of schema fields is ignored
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted or there are missing ids
   */
  public static Schema convert(Schema baseSchema, StructType sparkType, boolean caseSensitive) {
    // convert to a type with fresh ids
    Types.StructType struct =
        SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType();
    // reassign ids to match the base schema
    Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema, caseSensitive);
    // fix types that can't be represented in Spark (UUID and Fixed)
    return SparkFixupTypes.fixup(schema, baseSchema);
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema.
   *
   * 
This conversion will assign new ids for fields that are not found in the base schema.
   *
   * 
Data types, field order, and nullability will match the spark type. This conversion may
   * return a schema that is not compatible with base schema.
   *
   * @param baseSchema a Schema on which conversion is based
   * @param sparkType a Spark StructType
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted or there are missing ids
   */
  public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType) {
    return convertWithFreshIds(baseSchema, sparkType, true);
  }

  /**
   * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema.
   *
   * 
This conversion will assign new ids for fields that are not found in the base schema.
   *
   * 
Data types, field order, and nullability will match the spark type. This conversion may
   * return a schema that is not compatible with base schema.
   *
   * @param baseSchema a Schema on which conversion is based
   * @param sparkType a Spark StructType
   * @param caseSensitive when false, case of field names in schema is ignored
   * @return the equivalent Schema
   * @throws IllegalArgumentException if the type cannot be converted or there are missing ids
   */
  public static Schema convertWithFreshIds(
      Schema baseSchema, StructType sparkType, boolean caseSensitive) {
    // convert to a type with fresh ids
    Types.StructType struct =
        SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType();
    // reassign ids to match the base schema
    Schema schema =
        TypeUtil.reassignOrRefreshIds(new Schema(struct.fields()), baseSchema, caseSensitive);
    // fix types that can't be represented in Spark (UUID and Fixed)
    return SparkFixupTypes.fixup(schema, baseSchema);
  }

  /**
   * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection.
   *
   * 
This requires that the Spark type is a projection of the Schema. Nullability and types must
   * match.
   *
   * @param schema a Schema
   * @param requestedType a projection of the Spark representation of the Schema
   * @return a Schema corresponding to the Spark projection
   * @throws IllegalArgumentException if the Spark type does not match the Schema
   */
  public static Schema prune(Schema schema, StructType requestedType) {
    return new Schema(
        TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of()))
            .asNestedType()
            .asStructType()
            .fields());
  }

  /**
   * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection.
   *
   * 
This requires that the Spark type is a projection of the Schema. Nullability and types must
   * match.
   *
   * 
The filters list of {@link Expression} is used to ensure that columns referenced by filters
   * are projected.
   *
   * @param schema a Schema
   * @param requestedType a projection of the Spark representation of the Schema
   * @param filters a list of filters
   * @return a Schema corresponding to the Spark projection
   * @throws IllegalArgumentException if the Spark type does not match the Schema
   */
  public static Schema prune(Schema schema, StructType requestedType, List filters) {
    Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true);
    return new Schema(
        TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs))
            .asNestedType()
            .asStructType()
            .fields());
  }

  /**
   * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection.
   *
   * 
This requires that the Spark type is a projection of the Schema. Nullability and types must
   * match.
   *
   * The filters list of {@link Expression} is used to ensure that columns referenced by filters
   * are projected.
   *
   * @param schema a Schema
   * @param requestedType a projection of the Spark representation of the Schema
   * @param filter a filters
   * @return a Schema corresponding to the Spark projection
   * @throws IllegalArgumentException if the Spark type does not match the Schema
   */
  public static Schema prune(
      Schema schema, StructType requestedType, Expression filter, boolean caseSensitive) {
    Set filterRefs =
        Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive);

    return new Schema(
        TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs))
            .asNestedType()
            .asStructType()
            .fields());
  }

  private static PartitionSpec identitySpec(Schema schema, Collection columns) {
    List names = Lists.newArrayList();
    for (Column column : columns) {
      if (column.isPartition()) {
        names.add(column.name());
      }
    }

    return identitySpec(schema, names);
  }

  private static PartitionSpec identitySpec(Schema schema, List partitionNames) {
    if (partitionNames == null || partitionNames.isEmpty()) {
      return null;
    }

    PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
    for (String partitionName : partitionNames) {
      builder.identity(partitionName);
    }

    return builder.build();
  }

  /**
   * Estimate approximate table size based on Spark schema and total records.
   *
   * @param tableSchema Spark schema
   * @param totalRecords total records in the table
   * @return approximate size based on table schema
   */
  public static long estimateSize(StructType tableSchema, long totalRecords) {
    if (totalRecords == Long.MAX_VALUE) {
      return totalRecords;
    }

    long result;
    try {
      result = LongMath.checkedMultiply(tableSchema.defaultSize(), totalRecords);
    } catch (ArithmeticException e) {
      result = Long.MAX_VALUE;
    }
    return result;
  }

  public static void validateMetadataColumnReferences(Schema tableSchema, Schema readSchema) {
    List conflictingColumnNames =
        readSchema.columns().stream()
            .map(Types.NestedField::name)
            .filter(
                name ->
                    MetadataColumns.isMetadataColumn(name) && tableSchema.findField(name) != null)
            .collect(Collectors.toList());

    ValidationException.check(
        conflictingColumnNames.isEmpty(),
        "Table column names conflict with names reserved for Iceberg metadata columns: %s.\n"
            + "Please, use ALTER TABLE statements to rename the conflicting table columns.",
        conflictingColumnNames);
  }

  public static Map indexQuotedNameById(Schema schema) {
    Function quotingFunc = name -> String.format("`%s`", name.replace("`", "``"));
    return TypeUtil.indexQuotedNameById(schema.asStruct(), quotingFunc);
  }
}