org.apache.iceberg.spark.procedures.AddFilesProcedure Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
There is a newer version: 0.13.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.spark.procedures;

import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkTableUtil;
import org.apache.iceberg.spark.SparkTableUtil.SparkPartition;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.TableIdentifier;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.iceberg.catalog.ProcedureParameter;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.runtime.BoxedUnit;

class AddFilesProcedure extends BaseProcedure {

  private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("=");

  private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{
      ProcedureParameter.required("table", DataTypes.StringType),
      ProcedureParameter.required("source_table", DataTypes.StringType),
      ProcedureParameter.optional("partition_filter", STRING_MAP),
      ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType)
  };

  private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{
      new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty())
  });

  private AddFilesProcedure(TableCatalog tableCatalog) {
    super(tableCatalog);
  }

  public static SparkProcedures.ProcedureBuilder builder() {
    return new BaseProcedure.Builder() {
      @Override
      protected AddFilesProcedure doBuild() {
        return new AddFilesProcedure(tableCatalog());
      }
    };
  }

  @Override
  public ProcedureParameter[] parameters() {
    return PARAMETERS;
  }

  @Override
  public StructType outputType() {
    return OUTPUT_TYPE;
  }

  @Override
  public InternalRow[] call(InternalRow args) {
    Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());

    CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog();
    Identifier sourceIdent = toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier();

    Map partitionFilter = Maps.newHashMap();
    if (!args.isNullAt(2)) {
      args.getMap(2).foreach(DataTypes.StringType, DataTypes.StringType,
          (k, v) -> {
            partitionFilter.put(k.toString(), v.toString());
            return BoxedUnit.UNIT;
          });
    }

    boolean checkDuplicateFiles;
    if (args.isNullAt(3)) {
      checkDuplicateFiles = true;
    } else {
      checkDuplicateFiles = args.getBoolean(3);
    }

    long addedFilesCount = importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles);
    return new InternalRow[]{newInternalRow(addedFilesCount)};
  }

  private boolean isFileIdentifier(Identifier ident) {
    String[] namespace = ident.namespace();
    return namespace.length == 1 &&
        (namespace[0].equalsIgnoreCase("orc") ||
            namespace[0].equalsIgnoreCase("parquet") ||
            namespace[0].equalsIgnoreCase("avro"));
  }

  private long importToIceberg(Identifier destIdent, Identifier sourceIdent, Map partitionFilter,
                               boolean checkDuplicateFiles) {
    return modifyIcebergTable(destIdent, table -> {

      validatePartitionSpec(table, partitionFilter);
      ensureNameMappingPresent(table);

      if (isFileIdentifier(sourceIdent)) {
        Path sourcePath = new Path(sourceIdent.name());
        String format = sourceIdent.namespace()[0];
        importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles);
      } else {
        importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles);
      }

      Snapshot snapshot = table.currentSnapshot();
      return Long.parseLong(snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0"));
    });
  }

  private static void ensureNameMappingPresent(Table table) {
    if (table.properties().get(TableProperties.DEFAULT_NAME_MAPPING) == null) {
      // Forces Name based resolution instead of position based resolution
      NameMapping mapping = MappingUtil.create(table.schema());
      String mappingJson = NameMappingParser.toJson(mapping);
      table.updateProperties()
          .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson)
          .commit();
    }
  }

  private void importFileTable(Table table, Path tableLocation, String format, Map partitionFilter,
                               boolean checkDuplicateFiles) {
    // List Partitions via Spark InMemory file search interface
    List partitions = Spark3Util.getPartitions(spark(), tableLocation, format);

    if (table.spec().isUnpartitioned()) {
      Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table");
      Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" +
          "to an unpartitioned table");

      // Build a Global Partition for the source
      SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format);
      importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles);
    } else {
      Preconditions.checkArgument(!partitions.isEmpty(),
          "Cannot find any partitions in table %s", partitions);
      List filteredPartitions = SparkTableUtil.filterPartitions(partitions, partitionFilter);
      Preconditions.checkArgument(!filteredPartitions.isEmpty(),
          "Cannot find any partitions which match the given filter. Partition filter is %s",
          MAP_JOINER.join(partitionFilter));
      importPartitions(table, filteredPartitions, checkDuplicateFiles);
    }
  }

  private void importCatalogTable(Table table, Identifier sourceIdent, Map partitionFilter,
                                  boolean checkDuplicateFiles) {
    String stagingLocation = getMetadataLocation(table);
    TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent);
    SparkTableUtil.importSparkTable(spark(), sourceTableIdentifier, table, stagingLocation, partitionFilter,
        checkDuplicateFiles);
  }

  private void importPartitions(Table table, List partitions,
                                boolean checkDuplicateFiles) {
    String stagingLocation = getMetadataLocation(table);
    SparkTableUtil.importSparkPartitions(spark(), partitions, table, table.spec(), stagingLocation,
        checkDuplicateFiles);
  }

  private String getMetadataLocation(Table table) {
    String defaultValue = table.location() + "/metadata";
    return table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, defaultValue);
  }

  @Override
  public String description() {
    return "AddFiles";
  }

  private void validatePartitionSpec(Table table, Map partitionFilter) {
    List partitionFields = table.spec().fields();
    Set partitionNames = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet());

    boolean tablePartitioned = !partitionFields.isEmpty();
    boolean partitionSpecPassed = !partitionFilter.isEmpty();

    // Check for any non-identity partition columns
    List nonIdentityFields = partitionFields.stream()
        .filter(x -> !x.transform().isIdentity())
        .collect(Collectors.toList());
    Preconditions.checkArgument(nonIdentityFields.isEmpty(),
        "Cannot add data files to target table %s because that table is partitioned and contains non-identity" +
            "partition transforms which will not be compatible. Found non-identity fields %s",
        table.name(), nonIdentityFields);

    if (tablePartitioned && partitionSpecPassed) {
      // Check to see there are sufficient partition columns to satisfy the filter
      Preconditions.checkArgument(partitionFields.size() >= partitionFilter.size(),
          "Cannot add data files to target table %s because that table is partitioned, " +
              "but the number of columns in the provided partition filter (%d) " +
              "is greater than the number of partitioned columns in table (%d)",
          table.name(), partitionFilter.size(), partitionFields.size());

      // Check for any filters of non existent columns
      List unMatchedFilters = partitionFilter.keySet().stream()
          .filter(filterName -> !partitionNames.contains(filterName))
          .collect(Collectors.toList());
      Preconditions.checkArgument(unMatchedFilters.isEmpty(),
          "Cannot add files to target table %s. %s is partitioned but the specified partition filter " +
              "refers to columns that are not partitioned: '%s' . Valid partition columns %s",
          table.name(), table.name(), unMatchedFilters, String.join(",", partitionNames));
    } else {
      Preconditions.checkArgument(!partitionSpecPassed,
          "Cannot use partition filter with an unpartitioned table %s",
          table.name());
    }
  }
}