org.apache.iceberg.actions.Spark3MigrateAction Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
There is a newer version: 0.11.1.33
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.actions;

import java.util.Map;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.Table;
import org.apache.iceberg.exceptions.AlreadyExistsException;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.spark.SparkSessionCatalog;
import org.apache.iceberg.spark.SparkTableUtil;
import org.apache.iceberg.spark.source.StagedSparkTable;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.TableIdentifier;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Some;
import scala.collection.JavaConverters;

/**
 * Takes a Spark table in the sourceCatalog and attempts to transform it into an Iceberg
 * Table in the same location with the same identifier. Once complete the identifier which
 * previously referred to a non-iceberg table will refer to the newly migrated iceberg
 * table.
 */
public class Spark3MigrateAction extends Spark3CreateAction {
  private static final Logger LOG = LoggerFactory.getLogger(Spark3MigrateAction.class);
  private static final String BACKUP_SUFFIX = "_BACKUP_";

  public Spark3MigrateAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableName) {
    super(spark, sourceCatalog, sourceTableName, sourceCatalog, sourceTableName);
  }

  @Override
  public Long execute() {
    // Move source table to a new name, halting all modifications and allowing us to stage
    // the creation of a new Iceberg table in its place
    String backupName = sourceTableIdent().name() + BACKUP_SUFFIX;
    Identifier backupIdentifier = Identifier.of(sourceTableIdent().namespace(), backupName);
    try {
      destCatalog().renameTable(sourceTableIdent(), backupIdentifier);
    } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) {
      throw new NoSuchTableException("Cannot find table '%s' to migrate", sourceTableIdent());
    } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) {
      throw new AlreadyExistsException("Cannot rename migration source '%s' to backup name '%s'." +
          " Backup table already exists.", sourceTableIdent(), backupIdentifier);
    }

    StagedSparkTable stagedTable = null;
    Table icebergTable;
    boolean threw = true;
    try {
      stagedTable = stageDestTable();
      icebergTable = stagedTable.table();

      ensureNameMappingPresent(icebergTable);

      String stagingLocation = getMetadataLocation(icebergTable);

      LOG.info("Beginning migration of {} using metadata location {}", sourceTableIdent(), stagingLocation);
      Some backupNamespace = Some.apply(backupIdentifier.namespace()[0]);
      TableIdentifier v1BackupIdentifier = new TableIdentifier(backupIdentifier.name(), backupNamespace);
      SparkTableUtil.importSparkTable(spark(), v1BackupIdentifier, icebergTable, stagingLocation);

      stagedTable.commitStagedChanges();
      threw = false;
    } finally {
      if (threw) {
        LOG.error("Error when attempting perform migration changes, aborting table creation and restoring backup.");

        try {
          destCatalog().renameTable(backupIdentifier, sourceTableIdent());
        } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException nstException) {
          LOG.error("Cannot restore backup '{}', the backup cannot be found", backupIdentifier, nstException);
        } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException taeException) {
          LOG.error("Cannot restore backup, a table with the original name " +
              "exists. The backup can be found with the name '{}'", backupIdentifier, taeException);
        }

        try {
          stagedTable.abortStagedChanges();
        } catch (Exception abortException) {
          LOG.error("Cannot abort staged changes", abortException);
        }
      }
    }

    Snapshot snapshot = icebergTable.currentSnapshot();
    long numMigratedFiles = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
    LOG.info("Successfully loaded Iceberg metadata for {} files", numMigratedFiles);
    return numMigratedFiles;
  }

  @Override
  protected Map targetTableProps() {
    Map properties = Maps.newHashMap();

    properties.putAll(JavaConverters.mapAsJavaMapConverter(v1SourceTable().properties()).asJava());
    EXCLUDED_PROPERTIES.forEach(properties::remove);

    properties.put(TableCatalog.PROP_PROVIDER, "iceberg");
    properties.put("migrated", "true");
    properties.putAll(additionalProperties());
    properties.putIfAbsent(LOCATION, sourceTableLocation());

    return properties;
  }

  @Override
  protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) {
    // Currently the Import code relies on being able to look up the table in the session code
    Preconditions.checkArgument(catalog instanceof SparkSessionCatalog,
        "Cannot migrate a table from a non-Iceberg Spark Session Catalog. Found %s of class %s as the source catalog.",
        catalog.name(), catalog.getClass().getName());

    return (TableCatalog) catalog;
  }
}