org.projectnessie.versioned.gc.actions.GcActions Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nessie-versioned-gc-iceberg-actions Show documentation
There is a newer version: 0.9.2
Show newest version
/*
 * Copyright (C) 2020 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.versioned.gc.actions;

import static org.apache.iceberg.types.Types.NestedField.required;

import com.google.common.collect.ImmutableMap;
import com.google.protobuf.ByteString;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Timestamp;
import java.time.Clock;
import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.types.Types;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.util.SerializableConfiguration;
import org.projectnessie.model.Contents;
import org.projectnessie.server.store.TableCommitMetaStoreWorker;
import org.projectnessie.versioned.dynamodb.DynamoStore;
import org.projectnessie.versioned.dynamodb.DynamoStoreConfig;
import org.projectnessie.versioned.gc.AssetKeySerializer;
import org.projectnessie.versioned.gc.CategorizedValue;
import org.projectnessie.versioned.gc.IcebergAssetKey;
import org.projectnessie.versioned.gc.IcebergAssetKeyConverter;
import org.projectnessie.versioned.gc.IdentifyUnreferencedAssets;
import org.projectnessie.versioned.gc.ValueTypeFilter;
import org.projectnessie.versioned.store.Store;
import org.projectnessie.versioned.tiered.gc.GcOptions;
import org.projectnessie.versioned.tiered.gc.IdentifyUnreferencedValues;
import software.amazon.awssdk.regions.Region;

/** Identify unreferenced assets and update the gc table. */
public class GcActions {
  public static final TableIdentifier DEFAULT_TABLE_IDENTIFIER =
      TableIdentifier.parse("gc.identified_tables");
  private static final StructType SCHEMA =
      SparkSchemaUtil.convert(
          new Schema(
              Types.StructType.of(
                      required(1, "tableName", Types.StringType.get()),
                      required(2, "timestamp", Types.TimestampType.withZone()),
                      required(3, "asset", Types.BinaryType.get()),
                      required(4, "snapshotId", Types.StringType.get()),
                      required(5, "assetType", Types.StringType.get()),
                      required(6, "path", Types.StringType.get()),
                      required(7, "name", Types.StringType.get()),
                      required(8, "runid", Types.LongType.get()))
                  .fields()));

  private final TableCommitMetaStoreWorker worker = new TableCommitMetaStoreWorker();
  private final Clock clock = Clock.systemUTC();
  private final SparkSession spark;
  private final AssetKeySerializer assetKeySerializer;
  private final IcebergAssetKeyConverter assetKeyConverter;
  private final GcActionsConfig actionsConfig;
  private final GcOptions gcConfig;
  private final TableIdentifier table;

  private GcActions(
      SparkSession spark,
      GcActionsConfig actionsConfig,
      GcOptions gcConfig,
      TableIdentifier table) {
    this.spark = spark;
    this.actionsConfig = actionsConfig;
    this.gcConfig = gcConfig;
    this.table = table;
    SparkConf conf = new SparkConf();
    conf.setAll(spark.sessionState().conf().getAllConfs());
    Configuration hadoopConfig = spark.sessionState().newHadoopConf();
    SerializableConfiguration configuration = new SerializableConfiguration(hadoopConfig);
    this.assetKeySerializer = new AssetKeySerializer(configuration);
    this.assetKeyConverter = new IcebergAssetKeyConverter(configuration);
    createTable(table);
  }

  private SparkSession spark() {
    return spark;
  }

  /** Build Spark dataset of unreferenced assets. This is in the schema of the gc table. */
  public Dataset identifyUnreferencedAssets() throws AnalysisException {
    IdentifyUnreferencedValues values =
        new IdentifyUnreferencedValues<>(worker, store(actionsConfig), spark(), gcConfig, clock);
    Dataset unreferencedValues = values.identify();
    IdentifyUnreferencedAssets assets =
        new IdentifyUnreferencedAssets<>(
            worker.getValueSerializer(),
            assetKeySerializer,
            assetKeyConverter,
            new ValueTypeFilter(worker.getValueSerializer()),
            spark());
    Dataset unreferencedAssets =
        assets.identify(unreferencedValues);
    long currentRunId = GcActionUtils.getMaxRunId(spark, table.toString()) + 1;
    return unreferencedAssets
        .map(new ConvertToTableFunction(assetKeySerializer), RowEncoder.apply(SCHEMA))
        .withColumn("runid", functions.lit(currentRunId));
  }

  /** Append dataset to the gc table. Dataset is already in the correct schema. */
  public void updateUnreferencedAssetTable(Dataset unreferencedAssets) {
    // sort because of https://issues.apache.org/jira/browse/SPARK-23889
    unreferencedAssets
        .repartition(unreferencedAssets.col("tableName"))
        .sortWithinPartitions()
        .write()
        .format("iceberg")
        .mode("append")
        .save(table.toString());
  }

  private static class ConvertToTableFunction
      implements MapFunction {
    private final AssetKeySerializer assetKeySerializer;

    private ConvertToTableFunction(AssetKeySerializer assetKeySerializer) {
      this.assetKeySerializer = assetKeySerializer;
    }

    @Override
    public Row call(IdentifyUnreferencedAssets.UnreferencedItem value) throws Exception {
      IcebergAssetKey assetKey =
          (IcebergAssetKey)
              assetKeySerializer.fromBytes(ByteString.copyFrom((byte[]) value.getAsset()));
      List key = value.getKey();
      long microTimestamp = value.getTimestamp();
      long secondTimestamp = TimeUnit.MICROSECONDS.toSeconds(microTimestamp);
      long nanos = microTimestamp * 1_000 - secondTimestamp * 1_000_000_000;
      Timestamp timestamp = Timestamp.from(Instant.ofEpochSecond(secondTimestamp, nanos));

      return RowFactory.create(
          String.join(".", key),
          timestamp,
          value.getAsset(),
          assetKey.getSnapshotId(),
          assetKey.getType().toString(),
          assetKey.getPath(),
          String.join(".", assetKey.toReportableName()));
    }
  }

  static DynamoStore createStore(GcActionsConfig config) {
    return new DynamoStore(
        DynamoStoreConfig.builder()
            .endpoint(
                Optional.ofNullable(config.getDynamoEndpoint())
                    .map(
                        e -> {
                          try {
                            return new URI(e);
                          } catch (URISyntaxException ex) {
                            throw new RuntimeException(ex);
                          }
                        }))
            .region(Region.of(config.getDynamoRegion()))
            .build());
  }

  private Supplier store(GcActionsConfig config) {
    if (config.getStoreType() != GcActionsConfig.StoreType.DYNAMO) {
      throw new UnsupportedOperationException("Ony dynamo tiered store is supported");
    }
    Store store = createStore(config);
    store.start();
    return () -> store;
  }

  private void createTable(TableIdentifier tableIdentifier) {
    CatalogPlugin catalog = spark.sessionState().catalogManager().currentCatalog();
    Identifier ident = Identifier.of(tableIdentifier.namespace().levels(), tableIdentifier.name());
    Transform[] partitions =
        Spark3Util.toTransforms(
            PartitionSpec.builderFor(SparkSchemaUtil.convert(SCHEMA))
                .identity("tableName")
                .build());
    try {
      ((TableCatalog) catalog).createTable(ident, SCHEMA, partitions, ImmutableMap.of());
    } catch (TableAlreadyExistsException e) {
      // table already exists. Does it have the same catalog?
      try {
        if (!((TableCatalog) catalog).loadTable(ident).schema().equals(SCHEMA)) {
          throw new RuntimeException(
              String.format(
                  "Cannot create table %s. Table with different schema already exists", ident),
              e);
        }
      } catch (NoSuchTableException noSuchTableException) {
        // can't happen
      }
    } catch (NoSuchNamespaceException e) {
      // this can't happen when using a Nessie Catalog as namespaces are implicit. If this happens
      // you are likely not using a Nessie catalog
      throw new RuntimeException(
          String.format(
              "Cannot create table. Are you using a Nessie Catalog. Catalog is %s",
              catalog.getClass().getName()),
          e);
    }
  }

  public static class Builder {
    private final SparkSession spark;
    private GcActionsConfig actionsConfig;
    private GcOptions gcOptions;
    private TableIdentifier table;

    public Builder(SparkSession spark) {
      this.spark = spark;
    }

    public Builder setActionsConfig(GcActionsConfig actionsConfig) {
      this.actionsConfig = actionsConfig;
      return this;
    }

    public Builder setGcOptions(GcOptions gcOptions) {
      this.gcOptions = gcOptions;
      return this;
    }

    public Builder setTable(TableIdentifier table) {
      this.table = table;
      return this;
    }

    public GcActions build() {
      return new GcActions(spark, actionsConfig, gcOptions, table);
    }
  }
}