All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.projectnessie.versioned.gc.IcebergAssetKeyConverter Maven / Gradle / Ivy

There is a newer version: 0.9.2
Show newest version
/*
 * Copyright (C) 2020 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.versioned.gc;

import java.io.Serializable;
import java.util.stream.Stream;

import org.apache.hadoop.fs.Path;
import org.apache.iceberg.DataFileCollector;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableMetadataParser;
import org.apache.iceberg.hadoop.HadoopFileIO;
import org.apache.spark.util.SerializableConfiguration;
import org.projectnessie.model.Contents;
import org.projectnessie.model.IcebergTable;

import com.google.common.base.Preconditions;


public class IcebergAssetKeyConverter implements AssetKeyConverter, Serializable {

  private final SerializableConfiguration hadoopConfig;

  public IcebergAssetKeyConverter(SerializableConfiguration configuration) {
    hadoopConfig = configuration;
  }

  private Stream allFiles(TableMetadata metadata) {
    //note the namespace is not stored here so we can't know for sure what the full table path is
    final String tableName = new Path(metadata.location()).getName();
    final long snapshotId = metadata.currentSnapshot() == null ? -1L : metadata.currentSnapshot().snapshotId();
    Stream metadataFiles = Stream.of(
      new IcebergAssetKey(metadata.metadataFileLocation(), hadoopConfig, IcebergAssetKey.AssetKeyType.ICEBERG_METADATA, snapshotId,
          tableName), new IcebergAssetKey(metadata.location(), hadoopConfig, IcebergAssetKey.AssetKeyType.TABLE, 0, tableName));

    return Stream.concat(metadataFiles, metadata.snapshots().stream().flatMap(snapshot -> {
      final long thisSnapshotId = snapshot.snapshotId();

      Stream manifestLists = Stream.of(new IcebergAssetKey(snapshot.manifestListLocation(), hadoopConfig,
          IcebergAssetKey.AssetKeyType.ICEBERG_MANIFEST_LIST, thisSnapshotId, tableName));

      Stream manifestsStream = snapshot.allManifests().stream().map(ManifestFile::path)
          .map(x -> new IcebergAssetKey(x, hadoopConfig, IcebergAssetKey.AssetKeyType.ICEBERG_MANIFEST, thisSnapshotId, tableName));

      Stream files = DataFileCollector.dataFiles(new HadoopFileIO(hadoopConfig.value()), snapshot.allManifests())
          .map(x -> new IcebergAssetKey(x, hadoopConfig, IcebergAssetKey.AssetKeyType.DATA_FILE, thisSnapshotId, tableName));
      return Stream.concat(Stream.concat(manifestsStream, files), manifestLists);
    }));
  }


  @Override
  public Stream apply(Contents contents) {
    Preconditions.checkArgument(contents instanceof IcebergTable,
        String.format("Cannot process contents of type %s, only Iceberg Tables", contents.getClass().getName()));
    TableMetadata metadata = TableMetadataParser.read(new HadoopFileIO(hadoopConfig.value()),
        ((IcebergTable) contents).getMetadataLocation());
    return allFiles(metadata);
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy