All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.projectnessie.gc.iceberg.IcebergContentToFiles Maven / Gradle / Ivy

/*
 * Copyright (C) 2022 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.gc.iceberg;

import static org.projectnessie.model.Content.Type.ICEBERG_TABLE;
import static org.projectnessie.model.Content.Type.ICEBERG_VIEW;
import static org.projectnessie.storage.uri.StorageUri.SCHEME_FILE;

import com.google.common.base.Preconditions;
import com.google.errorprone.annotations.CanIgnoreReturnValue;
import com.google.errorprone.annotations.MustBeClosed;
import jakarta.annotation.Nonnull;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.ManifestReaderUtil;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableMetadataParser;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.view.ViewMetadata;
import org.apache.iceberg.view.ViewMetadataParser;
import org.immutables.value.Value;
import org.projectnessie.gc.contents.ContentReference;
import org.projectnessie.gc.expire.ContentToFiles;
import org.projectnessie.gc.files.FileReference;
import org.projectnessie.model.Content;
import org.projectnessie.storage.uri.StorageUri;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Provides functionality to extract information about files and base locations from Iceberg
 * content, which is the one Iceberg table snapshot referred to by a Nessie commit.
 */
@Value.Immutable
public abstract class IcebergContentToFiles implements ContentToFiles {

  private static final Logger LOGGER = LoggerFactory.getLogger(IcebergContentToFiles.class);

  static final String ICEBERG_NOT_FOUND_EXCEPTION =
      "org.apache.iceberg.exceptions.NotFoundException";
  static final String S3_KEY_NOT_FOUND_EXCEPTION =
      "software.amazon.awssdk.services.s3.model.NoSuchKeyException";
  static final String GCS_STORAGE_EXCEPTION = "com.google.cloud.storage.StorageException";
  static final String ADLS_BLOB_STORAGE_EXCEPTION =
      "com.azure.storage.blob.models.BlobStorageException";

  static final String GCS_NOT_FOUND_START = "404 Not Found";

  static final String ADLS_PATH_NOT_FOUND_CODE = "PathNotFound";
  static final String ADLS_BLOB_NOT_FOUND_CODE = "BlobNotFound";

  public static Builder builder() {
    return ImmutableIcebergContentToFiles.builder();
  }

  public interface Builder {
    @CanIgnoreReturnValue
    Builder io(FileIO io);

    IcebergContentToFiles build();
  }

  abstract FileIO io();

  /**
   * Provides a {@link Stream} with the {@link FileReference}s referencing the table-metadata, the
   * {@link Snapshot#manifestListLocation() manifest-list}, all {@link ManifestFile manifest-files}
   * and all {@link org.apache.iceberg.DataFile data files}.
   */
  @Override
  @MustBeClosed
  public Stream extractFiles(ContentReference contentReference) {
    Content.Type contentType = contentReference.contentType();
    if (contentType.equals(ICEBERG_TABLE)) {
      return extractTableFiles(contentReference);
    } else if (contentType.equals(ICEBERG_VIEW)) {
      return extractViewFiles(contentReference);
    } else {
      throw new IllegalArgumentException(
          "Expect ICEBERG_TABLE/ICEBERG_VIEW, but got " + contentType);
    }
  }

  private Stream extractViewFiles(ContentReference contentReference) {
    FileIO io = io();

    ViewMetadata viewMetadata;
    try {
      viewMetadata = ViewMetadataParser.read(io.newInputFile(contentReference.metadataLocation()));
    } catch (Exception notFoundCandidate) {
      return handleNotFound(contentReference, notFoundCandidate, "View");
    }

    Stream allFiles = Stream.of(metadataStorageUri(contentReference));

    StorageUri baseUri = baseUri(viewMetadata, contentReference);

    return extractFilesRelativize(allFiles, baseUri);
  }

  private Stream extractTableFiles(ContentReference contentReference) {
    FileIO io = io();

    TableMetadata tableMetadata;
    try {
      tableMetadata = TableMetadataParser.read(io, contentReference.metadataLocation());
    } catch (Exception notFoundCandidate) {
      return handleNotFound(contentReference, notFoundCandidate, "Table");
    }

    long snapshotId =
        Objects.requireNonNull(
            contentReference.snapshotId(),
            "Iceberg content is expected to have a non-null snapshot-ID");

    Snapshot snapshot =
        snapshotId < 0L ? tableMetadata.currentSnapshot() : tableMetadata.snapshot(snapshotId);

    Map specsById = tableMetadata.specsById();

    Stream allFiles = elementaryUrisFromSnapshot(snapshot, contentReference);

    if (snapshot != null) {
      allFiles =
          Stream.concat(
              allFiles,
              Stream.of("")
                  // .flatMap() for lazy loading
                  .flatMap(
                      x -> {
                        try {
                          @SuppressWarnings("MustBeClosedChecker")
                          Stream r =
                              allManifestsAndDataFiles(io, snapshot, specsById, contentReference);
                          return r;
                        } catch (Exception e) {
                          String msg =
                              "Failed to get manifest files for "
                                  + contentReference.contentType()
                                  + " "
                                  + contentReference.contentKey()
                                  + ", content-ID "
                                  + contentReference.contentId()
                                  + " at commit "
                                  + contentReference.commitId()
                                  + " via "
                                  + contentReference.metadataLocation();
                          LOGGER.error("{}", msg, e);
                          throw new RuntimeException(msg, e);
                        }
                      }));
    }

    StorageUri baseUri = baseUri(tableMetadata, contentReference);

    return extractFilesRelativize(allFiles, baseUri);
  }

  private static Stream extractFilesRelativize(
      Stream allFiles, StorageUri baseUri) {
    return allFiles.map(baseUri::relativize).map(u -> FileReference.of(u, baseUri, -1L));
  }

  private static Stream handleNotFound(
      ContentReference contentReference, Exception notFoundCandidate, String kind) {
    boolean notFound = false;
    for (Throwable c = notFoundCandidate; c != null; c = c.getCause()) {
      switch (c.getClass().getName()) {
        case ICEBERG_NOT_FOUND_EXCEPTION:
        case S3_KEY_NOT_FOUND_EXCEPTION:
          notFound = true;
          break;
        case GCS_STORAGE_EXCEPTION:
          if (c.getMessage().startsWith(GCS_NOT_FOUND_START)) {
            notFound = true;
          }
          break;
        case ADLS_BLOB_STORAGE_EXCEPTION:
          String message = c.getMessage();
          if (message.contains(ADLS_PATH_NOT_FOUND_CODE)
              || message.contains(ADLS_BLOB_NOT_FOUND_CODE)) {
            notFound = true;
          }
          break;
        default:
          break;
      }

      if (notFound || c == c.getCause()) {
        break;
      }
    }

    if (notFound) {
      // It is safe to assume that a missing table-metadata means no referenced files.
      // A table-metadata can be missing, because a previous Nessie GC "sweep" phase deleted it.
      LOGGER.info(
          "{} metadata {} for snapshot ID {} for content-key {} at Nessie commit {} does not exist, probably already deleted, assuming no files",
          kind,
          contentReference.metadataLocation(),
          contentReference.snapshotId(),
          contentReference.contentKey(),
          contentReference.commitId());
      return Stream.empty();
    }

    String msg =
        "Failed to extract content of "
            + contentReference.contentType()
            + " "
            + contentReference.contentKey()
            + ", content-ID "
            + contentReference.contentId()
            + " at commit "
            + contentReference.commitId()
            + " via "
            + contentReference.metadataLocation();
    LOGGER.error("{}", msg, notFoundCandidate);
    throw new RuntimeException(msg, notFoundCandidate);
  }

  /**
   * For the given {@link Snapshot}, provide a {@link Stream} of all manifest files with {@link
   * #allDataAndDeleteFiles(FileIO, Map, ManifestFile, ContentReference) all included data and
   * delete files}.
   */
  @MustBeClosed
  static Stream allManifestsAndDataFiles(
      FileIO io,
      Snapshot snapshot,
      Map specsById,
      ContentReference contentReference) {
    return allManifests(io, specsById, snapshot)
        .flatMap(
            mf -> {
              StorageUri manifestFileLoc = manifestFileUri(mf, contentReference);
              @SuppressWarnings("MustBeClosedChecker")
              Stream allDataAndDeleteFiles =
                  allDataAndDeleteFiles(io, specsById, mf, contentReference);
              return Stream.concat(Stream.of(manifestFileLoc), allDataAndDeleteFiles);
            });
  }

  /** Provide all {@link ManifestFile}s for the given {@link Snapshot}. */
  static Stream allManifests(
      FileIO io, Map specsById, Snapshot snapshot) {
    return snapshot.allManifests(io).stream();
  }

  /**
   * For the given {@link ManifestFile}, provide a {@link Stream} of all data and delete
   * files, means including all {@link org.apache.iceberg.ManifestEntry}s of every status ({@code
   * EXISTING}, {@code ADDED}, {@code DELETED}.
   */
  @MustBeClosed
  static Stream allDataAndDeleteFiles(
      FileIO io,
      Map specsById,
      ManifestFile manifestFile,
      ContentReference contentReference) {
    CloseableIterable iter;
    try {
      iter = ManifestReaderUtil.readPathsFromManifest(manifestFile, specsById, io);
    } catch (Exception e) {
      throw new RuntimeException(
          "Failed to get paths from manifest file " + manifestFile.path(), e);
    }
    return StreamSupport.stream(iter.spliterator(), false)
        .map(dataFilePath -> dataFileUri(dataFilePath, contentReference))
        .onClose(
            () -> {
              try {
                iter.close();
              } catch (IOException e) {
                throw new RuntimeException(e);
              }
            });
  }

  /**
   * All processed {@link StorageUri}s must have a schema part and, if the schema is {@code file},
   * the path must be an absolute path.
   */
  static StorageUri checkUri(String type, String location, ContentReference contentReference) {
    StorageUri loc = StorageUri.of(location);
    if (loc.scheme() == null) {
      Preconditions.checkArgument(
          location.startsWith("/"),
          "Iceberg content reference points to the %s URI '%s' as content-key %s on commit %s without a scheme and with a relative path, which is not supported.",
          type,
          location,
          contentReference.contentKey(),
          contentReference.commitId());
      return StorageUri.of("file://" + location);
    }

    if (SCHEME_FILE.equals(loc.scheme())) {
      URI uri = URI.create(location);
      Preconditions.checkArgument(
          uri.getSchemeSpecificPart().startsWith("/"),
          "Iceberg content reference points to the %s URI '%s' as content-key %s on commit %s with a non-absolute scheme-specific-part %s, which is not supported.",
          type,
          uri,
          contentReference.contentKey(),
          contentReference.commitId(),
          uri.getSchemeSpecificPart());

      Preconditions.checkArgument(
          uri.getHost() == null,
          "Iceberg content reference points to the host-specific %s URI '%s' as content-key %s on commit %s without a scheme, which is not supported.",
          type,
          location,
          contentReference.contentKey(),
          contentReference.commitId());
    }
    return loc;
  }

  static Stream elementaryUrisFromSnapshot(
      Snapshot snapshot, ContentReference contentReference) {
    StorageUri metadataLoc = metadataStorageUri(contentReference);

    if (snapshot == null) {
      return Stream.of(metadataLoc);
    }

    String manifestListLocation = snapshot.manifestListLocation();
    if (manifestListLocation == null) {
      // Iceberg spec v1 has the manifest files embedded in the table-metadata, Iceberg spec v2
      // has a separate manifest list file.
      return Stream.of(metadataLoc);
    }

    StorageUri manifestListLoc =
        checkUri("manifest list", snapshot.manifestListLocation(), contentReference);

    return Stream.of(metadataLoc, manifestListLoc);
  }

  private static StorageUri metadataStorageUri(ContentReference contentReference) {
    String metadataLocation =
        Preconditions.checkNotNull(
            contentReference.metadataLocation(),
            "Iceberg content is expected to have a non-null metadata-location for content-key %s on commit %s",
            contentReference.contentKey(),
            contentReference.commitId());
    StorageUri metadataLoc = checkUri("metadata", metadataLocation, contentReference);
    return metadataLoc;
  }

  static StorageUri baseUri(
      @Nonnull TableMetadata tableMetadata, @Nonnull ContentReference contentReference) {
    return baseUri(tableMetadata.location(), contentReference);
  }

  static StorageUri baseUri(
      @Nonnull ViewMetadata viewMetadata, @Nonnull ContentReference contentReference) {
    return baseUri(viewMetadata.location(), contentReference);
  }

  static StorageUri baseUri(@Nonnull String location, @Nonnull ContentReference contentReference) {
    String loc = location.endsWith("/") ? location : (location + "/");
    return checkUri("location", loc, contentReference);
  }

  static StorageUri manifestFileUri(
      @Nonnull ManifestFile mf, @Nonnull ContentReference contentReference) {
    String manifestFilePath =
        Preconditions.checkNotNull(
            mf.path(),
            "Iceberg manifest file is expected to have a non-null path for content-key %s on commit %s",
            contentReference.contentKey(),
            contentReference.commitId());
    return checkUri("manifest file", manifestFilePath, contentReference);
  }

  static StorageUri dataFileUri(
      @Nonnull String dataFilePath, @Nonnull ContentReference contentReference) {
    return checkUri("data file", dataFilePath, contentReference);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy