All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.aws.s3.S3FileIO Maven / Gradle / Ivy

There is a newer version: 1.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.aws.s3;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.apache.iceberg.aws.AwsClientFactories;
import org.apache.iceberg.aws.AwsClientFactory;
import org.apache.iceberg.aws.AwsProperties;
import org.apache.iceberg.common.DynConstructors;
import org.apache.iceberg.io.BulkDeletionFailureException;
import org.apache.iceberg.io.CredentialSupplier;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.FileInfo;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.io.SupportsBulkOperations;
import org.apache.iceberg.io.SupportsPrefixOperations;
import org.apache.iceberg.metrics.MetricsContext;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Multimaps;
import org.apache.iceberg.relocated.com.google.common.collect.SetMultimap;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.collect.Streams;
import org.apache.iceberg.util.SerializableMap;
import org.apache.iceberg.util.SerializableSupplier;
import org.apache.iceberg.util.Tasks;
import org.apache.iceberg.util.ThreadPools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.Delete;
import software.amazon.awssdk.services.s3.model.DeleteObjectRequest;
import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest;
import software.amazon.awssdk.services.s3.model.DeleteObjectsResponse;
import software.amazon.awssdk.services.s3.model.GetObjectTaggingRequest;
import software.amazon.awssdk.services.s3.model.GetObjectTaggingResponse;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
import software.amazon.awssdk.services.s3.model.ObjectIdentifier;
import software.amazon.awssdk.services.s3.model.PutObjectTaggingRequest;
import software.amazon.awssdk.services.s3.model.S3Exception;
import software.amazon.awssdk.services.s3.model.Tag;
import software.amazon.awssdk.services.s3.model.Tagging;

/**
 * FileIO implementation backed by S3.
 *
 * 

Locations used must follow the conventions for S3 URIs (e.g. s3://bucket/path...). URIs with * schemes s3a, s3n, https are also treated as s3 file paths. Using this FileIO with other schemes * will result in {@link org.apache.iceberg.exceptions.ValidationException}. */ public class S3FileIO implements FileIO, SupportsBulkOperations, SupportsPrefixOperations, CredentialSupplier { private static final Logger LOG = LoggerFactory.getLogger(S3FileIO.class); private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; private static volatile ExecutorService executorService; private String credential = null; private SerializableSupplier s3; private AwsProperties awsProperties; private SerializableMap properties = null; private transient volatile S3Client client; private MetricsContext metrics = MetricsContext.nullMetrics(); private final AtomicBoolean isResourceClosed = new AtomicBoolean(false); /** * No-arg constructor to load the FileIO dynamically. * *

All fields are initialized by calling {@link S3FileIO#initialize(Map)} later. */ public S3FileIO() {} /** * Constructor with custom s3 supplier and default AWS properties. * *

Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. * * @param s3 s3 supplier */ public S3FileIO(SerializableSupplier s3) { this(s3, new AwsProperties()); } /** * Constructor with custom s3 supplier and AWS properties. * *

Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. * * @param s3 s3 supplier * @param awsProperties aws properties */ public S3FileIO(SerializableSupplier s3, AwsProperties awsProperties) { this.s3 = s3; this.awsProperties = awsProperties; } @Override public InputFile newInputFile(String path) { return S3InputFile.fromLocation(path, client(), awsProperties, metrics); } @Override public InputFile newInputFile(String path, long length) { return S3InputFile.fromLocation(path, length, client(), awsProperties, metrics); } @Override public OutputFile newOutputFile(String path) { return S3OutputFile.fromLocation(path, client(), awsProperties, metrics); } @Override public void deleteFile(String path) { if (awsProperties.s3DeleteTags() != null && !awsProperties.s3DeleteTags().isEmpty()) { try { tagFileToDelete(path, awsProperties.s3DeleteTags()); } catch (S3Exception e) { LOG.warn("Failed to add delete tags: {} to {}", awsProperties.s3DeleteTags(), path, e); } } if (!awsProperties.isS3DeleteEnabled()) { return; } S3URI location = new S3URI(path, awsProperties.s3BucketToAccessPointMapping()); DeleteObjectRequest deleteRequest = DeleteObjectRequest.builder().bucket(location.bucket()).key(location.key()).build(); client().deleteObject(deleteRequest); } @Override public Map properties() { return properties.immutableMap(); } /** * Deletes the given paths in a batched manner. * *

The paths are grouped by bucket, and deletion is triggered when we either reach the * configured batch size or have a final remainder batch for each bucket. * * @param paths paths to delete */ @Override public void deleteFiles(Iterable paths) throws BulkDeletionFailureException { if (awsProperties.s3DeleteTags() != null && !awsProperties.s3DeleteTags().isEmpty()) { Tasks.foreach(paths) .noRetry() .executeWith(executorService()) .suppressFailureWhenFinished() .onFailure( (path, exc) -> LOG.warn( "Failed to add delete tags: {} to {}", awsProperties.s3DeleteTags(), path, exc)) .run(path -> tagFileToDelete(path, awsProperties.s3DeleteTags())); } if (!awsProperties.isS3DeleteEnabled()) { return; } SetMultimap bucketToObjects = Multimaps.newSetMultimap(Maps.newHashMap(), Sets::newHashSet); int numberOfFailedDeletions = 0; for (String path : paths) { S3URI location = new S3URI(path, awsProperties.s3BucketToAccessPointMapping()); String bucket = location.bucket(); String objectKey = location.key(); Set objectsInBucket = bucketToObjects.get(bucket); if (objectsInBucket.size() == awsProperties.s3FileIoDeleteBatchSize()) { List failedDeletionsForBatch = deleteObjectsInBucket(bucket, objectsInBucket); numberOfFailedDeletions += failedDeletionsForBatch.size(); failedDeletionsForBatch.forEach( failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); bucketToObjects.removeAll(bucket); } bucketToObjects.get(bucket).add(objectKey); } // Delete the remainder for (Map.Entry> bucketToObjectsEntry : bucketToObjects.asMap().entrySet()) { final String bucket = bucketToObjectsEntry.getKey(); final Collection objects = bucketToObjectsEntry.getValue(); List failedDeletions = deleteObjectsInBucket(bucket, objects); failedDeletions.forEach( failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); numberOfFailedDeletions += failedDeletions.size(); } if (numberOfFailedDeletions > 0) { throw new BulkDeletionFailureException(numberOfFailedDeletions); } } private void tagFileToDelete(String path, Set deleteTags) throws S3Exception { S3URI location = new S3URI(path, awsProperties.s3BucketToAccessPointMapping()); String bucket = location.bucket(); String objectKey = location.key(); GetObjectTaggingRequest getObjectTaggingRequest = GetObjectTaggingRequest.builder().bucket(bucket).key(objectKey).build(); GetObjectTaggingResponse getObjectTaggingResponse = client().getObjectTagging(getObjectTaggingRequest); // Get existing tags, if any and then add the delete tags Set tags = Sets.newHashSet(); if (getObjectTaggingResponse.hasTagSet()) { tags.addAll(getObjectTaggingResponse.tagSet()); } tags.addAll(deleteTags); PutObjectTaggingRequest putObjectTaggingRequest = PutObjectTaggingRequest.builder() .bucket(bucket) .key(objectKey) .tagging(Tagging.builder().tagSet(tags).build()) .build(); client().putObjectTagging(putObjectTaggingRequest); } private List deleteObjectsInBucket(String bucket, Collection objects) { if (!objects.isEmpty()) { List objectIds = objects.stream() .map(objectKey -> ObjectIdentifier.builder().key(objectKey).build()) .collect(Collectors.toList()); DeleteObjectsRequest deleteObjectsRequest = DeleteObjectsRequest.builder() .bucket(bucket) .delete(Delete.builder().objects(objectIds).build()) .build(); DeleteObjectsResponse response = client().deleteObjects(deleteObjectsRequest); if (response.hasErrors()) { return response.errors().stream() .map(error -> String.format("s3://%s/%s", bucket, error.key())) .collect(Collectors.toList()); } } return Lists.newArrayList(); } @Override public Iterable listPrefix(String prefix) { S3URI s3uri = new S3URI(prefix, awsProperties.s3BucketToAccessPointMapping()); ListObjectsV2Request request = ListObjectsV2Request.builder().bucket(s3uri.bucket()).prefix(s3uri.key()).build(); return () -> client().listObjectsV2Paginator(request).stream() .flatMap(r -> r.contents().stream()) .map( o -> new FileInfo( String.format("%s://%s/%s", s3uri.scheme(), s3uri.bucket(), o.key()), o.size(), o.lastModified().toEpochMilli())) .iterator(); } /** * This method provides a "best-effort" to delete all objects under the given prefix. * *

Bulk delete operations are used and no reattempt is made for deletes if they fail, but will * log any individual objects that are not deleted as part of the bulk operation. * * @param prefix prefix to delete */ @Override public void deletePrefix(String prefix) { deleteFiles(() -> Streams.stream(listPrefix(prefix)).map(FileInfo::location).iterator()); } private S3Client client() { if (client == null) { synchronized (this) { if (client == null) { client = s3.get(); } } } return client; } private ExecutorService executorService() { if (executorService == null) { synchronized (S3FileIO.class) { if (executorService == null) { executorService = ThreadPools.newWorkerPool( "iceberg-s3fileio-delete", awsProperties.s3FileIoDeleteThreads()); } } } return executorService; } @Override public String getCredential() { return credential; } @Override public void initialize(Map props) { this.properties = SerializableMap.copyOf(props); this.awsProperties = new AwsProperties(properties); // Do not override s3 client if it was provided if (s3 == null) { AwsClientFactory clientFactory = AwsClientFactories.from(props); if (clientFactory instanceof CredentialSupplier) { this.credential = ((CredentialSupplier) clientFactory).getCredential(); } this.s3 = clientFactory::s3; } // Report Hadoop metrics if Hadoop is available try { DynConstructors.Ctor ctor = DynConstructors.builder(MetricsContext.class) .loader(S3FileIO.class.getClassLoader()) .hiddenImpl(DEFAULT_METRICS_IMPL, String.class) .buildChecked(); MetricsContext context = ctor.newInstance("s3"); context.initialize(properties); this.metrics = context; } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { LOG.warn( "Unable to load metrics class: '{}', falling back to null metrics", DEFAULT_METRICS_IMPL, e); } } @Override public void close() { // handles concurrent calls to close() if (isResourceClosed.compareAndSet(false, true)) { if (client != null) { client.close(); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy