All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.fs.s3a.impl.DeleteOperation Maven / Gradle / Ivy

Go to download

This module contains code to support integration with Amazon Web Services. It also declares the dependencies needed to work with AWS services.

There is a newer version: 3.4.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a.impl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;

import com.amazonaws.services.s3.model.DeleteObjectsRequest;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.MoreExecutors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.s3a.Invoker;
import org.apache.hadoop.fs.s3a.Retries;
import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.s3a.S3ALocatedFileStatus;
import org.apache.hadoop.fs.s3a.Tristate;
import org.apache.hadoop.util.DurationInfo;

import static org.apache.hadoop.fs.store.audit.AuditingFunctions.callableWithinAuditSpan;
import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.maybeAwaitCompletion;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;

/**
 * Implementation of the delete() operation.
 * This issues only one bulk delete at a time,
 * intending to update S3Guard after every request succeeded.
 * Now that S3Guard has been removed, it
 * would be possible to issue multiple delete calls
 * in parallel.
 * If this is done, then it may be good to experiment with different
 * page sizes. The default value is
 * {@link InternalConstants#MAX_ENTRIES_TO_DELETE}, the maximum a single
 * POST permits.
 * 

* Smaller pages executed in parallel may have different * performance characteristics when deleting very large directories. * Any exploration of options here MUST be done with performance * measurements taken from test runs in EC2 against local S3 stores, * so as to ensure network latencies do not skew the results. */ public class DeleteOperation extends ExecutingStoreOperation { private static final Logger LOG = LoggerFactory.getLogger( DeleteOperation.class); /** * Pre-fetched source status. */ private final S3AFileStatus status; /** * Recursive delete? */ private final boolean recursive; /** * Callback provider. */ private final OperationCallbacks callbacks; /** * Number of entries in a page. */ private final int pageSize; /** * Executor for async operations. */ private final ListeningExecutorService executor; /** * List of keys built up for the next delete batch. */ private List keys; /** * The single async delete operation, or null. */ private CompletableFuture deleteFuture; /** * Counter of deleted files. */ private long filesDeleted; /** * Constructor. * @param context store context * @param status pre-fetched source status * @param recursive recursive delete? * @param callbacks callback provider * @param pageSize size of delete pages */ public DeleteOperation(final StoreContext context, final S3AFileStatus status, final boolean recursive, final OperationCallbacks callbacks, final int pageSize) { super(context); this.status = status; this.recursive = recursive; this.callbacks = callbacks; checkArgument(pageSize > 0 && pageSize <= InternalConstants.MAX_ENTRIES_TO_DELETE, "page size out of range: %s", pageSize); this.pageSize = pageSize; executor = MoreExecutors.listeningDecorator( context.createThrottledExecutor(1)); } public long getFilesDeleted() { return filesDeleted; } /** * Delete a file or directory tree. *

* This call does not create any fake parent directory; that is * left to the caller. * The actual delete call is done in a separate thread. * Only one delete at a time is submitted, however, to reduce the * complexity of recovering from failures. *

* With S3Guard removed, the problem of updating any * DynamoDB store has gone away -delete calls could now be issued * in parallel. * However, rate limiting may be required to keep write load * below the throttling point. Every entry in a single * bulk delete call counts as a single write request -overloading * an S3 partition with delete calls has been a problem in * the past. * * @return true, except in the corner cases of root directory deletion * @throws PathIsNotEmptyDirectoryException if the path is a dir and this * is not a recursive delete. * @throws IOException list failures or an inability to delete a file. */ @Retries.RetryTranslated public Boolean execute() throws IOException { executeOnlyOnce(); StoreContext context = getStoreContext(); Path path = status.getPath(); LOG.debug("Delete path {} - recursive {}", path, recursive); LOG.debug("Type = {}", status.isFile() ? "File" : (status.isEmptyDirectory() == Tristate.TRUE ? "Empty Directory" : "Directory")); String key = context.pathToKey(path); if (status.isDirectory()) { LOG.debug("delete: Path is a directory: {}", path); checkArgument( status.isEmptyDirectory() != Tristate.UNKNOWN, "File status must have directory emptiness computed"); if (!key.endsWith("/")) { key = key + "/"; } if ("/".equals(key)) { LOG.error("S3A: Cannot delete the root directory." + " Path: {}. Recursive: {}", status.getPath(), recursive); return false; } if (!recursive && status.isEmptyDirectory() == Tristate.FALSE) { throw new PathIsNotEmptyDirectoryException(path.toString()); } if (status.isEmptyDirectory() == Tristate.TRUE) { LOG.debug("deleting empty directory {}", path); deleteObjectAtPath(path, key, false); } else { deleteDirectoryTree(path, key); } } else { // simple file. LOG.debug("deleting simple file {}", path); deleteObjectAtPath(path, key, true); } LOG.debug("Deleted {} objects", filesDeleted); return true; } /** * Delete a directory tree. *

* This is done by asking the filesystem for a list of all objects under * the directory path. *

* Once the first {@link #pageSize} worth of objects has been listed, a batch * delete is queued for execution in a separate thread; subsequent batches * block waiting for the first call to complete or fail before again, * being deleted in the separate thread. *

* After all listed objects are queued for deletion, * @param path directory path * @param dirKey directory key * @throws IOException failure */ @Retries.RetryTranslated protected void deleteDirectoryTree(final Path path, final String dirKey) throws IOException { try (DurationInfo ignored = new DurationInfo(LOG, false, "deleting %s", dirKey)) { // init the lists of keys and paths to delete resetDeleteList(); deleteFuture = null; // list files LOG.debug("Getting objects for directory prefix {} to delete", dirKey); final RemoteIterator locatedFiles = callbacks.listFilesAndDirectoryMarkers(path, status, true); // iterate through and delete. The next() call will block when a new S3 // page is required; this any active delete submitted to the executor // will run in parallel with this. while (locatedFiles.hasNext()) { // get the next entry in the listing. S3AFileStatus child = locatedFiles.next().toS3AFileStatus(); queueForDeletion(child); } LOG.debug("Deleting final batch of listed files"); submitNextBatch(); maybeAwaitCompletion(deleteFuture); } LOG.debug("Delete \"{}\" completed; deleted {} objects", path, filesDeleted); } /** * Build an S3 key for a delete request, * possibly adding a "/" if it represents directory and it does * not have a trailing slash already. * @param stat status to build the key from * @return a key for a delete request */ private String deletionKey(final S3AFileStatus stat) { return getStoreContext().fullKey(stat); } /** * Queue for deletion. * @param stat status to queue * @throws IOException failure of the previous batch of deletions. */ private void queueForDeletion( final S3AFileStatus stat) throws IOException { queueForDeletion(deletionKey(stat), stat.isDirectory()); } /** * Queue keys for deletion. * Once a page of keys are ready to delete this * call is submitted to the executor, after waiting for the previous run to * complete. * * @param key key to delete * @param isDirMarker is the entry a directory? * @throws IOException failure of the previous batch of deletions. */ private void queueForDeletion(final String key, boolean isDirMarker) throws IOException { LOG.debug("Adding object to delete: \"{}\"", key); keys.add(new DeleteEntry(key, isDirMarker)); if (keys.size() == pageSize) { submitNextBatch(); } } /** * Wait for the previous batch to finish then submit this page. * The lists of keys and pages are reset here. * * @throws IOException failure of the previous batch of deletions. */ private void submitNextBatch() throws IOException { // delete a single page of keys and the metadata. // block for any previous batch. maybeAwaitCompletion(deleteFuture); // delete the current page of keys and paths deleteFuture = submitDelete(keys); // reset the references so a new list can be built up. resetDeleteList(); } /** * Reset the lists of keys and paths so that a new batch of * entries can built up. */ private void resetDeleteList() { keys = new ArrayList<>(pageSize); } /** * Delete a file or directory marker. * @param path path * @param key key * @param isFile is this a file? * @throws IOException failure */ @Retries.RetryTranslated private void deleteObjectAtPath( final Path path, final String key, final boolean isFile) throws IOException { LOG.debug("delete: {} {}", (isFile ? "file" : "dir marker"), key); filesDeleted++; callbacks.deleteObjectAtPath(path, key, isFile); } /** * Delete a single page of keys. * If the list is empty no work is submitted and null is returned. * * @param keyList keys to delete. * @return the submitted future or null */ private CompletableFuture submitDelete( final List keyList) { if (keyList.isEmpty()) { return null; } filesDeleted += keyList.size(); return submit(executor, callableWithinAuditSpan( getAuditSpan(), () -> { asyncDeleteAction( keyList); return null; })); } /** * The action called in the asynchronous thread to delete * the keys from S3 and paths from S3Guard. * * @param keyList keys to delete. * entries logged? * @throws IOException failure */ @Retries.RetryTranslated private void asyncDeleteAction( final List keyList) throws IOException { try (DurationInfo ignored = new DurationInfo(LOG, false, "Delete page of %d keys", keyList.size())) { if (!keyList.isEmpty()) { // first delete the files. List files = keyList.stream() .filter(e -> !e.isDirMarker) .map(e -> e.keyVersion) .collect(Collectors.toList()); LOG.debug("Deleting of {} file objects", files.size()); Invoker.once("Remove S3 Files", status.getPath().toString(), () -> callbacks.removeKeys( files, false )); // now the dirs List dirs = keyList.stream() .filter(e -> e.isDirMarker) .map(e -> e.keyVersion) .collect(Collectors.toList()); LOG.debug("Deleting of {} directory markers", dirs.size()); // This is invoked with deleteFakeDir. Invoker.once("Remove S3 Dir Markers", status.getPath().toString(), () -> callbacks.removeKeys( dirs, true )); } } } /** * Deletion entry; dir marker state is tracked to allow * delete requests to be split into file * and marker delete phases. * Without S3Guard, the split is only used * to choose which statistics to update. */ private static final class DeleteEntry { private final DeleteObjectsRequest.KeyVersion keyVersion; private final boolean isDirMarker; private DeleteEntry(final String key, final boolean isDirMarker) { this.keyVersion = new DeleteObjectsRequest.KeyVersion(key); this.isDirMarker = isDirMarker; } public String getKey() { return keyVersion.getKey(); } @Override public String toString() { return "DeleteEntry{" + "key='" + getKey() + '\'' + ", isDirMarker=" + isDirMarker + '}'; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy