org.apache.hadoop.fs.s3a.impl.DeleteOperation Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a.impl;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;

import com.amazonaws.services.s3.model.DeleteObjectsRequest;
import com.amazonaws.services.s3.model.DeleteObjectsResult;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.MoreExecutors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.s3a.Invoker;
import org.apache.hadoop.fs.s3a.Retries;
import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.s3a.S3ALocatedFileStatus;
import org.apache.hadoop.fs.s3a.Tristate;
import org.apache.hadoop.fs.s3a.s3guard.BulkOperationState;
import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
import org.apache.hadoop.fs.s3a.s3guard.S3Guard;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.DurationInfo;

import static org.apache.hadoop.fs.store.audit.AuditingFunctions.callableWithinAuditSpan;
import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.maybeAwaitCompletion;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;

/**
 * Implementation of the delete() operation.
 * 
 * How S3Guard/Store inconsistency is handled:
 * 

 *   
 *     The list operation does not ask for tombstone markers; objects
 *     under tombstones will be found and deleted.
 *     The {@code extraFilesDeleted} counter will be incremented here.
 *   
 *   
 *     That may result in recently deleted files being found and
 *     duplicate delete requests issued. This is mostly harmless.
 *   
 *   
 *     If a path is considered authoritative on the client, so only S3Guard
 *     is used for listings, we wrap up the delete with a scan of raw S3.
 *     This will find and eliminate OOB additions.
 *   
 *   
 *     Exception 1: simple directory markers of the form PATH + "/".
 *     These are treated as a signal that there are no children; no
 *     listing is made.
 *   
 *   
 *     Exception 2: delete(path, true) where path has a tombstone in S3Guard.
 *     Here the delete is downgraded to a no-op even before this operation
 *     is created. Thus: no listings of S3.
 *   
 * 
 * If this class is logged at debug, requests will be audited:
 * the response to a bulk delete call will be reviewed to see if there
 * were fewer files deleted than requested; that will be printed
 * at WARN level. This is independent of handling rejected delete
 * requests which raise exceptions -those are processed lower down.
 * 
 * Performance tuning:
 * 

 * The operation to POST a delete request (or issue many individual
 * DELETE calls) then update the S3Guard table is done in an async
 * operation so that it can overlap with the LIST calls for data.
 * However, only one single operation is queued at a time.
 * 

 * Executing more than one batch delete is possible, it just
 * adds complexity in terms of error handling as well as in
 * the datastructures used to track outstanding operations.
 * If this is done, then it may be good to experiment with different
 * page sizes. The default value is
 * {@link InternalConstants#MAX_ENTRIES_TO_DELETE}, the maximum a single
 * POST permits.
 * 

 * 1. Smaller pages executed in parallel may have different
 * performance characteristics when deleting very large directories,
 * because it will be the DynamoDB calls which will come to dominate.
 * Any exploration of options here MUST be done with performance
 * measurements taken from test runs in EC2 against local DDB and S3 stores,
 * so as to ensure network latencies do not skew the results.
 * 

 * 2. Note that as the DDB thread/connection pools will be shared across
 * all active delete operations, speedups will be minimal unless
 * those pools are large enough to cope the extra load.
 * 

 * There are also some opportunities to explore in
 * {@code DynamoDBMetadataStore} with batching delete requests
 * in the DDB APIs.
 */
public class DeleteOperation extends ExecutingStoreOperation {

  private static final Logger LOG = LoggerFactory.getLogger(
      DeleteOperation.class);

  /**
   * Pre-fetched source status.
   */
  private final S3AFileStatus status;

  /**
   * Recursive delete?
   */
  private final boolean recursive;

  /**
   * Callback provider.
   */
  private final OperationCallbacks callbacks;

  /**
   * Number of entries in a page.
   */
  private final int pageSize;

  /**
   * Metastore -never null but may be the NullMetadataStore.
   */
  private final MetadataStore metadataStore;

  /**
   * Executor for async operations.
   */
  private final ListeningExecutorService executor;

  /**
   * List of keys built up for the next delete batch.
   */
  private List keys;

  /**
   * List of paths built up for incremental deletion on tree delete.
   * At the end of the entire delete the full tree is scanned in S3Guard
   * and tombstones added. For this reason this list of paths must not
   * include directory markers, as that will break the scan.
   */
  private List paths;

  /**
   * The single async delete operation, or null.
   */
  private CompletableFuture deleteFuture;

  /**
   * Bulk Operation state if this is a bulk operation.
   */
  private BulkOperationState operationState;

  /**
   * Counter of deleted files.
   */
  private long filesDeleted;

  /**
   * Counter of files found in the S3 Store during a raw scan of the store
   * after the previous listing was in auth-mode.
   */
  private long extraFilesDeleted;

  /**
   * Constructor.
   * @param context store context
   * @param status  pre-fetched source status
   * @param recursive recursive delete?
   * @param callbacks callback provider
   * @param pageSize size of delete pages
   */
  public DeleteOperation(final StoreContext context,
      final S3AFileStatus status,
      final boolean recursive,
      final OperationCallbacks callbacks,
      final int pageSize) {

    super(context);
    this.status = status;
    this.recursive = recursive;
    this.callbacks = callbacks;
    checkArgument(pageSize > 0
            && pageSize <= InternalConstants.MAX_ENTRIES_TO_DELETE,
        "page size out of range: %s", pageSize);
    this.pageSize = pageSize;
    metadataStore = context.getMetadataStore();
    executor = MoreExecutors.listeningDecorator(
        context.createThrottledExecutor(1));
  }

  public long getFilesDeleted() {
    return filesDeleted;
  }

  public long getExtraFilesDeleted() {
    return extraFilesDeleted;
  }

  /**
   * Delete a file or directory tree.
   * 

   * This call does not create any fake parent directory; that is
   * left to the caller.
   * The actual delete call is done in a separate thread.
   * Only one delete at a time is submitted, however, to reduce the
   * complexity of recovering from failures.
   * 

   * The DynamoDB store deletes paths in parallel itself, so that
   * potentially slow part of the process is somewhat speeded up.
   * The extra parallelization here is to list files from the store/DDB while
   * that delete operation is in progress.
   *
   * @return true, except in the corner cases of root directory deletion
   * @throws PathIsNotEmptyDirectoryException if the path is a dir and this
   * is not a recursive delete.
   * @throws IOException list failures or an inability to delete a file.
   */
  @Retries.RetryTranslated
  public Boolean execute() throws IOException {
    executeOnlyOnce();

    StoreContext context = getStoreContext();
    Path path = status.getPath();
    LOG.debug("Delete path {} - recursive {}", path, recursive);
    LOG.debug("Type = {}",
        status.isFile() ? "File"
            : (status.isEmptyDirectory() == Tristate.TRUE
                ? "Empty Directory"
                : "Directory"));

    String key = context.pathToKey(path);
    if (status.isDirectory()) {
      LOG.debug("delete: Path is a directory: {}", path);
      checkArgument(
          status.isEmptyDirectory() != Tristate.UNKNOWN,
          "File status must have directory emptiness computed");

      if (!key.endsWith("/")) {
        key = key + "/";
      }

      if ("/".equals(key)) {
        LOG.error("S3A: Cannot delete the root directory."
                + " Path: {}. Recursive: {}",
            status.getPath(), recursive);
        return false;
      }

      if (!recursive && status.isEmptyDirectory() == Tristate.FALSE) {
        throw new PathIsNotEmptyDirectoryException(path.toString());
      }
      if (status.isEmptyDirectory() == Tristate.TRUE) {
        LOG.debug("deleting empty directory {}", path);
        deleteObjectAtPath(path, key, false);
      } else {
        deleteDirectoryTree(path, key);
      }

    } else {
      // simple file.
      LOG.debug("deleting simple file {}", path);
      deleteObjectAtPath(path, key, true);
    }
    LOG.debug("Deleted {} objects", filesDeleted);
    return true;
  }

  /**
   * Delete a directory tree.
   * 

   * This is done by asking the filesystem for a list of all objects under
   * the directory path, without using any S3Guard tombstone markers to hide
   * objects which may be returned in S3 listings but which are considered
   * deleted.
   * 

   * Once the first {@link #pageSize} worth of objects has been listed, a batch
   * delete is queued for execution in a separate thread; subsequent batches
   * block waiting for the first call to complete or fail before again,
   * being deleted in the separate thread.
   * 
   * After all listed objects are queued for deletion,
   * if the path is considered authoritative in the client, a final scan
   * of S3 without S3Guard is executed, so as to find and delete
   * any out-of-band objects in the tree.
   * @param path directory path
   * @param dirKey directory key
   * @throws IOException failure
   */
  @Retries.RetryTranslated
  protected void deleteDirectoryTree(final Path path,
      final String dirKey) throws IOException {
    // create an operation state so that the store can manage the bulk
    // operation if it needs to
    operationState = S3Guard.initiateBulkWrite(
        metadataStore,
        BulkOperationState.OperationType.Delete,
        path);
    try (DurationInfo ignored =
             new DurationInfo(LOG, false, "deleting %s", dirKey)) {

      // init the lists of keys and paths to delete
      resetDeleteList();
      deleteFuture = null;

      // list files including any under tombstones through S3Guard
      LOG.debug("Getting objects for directory prefix {} to delete", dirKey);
      final RemoteIterator locatedFiles =
          callbacks.listFilesAndDirectoryMarkers(path, status,
              false, true);

      // iterate through and delete. The next() call will block when a new S3
      // page is required; this any active delete submitted to the executor
      // will run in parallel with this.
      while (locatedFiles.hasNext()) {
        // get the next entry in the listing.
        S3AFileStatus child = locatedFiles.next().toS3AFileStatus();
        queueForDeletion(child);
      }
      LOG.debug("Deleting final batch of listed files");
      submitNextBatch();
      maybeAwaitCompletion(deleteFuture);

      // if s3guard is authoritative we follow up with a bulk list and
      // delete process on S3 this helps recover from any situation where S3
      // and S3Guard have become inconsistent.
      // This is only needed for auth paths; by performing the previous listing
      // without tombstone filtering, any files returned by the non-auth
      // S3 list which were hidden under tombstones will have been found
      // and deleted.

      if (callbacks.allowAuthoritative(path)) {
        LOG.debug("Path is authoritatively guarded;"
            + " listing files on S3 for completeness");
        // let the ongoing delete finish to avoid duplicates
        final RemoteIterator objects =
            callbacks.listObjects(path, dirKey);

        // iterate through and delete. The next() call will block when a new S3
        // page is required; this any active delete submitted to the executor
        // will run in parallel with this.
        while (objects.hasNext()) {
          // get the next entry in the listing.
          extraFilesDeleted++;
          S3AFileStatus next = objects.next();
          LOG.debug("Found Unlisted entry {}", next);
          queueForDeletion(deletionKey(next), null,
              next.isDirectory());
        }
        if (extraFilesDeleted > 0) {
          LOG.debug("Raw S3 Scan found {} extra file(s) to delete",
              extraFilesDeleted);
          // there is no more data:
          // await any ongoing operation
          submitNextBatch();
          maybeAwaitCompletion(deleteFuture);
        }
      }

      // final cleanup of the directory tree in the metastore, including the
      // directory entry itself.
      try (DurationInfo ignored2 =
               new DurationInfo(LOG, false, "Delete metastore")) {
        metadataStore.deleteSubtree(path, operationState);
      }
    } finally {
      IOUtils.cleanupWithLogger(LOG, operationState);
    }
    LOG.debug("Delete \"{}\" completed; deleted {} objects", path,
        filesDeleted);
  }

  /**
   * Build an S3 key for a delete request,
   * possibly adding a "/" if it represents directory and it does
   * not have a trailing slash already.
   * @param stat status to build the key from
   * @return a key for a delete request
   */
  private String deletionKey(final S3AFileStatus stat) {
    return getStoreContext().fullKey(stat);
  }

  /**
   * Queue for deletion.
   * @param stat status to queue
   * @throws IOException failure of the previous batch of deletions.
   */
  private void queueForDeletion(
      final S3AFileStatus stat) throws IOException {
    queueForDeletion(deletionKey(stat), stat.getPath(), stat.isDirectory());
  }

  /**
   * Queue keys for deletion.
   * Once a page of keys are ready to delete this
   * call is submitted to the executor, after waiting for the previous run to
   * complete.
   *
   * @param key key to delete
   * @param deletePath nullable path of the key
   * @param isDirMarker is the entry a directory?
   * @throws IOException failure of the previous batch of deletions.
   */
  private void queueForDeletion(final String key,
      @Nullable final Path deletePath,
      boolean isDirMarker) throws IOException {
    LOG.debug("Adding object to delete: \"{}\"", key);
    keys.add(new DeleteEntry(key, isDirMarker));
    if (deletePath != null) {
      if (!isDirMarker) {
        paths.add(deletePath);
      }
    }

    if (keys.size() == pageSize) {
      submitNextBatch();
    }
  }

  /**
   * Wait for the previous batch to finish then submit this page.
   * The lists of keys and pages are reset here.
   *
   * @throws IOException failure of the previous batch of deletions.
   */
  private void submitNextBatch()
      throws IOException {
    // delete a single page of keys and the metadata.
    // block for any previous batch.
    maybeAwaitCompletion(deleteFuture);

    // delete the current page of keys and paths
    deleteFuture = submitDelete(keys, paths);
    // reset the references so a new list can be built up.
    resetDeleteList();
  }

  /**
   * Reset the lists of keys and paths so that a new batch of
   * entries can built up.
   */
  private void resetDeleteList() {
    keys = new ArrayList<>(pageSize);
    paths = new ArrayList<>(pageSize);
  }

  /**
   * Delete a file or directory marker.
   * @param path path
   * @param key key
   * @param isFile is this a file?
   * @throws IOException failure
   */
  @Retries.RetryTranslated
  private void deleteObjectAtPath(
      final Path path,
      final String key,
      final boolean isFile)
      throws IOException {
    LOG.debug("delete: {} {}", (isFile ? "file" : "dir marker"), key);
    filesDeleted++;
    callbacks.deleteObjectAtPath(path, key, isFile, operationState);
  }

  /**
   * Delete a single page of keys and optionally the metadata.
   * For a large page, it is the metadata size which dominates.
   * Its possible to invoke this with empty lists of keys or paths.
   * If both lists are empty no work is submitted and null is returned.
   *
   * @param keyList keys to delete.
   * @param pathList paths to update the metastore with.
   * @return the submitted future or null
   */
  private CompletableFuture submitDelete(
      final List keyList,
      final List pathList) {

    if (keyList.isEmpty() && pathList.isEmpty()) {
      return null;
    }
    filesDeleted += keyList.size();
    return submit(executor,
        callableWithinAuditSpan(
            getAuditSpan(), () -> {
              asyncDeleteAction(operationState,
                  keyList,
                  pathList,
                  LOG.isDebugEnabled());
              return null;
            }));
  }

  /**
   * The action called in the asynchronous thread to delete
   * the keys from S3 and paths from S3Guard.
   *
   * @param state ongoing operation state
   * @param keyList keys to delete.
   * @param pathList paths to update the metastore with.
   * @param auditDeletedKeys should the results be audited and undeleted
   * entries logged?
   * @throws IOException failure
   */
  @Retries.RetryTranslated
  private void asyncDeleteAction(
      final BulkOperationState state,
      final List keyList,
      final List pathList,
      final boolean auditDeletedKeys)
      throws IOException {
    List deletedObjects = new ArrayList<>();
    try (DurationInfo ignored =
             new DurationInfo(LOG, false,
                 "Delete page of %d keys", keyList.size())) {
      DeleteObjectsResult result = null;
      List undeletedObjects = new ArrayList<>();
      if (!keyList.isEmpty()) {
        // first delete the files.
        List files = keyList.stream()
            .filter(e -> !e.isDirMarker)
            .map(e -> e.keyVersion)
            .collect(Collectors.toList());
        LOG.debug("Deleting of {} file objects", files.size());
        result = Invoker.once("Remove S3 Files",
            status.getPath().toString(),
            () -> callbacks.removeKeys(
                files,
                false,
                undeletedObjects,
                state,
                !auditDeletedKeys));
        if (result != null) {
          deletedObjects.addAll(result.getDeletedObjects());
        }
        // now the dirs
        List dirs = keyList.stream()
            .filter(e -> e.isDirMarker)
            .map(e -> e.keyVersion)
            .collect(Collectors.toList());
        LOG.debug("Deleting of {} directory markers", dirs.size());
        // This is invoked with deleteFakeDir = true, so
        // S3Guard is not updated.
        result = Invoker.once("Remove S3 Dir Markers",
            status.getPath().toString(),
            () -> callbacks.removeKeys(
                dirs,
                true,
                undeletedObjects,
                state,
                !auditDeletedKeys));
        if (result != null) {
          deletedObjects.addAll(result.getDeletedObjects());
        }
      }
      if (!pathList.isEmpty()) {
        // delete file paths only. This stops tombstones
        // being added until the final directory cleanup
        // (HADOOP-17244)
        metadataStore.deletePaths(pathList, state);
      }
      if (auditDeletedKeys) {
        // audit the deleted keys
        if (deletedObjects.size() != keyList.size()) {
          // size mismatch
          LOG.warn("Size mismatch in deletion operation. "
                  + "Expected count of deleted files: {}; "
                  + "actual: {}",
              keyList.size(), deletedObjects.size());
          // strip out the deleted keys
          for (DeleteObjectsResult.DeletedObject del : deletedObjects) {
            keyList.removeIf(kv -> kv.getKey().equals(del.getKey()));
          }
          for (DeleteEntry kv : keyList) {
            LOG.debug("{}", kv.getKey());
          }
        }
      }
    }
  }

  /**
   * Deletion entry; dir marker state is tracked to control S3Guard
   * update policy.
   */
  private static final class DeleteEntry {
    private final DeleteObjectsRequest.KeyVersion keyVersion;

    private final boolean isDirMarker;

    private DeleteEntry(final String key, final boolean isDirMarker) {
      this.keyVersion = new DeleteObjectsRequest.KeyVersion(key);
      this.isDirMarker = isDirMarker;
    }

    public String getKey() {
      return keyVersion.getKey();
    }

    @Override
    public String toString() {
      return "DeleteEntry{" +
          "key='" + getKey() + '\'' +
          ", isDirMarker=" + isDirMarker +
          '}';
    }
  }

}