org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hadoop-aws Show documentation
This module contains code to support integration with Amazon Web Services. It also declares the dependencies needed to work with AWS services.
There is a newer version: 3.4.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a.s3guard;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.UncheckedIOException;
import java.net.URI;
import java.nio.file.AccessDeniedException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import com.amazonaws.AmazonServiceException;
import com.amazonaws.auth.AWSCredentialsProvider;
import com.amazonaws.services.dynamodbv2.AmazonDynamoDB;
import com.amazonaws.services.dynamodbv2.document.BatchWriteItemOutcome;
import com.amazonaws.services.dynamodbv2.document.DynamoDB;
import com.amazonaws.services.dynamodbv2.document.Item;
import com.amazonaws.services.dynamodbv2.document.ItemCollection;
import com.amazonaws.services.dynamodbv2.document.PrimaryKey;
import com.amazonaws.services.dynamodbv2.document.PutItemOutcome;
import com.amazonaws.services.dynamodbv2.document.QueryOutcome;
import com.amazonaws.services.dynamodbv2.document.ScanOutcome;
import com.amazonaws.services.dynamodbv2.document.Table;
import com.amazonaws.services.dynamodbv2.document.TableWriteItems;
import com.amazonaws.services.dynamodbv2.document.internal.IteratorSupport;
import com.amazonaws.services.dynamodbv2.document.spec.GetItemSpec;
import com.amazonaws.services.dynamodbv2.document.spec.QuerySpec;
import com.amazonaws.services.dynamodbv2.document.utils.ValueMap;
import com.amazonaws.services.dynamodbv2.model.AmazonDynamoDBException;
import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputDescription;
import com.amazonaws.services.dynamodbv2.model.TableDescription;
import com.amazonaws.services.dynamodbv2.model.WriteRequest;
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.thirdparty.com.google.common.collect.Lists;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.MoreExecutors;;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIOException;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.util.functional.CallableRaisingIOE;
import org.apache.hadoop.util.functional.RemoteIterators;
import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
import org.apache.hadoop.fs.s3a.AWSServiceThrottledException;
import org.apache.hadoop.fs.s3a.Constants;
import org.apache.hadoop.fs.s3a.Invoker;
import org.apache.hadoop.fs.s3a.Retries;
import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import org.apache.hadoop.fs.s3a.S3AUtils;
import org.apache.hadoop.fs.s3a.Tristate;
import org.apache.hadoop.fs.s3a.auth.RoleModel;
import org.apache.hadoop.fs.s3a.auth.RolePolicies;
import org.apache.hadoop.fs.s3a.auth.delegation.AWSPolicyProvider;
import org.apache.hadoop.fs.s3a.impl.StoreContext;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.BlockingThreadPoolExecutorService;
import org.apache.hadoop.util.DurationInfo;
import org.apache.hadoop.util.ReflectionUtils;

import static org.apache.hadoop.fs.s3a.Constants.*;
import static org.apache.hadoop.fs.s3a.S3AUtils.*;
import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowAllDynamoDBOperations;
import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowS3GuardClientOperations;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
import static org.apache.hadoop.fs.s3a.s3guard.PathMetadataDynamoDBTranslation.*;
import static org.apache.hadoop.fs.s3a.s3guard.PathOrderComparators.TOPMOST_PM_LAST;
import static org.apache.hadoop.fs.s3a.s3guard.S3Guard.*;

/**
 * DynamoDBMetadataStore is a {@link MetadataStore} that persists
 * file system metadata to DynamoDB.
 *
 * The current implementation uses a schema consisting of a single table.  The
 * name of the table can be configured by config key
 * {@link org.apache.hadoop.fs.s3a.Constants#S3GUARD_DDB_TABLE_NAME_KEY}.
 * By default, it matches the name of the S3 bucket.  Each item in the table
 * represents a single directory or file.  Its path is split into separate table
 * attributes:
 * 
 *  parent (absolute path of the parent, with bucket name inserted as
 * first path component). 
 *  child (path of that specific child, relative to parent). 
 *  optional boolean attribute tracking whether the path is a directory.
 *      Absence or a false value indicates the path is a file. 
 *  optional long attribute revealing modification time of file.
 *      This attribute is meaningful only to file items.
 *  optional long attribute revealing file length.
 *      This attribute is meaningful only to file items.
 *  optional long attribute revealing block size of the file.
 *      This attribute is meaningful only to file items.
 *  optional string attribute tracking the s3 eTag of the file.
 *      May be absent if the metadata was entered with a version of S3Guard
 *      before this was tracked.
 *      This attribute is meaningful only to file items.
  *  optional string attribute tracking the s3 versionId of the file.
 *      May be absent if the metadata was entered with a version of S3Guard
 *      before this was tracked.
 *      This attribute is meaningful only to file items.
 * 
 *
 * The DynamoDB partition key is the parent, and the range key is the child.
 *
 * To allow multiple buckets to share the same DynamoDB table, the bucket
 * name is treated as the root directory.
 *
 * For example, assume the consistent store contains metadata representing this
 * file system structure:
 *
 *  * s3a://bucket/dir1
 * |-- dir2
 * |   |-- file1
 * |   `-- file2
 * `-- dir3
 *     |-- dir4
 *     |   `-- file3
 *     |-- dir5
 *     |   `-- file4
 *     `-- dir6
 * 
 *
 * This is persisted to a single DynamoDB table as:
 *
 *  * ====================================================================================
 * | parent                 | child | is_dir | mod_time | len | etag | ver_id |  ...  |
 * ====================================================================================
 * | /bucket                | dir1  | true   |          |     |      |        |       |
 * | /bucket/dir1           | dir2  | true   |          |     |      |        |       |
 * | /bucket/dir1           | dir3  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir2      | file1 |        |   100    | 111 | abc  |  mno   |       |
 * | /bucket/dir1/dir2      | file2 |        |   200    | 222 | def  |  pqr   |       |
 * | /bucket/dir1/dir3      | dir4  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir3      | dir5  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir3/dir4 | file3 |        |   300    | 333 | ghi  |  stu   |       |
 * | /bucket/dir1/dir3/dir5 | file4 |        |   400    | 444 | jkl  |  vwx   |       |
 * | /bucket/dir1/dir3      | dir6  | true   |          |     |      |        |       |
 * ====================================================================================
 * 
 *
 * This choice of schema is efficient for read access patterns.
 * {@link #get(Path)} can be served from a single item lookup.
 * {@link #listChildren(Path)} can be served from a query against all rows
 * matching the parent (the partition key) and the returned list is guaranteed
 * to be sorted by child (the range key).  Tracking whether or not a path is a
 * directory helps prevent unnecessary queries during traversal of an entire
 * sub-tree.
 *
 * Some mutating operations, notably
 * {@link MetadataStore#deleteSubtree(Path, BulkOperationState)} and
 * {@link MetadataStore#move(Collection, Collection, BulkOperationState)}
 * are less efficient with this schema.
 * They require mutating multiple items in the DynamoDB table.
 *
 * By default, DynamoDB access is performed within the same AWS region as
 * the S3 bucket that hosts the S3A instance.  During initialization, it checks
 * the location of the S3 bucket and creates a DynamoDB client connected to the
 * same region. The region may also be set explicitly by setting the config
 * parameter {@code fs.s3a.s3guard.ddb.region} to the corresponding region.
 */
@SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class DynamoDBMetadataStore implements MetadataStore,
    AWSPolicyProvider {
  public static final Logger LOG = LoggerFactory.getLogger(
      DynamoDBMetadataStore.class);

  /**
   * Name of the operations log.
   */
  public static final String OPERATIONS_LOG_NAME =
      "org.apache.hadoop.fs.s3a.s3guard.Operations";

  /**
   * A log of all state changing operations to the store;
   * only updated at debug level.
   */
  public static final Logger OPERATIONS_LOG = LoggerFactory.getLogger(
      OPERATIONS_LOG_NAME);

  /** parent/child name to use in the version marker. */
  public static final String VERSION_MARKER_ITEM_NAME = "../VERSION";

  /** parent/child name to use in the version marker. */
  public static final String VERSION_MARKER_TAG_NAME = "s3guard_version";

  /** Current version number. */
  public static final int VERSION = 100;

  @VisibleForTesting
  static final String BILLING_MODE
      = "billing-mode";

  @VisibleForTesting
  static final String BILLING_MODE_PER_REQUEST
      = "per-request";

  @VisibleForTesting
  static final String BILLING_MODE_PROVISIONED
      = "provisioned";

  @VisibleForTesting
  static final String DESCRIPTION
      = "S3Guard metadata store in DynamoDB";
  @VisibleForTesting
  static final String READ_CAPACITY = "read-capacity";
  @VisibleForTesting
  static final String WRITE_CAPACITY = "write-capacity";
  @VisibleForTesting
  static final String STATUS = "status";
  @VisibleForTesting
  static final String TABLE = "table";

  @VisibleForTesting
  static final String HINT_DDB_IOPS_TOO_LOW
      = " This may be because the write threshold of DynamoDB is set too low.";

  @VisibleForTesting
  static final String THROTTLING = "Throttling";

  public static final String E_ON_DEMAND_NO_SET_CAPACITY
      = "Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when BillingMode is PAY_PER_REQUEST";

  @VisibleForTesting
  static final String E_INCONSISTENT_UPDATE
      = "Duplicate and inconsistent entry in update operation";

  private static final ValueMap DELETE_TRACKING_VALUE_MAP =
      new ValueMap().withBoolean(":false", false);

  /**
   * The maximum number of outstanding operations to submit
   * before blocking to await completion of all the executors.
   * Paging work like this is less efficient, but it ensures that
   * failure (auth, network, etc) are picked up before many more
   * operations are submitted.
   *
   * Arbitrary Choice.
   * Value: {@value}.
   */
  private static final int S3GUARD_DDB_SUBMITTED_TASK_LIMIT = 50;

  private AmazonDynamoDB amazonDynamoDB;
  private DynamoDB dynamoDB;
  private AWSCredentialProviderList credentials;
  private String region;
  private Table table;
  private String tableName;
  private Configuration conf;
  private String username;

  /**
   * This policy is mostly for batched writes, not for processing
   * exceptions in invoke() calls.
   * It also has a role purpose in
   * {@link DynamoDBMetadataStoreTableManager#getVersionMarkerItem()};
   * look at that method for the details.
   */
  private RetryPolicy batchWriteRetryPolicy;

  /**
   * The instrumentation is never null -if/when bound to an owner file system
   * That filesystem statistics will be updated as appropriate.
   */
  private MetastoreInstrumentation instrumentation
      = new MetastoreInstrumentationImpl();

  /** Owner FS: only valid if configured with an owner FS. */
  private S3AFileSystem owner;

  /** Invoker for IO. Until configured properly, use try-once. */
  private Invoker invoker = new Invoker(RetryPolicies.TRY_ONCE_THEN_FAIL,
      Invoker.NO_OP
  );

  /** Invoker for read operations. */
  private Invoker readOp;

  /** Invoker for write operations. */
  private Invoker writeOp;

  /** Invoker for scan operations. */
  private Invoker scanOp;

  private final AtomicLong readThrottleEvents = new AtomicLong(0);
  private final AtomicLong writeThrottleEvents = new AtomicLong(0);
  private final AtomicLong scanThrottleEvents = new AtomicLong(0);
  private final AtomicLong batchWriteCapacityExceededEvents = new AtomicLong(0);

  /**
   * Total limit on the number of throttle events after which
   * we stop warning in the log. Keeps the noise down.
   */
  private static final int THROTTLE_EVENT_LOG_LIMIT = 100;

  /**
   * Count of the total number of throttle events; used to crank back logging.
   */
  private AtomicInteger throttleEventCount = new AtomicInteger(0);

  /**
   * Executor for submitting operations.
   */
  private ListeningExecutorService executor;

  /**
   * Time source. This is used during writes when parent
   * entries need to be created.
   */
  private ITtlTimeProvider ttlTimeProvider;

  private DynamoDBMetadataStoreTableManager tableHandler;

  /**
   * A utility function to create DynamoDB instance.
   * @param conf the file system configuration
   * @param s3Region region of the associated S3 bucket (if any).
   * @param bucket Optional bucket to use to look up per-bucket proxy secrets
   * @param credentials credentials.
   * @return DynamoDB instance.
   * @throws IOException I/O error.
   */
  private DynamoDB createDynamoDB(
      final Configuration conf,
      final String s3Region,
      final String bucket,
      final AWSCredentialsProvider credentials)
      throws IOException {
    if (amazonDynamoDB == null) {
      Preconditions.checkNotNull(conf);
      final Class cls =
          conf.getClass(S3GUARD_DDB_CLIENT_FACTORY_IMPL,
          S3GUARD_DDB_CLIENT_FACTORY_IMPL_DEFAULT, DynamoDBClientFactory.class);
      LOG.debug("Creating DynamoDB client {} with S3 region {}", cls, s3Region);
      amazonDynamoDB = ReflectionUtils.newInstance(cls, conf)
          .createDynamoDBClient(s3Region, bucket, credentials);
    }
    return new DynamoDB(amazonDynamoDB);
  }

  /**
   * {@inheritDoc}.
   * The credentials for authenticating with S3 are requested from the
   * FS via {@link S3AFileSystem#shareCredentials(String)}; this will
   * increment the reference counter of these credentials.
   * @param fs {@code S3AFileSystem} associated with the MetadataStore
   * @param ttlTp the time provider to use for metadata expiry
   * @throws IOException on a failure
   */
  @Override
  @Retries.OnceRaw
  public void initialize(FileSystem fs, ITtlTimeProvider ttlTp)
      throws IOException {
    Preconditions.checkNotNull(fs, "Null filesystem");
    Preconditions.checkArgument(fs instanceof S3AFileSystem,
        "DynamoDBMetadataStore only supports S3A filesystem - not %s",
        fs);
    bindToOwnerFilesystem((S3AFileSystem) fs);
    final String bucket = owner.getBucket();
    String confRegion = conf.getTrimmed(S3GUARD_DDB_REGION_KEY);
    if (!StringUtils.isEmpty(confRegion)) {
      region = confRegion;
      LOG.debug("Overriding S3 region with configured DynamoDB region: {}",
          region);
    } else {
      try {
        region = owner.getBucketLocation();
      } catch (AccessDeniedException e) {
        // access denied here == can't call getBucket. Report meaningfully
        URI uri = owner.getUri();
        String message =
            "Failed to get bucket location as client lacks permission "
                + RolePolicies.S3_GET_BUCKET_LOCATION + " for " + uri;
        LOG.error(message);
        throw (IOException)new AccessDeniedException(message).initCause(e);
      }
      LOG.debug("Inferring DynamoDB region from S3 bucket: {}", region);
    }
    credentials = owner.shareCredentials("s3guard");
    dynamoDB = createDynamoDB(conf, region, bucket, credentials);

    // use the bucket as the DynamoDB table name if not specified in config
    tableName = conf.getTrimmed(S3GUARD_DDB_TABLE_NAME_KEY, bucket);
    initDataAccessRetries(conf);

    this.ttlTimeProvider = ttlTp;

    tableHandler = new DynamoDBMetadataStoreTableManager(
        dynamoDB, tableName, region, amazonDynamoDB, conf, readOp,
        batchWriteRetryPolicy);
    this.table = tableHandler.initTable();

    instrumentation.initialized();
  }

  /**
   * Declare that this table is owned by the specific S3A FS instance.
   * This will bind some fields to the values provided by the owner,
   * including wiring up the instrumentation.
   * @param fs owner filesystem
   */
  @VisibleForTesting
  void bindToOwnerFilesystem(final S3AFileSystem fs) {
    owner = fs;
    conf = owner.getConf();
    StoreContext context = owner.createStoreContext();
    instrumentation = context.getInstrumentation()
        .getS3GuardInstrumentation();
    username = context.getUsername();
    executor = MoreExecutors.listeningDecorator(
        context.createThrottledExecutor());
    ttlTimeProvider = Preconditions.checkNotNull(
        context.getTimeProvider(),
        "ttlTimeProvider must not be null");
  }

  /**
   * Performs one-time initialization of the metadata store via configuration.
   *
   * This initialization depends on the configuration object to get AWS
   * credentials, DynamoDBFactory implementation class, DynamoDB endpoints,
   * DynamoDB table names etc. After initialization, this metadata store does
   * not explicitly relate to any S3 bucket, which be nonexistent.
   *
   * This is used to operate the metadata store directly beyond the scope of the
   * S3AFileSystem integration, e.g. command line tools.
   * Generally, callers should use
   * {@link MetadataStore#initialize(FileSystem, ITtlTimeProvider)}
   * with an initialized {@code S3AFileSystem} instance.
   *
   * Without a filesystem to act as a reference point, the configuration itself
   * must declare the table name and region in the
   * {@link Constants#S3GUARD_DDB_TABLE_NAME_KEY} and
   * {@link Constants#S3GUARD_DDB_REGION_KEY} respectively.
   * It also creates a new credential provider list from the configuration,
   * using the base fs.s3a.* options, as there is no bucket to infer per-bucket
   * settings from.
   *
   * @see MetadataStore#initialize(FileSystem, ITtlTimeProvider)
   * @throws IOException if there is an error
   * @throws IllegalArgumentException if the configuration is incomplete
   */
  @Override
  @Retries.OnceRaw
  public void initialize(Configuration config,
      ITtlTimeProvider ttlTp) throws IOException {
    conf = config;
    // use the bucket as the DynamoDB table name if not specified in config
    tableName = conf.getTrimmed(S3GUARD_DDB_TABLE_NAME_KEY);

    Preconditions.checkArgument(!StringUtils.isEmpty(tableName),
        "No DynamoDB table name configured");
    region = conf.getTrimmed(S3GUARD_DDB_REGION_KEY);
    Preconditions.checkArgument(!StringUtils.isEmpty(region),
        "No DynamoDB region configured");
    // there's no URI here, which complicates life: you cannot
    // create AWS providers here which require one.
    credentials = createAWSCredentialProviderSet(null, conf);
    dynamoDB = createDynamoDB(conf, region, null, credentials);

    username = UserGroupInformation.getCurrentUser().getShortUserName();
    // without an executor from the owner FS, create one using
    // the executor capacity for work.
    int executorCapacity = intOption(conf,
        EXECUTOR_CAPACITY, DEFAULT_EXECUTOR_CAPACITY, 1);
    executor = MoreExecutors.listeningDecorator(
        BlockingThreadPoolExecutorService.newInstance(
            executorCapacity,
            executorCapacity * 2,
              longOption(conf, KEEPALIVE_TIME,
                  DEFAULT_KEEPALIVE_TIME, 0),
                  TimeUnit.SECONDS,
                  "s3a-ddb-" + tableName));
    initDataAccessRetries(conf);
    this.ttlTimeProvider = ttlTp;

    tableHandler = new DynamoDBMetadataStoreTableManager(
        dynamoDB, tableName, region, amazonDynamoDB, conf, readOp,
        batchWriteRetryPolicy);
    this.table = tableHandler.initTable();
  }

  /**
   * Set retry policy. This is driven by the value of
   * {@link Constants#S3GUARD_DDB_MAX_RETRIES} with an exponential backoff
   * between each attempt of {@link Constants#S3GUARD_DDB_THROTTLE_RETRY_INTERVAL}
   * milliseconds.
   * @param config configuration for data access
   */
  private void initDataAccessRetries(Configuration config) {
    batchWriteRetryPolicy = RetryPolicies
        .exponentialBackoffRetry(
            config.getInt(S3GUARD_DDB_MAX_RETRIES,
                S3GUARD_DDB_MAX_RETRIES_DEFAULT),
            conf.getTimeDuration(S3GUARD_DDB_THROTTLE_RETRY_INTERVAL,
                S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT,
                TimeUnit.MILLISECONDS),
            TimeUnit.MILLISECONDS);
    final RetryPolicy throttledRetryRetryPolicy
        = new S3GuardDataAccessRetryPolicy(config);
    readOp = new Invoker(throttledRetryRetryPolicy, this::readRetryEvent);
    writeOp = new Invoker(throttledRetryRetryPolicy, this::writeRetryEvent);
    scanOp = new Invoker(throttledRetryRetryPolicy, this::scanRetryEvent);
  }

  @Override
  @Retries.RetryTranslated
  public void delete(Path path,
      final BulkOperationState operationState)
      throws IOException {
    innerDelete(path, true,
        extractOrCreate(operationState,
            BulkOperationState.OperationType.Delete));
  }

  @Override
  @Retries.RetryTranslated
  public void forgetMetadata(Path path) throws IOException {
    LOG.debug("Forget metadata for {}", path);
    innerDelete(path, false, null);
  }

  /**
   * Inner delete option, action based on the {@code tombstone} flag.
   * No tombstone: delete the entry. Tombstone: create a tombstone entry.
   * There is no check as to whether the entry exists in the table first.
   * @param path path to delete
   * @param tombstone flag to create a tombstone marker
   * @param ancestorState ancestor state for context.
   * @throws IOException I/O error.
   */
  @Retries.RetryTranslated
  private void innerDelete(final Path path,
      final boolean tombstone,
      final AncestorState ancestorState)
      throws IOException {
    checkPath(path);
    LOG.debug("Deleting from table {} in region {}: {}",
        tableName, region, path);

    // deleting nonexistent item consumes 1 write capacity; skip it
    if (path.isRoot()) {
      LOG.debug("Skip deleting root directory as it does not exist in table");
      return;
    }
    // the policy on whether repeating delete operations is based
    // on that of S3A itself
    boolean idempotent = S3AFileSystem.DELETE_CONSIDERED_IDEMPOTENT;
    if (tombstone) {
      Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider "
          + "must not be null");
      final PathMetadata pmTombstone = PathMetadata.tombstone(path,
          ttlTimeProvider.getNow());
      Item item = PathMetadataDynamoDBTranslation.pathMetadataToItem(
          new DDBPathMetadata(pmTombstone));
      writeOp.retry(
          "Put tombstone",
          path.toString(),
          idempotent,
          () -> {
            logPut(ancestorState, item);
            recordsWritten(1);
            table.putItem(item);
          });
    } else {
      PrimaryKey key = pathToKey(path);
      writeOp.retry(
          "Delete key",
          path.toString(),
          idempotent,
          () -> {
            // record the attempt so even on retry the counter goes up.
            logDelete(ancestorState, key);
            recordsDeleted(1);
            table.deleteItem(key);
          });
    }
  }

  @Override
  @Retries.RetryTranslated
  public void deleteSubtree(Path path,
      final BulkOperationState operationState)
      throws IOException {
    checkPath(path);
    LOG.debug("Deleting subtree from table {} in region {}: {}",
        tableName, region, path);

    final PathMetadata meta = get(path);
    if (meta == null) {
      LOG.debug("Subtree path {} does not exist; this will be a no-op", path);
      return;
    }
    if (meta.isDeleted()) {
      LOG.debug("Subtree path {} is deleted; this will be a no-op", path);
      return;
    }
    deleteEntries(RemoteIterators.mappingRemoteIterator(
        new DescendantsIterator(this, meta),
        FileStatus::getPath),
        operationState);
  }

  @Override
  @Retries.RetryTranslated
  public void deletePaths(Collection paths,
      final BulkOperationState operationState)
      throws IOException {
    deleteEntries(RemoteIterators.remoteIteratorFromIterable(paths),
        operationState);
  }

  /**
   * Delete the entries under an iterator.
   * There's no attempt to order the paths: they are
   * deleted in the order passed in.
   * @param entries entries to delete.
   * @param operationState Nullable operation state
   * @throws IOException failure
   */
  @Retries.RetryTranslated
  private void deleteEntries(RemoteIterator entries,
      final BulkOperationState operationState)
      throws IOException {
    final List> futures = new ArrayList<>();
    AncestorState state = extractOrCreate(operationState,
        BulkOperationState.OperationType.Delete);

    while (entries.hasNext()) {
      final Path pathToDelete = entries.next();
      futures.add(submit(executor, () -> {
        innerDelete(pathToDelete, true, state);
        return null;
      }));
      if (futures.size() > S3GUARD_DDB_SUBMITTED_TASK_LIMIT) {
        // first batch done; block for completion.
        waitForCompletion(futures);
        futures.clear();
      }
    }
    // now wait for the final set.
    waitForCompletion(futures);
  }

  /**
   * Get a consistent view of an item.
   * @param path path to look up in the database
   * @return the result
   * @throws IOException failure
   */
  @Retries.RetryTranslated
  private Item getConsistentItem(final Path path) throws IOException {
    PrimaryKey key = pathToKey(path);
    final GetItemSpec spec = new GetItemSpec()
        .withPrimaryKey(key)
        .withConsistentRead(true); // strictly consistent read
    return readOp.retry("get",
        path.toString(),
        true,
        () -> {
          recordsRead(1);
          return table.getItem(spec);
        });
  }

  @Override
  @Retries.RetryTranslated
  public DDBPathMetadata get(Path path) throws IOException {
    return get(path, false);
  }

  @Override
  @Retries.RetryTranslated
  public DDBPathMetadata get(Path path, boolean wantEmptyDirectoryFlag)
      throws IOException {
    checkPath(path);
    LOG.debug("Get from table {} in region {}: {} ; wantEmptyDirectory={}",
        tableName, region, path, wantEmptyDirectoryFlag);
    DDBPathMetadata result = innerGet(path, wantEmptyDirectoryFlag);
    LOG.debug("result of get {} is: {}", path, result);
    return result;
  }

  /**
   * Inner get operation, as invoked in the retry logic.
   * @param path the path to get
   * @param wantEmptyDirectoryFlag Set to true to give a hint to the
   *   MetadataStore that it should try to compute the empty directory flag.
   * @return metadata for {@code path}, {@code null} if not found
   * @throws IOException IO problem
   */
  @Retries.RetryTranslated
  private DDBPathMetadata innerGet(Path path, boolean wantEmptyDirectoryFlag)
      throws IOException {
    final DDBPathMetadata meta;
    if (path.isRoot()) {
      // Root does not persist in the table
      meta =
          new DDBPathMetadata(makeDirStatus(username, path));
    } else {
      final Item item = getConsistentItem(path);
      meta = itemToPathMetadata(item, username);
      LOG.debug("Get from table {} in region {} returning for {}: {}",
          tableName, region, path, meta);
    }

    if (wantEmptyDirectoryFlag && meta != null && !meta.isDeleted()) {
      final FileStatus status = meta.getFileStatus();
      // for a non-deleted directory, we query its direct undeleted children
      // to determine the isEmpty bit. There's no TTL checking going on here.
      if (status.isDirectory()) {
        final QuerySpec spec = new QuerySpec()
            .withHashKey(pathToParentKeyAttribute(path))
            .withConsistentRead(true)
            .withFilterExpression(IS_DELETED + " = :false")
            .withValueMap(DELETE_TRACKING_VALUE_MAP);
        boolean hasChildren = readOp.retry("get/hasChildren",
            path.toString(),
            true,
            () -> {
              // issue the query
              final IteratorSupport it = table.query(
                  spec).iterator();
              // if non empty, log the result to aid with some debugging
              if (it.hasNext()) {
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Dir {} is non-empty", status.getPath());
                  while(it.hasNext()) {
                    LOG.debug("{}", itemToPathMetadata(it.next(), username));
                  }
                }
                return true;
              } else {
                return false;
              }
          });

        // If directory is authoritative, we can set the empty directory flag
        // to TRUE or FALSE. Otherwise FALSE, or UNKNOWN.
        if (meta.isAuthoritativeDir()) {
          meta.setIsEmptyDirectory(
              hasChildren ? Tristate.FALSE : Tristate.TRUE);
        } else {
          meta.setIsEmptyDirectory(
              hasChildren ? Tristate.FALSE : Tristate.UNKNOWN);
        }
      }
    }

    return meta;
  }

  /**
   * Make a S3AFileStatus object for a directory at given path.
   * The FileStatus only contains what S3A needs, and omits mod time
   * since S3A uses its own implementation which returns current system time.
   * @param dirOwner  username of owner
   * @param path   path to dir
   * @return new S3AFileStatus
   */
  private S3AFileStatus makeDirStatus(String dirOwner, Path path) {
    return new S3AFileStatus(Tristate.UNKNOWN, path, dirOwner);
  }

  @Override
  @Retries.RetryTranslated
  public DirListingMetadata listChildren(final Path path) throws IOException {
    checkPath(path);
    LOG.debug("Listing table {} in region {}: {}", tableName, region, path);

    final QuerySpec spec = new QuerySpec()
        .withHashKey(pathToParentKeyAttribute(path))
        .withConsistentRead(true); // strictly consistent read
    final List metas = new ArrayList<>();
    // find the children in the table
    final ItemCollection items = scanOp.retry(
        "listChildren",
        path.toString(),
        true,
        () -> table.query(spec));
    // now wrap the result with retry logic
    try {
      for (Item item : wrapWithRetries(items)) {
        metas.add(itemToPathMetadata(item, username));
      }
    } catch (UncheckedIOException e) {
      // failure in the iterators; unwrap.
      throw e.getCause();
    }

    // Minor race condition here - if the path is deleted between
    // getting the list of items and the directory metadata we might
    // get a null in DDBPathMetadata.
    return getDirListingMetadataFromDirMetaAndList(path, metas,
        get(path));
  }

  DirListingMetadata getDirListingMetadataFromDirMetaAndList(Path path,
      List metas, DDBPathMetadata dirPathMeta) {
    boolean isAuthoritative = false;
    if (dirPathMeta != null) {
      isAuthoritative = dirPathMeta.isAuthoritativeDir();
    }

    LOG.trace("Listing table {} in region {} for {} returning {}",
        tableName, region, path, metas);

    if (!metas.isEmpty() && dirPathMeta == null) {
      // We handle this case as the directory is deleted.
      LOG.warn("Directory marker is deleted, but the list of the directory "
          + "elements is not empty: {}. This case is handled as if the "
          + "directory was deleted.", metas);
      return null;
    }

    if(metas.isEmpty() && dirPathMeta == null) {
      return null;
    }

    return new DirListingMetadata(path, metas, isAuthoritative,
        dirPathMeta.getLastUpdated());
  }

  /**
   * Origin of entries in the ancestor map built up in
   * {@link #completeAncestry(Collection, AncestorState)}.
   * This is done to stop generated ancestor entries to overwriting those
   * in the store, while allowing those requested in the API call to do this.
   */
  private enum EntryOrigin {
    Requested,  // requested in method call
    Retrieved,  // retrieved from DDB: do not resubmit
    Generated   // generated ancestor.
  }

  /**
   * Build the list of all parent entries.
   * 
   * Thread safety: none. Callers must synchronize access.
   * 

   * Callers are required to synchronize on ancestorState.
   * @param pathsToCreate paths to create
   * @param ancestorState ongoing ancestor state.
   * @return the full ancestry paths
   */
  private Collection completeAncestry(
      final Collection pathsToCreate,
      final AncestorState ancestorState) throws IOException {
    // Key on path to allow fast lookup
    Map> ancestry = new HashMap<>();
    LOG.debug("Completing ancestry for {} paths", pathsToCreate.size());
    // we sort the inputs to guarantee that the topmost entries come first.
    // that way if the put request contains both parents and children
    // then the existing parents will not be re-created -they will just
    // be added to the ancestor list first.
    List sortedPaths = new ArrayList<>(pathsToCreate);
    sortedPaths.sort(PathOrderComparators.TOPMOST_PM_FIRST);
    // iterate through the paths.
    for (DDBPathMetadata entry : sortedPaths) {
      Preconditions.checkArgument(entry != null);
      Path path = entry.getFileStatus().getPath();
      LOG.debug("Adding entry {}", path);
      if (path.isRoot()) {
        // this is a root entry: do not add it.
        break;
      }
      // add it to the ancestor state, failing if it is already there and
      // of a different type.
      DDBPathMetadata oldEntry = ancestorState.put(path, entry);
      boolean addAncestors = true;
      if (oldEntry != null) {
        // check for and warn if the existing bulk operation has an inconsistent
        // entry.
        // two directories or two files are both allowed.
        // file-over-file can happen in multipart uploaders when the same
        // uploader is overwriting file entries to the same destination as
        // part of its bulk operation.
        boolean oldWasDir = oldEntry.getFileStatus().isDirectory();
        boolean newIsDir = entry.getFileStatus().isDirectory();
        if ((oldWasDir && !newIsDir)
            || (!oldWasDir && newIsDir)) {
          LOG.warn("Overwriting a S3Guard file created in the operation: {}",
              oldEntry);
          LOG.warn("With new entry: {}", entry);
          // restore the old state
          ancestorState.put(path, oldEntry);
          // then raise an exception
          throw new PathIOException(path.toString(),
              String.format("%s old %s new %s",
                  E_INCONSISTENT_UPDATE,
                  oldEntry,
                  entry));
        } else {
          // a directory is already present. Log and continue.
          LOG.debug("Directory at {} being updated with value {}",
              path, entry);
          // and we skip the the subsequent parent scan as we've already been
          // here
          addAncestors = false;
        }
      }
      // add the entry to the ancestry map as an explicitly requested entry.
      ancestry.put(path, Pair.of(EntryOrigin.Requested, entry));
      // now scan up the ancestor tree to see if there are any
      // immediately missing entries.
      Path parent = path.getParent();
      while (addAncestors
          && !parent.isRoot() && !ancestry.containsKey(parent)) {
        if (!ancestorState.findEntry(parent, true)) {
          // there is no entry in the ancestor state.
          // look in the store
          DDBPathMetadata md;
          Pair newEntry;
          final Item item = getConsistentItem(parent);
          if (item != null && !itemToPathMetadata(item, username).isDeleted()) {
            // This is an undeleted entry found in the database.
            // register it in ancestor state and in the map of entries to create
            // as a retrieved entry
            md = itemToPathMetadata(item, username);
            LOG.debug("Found existing entry for parent: {}", md);
            newEntry = Pair.of(EntryOrigin.Retrieved, md);
            // and we break, assuming that if there is an entry, its parents
            // are valid too.
            addAncestors = false;
          } else {
            // A directory entry was not found in the DB. Create one.
            LOG.debug("auto-create ancestor path {} for child path {}",
                parent, path);
            final S3AFileStatus status = makeDirStatus(parent, username);
            md = new DDBPathMetadata(status, Tristate.FALSE,
                false, false, ttlTimeProvider.getNow());
            // declare to be a generated entry
            newEntry =  Pair.of(EntryOrigin.Generated, md);
          }
          // insert into the ancestor state to avoid further checks
          ancestorState.put(parent, md);
          ancestry.put(parent, newEntry);
        }
        parent = parent.getParent();
      }
    }
    // we now have a list of entries which were not in the operation state.
    // Filter out those which were retrieved, to produce a list of those
    // which must be written to the database.
    // TODO sort in reverse order of existence
    return ancestry.values().stream()
        .filter(p -> p.getLeft() != EntryOrigin.Retrieved)
        .map(Pair::getRight)
        .collect(Collectors.toList());
  }

  /**
   * {@inheritDoc}
   * 

   * The implementation scans all up the directory tree and does a get()
   * for each entry; at each level one is found it is added to the ancestor
   * state.
   * 

   * The original implementation would stop on finding the first non-empty
   * parent. This (re) implementation issues a GET for every parent entry
   * and so detects and recovers from a tombstone marker further up the tree
   * (i.e. an inconsistent store is corrected for).
   * 

   * if {@code operationState} is not null, when this method returns the
   * operation state will be updated with all new entries created.
   * This ensures that subsequent operations with the same store will not
   * trigger new updates.
   * @param qualifiedPath path to update
   * @param operationState (nullable) operational state for a bulk update
   * @throws IOException on failure.
   */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  @Override
  @Retries.RetryTranslated
  public void addAncestors(final Path qualifiedPath,
      @Nullable final BulkOperationState operationState) throws IOException {

    Collection newDirs = new ArrayList<>();
    final AncestorState ancestorState = extractOrCreate(operationState,
        BulkOperationState.OperationType.Put);
    Path parent = qualifiedPath.getParent();
    boolean entryFound = false;

    // Iterate up the parents.
    // note that only ancestorState get/set operations are synchronized;
    // the DDB read between them is not. As a result, more than one
    // thread may probe the state, find the entry missing, do the database
    // query and add the entry.
    // This is done to avoid making the remote dynamo query part of the
    // synchronized block.
    // If a race does occur, the cost is simply one extra GET and potentially
    // one extra PUT.
    while (!parent.isRoot()) {
      synchronized (ancestorState) {
        if (ancestorState.contains(parent)) {
          // the ancestry map contains the key, so no need to even look for it.
          break;
        }
      }
      // we don't worry about tombstone expiry here as expired or not,
      // a directory entry will go in.
      PathMetadata directory = get(parent);
      if (directory == null || directory.isDeleted()) {
        if (entryFound) {
          LOG.warn("Inconsistent S3Guard table: adding directory {}", parent);
        }
        S3AFileStatus status = makeDirStatus(username, parent);
        LOG.debug("Adding new ancestor entry {}", status);
        DDBPathMetadata meta = new DDBPathMetadata(status, Tristate.FALSE,
            false, ttlTimeProvider.getNow());
        newDirs.add(meta);
        // Do not update ancestor state here, as it
        // will happen in the innerPut() call. Were we to add it
        // here that put operation would actually (mistakenly) skip
        // creating the entry.
      } else {
        // an entry was found. Check its type
        entryFound = true;
        if (directory.getFileStatus().isFile()) {
          throw new PathIOException(parent.toString(),
              "Cannot overwrite parent file: metastore is"
                  + " in an inconsistent state");
        }
        // the directory exists. Add it to the ancestor state for next time.
        synchronized (ancestorState) {
          ancestorState.put(parent, new DDBPathMetadata(directory));
        }
      }
      parent = parent.getParent();
    }
    // the listing of directories to put is all those parents which we know
    // are not in the store or BulkOperationState.
    if (!newDirs.isEmpty()) {
      // patch up the time.
      patchLastUpdated(newDirs, ttlTimeProvider);
      innerPut(newDirs, operationState);
    }
  }

  /**
   * {@inheritDoc}.
   *
   * The DDB implementation sorts all the paths such that new items
   * are ordered highest level entry first; deleted items are ordered
   * lowest entry first.
   *
   * This is to ensure that if a client failed partway through the update,
   * there will no entries in the table which lack parent entries.
   * @param pathsToDelete Collection of all paths that were removed from the
   *                      source directory tree of the move.
   * @param pathsToCreate Collection of all PathMetadata for the new paths
   *                      that were created at the destination of the rename
   *                      ().
   * @param operationState Any ongoing state supplied to the rename tracker
   *                      which is to be passed in with each move operation.
   * @throws IOException if there is an error
   */
  @Override
  @Retries.RetryTranslated
  public void move(@Nullable Collection pathsToDelete,
      @Nullable Collection pathsToCreate,
      @Nullable final BulkOperationState operationState) throws IOException {
    if (pathsToDelete == null && pathsToCreate == null) {
      return;
    }

    LOG.debug("Moving paths of table {} in region {}: {} paths to delete and {}"
        + " paths to create", tableName, region,
        pathsToDelete == null ? 0 : pathsToDelete.size(),
        pathsToCreate == null ? 0 : pathsToCreate.size());
    LOG.trace("move: pathsToDelete = {}, pathsToCreate = {}", pathsToDelete,
        pathsToCreate);

    // In DynamoDBMetadataStore implementation, we assume that if a path
    // exists, all its ancestors will also exist in the table.
    // Following code is to maintain this invariant by putting all ancestor
    // directories of the paths to create.
    // ancestor paths that are not explicitly added to paths to create
    AncestorState ancestorState = extractOrCreate(operationState,
        BulkOperationState.OperationType.Rename);
    List newItems = new ArrayList<>();
    if (pathsToCreate != null) {
      // create all parent entries.
      // this is synchronized on the move state so that across both serialized
      // and parallelized renames, duplicate ancestor entries are not created.
      synchronized (ancestorState) {
        newItems.addAll(
            completeAncestry(
                pathMetaToDDBPathMeta(pathsToCreate),
                ancestorState));
      }
    }
    // sort all the new items topmost first.
    newItems.sort(PathOrderComparators.TOPMOST_PM_FIRST);

    // now process the deletions.
    if (pathsToDelete != null) {
      List tombstones = new ArrayList<>(pathsToDelete.size());
      for (Path meta : pathsToDelete) {
        Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider"
            + " must not be null");
        final PathMetadata pmTombstone = PathMetadata.tombstone(meta,
            ttlTimeProvider.getNow());
        tombstones.add(new DDBPathMetadata(pmTombstone));
      }
      // sort all the tombstones lowest first.
      tombstones.sort(TOPMOST_PM_LAST);
      newItems.addAll(tombstones);
    }

    processBatchWriteRequest(ancestorState,
        null, pathMetadataToItem(newItems));
  }

  /**
   * Helper method to issue a batch write request to DynamoDB.
   * 

   *   Keys to delete are processed ahead of writing new items.
   *   No attempt is made to sort the input: the caller must do that
   * 
   * As well as retrying on the operation invocation, incomplete
   * batches are retried until all have been processed.
   *
   * @param ancestorState ancestor state for logging
   * @param keysToDelete primary keys to be deleted; can be null
   * @param itemsToPut new items to be put; can be null
   * @return the number of iterations needed to complete the call.
   */
  @Retries.RetryTranslated("Outstanding batch items are updated with backoff")
  private int processBatchWriteRequest(
      @Nullable AncestorState ancestorState,
      PrimaryKey[] keysToDelete,
      Item[] itemsToPut) throws IOException {
    final int totalToDelete = (keysToDelete == null ? 0 : keysToDelete.length);
    final int totalToPut = (itemsToPut == null ? 0 : itemsToPut.length);
    if (totalToPut == 0 && totalToDelete == 0) {
      LOG.debug("Ignoring empty batch write request");
      return 0;
    }
    int count = 0;
    int batches = 0;
    while (count < totalToDelete + totalToPut) {
      final TableWriteItems writeItems = new TableWriteItems(tableName);
      int numToDelete = 0;
      if (keysToDelete != null
          && count < totalToDelete) {
        numToDelete = Math.min(S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT,
            totalToDelete - count);
        PrimaryKey[] toDelete = Arrays.copyOfRange(keysToDelete,
            count, count + numToDelete);
        LOG.debug("Deleting {} entries: {}", toDelete.length, toDelete);
        writeItems.withPrimaryKeysToDelete(toDelete);
        count += numToDelete;
      }

      if (numToDelete < S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT
          && itemsToPut != null
          && count < totalToDelete + totalToPut) {
        final int numToPut = Math.min(
            S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT - numToDelete,
            totalToDelete + totalToPut - count);
        final int index = count - totalToDelete;
        writeItems.withItemsToPut(
            Arrays.copyOfRange(itemsToPut, index, index + numToPut));
        count += numToPut;
      }

      // if there's a retry and another process updates things then it's not
      // quite idempotent, but this was the case anyway
      batches++;
      BatchWriteItemOutcome res = writeOp.retry(
          "batch write",
          "",
          true,
          () -> dynamoDB.batchWriteItem(writeItems));
      // Check for unprocessed keys in case of exceeding provisioned throughput
      Map> unprocessed = res.getUnprocessedItems();
      int retryCount = 0;
      while (!unprocessed.isEmpty()) {
        batchWriteCapacityExceededEvents.incrementAndGet();
        batches++;
        retryBackoffOnBatchWrite(retryCount++);
        // use a different reference to keep the compiler quiet
        final Map> upx = unprocessed;
        res = writeOp.retry(
            "batch write",
            "",
            true,
            () -> dynamoDB.batchWriteItemUnprocessed(upx));
        unprocessed = res.getUnprocessedItems();
      }
    }
    if (itemsToPut != null) {
      recordsWritten(itemsToPut.length);
      logPut(ancestorState, itemsToPut);
    }
    if (keysToDelete != null) {
      recordsDeleted(keysToDelete.length);
      logDelete(ancestorState, keysToDelete);

    }
    return batches;
  }

  /**
   * Put the current thread to sleep to implement exponential backoff
   * depending on retryCount.  If max retries are exceeded, throws an
   * exception instead.
   *
   * @param retryCount number of retries so far
   * @throws IOException when max retryCount is exceeded.
   */
  private void retryBackoffOnBatchWrite(int retryCount) throws IOException {
    try {
      // Our RetryPolicy ignores everything but retryCount here.
      RetryPolicy.RetryAction action = batchWriteRetryPolicy.shouldRetry(
          null,
          retryCount, 0, true);
      if (action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) {
        // Create an AWSServiceThrottledException, with a fake inner cause
        // which we fill in to look like a real exception so
        // error messages look sensible
        AmazonServiceException cause = new AmazonServiceException(
            "Throttling");
        cause.setServiceName("S3Guard");
        cause.setStatusCode(AWSServiceThrottledException.STATUS_CODE);
        cause.setErrorCode(THROTTLING);  // used in real AWS errors
        cause.setErrorType(AmazonServiceException.ErrorType.Service);
        cause.setErrorMessage(THROTTLING);
        cause.setRequestId("n/a");
        throw new AWSServiceThrottledException(
            String.format("Max retries during batch write exceeded"
                    + " (%d) for DynamoDB."
                    + HINT_DDB_IOPS_TOO_LOW,
                retryCount),
            cause);
      } else {
        LOG.debug("Sleeping {} msec before next retry", action.delayMillis);
        Thread.sleep(action.delayMillis);
      }
    } catch (InterruptedException e) {
      throw (IOException)new InterruptedIOException(e.toString()).initCause(e);
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new IOException("Unexpected exception " + e, e);
    }
  }

  @Override
  @Retries.RetryTranslated
  public void put(final PathMetadata meta) throws IOException {
    put(meta, null);
  }

  @Override
  @Retries.RetryTranslated
  public void put(
      final PathMetadata meta,
      @Nullable final BulkOperationState operationState) throws IOException {
    // For a deeply nested path, this method will automatically create the full
    // ancestry and save respective item in DynamoDB table.
    // So after put operation, we maintain the invariant that if a path exists,
    // all its ancestors will also exist in the table.
    // For performance purpose, we generate the full paths to put and use batch
    // write item request to save the items.
    LOG.debug("Saving to table {} in region {}: {}", tableName, region, meta);

    Collection wrapper = new ArrayList<>(1);
    wrapper.add(meta);
    put(wrapper, operationState);
  }

  @Override
  @Retries.RetryTranslated
  public void put(
      final Collection metas,
      @Nullable final BulkOperationState operationState) throws IOException {
    innerPut(pathMetaToDDBPathMeta(metas), operationState);
  }

  /**
   * Internal put operation.
   * 
   * The ancestors to all entries are added to the set of entries to write,
   * provided they are not already stored in any supplied operation state.
   * Both the supplied metadata entries and ancestor entries are sorted
   * so that the topmost entries are written first.
   * This is to ensure that a failure partway through the operation will not
   * create entries in the table without parents.
   * @param metas metadata entries to write.
   * @param operationState (nullable) operational state for a bulk update
   * @throws IOException failure.
   */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  @Retries.RetryTranslated
  private void innerPut(
      final Collection metas,
      @Nullable final BulkOperationState operationState) throws IOException {
    if (metas.isEmpty()) {
      // Happens when someone calls put() with an empty list.
      LOG.debug("Ignoring empty list of entries to put");
      return;
    }
    // always create or retrieve an ancestor state instance, so it can
    // always be used for synchronization.
    final AncestorState ancestorState = extractOrCreate(operationState,
        BulkOperationState.OperationType.Put);

    Item[] items;
    synchronized (ancestorState) {
      items = pathMetadataToItem(
          completeAncestry(metas, ancestorState));
    }
    LOG.debug("Saving batch of {} items to table {}, region {}", items.length,
        tableName, region);
    processBatchWriteRequest(ancestorState, null, items);
  }

  /**
   * Get full path of ancestors that are nonexistent in table.
   *
   * This queries DDB when looking for parents which are not in
   * any supplied ongoing operation state.
   * Updates the operation state with found entries to reduce further checks.
   *
   * @param meta metadata to put
   * @param operationState ongoing bulk state
   * @return a possibly empty list of entries to put.
   * @throws IOException failure
   */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  @VisibleForTesting
  @Retries.RetryTranslated
  List fullPathsToPut(DDBPathMetadata meta,
      @Nullable BulkOperationState operationState)
      throws IOException {
    checkPathMetadata(meta);
    final List metasToPut = new ArrayList<>();
    // root path is not persisted
    if (!meta.getFileStatus().getPath().isRoot()) {
      metasToPut.add(meta);
    }

    // put all its ancestors if not present; as an optimization we return at its
    // first existent ancestor
    final AncestorState ancestorState = extractOrCreate(operationState,
        BulkOperationState.OperationType.Put);
    Path path = meta.getFileStatus().getPath().getParent();
    while (path != null && !path.isRoot()) {
      synchronized (ancestorState) {
        if (ancestorState.findEntry(path, true)) {
          break;
        }
      }
      final Item item = getConsistentItem(path);
      if (!itemExists(item)) {
        final S3AFileStatus status = makeDirStatus(path, username);
        metasToPut.add(new DDBPathMetadata(status, Tristate.FALSE, false,
            meta.isAuthoritativeDir(), meta.getLastUpdated()));
        path = path.getParent();
      } else {
        // found the entry in the table, so add it to the ancestor state
        synchronized (ancestorState) {
          ancestorState.put(path, itemToPathMetadata(item, username));
        }
        // then break out of the loop.
        break;
      }
    }
    return metasToPut;
  }

  /**
   * Does an item represent an object which exists?
   * @param item item retrieved in a query.
   * @return true iff the item isn't null and, if there is an is_deleted
   * column, that its value is false.
   */
  private static boolean itemExists(Item item) {
    if (item == null) {
      return false;
    }
    if (item.hasAttribute(IS_DELETED) &&
        item.getBoolean(IS_DELETED)) {
      return false;
    }
    return true;
  }

  /**
   * Get the value of an optional boolean attribute, falling back to the
   * default value if the attribute is absent.
   * @param item Item
   * @param attrName Attribute name
   * @param defVal Default value
   * @return The value or the default
   */
  private static boolean getBoolAttribute(Item item,
      String attrName,
      boolean defVal) {
    return item.hasAttribute(attrName) ? item.getBoolean(attrName) : defVal;
  }

  /** Create a directory FileStatus using 0 for the lastUpdated time. */
  static S3AFileStatus makeDirStatus(Path f, String owner) {
    return new S3AFileStatus(Tristate.UNKNOWN, f, owner);
  }

  /**
   * {@inheritDoc}.
   * There is retry around building the list of paths to update, but
   * the call to
   * {@link #processBatchWriteRequest(DynamoDBMetadataStore.AncestorState, PrimaryKey[], Item[])}
   * is only tried once.
   * @param meta Directory listing metadata.
   * @param unchangedEntries unchanged child entry paths
   * @param operationState operational state for a bulk update
   * @throws IOException IO problem
   */
  @Override
  @Retries.RetryTranslated
  public void put(
      final DirListingMetadata meta,
      final List unchangedEntries,
      @Nullable final BulkOperationState operationState) throws IOException {
    LOG.debug("Saving {} dir meta for {} to table {} in region {}: {}",
        meta.isAuthoritative() ? "auth" : "nonauth",
        meta.getPath(),
        tableName, region, meta);
    // directory path
    Path path = meta.getPath();
    DDBPathMetadata ddbPathMeta =
        new DDBPathMetadata(makeDirStatus(path, username), meta.isEmpty(),
            false, meta.isAuthoritative(), meta.getLastUpdated());
    // put all its ancestors if not present
    final AncestorState ancestorState = extractOrCreate(operationState,
        BulkOperationState.OperationType.Put);
    // First add any missing ancestors...
    final List metasToPut = fullPathsToPut(ddbPathMeta,
        ancestorState);

    // next add all changed children of the directory
    // ones that came from the previous listing are left as-is
    final Collection children = meta.getListing()
        .stream()
        .filter(e -> !unchangedEntries.contains(e.getFileStatus().getPath()))
        .collect(Collectors.toList());

    metasToPut.addAll(pathMetaToDDBPathMeta(children));

    // sort so highest-level entries are written to the store first.
    // if a sequence fails, no orphan entries will have been written.
    metasToPut.sort(PathOrderComparators.TOPMOST_PM_FIRST);
    processBatchWriteRequest(ancestorState,
        null,
        pathMetadataToItem(metasToPut));
    // and add the ancestors
    synchronized (ancestorState) {
      metasToPut.forEach(ancestorState::put);
    }
  }

  @Override
  public synchronized void close() {
    instrumentation.storeClosed();
    try {
      if (dynamoDB != null) {
        LOG.debug("Shutting down {}", this);
        dynamoDB.shutdown();
        dynamoDB = null;
      }
    } finally {
      closeAutocloseables(LOG, credentials);
      credentials = null;
    }
  }

  @Override
  @Retries.RetryTranslated
  public void destroy() throws IOException {
    tableHandler.destroy();
  }

  @Retries.RetryTranslated
  private ItemCollection expiredFiles(PruneMode pruneMode,
      long cutoff, String keyPrefix) throws IOException {

    String filterExpression;
    String projectionExpression;
    ValueMap map;

    switch (pruneMode) {
    case ALL_BY_MODTIME:
      // filter all files under the given parent older than the modtime.
      // this implicitly skips directories, because they lack a modtime field.
      // however we explicitly exclude directories to make clear that
      // directories are to be excluded and avoid any confusion
      // see: HADOOP-16725.
      // note: files lack the is_dir field entirely, so we use a `not` to
      // filter out the directories.
      filterExpression =
          "mod_time < :mod_time and begins_with(parent, :parent)"
              + " and not is_dir = :is_dir";
      projectionExpression = "parent,child";
      map = new ValueMap()
          .withLong(":mod_time", cutoff)
          .withString(":parent", keyPrefix)
          .withBoolean(":is_dir", true);
      break;
    case TOMBSTONES_BY_LASTUPDATED:
      filterExpression =
          "last_updated < :last_updated and begins_with(parent, :parent) "
              + "and is_deleted = :is_deleted";
      projectionExpression = "parent,child,is_deleted";
      map = new ValueMap()
          .withLong(":last_updated", cutoff)
          .withString(":parent", keyPrefix)
          .withBoolean(":is_deleted", true);
      break;
    default:
      throw new UnsupportedOperationException("Unsupported prune mode: "
          + pruneMode);
    }

    return readOp.retry(
        "scan",
        keyPrefix,
        true,
        () -> table.scan(filterExpression, projectionExpression, null, map));
  }

  @Override
  @Retries.RetryTranslated
  public void prune(PruneMode pruneMode, long cutoff) throws IOException {
    prune(pruneMode, cutoff, "/");
  }

  /**
   * Prune files, in batches. There's optionally a sleep between each batch.
   *
   * @param pruneMode The mode of operation for the prune For details see
   *                  {@link MetadataStore#prune(PruneMode, long)}
   * @param cutoff Oldest modification time to allow
   * @param keyPrefix The prefix for the keys that should be removed
   * @throws IOException Any IO/DDB failure.
   * @throws InterruptedIOException if the prune was interrupted
   * @return count of pruned items.
   */
  @Override
  @Retries.RetryTranslated
  public long prune(PruneMode pruneMode, long cutoff, String keyPrefix)
      throws IOException {
    LOG.debug("Prune {} under {} with age {}",
        pruneMode == PruneMode.ALL_BY_MODTIME
            ? "files and tombstones" : "tombstones",
        keyPrefix, cutoff);
    final ItemCollection items =
        expiredFiles(pruneMode, cutoff, keyPrefix);
    return innerPrune(pruneMode, cutoff, keyPrefix, items);
  }

  /**
   * Prune files, in batches. There's optionally a sleep between each batch.
   *
   * @param pruneMode The mode of operation for the prune For details see
   *                  {@link MetadataStore#prune(PruneMode, long)}
   * @param cutoff Oldest modification time to allow
   * @param keyPrefix The prefix for the keys that should be removed
   * @param items expired items
   * @return count of pruned items.
   * @throws IOException Any IO/DDB failure.
   * @throws InterruptedIOException if the prune was interrupted
   */
  private int innerPrune(
      final PruneMode pruneMode, final long cutoff, final String keyPrefix,
      final ItemCollection items)
      throws IOException {
    int itemCount = 0;
    try (AncestorState state = initiateBulkWrite(
        BulkOperationState.OperationType.Prune, null);
         DurationInfo ignored =
             new DurationInfo(LOG, "Pruning DynamoDB Store")) {
      ArrayList deletionBatch =
          new ArrayList<>(S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT);
      long delay = conf.getTimeDuration(
          S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_KEY,
          S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_DEFAULT,
          TimeUnit.MILLISECONDS);
      Set parentPathSet = new HashSet<>();
      Set clearedParentPathSet = new HashSet<>();
      // declare the operation to delete a batch as a function so
      // as to keep the code consistent across multiple uses.
      CallableRaisingIOE deleteBatchOperation =
          () -> {
            // lowest path entries get deleted first.
            deletionBatch.sort(PathOrderComparators.TOPMOST_PATH_LAST);
            processBatchWriteRequest(state, pathToKey(deletionBatch), null);

            // set authoritative false for each pruned dir listing
            // if at least one entry was not a tombstone
            removeAuthoritativeDirFlag(parentPathSet, state);
            // already cleared parent paths.
            clearedParentPathSet.addAll(parentPathSet);
            parentPathSet.clear();
            return null;
          };
      for (Item item : items) {
        DDBPathMetadata md = PathMetadataDynamoDBTranslation
            .itemToPathMetadata(item, username);
        Path path = md.getFileStatus().getPath();
        boolean tombstone = md.isDeleted();
        LOG.debug("Prune entry {}", path);
        deletionBatch.add(path);

        // add parent path of item so it can be marked as non-auth.
        // this is only done if
        // * it has not already been processed
        // * the entry pruned is not a tombstone (no need to update)
        // * the file is not in the root dir
        Path parentPath = path.getParent();
        if (!tombstone
            && parentPath != null
            && !parentPath.isRoot()
            && !clearedParentPathSet.contains(parentPath)) {
          parentPathSet.add(parentPath);
        }

        itemCount++;
        if (deletionBatch.size() == S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT) {
          deleteBatchOperation.apply();
          deletionBatch.clear();
          if (delay > 0) {
            Thread.sleep(delay);
          }
        }
      }
      // final batch of deletes
      if (!deletionBatch.isEmpty()) {
        deleteBatchOperation.apply();
      }
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
      throw new InterruptedIOException("Pruning was interrupted");
    } catch (AmazonDynamoDBException e) {
      throw translateDynamoDBException(keyPrefix,
          "Prune of " + keyPrefix + " failed", e);
    }
    LOG.info("Finished pruning {} items in batches of {}", itemCount,
        S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT);
    return itemCount;
  }

  /**
   * Remove the Authoritative Directory Marker from a set of paths, if
   * those paths are in the store.
   * 

   * This operation is onlyfor pruning; it does not raise an error
   * if, during the prune phase, the table appears inconsistent.
   * This is not unusual as it can happen in a number of ways
   * 

   *   The state of the table changes during a slow prune operation which
   *   deliberately inserts pauses to avoid overloading prepaid IO capacity.
   *   
   *   Tombstone markers have been left in the table after many other
   *   operations have taken place, including deleting/replacing
   *   parents.
   * 
   * 
   *
   * If an exception is raised in the get/update process, then the exception
   * is caught and only rethrown after all the other paths are processed.
   * This is to ensure a best-effort attempt to update the store.
   * @param pathSet set of paths.
   * @param state ongoing operation state.
   * @throws IOException only after a best effort is made to update the store.
   */
  private void removeAuthoritativeDirFlag(
      final Set pathSet,
      final AncestorState state) throws IOException {

    AtomicReference rIOException = new AtomicReference<>();

    Set metas = pathSet.stream().map(path -> {
      try {
        if (path.isRoot()) {
          LOG.debug("ignoring root path");
          return null;
        }
        if (state != null && state.get(path) != null) {
          // there's already an entry for this path
          LOG.debug("Ignoring update of entry already in the state map");
          return null;
        }
        DDBPathMetadata ddbPathMetadata = get(path);
        if (ddbPathMetadata == null) {
          // there is no entry.
          LOG.debug("No parent {}; skipping", path);
          return null;
        }
        if (ddbPathMetadata.isDeleted()) {
          // the parent itself is deleted
          LOG.debug("Parent has been deleted {}; skipping", path);
          return null;
        }
        if (!ddbPathMetadata.getFileStatus().isDirectory()) {
          // the parent itself is deleted
          LOG.debug("Parent is not a directory {}; skipping", path);
          return null;
        }
        LOG.debug("Setting isAuthoritativeDir==false on {}", ddbPathMetadata);
        ddbPathMetadata.setAuthoritativeDir(false);
        ddbPathMetadata.setLastUpdated(ttlTimeProvider.getNow());
        return ddbPathMetadata;
      } catch (IOException e) {
        String msg = String.format("IOException while getting PathMetadata "
            + "on path: %s.", path);
        LOG.error(msg, e);
        rIOException.set(e);
        return null;
      }
    }).filter(Objects::nonNull).collect(Collectors.toSet());

    try {
      LOG.debug("innerPut on metas: {}", metas);
      if (!metas.isEmpty()) {
        innerPut(metas, state);
      }
    } catch (IOException e) {
      String msg = String.format("IOException while setting false "
          + "authoritative directory flag on: %s.", metas);
      LOG.error(msg, e);
      rIOException.set(e);
    }

    if (rIOException.get() != null) {
      throw rIOException.get();
    }
  }

  @VisibleForTesting
  public AmazonDynamoDB getAmazonDynamoDB() {
    return amazonDynamoDB;
  }

  @Override
  public String toString() {
    return getClass().getSimpleName() + '{'
        + "region=" + region
        + ", tableName=" + tableName
        + ", tableArn=" + tableHandler.getTableArn()
        + '}';
  }

  /**
   * The administrative policy includes all DDB table operations;
   * application access is restricted to those operations S3Guard operations
   * require when working with data in a guarded bucket.
   * @param access access level desired.
   * @return a possibly empty list of statements.
   */
  @Override
  public List listAWSPolicyRules(
      final Set access) {
    Preconditions.checkState(tableHandler.getTableArn() != null,
        "TableARN not known");
    if (access.isEmpty()) {
      return Collections.emptyList();
    }
    RoleModel.Statement stat;
    if (access.contains(AccessLevel.ADMIN)) {
      stat = allowAllDynamoDBOperations(tableHandler.getTableArn());
    } else {
      stat = allowS3GuardClientOperations(tableHandler.getTableArn());
    }
    return Lists.newArrayList(stat);
  }

  /**
   * PUT a single item to the table.
   * @param item item to put
   * @return the outcome.
   */
  @Retries.OnceRaw
  private PutItemOutcome putItem(Item item) {
    LOG.debug("Putting item {}", item);
    return table.putItem(item);
  }

  @VisibleForTesting
  Table getTable() {
    return table;
  }

  String getRegion() {
    return region;
  }

  @VisibleForTesting
  public String getTableName() {
    return tableName;
  }

  @VisibleForTesting
  DynamoDB getDynamoDB() {
    return dynamoDB;
  }

  /**
   * Validates a path object; it must be absolute, have an s3a:/// scheme
   * and contain a host (bucket) component.
   * @param path path to check
   * @return the path passed in
   */
  private Path checkPath(Path path) {
    Preconditions.checkNotNull(path);
    Preconditions.checkArgument(path.isAbsolute(), "Path %s is not absolute",
        path);
    URI uri = path.toUri();
    Preconditions.checkNotNull(uri.getScheme(), "Path %s missing scheme", path);
    Preconditions.checkArgument(uri.getScheme().equals(Constants.FS_S3A),
        "Path %s scheme must be %s", path, Constants.FS_S3A);
    Preconditions.checkArgument(!StringUtils.isEmpty(uri.getHost()), "Path %s" +
        " is missing bucket.", path);
    return path;
  }

  /**
   * Validates a path meta-data object.
   */
  private static void checkPathMetadata(PathMetadata meta) {
    Preconditions.checkNotNull(meta);
    Preconditions.checkNotNull(meta.getFileStatus());
    Preconditions.checkNotNull(meta.getFileStatus().getPath());
  }

  @Override
  @Retries.OnceRaw
  public Map getDiagnostics() throws IOException {
    Map map = new TreeMap<>();
    if (table != null) {
      TableDescription desc = getTableDescription(true);
      map.put("name", desc.getTableName());
      map.put(STATUS, desc.getTableStatus());
      map.put("ARN", desc.getTableArn());
      map.put("size", desc.getTableSizeBytes().toString());
      map.put(TABLE, desc.toString());
      ProvisionedThroughputDescription throughput
          = desc.getProvisionedThroughput();
      map.put(READ_CAPACITY, throughput.getReadCapacityUnits().toString());
      map.put(WRITE_CAPACITY, throughput.getWriteCapacityUnits().toString());
      map.put(BILLING_MODE,
          throughput.getWriteCapacityUnits() == 0
              ? BILLING_MODE_PER_REQUEST
              : BILLING_MODE_PROVISIONED);
      map.put("sse", desc.getSSEDescription() == null
          ? "DISABLED"
          : desc.getSSEDescription().toString());
      map.put(MetadataStoreCapabilities.PERSISTS_AUTHORITATIVE_BIT,
          Boolean.toString(true));
    } else {
      map.put("name", "DynamoDB Metadata Store");
      map.put(TABLE, "none");
      map.put(STATUS, "undefined");
    }
    map.put("description", DESCRIPTION);
    map.put("region", region);
    if (batchWriteRetryPolicy != null) {
      map.put("retryPolicy", batchWriteRetryPolicy.toString());
    }
    return map;
  }

  @Retries.OnceRaw
  private TableDescription getTableDescription(boolean forceUpdate) {
    TableDescription desc = table.getDescription();
    if (desc == null || forceUpdate) {
      desc = table.describe();
    }
    return desc;
  }

  @Override
  @Retries.OnceRaw
  public void updateParameters(Map parameters)
      throws IOException {
    Preconditions.checkNotNull(table, "Not initialized");
    TableDescription desc = getTableDescription(true);
    ProvisionedThroughputDescription current
        = desc.getProvisionedThroughput();

    long currentRead = current.getReadCapacityUnits();
    long newRead = getLongParam(parameters,
        S3GUARD_DDB_TABLE_CAPACITY_READ_KEY,
        currentRead);
    long currentWrite = current.getWriteCapacityUnits();
    long newWrite = getLongParam(parameters,
            S3GUARD_DDB_TABLE_CAPACITY_WRITE_KEY,
            currentWrite);

    if (currentRead == 0 || currentWrite == 0) {
      // table is pay on demand
      throw new IOException(E_ON_DEMAND_NO_SET_CAPACITY);
    }

    if (newRead != currentRead || newWrite != currentWrite) {
      LOG.info("Current table capacity is read: {}, write: {}",
          currentRead, currentWrite);
      LOG.info("Changing capacity of table to read: {}, write: {}",
          newRead, newWrite);
      tableHandler.provisionTableBlocking(newRead, newWrite);
    } else {
      LOG.info("Table capacity unchanged at read: {}, write: {}",
          newRead, newWrite);
    }
  }

  private long getLongParam(Map parameters,
      String key,
      long defVal) {
    String k = parameters.get(key);
    if (k != null) {
      return Long.parseLong(k);
    } else {
      return defVal;
    }
  }

  /**
   * Callback on a read operation retried.
   * @param text text of the operation
   * @param ex exception
   * @param attempts number of attempts
   * @param idempotent is the method idempotent (this is assumed to be true)
   */
  void readRetryEvent(
      String text,
      IOException ex,
      int attempts,
      boolean idempotent) {
    readThrottleEvents.incrementAndGet();
    retryEvent(text, ex, attempts, true);
  }

  /**
   * Callback  on a write operation retried.
   * @param text text of the operation
   * @param ex exception
   * @param attempts number of attempts
   * @param idempotent is the method idempotent (this is assumed to be true)
   */
  void writeRetryEvent(
      String text,
      IOException ex,
      int attempts,
      boolean idempotent) {
    writeThrottleEvents.incrementAndGet();
    retryEvent(text, ex, attempts, idempotent);
  }

  /**
   * Callback on a scan operation retried.
   * @param text text of the operation
   * @param ex exception
   * @param attempts number of attempts
   * @param idempotent is the method idempotent (this is assumed to be true)
   */
  void scanRetryEvent(
      String text,
      IOException ex,
      int attempts,
      boolean idempotent) {
    scanThrottleEvents.incrementAndGet();
    retryEvent(text, ex, attempts, idempotent);
  }

  /**
   * Callback from {@link Invoker} when an operation is retried.
   * @param text text of the operation
   * @param ex exception
   * @param attempts number of attempts
   * @param idempotent is the method idempotent
   */
  void retryEvent(
      String text,
      IOException ex,
      int attempts,
      boolean idempotent) {
    if (S3AUtils.isThrottleException(ex)) {
      // throttled
      instrumentation.throttled();
      int eventCount = throttleEventCount.addAndGet(1);
      if (attempts == 1 && eventCount < THROTTLE_EVENT_LOG_LIMIT) {
        LOG.warn("DynamoDB IO limits reached in {};"
                + " consider increasing capacity: {}", text, ex.toString());
        LOG.debug("Throttled", ex);
      } else {
        // user has been warned already, log at debug only.
        LOG.debug("DynamoDB IO limits reached in {};"
                + " consider increasing capacity: {}", text, ex.toString());
      }
    } else if (attempts == 1) {
      // not throttled. Log on the first attempt only
      LOG.info("Retrying {}: {}", text, ex.toString());
      LOG.debug("Retrying {}", text, ex);
    }

    // note a retry
    instrumentation.retrying();
    if (owner != null) {
      owner.metastoreOperationRetried(ex, attempts, idempotent);
    }
  }

  /**
   * Get the count of read throttle events.
   * @return the current count of read throttle events.
   */
  @VisibleForTesting
  public long getReadThrottleEventCount() {
    return readThrottleEvents.get();
  }

  /**
   * Get the count of write throttle events.
   * @return the current count of write throttle events.
   */
  @VisibleForTesting
  public long getWriteThrottleEventCount() {
    return writeThrottleEvents.get();
  }

  /**
   * Get the count of scan throttle events.
   * @return the current count of scan throttle events.
   */
  @VisibleForTesting
  public long getScanThrottleEventCount() {
    return scanThrottleEvents.get();
  }

  @VisibleForTesting
  public long getBatchWriteCapacityExceededCount() {
    return batchWriteCapacityExceededEvents.get();
  }

  /**
   * Get the operation invoker for write operations.
   * @return an invoker for retrying mutating operations on a store.
   */
  public Invoker getInvoker() {
    return writeOp;
  }

  /**
   * Wrap an iterator returned from any scan with a retrying one.
   * This includes throttle handling.
   * Retries will update the relevant counters/metrics for scan operations.
   * @param source source iterator
   * @return a retrying iterator.
   */
  public  Iterable wrapWithRetries(
      final Iterable source) {
    return new RetryingCollection<>("scan dynamoDB table", scanOp, source);
  }

  /**
   * Record the number of records written.
   * @param count count of records.
   */
  private void recordsWritten(final int count) {
    instrumentation.recordsWritten(count);
  }

  /**
   * Record the number of records read.
   * @param count count of records.
   */
  private void recordsRead(final int count) {
    instrumentation.recordsRead(count);
  }
  /**
   * Record the number of records deleted.
   * @param count count of records.
   */
  private void recordsDeleted(final int count) {
    instrumentation.recordsDeleted(count);
  }

  /**
   * Initiate the rename operation by creating the tracker for the filesystem
   * to keep up to date with state changes in the S3A bucket.
   * @param storeContext store context.
   * @param source source path
   * @param sourceStatus status of the source file/dir
   * @param dest destination path.
   * @return the rename tracker
   */
  @Override
  public RenameTracker initiateRenameOperation(
      final StoreContext storeContext,
      final Path source,
      final S3AFileStatus sourceStatus,
      final Path dest) {
    return new ProgressiveRenameTracker(storeContext, this, source, dest,
        new AncestorState(this, BulkOperationState.OperationType.Rename, dest));
  }

  /**
   * Mark the directories instantiated under the destination path
   * as authoritative. That is: all entries in the
   * operationState (which must be an AncestorState instance),
   * that are under the destination path.
   *
   * The database update synchronized on the operationState, so all other
   * threads trying to update that state will be blocked until completion.
   *
   * This operation is only used in import and at the end of a rename,
   * so this is not considered an issue.
   * @param dest destination path.
   * @param operationState active state.
   * @throws IOException failure.
   * @return the number of directories marked.
   */
  @Override
  public int markAsAuthoritative(
      final Path dest,
      final BulkOperationState operationState) throws IOException {
    if (operationState == null) {
      return 0;
    }
    Preconditions.checkArgument(operationState instanceof AncestorState,
        "Not an AncestorState %s", operationState);
    final AncestorState state = (AncestorState)operationState;
    // only mark paths under the dest as auth
    final String simpleDestKey = pathToParentKey(dest);
    final String destPathKey = simpleDestKey + "/";
    final String opId = AncestorState.stateAsString(state);
    LOG.debug("{}: marking directories under {} as authoritative",
        opId, destPathKey);

    // the list of dirs to build up.
    final List dirsToUpdate = new ArrayList<>();
    synchronized (state) {
      for (Map.Entry entry :
          state.getAncestry().entrySet()) {
        final Path path = entry.getKey();
        final DDBPathMetadata md = entry.getValue();
        final String key = pathToParentKey(path);
        if (md.getFileStatus().isDirectory()
            && (key.equals(simpleDestKey) || key.startsWith(destPathKey))) {
          // the updated entry is under the destination.
          md.setAuthoritativeDir(true);
          md.setLastUpdated(ttlTimeProvider.getNow());
          LOG.debug("{}: added {}", opId, key);
          dirsToUpdate.add(md);
        }
      }
      processBatchWriteRequest(state,
          null, pathMetadataToItem(dirsToUpdate));
    }
    return dirsToUpdate.size();
  }

  @Override
  public AncestorState initiateBulkWrite(
      final BulkOperationState.OperationType operation,
      final Path dest) {
    return new AncestorState(this, operation, dest);
  }

  @Override
  public void setTtlTimeProvider(ITtlTimeProvider ttlTimeProvider) {
    this.ttlTimeProvider = ttlTimeProvider;
  }

  /**
   * Username.
   * @return the current username
   */
  String getUsername() {
    return username;
  }

  /**
   * Log a PUT into the operations log at debug level.
   * @param state optional ancestor state.
   * @param items items which have been PUT
   */
  private static void logPut(
      @Nullable AncestorState state,
      Item[] items) {
    if (OPERATIONS_LOG.isDebugEnabled()) {
      // log the operations
      String stateStr = AncestorState.stateAsString(state);
      for (Item item : items) {
        boolean tombstone = !itemExists(item);
        boolean isDir = getBoolAttribute(item, IS_DIR, false);
        boolean auth = getBoolAttribute(item, IS_AUTHORITATIVE, false);
        OPERATIONS_LOG.debug("{} {} {}{}{}",
            stateStr,
            tombstone ? "TOMBSTONE" : "PUT",
            itemPrimaryKeyToString(item),
            auth ? " [auth]" : "",
            isDir ? " directory" : "");
      }
    }
  }

  /**
   * Log a PUT into the operations log at debug level.
   * @param state optional ancestor state.
   * @param item item PUT.
   */
  private static void logPut(
      @Nullable AncestorState state,
      Item item) {
    if (OPERATIONS_LOG.isDebugEnabled()) {
      // log the operations
      logPut(state, new Item[]{item});
    }
  }

  /**
   * Log a DELETE into the operations log at debug level.
   * @param state optional ancestor state.
   * @param keysDeleted keys which were deleted.
   */
  private static void logDelete(
      @Nullable AncestorState state,
      PrimaryKey[] keysDeleted) {
    if (OPERATIONS_LOG.isDebugEnabled()) {
      // log the operations
      String stateStr = AncestorState.stateAsString(state);
      for (PrimaryKey key : keysDeleted) {
        OPERATIONS_LOG.debug("{} DELETE {}",
            stateStr, primaryKeyToString(key));
      }
    }
  }

  /**
   * Log a DELETE into the operations log at debug level.
   * @param state optional ancestor state.
   * @param key Deleted key
   */
  private static void logDelete(
      @Nullable AncestorState state,
      PrimaryKey key) {
    if (OPERATIONS_LOG.isDebugEnabled()) {
      logDelete(state, new PrimaryKey[]{key});
    }
  }

  /**
   * Get the move state passed in; create a new one if needed.
   * @param state state.
   * @param operation the type of the operation to use if the state is created.
   * @return the cast or created state.
   */
  private AncestorState extractOrCreate(@Nullable BulkOperationState state,
      BulkOperationState.OperationType operation) {
    if (state != null) {
      return (AncestorState) state;
    } else {
      return new AncestorState(this, operation, null);
    }
  }

  @Override
  public MetastoreInstrumentation getInstrumentation() {
    return instrumentation;
  }

  /**
   * This tracks all the ancestors created,
   * across multiple move/write operations.
   * This is to avoid duplicate creation of ancestors during bulk commits
   * and rename operations managed by a rename tracker.
   *
   * There is no thread safety: callers must synchronize as appropriate.
   */
  @VisibleForTesting
  static final class AncestorState extends BulkOperationState {

    /**
     * Counter of IDs issued.
     */
    private static final AtomicLong ID_COUNTER = new AtomicLong(0);

    /** Owning store. */
    private final DynamoDBMetadataStore store;

    /** The ID of the state; for logging. */
    private final long id;

    /**
     * Map of ancestors.
     */
    private final Map ancestry = new HashMap<>();

    /**
     * Destination path.
     */
    private final Path dest;

    /**
     * Create the state.
     * @param store the store, for use in validation.
     * If null: no validation (test only operation)
     * @param operation the type of the operation.
     * @param dest destination path.
     */
    AncestorState(
        @Nullable final DynamoDBMetadataStore store,
        final OperationType operation,
        @Nullable final Path dest) {
      super(operation);
      this.store = store;
      this.dest = dest;
      this.id = ID_COUNTER.addAndGet(1);
    }

    int size() {
      return ancestry.size();
    }

    /**
     * Get the ancestry. Not thread safe.
     * @return the map of ancestors.
     */
    Map getAncestry() {
      return ancestry;
    }

    public Path getDest() {
      return dest;
    }

    long getId() {
      return id;
    }

    @Override
    public String toString() {
      final StringBuilder sb = new StringBuilder(
          "AncestorState{");
      sb.append("operation=").append(getOperation());
      sb.append("id=").append(id);
      sb.append("; dest=").append(dest);
      sb.append("; size=").append(size());
      sb.append("; paths={")
          .append(StringUtils.join(ancestry.keySet(), " "))
          .append('}');
      sb.append('}');
      return sb.toString();
    }

    /**
     * Does the ancestor state contain a path?
     * @param p path to check
     * @return true if the state has an entry
     */
    boolean contains(Path p) {
      return get(p) != null;
    }

    DDBPathMetadata put(Path p, DDBPathMetadata md) {
      return ancestry.put(p, md);
    }

    DDBPathMetadata put(DDBPathMetadata md) {
      return ancestry.put(md.getFileStatus().getPath(), md);
    }

    DDBPathMetadata get(Path p) {
      return ancestry.get(p);
    }

    /**
     * Find an entry in the ancestor state, warning and optionally
     * raising an exception if there is a file at the path.
     * @param path path to look up
     * @param failOnFile fail if a file was found.
     * @return true iff a directory was found in the ancestor state.
     * @throws PathIOException if there was a file at the path.
     */
    boolean findEntry(
        final Path path,
        final boolean failOnFile) throws PathIOException {
      final DDBPathMetadata ancestor = get(path);
      if (ancestor != null) {
        // there's an entry in the ancestor state
        if (!ancestor.getFileStatus().isDirectory()) {
          // but: its a file, which means this update is now inconsistent.
          final String message = E_INCONSISTENT_UPDATE + " entry is " + ancestor
              .getFileStatus();
          LOG.error(message);
          if (failOnFile) {
            // errors trigger failure
            throw new PathIOException(path.toString(), message);
          }
        }
        return true;
      } else {
        return false;
      }
    }

    /**
     * If debug logging is enabled, this does an audit of the store state.
     * it only logs this; the error messages are created so as they could
     * be turned into exception messages.
     * Audit failures aren't being turned into IOEs is that
     * rename operations delete the source entry and that ends up in the
     * ancestor state as present
     * @throws IOException failure
     */
    @Override
    public void close() throws IOException {
      if (LOG.isDebugEnabled() && store != null) {
        LOG.debug("Auditing {}", stateAsString(this));
        for (Map.Entry entry : ancestry
            .entrySet()) {
          Path path = entry.getKey();
          DDBPathMetadata expected = entry.getValue();
          if (expected.isDeleted()) {
            // file was deleted in bulk op; we don't care about it
            // any more
            continue;
          }
          DDBPathMetadata actual;
          try {
            actual = store.get(path);
          } catch (IOException e) {
            LOG.debug("Retrieving {}", path, e);
            // this is for debug; don't be ambitious
            return;
          }
          if (actual == null || actual.isDeleted()) {
            String message = "Metastore entry for path "
                + path + " deleted during bulk "
                + getOperation() + " operation";
            LOG.debug(message);
          } else {
            if (actual.getFileStatus().isDirectory() !=
                expected.getFileStatus().isDirectory()) {
              // the type of the entry has changed
              String message = "Metastore entry for path "
                  + path + " changed during bulk "
                  + getOperation() + " operation"
                  + " from " + expected
                  + " to " + actual;
              LOG.debug(message);
            }
          }

        }
      }
    }

    /**
     * Create a string from the state including operation and ID.
     * @param state state to use -may be null
     * @return a string for logging.
     */
    private static String stateAsString(@Nullable AncestorState state) {
      String stateStr;
      if (state != null) {
        stateStr = String.format("#(%s-%04d)",
            state.getOperation(),
            state.getId());
      } else {
        stateStr = "#()";
      }
      return stateStr;
    }
  }

  protected DynamoDBMetadataStoreTableManager getTableHandler() {
    Preconditions.checkNotNull(tableHandler, "Not initialized");
    return tableHandler;
  }
}