All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore Maven / Gradle / Ivy

Go to download

This module contains code to support integration with Amazon Web Services. It also declares the dependencies needed to work with AWS services.

There is a newer version: 3.4.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a.s3guard;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.URI;
import java.nio.file.AccessDeniedException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import com.amazonaws.AmazonServiceException;
import com.amazonaws.auth.AWSCredentialsProvider;
import com.amazonaws.services.dynamodbv2.AmazonDynamoDB;
import com.amazonaws.services.dynamodbv2.document.BatchWriteItemOutcome;
import com.amazonaws.services.dynamodbv2.document.DynamoDB;
import com.amazonaws.services.dynamodbv2.document.Item;
import com.amazonaws.services.dynamodbv2.document.ItemCollection;
import com.amazonaws.services.dynamodbv2.document.PrimaryKey;
import com.amazonaws.services.dynamodbv2.document.PutItemOutcome;
import com.amazonaws.services.dynamodbv2.document.QueryOutcome;
import com.amazonaws.services.dynamodbv2.document.ScanOutcome;
import com.amazonaws.services.dynamodbv2.document.Table;
import com.amazonaws.services.dynamodbv2.document.TableWriteItems;
import com.amazonaws.services.dynamodbv2.document.internal.IteratorSupport;
import com.amazonaws.services.dynamodbv2.document.spec.GetItemSpec;
import com.amazonaws.services.dynamodbv2.document.spec.QuerySpec;
import com.amazonaws.services.dynamodbv2.document.utils.ValueMap;
import com.amazonaws.services.dynamodbv2.model.AmazonDynamoDBException;
import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputDescription;
import com.amazonaws.services.dynamodbv2.model.TableDescription;
import com.amazonaws.services.dynamodbv2.model.WriteRequest;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ListeningExecutorService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIOException;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.impl.FunctionsRaisingIOE;
import org.apache.hadoop.fs.impl.WrappedIOException;
import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
import org.apache.hadoop.fs.s3a.AWSServiceThrottledException;
import org.apache.hadoop.fs.s3a.Constants;
import org.apache.hadoop.fs.s3a.Invoker;
import org.apache.hadoop.fs.s3a.Retries;
import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import org.apache.hadoop.fs.s3a.S3AUtils;
import org.apache.hadoop.fs.s3a.Tristate;
import org.apache.hadoop.fs.s3a.auth.RoleModel;
import org.apache.hadoop.fs.s3a.auth.RolePolicies;
import org.apache.hadoop.fs.s3a.auth.delegation.AWSPolicyProvider;
import org.apache.hadoop.fs.s3a.impl.StoreContext;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.BlockingThreadPoolExecutorService;
import org.apache.hadoop.util.DurationInfo;
import org.apache.hadoop.util.ReflectionUtils;

import static org.apache.hadoop.fs.s3a.Constants.*;
import static org.apache.hadoop.fs.s3a.S3AUtils.*;
import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowAllDynamoDBOperations;
import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowS3GuardClientOperations;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
import static org.apache.hadoop.fs.s3a.s3guard.PathMetadataDynamoDBTranslation.*;
import static org.apache.hadoop.fs.s3a.s3guard.PathOrderComparators.TOPMOST_PM_LAST;
import static org.apache.hadoop.fs.s3a.s3guard.S3Guard.*;

/**
 * DynamoDBMetadataStore is a {@link MetadataStore} that persists
 * file system metadata to DynamoDB.
 *
 * The current implementation uses a schema consisting of a single table.  The
 * name of the table can be configured by config key
 * {@link org.apache.hadoop.fs.s3a.Constants#S3GUARD_DDB_TABLE_NAME_KEY}.
 * By default, it matches the name of the S3 bucket.  Each item in the table
 * represents a single directory or file.  Its path is split into separate table
 * attributes:
 * 
    *
  • parent (absolute path of the parent, with bucket name inserted as * first path component).
  • *
  • child (path of that specific child, relative to parent).
  • *
  • optional boolean attribute tracking whether the path is a directory. * Absence or a false value indicates the path is a file.
  • *
  • optional long attribute revealing modification time of file. * This attribute is meaningful only to file items.
  • *
  • optional long attribute revealing file length. * This attribute is meaningful only to file items.
  • *
  • optional long attribute revealing block size of the file. * This attribute is meaningful only to file items.
  • *
  • optional string attribute tracking the s3 eTag of the file. * May be absent if the metadata was entered with a version of S3Guard * before this was tracked. * This attribute is meaningful only to file items.
  • *
  • optional string attribute tracking the s3 versionId of the file. * May be absent if the metadata was entered with a version of S3Guard * before this was tracked. * This attribute is meaningful only to file items.
  • *
* * The DynamoDB partition key is the parent, and the range key is the child. * * To allow multiple buckets to share the same DynamoDB table, the bucket * name is treated as the root directory. * * For example, assume the consistent store contains metadata representing this * file system structure: * *
 * s3a://bucket/dir1
 * |-- dir2
 * |   |-- file1
 * |   `-- file2
 * `-- dir3
 *     |-- dir4
 *     |   `-- file3
 *     |-- dir5
 *     |   `-- file4
 *     `-- dir6
 * 
* * This is persisted to a single DynamoDB table as: * *
 * ====================================================================================
 * | parent                 | child | is_dir | mod_time | len | etag | ver_id |  ...  |
 * ====================================================================================
 * | /bucket                | dir1  | true   |          |     |      |        |       |
 * | /bucket/dir1           | dir2  | true   |          |     |      |        |       |
 * | /bucket/dir1           | dir3  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir2      | file1 |        |   100    | 111 | abc  |  mno   |       |
 * | /bucket/dir1/dir2      | file2 |        |   200    | 222 | def  |  pqr   |       |
 * | /bucket/dir1/dir3      | dir4  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir3      | dir5  | true   |          |     |      |        |       |
 * | /bucket/dir1/dir3/dir4 | file3 |        |   300    | 333 | ghi  |  stu   |       |
 * | /bucket/dir1/dir3/dir5 | file4 |        |   400    | 444 | jkl  |  vwx   |       |
 * | /bucket/dir1/dir3      | dir6  | true   |          |     |      |        |       |
 * ====================================================================================
 * 
* * This choice of schema is efficient for read access patterns. * {@link #get(Path)} can be served from a single item lookup. * {@link #listChildren(Path)} can be served from a query against all rows * matching the parent (the partition key) and the returned list is guaranteed * to be sorted by child (the range key). Tracking whether or not a path is a * directory helps prevent unnecessary queries during traversal of an entire * sub-tree. * * Some mutating operations, notably * {@link MetadataStore#deleteSubtree(Path, BulkOperationState)} and * {@link MetadataStore#move(Collection, Collection, BulkOperationState)} * are less efficient with this schema. * They require mutating multiple items in the DynamoDB table. * * By default, DynamoDB access is performed within the same AWS region as * the S3 bucket that hosts the S3A instance. During initialization, it checks * the location of the S3 bucket and creates a DynamoDB client connected to the * same region. The region may also be set explicitly by setting the config * parameter {@code fs.s3a.s3guard.ddb.region} to the corresponding region. */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") @InterfaceAudience.Private @InterfaceStability.Evolving public class DynamoDBMetadataStore implements MetadataStore, AWSPolicyProvider { public static final Logger LOG = LoggerFactory.getLogger( DynamoDBMetadataStore.class); /** * Name of the operations log. */ public static final String OPERATIONS_LOG_NAME = "org.apache.hadoop.fs.s3a.s3guard.Operations"; /** * A log of all state changing operations to the store; * only updated at debug level. */ public static final Logger OPERATIONS_LOG = LoggerFactory.getLogger( OPERATIONS_LOG_NAME); /** parent/child name to use in the version marker. */ public static final String VERSION_MARKER_ITEM_NAME = "../VERSION"; /** parent/child name to use in the version marker. */ public static final String VERSION_MARKER_TAG_NAME = "s3guard_version"; /** Current version number. */ public static final int VERSION = 100; @VisibleForTesting static final String BILLING_MODE = "billing-mode"; @VisibleForTesting static final String BILLING_MODE_PER_REQUEST = "per-request"; @VisibleForTesting static final String BILLING_MODE_PROVISIONED = "provisioned"; @VisibleForTesting static final String DESCRIPTION = "S3Guard metadata store in DynamoDB"; @VisibleForTesting static final String READ_CAPACITY = "read-capacity"; @VisibleForTesting static final String WRITE_CAPACITY = "write-capacity"; @VisibleForTesting static final String STATUS = "status"; @VisibleForTesting static final String TABLE = "table"; @VisibleForTesting static final String HINT_DDB_IOPS_TOO_LOW = " This may be because the write threshold of DynamoDB is set too low."; @VisibleForTesting static final String THROTTLING = "Throttling"; public static final String E_ON_DEMAND_NO_SET_CAPACITY = "Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when BillingMode is PAY_PER_REQUEST"; @VisibleForTesting static final String E_INCONSISTENT_UPDATE = "Duplicate and inconsistent entry in update operation"; private static final ValueMap DELETE_TRACKING_VALUE_MAP = new ValueMap().withBoolean(":false", false); /** * The maximum number of outstanding operations to submit * before blocking to await completion of all the executors. * Paging work like this is less efficient, but it ensures that * failure (auth, network, etc) are picked up before many more * operations are submitted. * * Arbitrary Choice. * Value: {@value}. */ private static final int S3GUARD_DDB_SUBMITTED_TASK_LIMIT = 50; private AmazonDynamoDB amazonDynamoDB; private DynamoDB dynamoDB; private AWSCredentialProviderList credentials; private String region; private Table table; private String tableName; private Configuration conf; private String username; /** * This policy is mostly for batched writes, not for processing * exceptions in invoke() calls. * It also has a role purpose in * {@link DynamoDBMetadataStoreTableManager#getVersionMarkerItem()}; * look at that method for the details. */ private RetryPolicy batchWriteRetryPolicy; /** * The instrumentation is never null -if/when bound to an owner file system * That filesystem statistics will be updated as appropriate. */ private MetastoreInstrumentation instrumentation = new MetastoreInstrumentationImpl(); /** Owner FS: only valid if configured with an owner FS. */ private S3AFileSystem owner; /** Invoker for IO. Until configured properly, use try-once. */ private Invoker invoker = new Invoker(RetryPolicies.TRY_ONCE_THEN_FAIL, Invoker.NO_OP ); /** Invoker for read operations. */ private Invoker readOp; /** Invoker for write operations. */ private Invoker writeOp; /** Invoker for scan operations. */ private Invoker scanOp; private final AtomicLong readThrottleEvents = new AtomicLong(0); private final AtomicLong writeThrottleEvents = new AtomicLong(0); private final AtomicLong scanThrottleEvents = new AtomicLong(0); private final AtomicLong batchWriteCapacityExceededEvents = new AtomicLong(0); /** * Total limit on the number of throttle events after which * we stop warning in the log. Keeps the noise down. */ private static final int THROTTLE_EVENT_LOG_LIMIT = 100; /** * Count of the total number of throttle events; used to crank back logging. */ private AtomicInteger throttleEventCount = new AtomicInteger(0); /** * Executor for submitting operations. */ private ListeningExecutorService executor; /** * Time source. This is used during writes when parent * entries need to be created. */ private ITtlTimeProvider ttlTimeProvider; private DynamoDBMetadataStoreTableManager tableHandler; /** * A utility function to create DynamoDB instance. * @param conf the file system configuration * @param s3Region region of the associated S3 bucket (if any). * @param bucket Optional bucket to use to look up per-bucket proxy secrets * @param credentials credentials. * @return DynamoDB instance. * @throws IOException I/O error. */ private DynamoDB createDynamoDB( final Configuration conf, final String s3Region, final String bucket, final AWSCredentialsProvider credentials) throws IOException { if (amazonDynamoDB == null) { Preconditions.checkNotNull(conf); final Class cls = conf.getClass(S3GUARD_DDB_CLIENT_FACTORY_IMPL, S3GUARD_DDB_CLIENT_FACTORY_IMPL_DEFAULT, DynamoDBClientFactory.class); LOG.debug("Creating DynamoDB client {} with S3 region {}", cls, s3Region); amazonDynamoDB = ReflectionUtils.newInstance(cls, conf) .createDynamoDBClient(s3Region, bucket, credentials); } return new DynamoDB(amazonDynamoDB); } /** * {@inheritDoc}. * The credentials for authenticating with S3 are requested from the * FS via {@link S3AFileSystem#shareCredentials(String)}; this will * increment the reference counter of these credentials. * @param fs {@code S3AFileSystem} associated with the MetadataStore * @param ttlTp the time provider to use for metadata expiry * @throws IOException on a failure */ @Override @Retries.OnceRaw public void initialize(FileSystem fs, ITtlTimeProvider ttlTp) throws IOException { Preconditions.checkNotNull(fs, "Null filesystem"); Preconditions.checkArgument(fs instanceof S3AFileSystem, "DynamoDBMetadataStore only supports S3A filesystem - not %s", fs); bindToOwnerFilesystem((S3AFileSystem) fs); final String bucket = owner.getBucket(); String confRegion = conf.getTrimmed(S3GUARD_DDB_REGION_KEY); if (!StringUtils.isEmpty(confRegion)) { region = confRegion; LOG.debug("Overriding S3 region with configured DynamoDB region: {}", region); } else { try { region = owner.getBucketLocation(); } catch (AccessDeniedException e) { // access denied here == can't call getBucket. Report meaningfully URI uri = owner.getUri(); String message = "Failed to get bucket location as client lacks permission " + RolePolicies.S3_GET_BUCKET_LOCATION + " for " + uri; LOG.error(message); throw (IOException)new AccessDeniedException(message).initCause(e); } LOG.debug("Inferring DynamoDB region from S3 bucket: {}", region); } credentials = owner.shareCredentials("s3guard"); dynamoDB = createDynamoDB(conf, region, bucket, credentials); // use the bucket as the DynamoDB table name if not specified in config tableName = conf.getTrimmed(S3GUARD_DDB_TABLE_NAME_KEY, bucket); initDataAccessRetries(conf); this.ttlTimeProvider = ttlTp; tableHandler = new DynamoDBMetadataStoreTableManager( dynamoDB, tableName, region, amazonDynamoDB, conf, readOp, batchWriteRetryPolicy); this.table = tableHandler.initTable(); instrumentation.initialized(); } /** * Declare that this table is owned by the specific S3A FS instance. * This will bind some fields to the values provided by the owner, * including wiring up the instrumentation. * @param fs owner filesystem */ @VisibleForTesting void bindToOwnerFilesystem(final S3AFileSystem fs) { owner = fs; conf = owner.getConf(); StoreContext context = owner.createStoreContext(); instrumentation = context.getInstrumentation().getS3GuardInstrumentation(); username = context.getUsername(); executor = context.createThrottledExecutor(); ttlTimeProvider = Preconditions.checkNotNull( context.getTimeProvider(), "ttlTimeProvider must not be null"); } /** * Performs one-time initialization of the metadata store via configuration. * * This initialization depends on the configuration object to get AWS * credentials, DynamoDBFactory implementation class, DynamoDB endpoints, * DynamoDB table names etc. After initialization, this metadata store does * not explicitly relate to any S3 bucket, which be nonexistent. * * This is used to operate the metadata store directly beyond the scope of the * S3AFileSystem integration, e.g. command line tools. * Generally, callers should use * {@link MetadataStore#initialize(FileSystem, ITtlTimeProvider)} * with an initialized {@code S3AFileSystem} instance. * * Without a filesystem to act as a reference point, the configuration itself * must declare the table name and region in the * {@link Constants#S3GUARD_DDB_TABLE_NAME_KEY} and * {@link Constants#S3GUARD_DDB_REGION_KEY} respectively. * It also creates a new credential provider list from the configuration, * using the base fs.s3a.* options, as there is no bucket to infer per-bucket * settings from. * * @see MetadataStore#initialize(FileSystem, ITtlTimeProvider) * @throws IOException if there is an error * @throws IllegalArgumentException if the configuration is incomplete */ @Override @Retries.OnceRaw public void initialize(Configuration config, ITtlTimeProvider ttlTp) throws IOException { conf = config; // use the bucket as the DynamoDB table name if not specified in config tableName = conf.getTrimmed(S3GUARD_DDB_TABLE_NAME_KEY); Preconditions.checkArgument(!StringUtils.isEmpty(tableName), "No DynamoDB table name configured"); region = conf.getTrimmed(S3GUARD_DDB_REGION_KEY); Preconditions.checkArgument(!StringUtils.isEmpty(region), "No DynamoDB region configured"); // there's no URI here, which complicates life: you cannot // create AWS providers here which require one. credentials = createAWSCredentialProviderSet(null, conf); dynamoDB = createDynamoDB(conf, region, null, credentials); username = UserGroupInformation.getCurrentUser().getShortUserName(); // without an executor from the owner FS, create one using // the executor capacity for work. int executorCapacity = intOption(conf, EXECUTOR_CAPACITY, DEFAULT_EXECUTOR_CAPACITY, 1); executor = BlockingThreadPoolExecutorService.newInstance( executorCapacity, executorCapacity * 2, longOption(conf, KEEPALIVE_TIME, DEFAULT_KEEPALIVE_TIME, 0), TimeUnit.SECONDS, "s3a-ddb-" + tableName); initDataAccessRetries(conf); this.ttlTimeProvider = ttlTp; tableHandler = new DynamoDBMetadataStoreTableManager( dynamoDB, tableName, region, amazonDynamoDB, conf, readOp, batchWriteRetryPolicy); this.table = tableHandler.initTable(); } /** * Set retry policy. This is driven by the value of * {@link Constants#S3GUARD_DDB_MAX_RETRIES} with an exponential backoff * between each attempt of {@link Constants#S3GUARD_DDB_THROTTLE_RETRY_INTERVAL} * milliseconds. * @param config configuration for data access */ private void initDataAccessRetries(Configuration config) { batchWriteRetryPolicy = RetryPolicies .exponentialBackoffRetry( config.getInt(S3GUARD_DDB_MAX_RETRIES, S3GUARD_DDB_MAX_RETRIES_DEFAULT), conf.getTimeDuration(S3GUARD_DDB_THROTTLE_RETRY_INTERVAL, S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); final RetryPolicy throttledRetryRetryPolicy = new S3GuardDataAccessRetryPolicy(config); readOp = new Invoker(throttledRetryRetryPolicy, this::readRetryEvent); writeOp = new Invoker(throttledRetryRetryPolicy, this::writeRetryEvent); scanOp = new Invoker(throttledRetryRetryPolicy, this::scanRetryEvent); } @Override @Retries.RetryTranslated public void delete(Path path, final BulkOperationState operationState) throws IOException { innerDelete(path, true, extractOrCreate(operationState, BulkOperationState.OperationType.Delete)); } @Override @Retries.RetryTranslated public void forgetMetadata(Path path) throws IOException { LOG.debug("Forget metadata for {}", path); innerDelete(path, false, null); } /** * Inner delete option, action based on the {@code tombstone} flag. * No tombstone: delete the entry. Tombstone: create a tombstone entry. * There is no check as to whether the entry exists in the table first. * @param path path to delete * @param tombstone flag to create a tombstone marker * @param ancestorState ancestor state for context. * @throws IOException I/O error. */ @Retries.RetryTranslated private void innerDelete(final Path path, final boolean tombstone, final AncestorState ancestorState) throws IOException { checkPath(path); LOG.debug("Deleting from table {} in region {}: {}", tableName, region, path); // deleting nonexistent item consumes 1 write capacity; skip it if (path.isRoot()) { LOG.debug("Skip deleting root directory as it does not exist in table"); return; } // the policy on whether repeating delete operations is based // on that of S3A itself boolean idempotent = S3AFileSystem.DELETE_CONSIDERED_IDEMPOTENT; if (tombstone) { Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider " + "must not be null"); final PathMetadata pmTombstone = PathMetadata.tombstone(path, ttlTimeProvider.getNow()); Item item = PathMetadataDynamoDBTranslation.pathMetadataToItem( new DDBPathMetadata(pmTombstone)); writeOp.retry( "Put tombstone", path.toString(), idempotent, () -> { logPut(ancestorState, item); recordsWritten(1); table.putItem(item); }); } else { PrimaryKey key = pathToKey(path); writeOp.retry( "Delete key", path.toString(), idempotent, () -> { // record the attempt so even on retry the counter goes up. logDelete(ancestorState, key); recordsDeleted(1); table.deleteItem(key); }); } } @Override @Retries.RetryTranslated public void deleteSubtree(Path path, final BulkOperationState operationState) throws IOException { checkPath(path); LOG.debug("Deleting subtree from table {} in region {}: {}", tableName, region, path); final PathMetadata meta = get(path); if (meta == null) { LOG.debug("Subtree path {} does not exist; this will be a no-op", path); return; } if (meta.isDeleted()) { LOG.debug("Subtree path {} is deleted; this will be a no-op", path); return; } deleteEntries(new InternalIterators.PathFromRemoteStatusIterator( new DescendantsIterator(this, meta)), operationState); } @Override @Retries.RetryTranslated public void deletePaths(Collection paths, final BulkOperationState operationState) throws IOException { deleteEntries( new InternalIterators.RemoteIteratorFromIterator<>(paths.iterator()), operationState); } /** * Delete the entries under an iterator. * There's no attempt to order the paths: they are * deleted in the order passed in. * @param entries entries to delete. * @param operationState Nullable operation state * @throws IOException failure */ @Retries.RetryTranslated private void deleteEntries(RemoteIterator entries, final BulkOperationState operationState) throws IOException { final List> futures = new ArrayList<>(); AncestorState state = extractOrCreate(operationState, BulkOperationState.OperationType.Delete); while (entries.hasNext()) { final Path pathToDelete = entries.next(); futures.add(submit(executor, () -> { innerDelete(pathToDelete, true, state); return null; })); if (futures.size() > S3GUARD_DDB_SUBMITTED_TASK_LIMIT) { // first batch done; block for completion. waitForCompletion(futures); futures.clear(); } } // now wait for the final set. waitForCompletion(futures); } /** * Get a consistent view of an item. * @param path path to look up in the database * @return the result * @throws IOException failure */ @Retries.RetryTranslated private Item getConsistentItem(final Path path) throws IOException { PrimaryKey key = pathToKey(path); final GetItemSpec spec = new GetItemSpec() .withPrimaryKey(key) .withConsistentRead(true); // strictly consistent read return readOp.retry("get", path.toString(), true, () -> { recordsRead(1); return table.getItem(spec); }); } @Override @Retries.RetryTranslated public DDBPathMetadata get(Path path) throws IOException { return get(path, false); } @Override @Retries.RetryTranslated public DDBPathMetadata get(Path path, boolean wantEmptyDirectoryFlag) throws IOException { checkPath(path); LOG.debug("Get from table {} in region {}: {}. wantEmptyDirectory={}", tableName, region, path, wantEmptyDirectoryFlag); DDBPathMetadata result = innerGet(path, wantEmptyDirectoryFlag); LOG.debug("result of get {} is: {}", path, result); return result; } /** * Inner get operation, as invoked in the retry logic. * @param path the path to get * @param wantEmptyDirectoryFlag Set to true to give a hint to the * MetadataStore that it should try to compute the empty directory flag. * @return metadata for {@code path}, {@code null} if not found * @throws IOException IO problem */ @Retries.RetryTranslated private DDBPathMetadata innerGet(Path path, boolean wantEmptyDirectoryFlag) throws IOException { final DDBPathMetadata meta; if (path.isRoot()) { // Root does not persist in the table meta = new DDBPathMetadata(makeDirStatus(username, path)); } else { final Item item = getConsistentItem(path); meta = itemToPathMetadata(item, username); LOG.debug("Get from table {} in region {} returning for {}: {}", tableName, region, path, meta); } if (wantEmptyDirectoryFlag && meta != null && !meta.isDeleted()) { final FileStatus status = meta.getFileStatus(); // for a non-deleted directory, we query its direct undeleted children // to determine the isEmpty bit. There's no TTL checking going on here. if (status.isDirectory()) { final QuerySpec spec = new QuerySpec() .withHashKey(pathToParentKeyAttribute(path)) .withConsistentRead(true) .withFilterExpression(IS_DELETED + " = :false") .withValueMap(DELETE_TRACKING_VALUE_MAP); boolean hasChildren = readOp.retry("get/hasChildren", path.toString(), true, () -> { // issue the query final IteratorSupport it = table.query( spec).iterator(); // if non empty, log the result to aid with some debugging if (it.hasNext()) { if (LOG.isDebugEnabled()) { LOG.debug("Dir {} is non-empty", status.getPath()); while(it.hasNext()) { LOG.debug("{}", itemToPathMetadata(it.next(), username)); } } return true; } else { return false; } }); // If directory is authoritative, we can set the empty directory flag // to TRUE or FALSE. Otherwise FALSE, or UNKNOWN. if (meta.isAuthoritativeDir()) { meta.setIsEmptyDirectory( hasChildren ? Tristate.FALSE : Tristate.TRUE); } else { meta.setIsEmptyDirectory( hasChildren ? Tristate.FALSE : Tristate.UNKNOWN); } } } return meta; } /** * Make a S3AFileStatus object for a directory at given path. * The FileStatus only contains what S3A needs, and omits mod time * since S3A uses its own implementation which returns current system time. * @param dirOwner username of owner * @param path path to dir * @return new S3AFileStatus */ private S3AFileStatus makeDirStatus(String dirOwner, Path path) { return new S3AFileStatus(Tristate.UNKNOWN, path, dirOwner); } @Override @Retries.RetryTranslated public DirListingMetadata listChildren(final Path path) throws IOException { checkPath(path); LOG.debug("Listing table {} in region {}: {}", tableName, region, path); final QuerySpec spec = new QuerySpec() .withHashKey(pathToParentKeyAttribute(path)) .withConsistentRead(true); // strictly consistent read final List metas = new ArrayList<>(); // find the children in the table final ItemCollection items = scanOp.retry( "listChildren", path.toString(), true, () -> table.query(spec)); // now wrap the result with retry logic try { for (Item item : wrapWithRetries(items)) { metas.add(itemToPathMetadata(item, username)); } } catch (WrappedIOException e) { // failure in the iterators; unwrap. throw e.getCause(); } // Minor race condition here - if the path is deleted between // getting the list of items and the directory metadata we might // get a null in DDBPathMetadata. return getDirListingMetadataFromDirMetaAndList(path, metas, get(path)); } DirListingMetadata getDirListingMetadataFromDirMetaAndList(Path path, List metas, DDBPathMetadata dirPathMeta) { boolean isAuthoritative = false; if (dirPathMeta != null) { isAuthoritative = dirPathMeta.isAuthoritativeDir(); } LOG.trace("Listing table {} in region {} for {} returning {}", tableName, region, path, metas); if (!metas.isEmpty() && dirPathMeta == null) { // We handle this case as the directory is deleted. LOG.warn("Directory marker is deleted, but the list of the directory " + "elements is not empty: {}. This case is handled as if the " + "directory was deleted.", metas); return null; } if(metas.isEmpty() && dirPathMeta == null) { return null; } return new DirListingMetadata(path, metas, isAuthoritative, dirPathMeta.getLastUpdated()); } /** * Origin of entries in the ancestor map built up in * {@link #completeAncestry(Collection, AncestorState)}. * This is done to stop generated ancestor entries to overwriting those * in the store, while allowing those requested in the API call to do this. */ private enum EntryOrigin { Requested, // requested in method call Retrieved, // retrieved from DDB: do not resubmit Generated // generated ancestor. } /** * Build the list of all parent entries. *

* Thread safety: none. Callers must synchronize access. *

* Callers are required to synchronize on ancestorState. * @param pathsToCreate paths to create * @param ancestorState ongoing ancestor state. * @return the full ancestry paths */ private Collection completeAncestry( final Collection pathsToCreate, final AncestorState ancestorState) throws IOException { // Key on path to allow fast lookup Map> ancestry = new HashMap<>(); LOG.debug("Completing ancestry for {} paths", pathsToCreate.size()); // we sort the inputs to guarantee that the topmost entries come first. // that way if the put request contains both parents and children // then the existing parents will not be re-created -they will just // be added to the ancestor list first. List sortedPaths = new ArrayList<>(pathsToCreate); sortedPaths.sort(PathOrderComparators.TOPMOST_PM_FIRST); // iterate through the paths. for (DDBPathMetadata entry : sortedPaths) { Preconditions.checkArgument(entry != null); Path path = entry.getFileStatus().getPath(); LOG.debug("Adding entry {}", path); if (path.isRoot()) { // this is a root entry: do not add it. break; } // add it to the ancestor state, failing if it is already there and // of a different type. DDBPathMetadata oldEntry = ancestorState.put(path, entry); boolean addAncestors = true; if (oldEntry != null) { if (!oldEntry.getFileStatus().isDirectory() || !entry.getFileStatus().isDirectory()) { // check for and warn if the existing bulk operation overwrote it. // this should never occur outside tests explicitly creating it LOG.warn("Overwriting a S3Guard file created in the operation: {}", oldEntry); LOG.warn("With new entry: {}", entry); // restore the old state ancestorState.put(path, oldEntry); // then raise an exception throw new PathIOException(path.toString(), E_INCONSISTENT_UPDATE); } else { // a directory is already present. Log and continue. LOG.debug("Directory at {} being updated with value {}", path, entry); // and we skip the the subsequent parent scan as we've already been // here addAncestors = false; } } // add the entry to the ancestry map as an explicitly requested entry. ancestry.put(path, Pair.of(EntryOrigin.Requested, entry)); // now scan up the ancestor tree to see if there are any // immediately missing entries. Path parent = path.getParent(); while (addAncestors && !parent.isRoot() && !ancestry.containsKey(parent)) { if (!ancestorState.findEntry(parent, true)) { // there is no entry in the ancestor state. // look in the store DDBPathMetadata md; Pair newEntry; final Item item = getConsistentItem(parent); if (item != null && !itemToPathMetadata(item, username).isDeleted()) { // This is an undeleted entry found in the database. // register it in ancestor state and in the map of entries to create // as a retrieved entry md = itemToPathMetadata(item, username); LOG.debug("Found existing entry for parent: {}", md); newEntry = Pair.of(EntryOrigin.Retrieved, md); // and we break, assuming that if there is an entry, its parents // are valid too. addAncestors = false; } else { // A directory entry was not found in the DB. Create one. LOG.debug("auto-create ancestor path {} for child path {}", parent, path); final S3AFileStatus status = makeDirStatus(parent, username); md = new DDBPathMetadata(status, Tristate.FALSE, false, false, ttlTimeProvider.getNow()); // declare to be a generated entry newEntry = Pair.of(EntryOrigin.Generated, md); } // insert into the ancestor state to avoid further checks ancestorState.put(parent, md); ancestry.put(parent, newEntry); } parent = parent.getParent(); } } // we now have a list of entries which were not in the operation state. // Filter out those which were retrieved, to produce a list of those // which must be written to the database. // TODO sort in reverse order of existence return ancestry.values().stream() .filter(p -> p.getLeft() != EntryOrigin.Retrieved) .map(Pair::getRight) .collect(Collectors.toList()); } /** * {@inheritDoc} *

* The implementation scans all up the directory tree and does a get() * for each entry; at each level one is found it is added to the ancestor * state. *

* The original implementation would stop on finding the first non-empty * parent. This (re) implementation issues a GET for every parent entry * and so detects and recovers from a tombstone marker further up the tree * (i.e. an inconsistent store is corrected for). *

* if {@code operationState} is not null, when this method returns the * operation state will be updated with all new entries created. * This ensures that subsequent operations with the same store will not * trigger new updates. * @param qualifiedPath path to update * @param operationState (nullable) operational state for a bulk update * @throws IOException on failure. */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") @Override @Retries.RetryTranslated public void addAncestors(final Path qualifiedPath, @Nullable final BulkOperationState operationState) throws IOException { Collection newDirs = new ArrayList<>(); final AncestorState ancestorState = extractOrCreate(operationState, BulkOperationState.OperationType.Put); Path parent = qualifiedPath.getParent(); boolean entryFound = false; // Iterate up the parents. // note that only ancestorState get/set operations are synchronized; // the DDB read between them is not. As a result, more than one // thread may probe the state, find the entry missing, do the database // query and add the entry. // This is done to avoid making the remote dynamo query part of the // synchronized block. // If a race does occur, the cost is simply one extra GET and potentially // one extra PUT. while (!parent.isRoot()) { synchronized (ancestorState) { if (ancestorState.contains(parent)) { // the ancestry map contains the key, so no need to even look for it. break; } } // we don't worry about tombstone expiry here as expired or not, // a directory entry will go in. PathMetadata directory = get(parent); if (directory == null || directory.isDeleted()) { if (entryFound) { LOG.warn("Inconsistent S3Guard table: adding directory {}", parent); } S3AFileStatus status = makeDirStatus(username, parent); LOG.debug("Adding new ancestor entry {}", status); DDBPathMetadata meta = new DDBPathMetadata(status, Tristate.FALSE, false, ttlTimeProvider.getNow()); newDirs.add(meta); // Do not update ancestor state here, as it // will happen in the innerPut() call. Were we to add it // here that put operation would actually (mistakenly) skip // creating the entry. } else { // an entry was found. Check its type entryFound = true; if (directory.getFileStatus().isFile()) { throw new PathIOException(parent.toString(), "Cannot overwrite parent file: metastore is" + " in an inconsistent state"); } // the directory exists. Add it to the ancestor state for next time. synchronized (ancestorState) { ancestorState.put(parent, new DDBPathMetadata(directory)); } } parent = parent.getParent(); } // the listing of directories to put is all those parents which we know // are not in the store or BulkOperationState. if (!newDirs.isEmpty()) { // patch up the time. patchLastUpdated(newDirs, ttlTimeProvider); innerPut(newDirs, operationState); } } /** * {@inheritDoc}. * * The DDB implementation sorts all the paths such that new items * are ordered highest level entry first; deleted items are ordered * lowest entry first. * * This is to ensure that if a client failed partway through the update, * there will no entries in the table which lack parent entries. * @param pathsToDelete Collection of all paths that were removed from the * source directory tree of the move. * @param pathsToCreate Collection of all PathMetadata for the new paths * that were created at the destination of the rename * (). * @param operationState Any ongoing state supplied to the rename tracker * which is to be passed in with each move operation. * @throws IOException if there is an error */ @Override @Retries.RetryTranslated public void move(@Nullable Collection pathsToDelete, @Nullable Collection pathsToCreate, @Nullable final BulkOperationState operationState) throws IOException { if (pathsToDelete == null && pathsToCreate == null) { return; } LOG.debug("Moving paths of table {} in region {}: {} paths to delete and {}" + " paths to create", tableName, region, pathsToDelete == null ? 0 : pathsToDelete.size(), pathsToCreate == null ? 0 : pathsToCreate.size()); LOG.trace("move: pathsToDelete = {}, pathsToCreate = {}", pathsToDelete, pathsToCreate); // In DynamoDBMetadataStore implementation, we assume that if a path // exists, all its ancestors will also exist in the table. // Following code is to maintain this invariant by putting all ancestor // directories of the paths to create. // ancestor paths that are not explicitly added to paths to create AncestorState ancestorState = extractOrCreate(operationState, BulkOperationState.OperationType.Rename); List newItems = new ArrayList<>(); if (pathsToCreate != null) { // create all parent entries. // this is synchronized on the move state so that across both serialized // and parallelized renames, duplicate ancestor entries are not created. synchronized (ancestorState) { newItems.addAll( completeAncestry( pathMetaToDDBPathMeta(pathsToCreate), ancestorState)); } } // sort all the new items topmost first. newItems.sort(PathOrderComparators.TOPMOST_PM_FIRST); // now process the deletions. if (pathsToDelete != null) { List tombstones = new ArrayList<>(pathsToDelete.size()); for (Path meta : pathsToDelete) { Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider" + " must not be null"); final PathMetadata pmTombstone = PathMetadata.tombstone(meta, ttlTimeProvider.getNow()); tombstones.add(new DDBPathMetadata(pmTombstone)); } // sort all the tombstones lowest first. tombstones.sort(TOPMOST_PM_LAST); newItems.addAll(tombstones); } processBatchWriteRequest(ancestorState, null, pathMetadataToItem(newItems)); } /** * Helper method to issue a batch write request to DynamoDB. *

    *
  1. Keys to delete are processed ahead of writing new items.
  2. *
  3. No attempt is made to sort the input: the caller must do that
  4. *
* As well as retrying on the operation invocation, incomplete * batches are retried until all have been processed. * * @param ancestorState ancestor state for logging * @param keysToDelete primary keys to be deleted; can be null * @param itemsToPut new items to be put; can be null * @return the number of iterations needed to complete the call. */ @Retries.RetryTranslated("Outstanding batch items are updated with backoff") private int processBatchWriteRequest( @Nullable AncestorState ancestorState, PrimaryKey[] keysToDelete, Item[] itemsToPut) throws IOException { final int totalToDelete = (keysToDelete == null ? 0 : keysToDelete.length); final int totalToPut = (itemsToPut == null ? 0 : itemsToPut.length); if (totalToPut == 0 && totalToDelete == 0) { LOG.debug("Ignoring empty batch write request"); return 0; } int count = 0; int batches = 0; while (count < totalToDelete + totalToPut) { final TableWriteItems writeItems = new TableWriteItems(tableName); int numToDelete = 0; if (keysToDelete != null && count < totalToDelete) { numToDelete = Math.min(S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT, totalToDelete - count); PrimaryKey[] toDelete = Arrays.copyOfRange(keysToDelete, count, count + numToDelete); LOG.debug("Deleting {} entries: {}", toDelete.length, toDelete); writeItems.withPrimaryKeysToDelete(toDelete); count += numToDelete; } if (numToDelete < S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT && itemsToPut != null && count < totalToDelete + totalToPut) { final int numToPut = Math.min( S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT - numToDelete, totalToDelete + totalToPut - count); final int index = count - totalToDelete; writeItems.withItemsToPut( Arrays.copyOfRange(itemsToPut, index, index + numToPut)); count += numToPut; } // if there's a retry and another process updates things then it's not // quite idempotent, but this was the case anyway batches++; BatchWriteItemOutcome res = writeOp.retry( "batch write", "", true, () -> dynamoDB.batchWriteItem(writeItems)); // Check for unprocessed keys in case of exceeding provisioned throughput Map> unprocessed = res.getUnprocessedItems(); int retryCount = 0; while (!unprocessed.isEmpty()) { batchWriteCapacityExceededEvents.incrementAndGet(); batches++; retryBackoffOnBatchWrite(retryCount++); // use a different reference to keep the compiler quiet final Map> upx = unprocessed; res = writeOp.retry( "batch write", "", true, () -> dynamoDB.batchWriteItemUnprocessed(upx)); unprocessed = res.getUnprocessedItems(); } } if (itemsToPut != null) { recordsWritten(itemsToPut.length); logPut(ancestorState, itemsToPut); } if (keysToDelete != null) { recordsDeleted(keysToDelete.length); logDelete(ancestorState, keysToDelete); } return batches; } /** * Put the current thread to sleep to implement exponential backoff * depending on retryCount. If max retries are exceeded, throws an * exception instead. * * @param retryCount number of retries so far * @throws IOException when max retryCount is exceeded. */ private void retryBackoffOnBatchWrite(int retryCount) throws IOException { try { // Our RetryPolicy ignores everything but retryCount here. RetryPolicy.RetryAction action = batchWriteRetryPolicy.shouldRetry( null, retryCount, 0, true); if (action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) { // Create an AWSServiceThrottledException, with a fake inner cause // which we fill in to look like a real exception so // error messages look sensible AmazonServiceException cause = new AmazonServiceException( "Throttling"); cause.setServiceName("S3Guard"); cause.setStatusCode(AWSServiceThrottledException.STATUS_CODE); cause.setErrorCode(THROTTLING); // used in real AWS errors cause.setErrorType(AmazonServiceException.ErrorType.Service); cause.setErrorMessage(THROTTLING); cause.setRequestId("n/a"); throw new AWSServiceThrottledException( String.format("Max retries during batch write exceeded" + " (%d) for DynamoDB." + HINT_DDB_IOPS_TOO_LOW, retryCount), cause); } else { LOG.debug("Sleeping {} msec before next retry", action.delayMillis); Thread.sleep(action.delayMillis); } } catch (InterruptedException e) { throw (IOException)new InterruptedIOException(e.toString()).initCause(e); } catch (IOException e) { throw e; } catch (Exception e) { throw new IOException("Unexpected exception " + e, e); } } @Override @Retries.RetryTranslated public void put(final PathMetadata meta) throws IOException { put(meta, null); } @Override @Retries.RetryTranslated public void put( final PathMetadata meta, @Nullable final BulkOperationState operationState) throws IOException { // For a deeply nested path, this method will automatically create the full // ancestry and save respective item in DynamoDB table. // So after put operation, we maintain the invariant that if a path exists, // all its ancestors will also exist in the table. // For performance purpose, we generate the full paths to put and use batch // write item request to save the items. LOG.debug("Saving to table {} in region {}: {}", tableName, region, meta); Collection wrapper = new ArrayList<>(1); wrapper.add(meta); put(wrapper, operationState); } @Override @Retries.RetryTranslated public void put( final Collection metas, @Nullable final BulkOperationState operationState) throws IOException { innerPut(pathMetaToDDBPathMeta(metas), operationState); } /** * Internal put operation. *

* The ancestors to all entries are added to the set of entries to write, * provided they are not already stored in any supplied operation state. * Both the supplied metadata entries and ancestor entries are sorted * so that the topmost entries are written first. * This is to ensure that a failure partway through the operation will not * create entries in the table without parents. * @param metas metadata entries to write. * @param operationState (nullable) operational state for a bulk update * @throws IOException failure. */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") @Retries.RetryTranslated private void innerPut( final Collection metas, @Nullable final BulkOperationState operationState) throws IOException { if (metas.isEmpty()) { // Happens when someone calls put() with an empty list. LOG.debug("Ignoring empty list of entries to put"); return; } // always create or retrieve an ancestor state instance, so it can // always be used for synchronization. final AncestorState ancestorState = extractOrCreate(operationState, BulkOperationState.OperationType.Put); Item[] items; synchronized (ancestorState) { items = pathMetadataToItem( completeAncestry(metas, ancestorState)); } LOG.debug("Saving batch of {} items to table {}, region {}", items.length, tableName, region); processBatchWriteRequest(ancestorState, null, items); } /** * Get full path of ancestors that are nonexistent in table. * * This queries DDB when looking for parents which are not in * any supplied ongoing operation state. * Updates the operation state with found entries to reduce further checks. * * @param meta metadata to put * @param operationState ongoing bulk state * @return a possibly empty list of entries to put. * @throws IOException failure */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") @VisibleForTesting @Retries.RetryTranslated List fullPathsToPut(DDBPathMetadata meta, @Nullable BulkOperationState operationState) throws IOException { checkPathMetadata(meta); final List metasToPut = new ArrayList<>(); // root path is not persisted if (!meta.getFileStatus().getPath().isRoot()) { metasToPut.add(meta); } // put all its ancestors if not present; as an optimization we return at its // first existent ancestor final AncestorState ancestorState = extractOrCreate(operationState, BulkOperationState.OperationType.Put); Path path = meta.getFileStatus().getPath().getParent(); while (path != null && !path.isRoot()) { synchronized (ancestorState) { if (ancestorState.findEntry(path, true)) { break; } } final Item item = getConsistentItem(path); if (!itemExists(item)) { final S3AFileStatus status = makeDirStatus(path, username); metasToPut.add(new DDBPathMetadata(status, Tristate.FALSE, false, meta.isAuthoritativeDir(), meta.getLastUpdated())); path = path.getParent(); } else { // found the entry in the table, so add it to the ancestor state synchronized (ancestorState) { ancestorState.put(path, itemToPathMetadata(item, username)); } // then break out of the loop. break; } } return metasToPut; } /** * Does an item represent an object which exists? * @param item item retrieved in a query. * @return true iff the item isn't null and, if there is an is_deleted * column, that its value is false. */ private static boolean itemExists(Item item) { if (item == null) { return false; } if (item.hasAttribute(IS_DELETED) && item.getBoolean(IS_DELETED)) { return false; } return true; } /** * Get the value of an optional boolean attribute, falling back to the * default value if the attribute is absent. * @param item Item * @param attrName Attribute name * @param defVal Default value * @return The value or the default */ private static boolean getBoolAttribute(Item item, String attrName, boolean defVal) { return item.hasAttribute(attrName) ? item.getBoolean(attrName) : defVal; } /** Create a directory FileStatus using 0 for the lastUpdated time. */ static S3AFileStatus makeDirStatus(Path f, String owner) { return new S3AFileStatus(Tristate.UNKNOWN, f, owner); } /** * {@inheritDoc}. * There is retry around building the list of paths to update, but * the call to * {@link #processBatchWriteRequest(DynamoDBMetadataStore.AncestorState, PrimaryKey[], Item[])} * is only tried once. * @param meta Directory listing metadata. * @param unchangedEntries unchanged child entry paths * @param operationState operational state for a bulk update * @throws IOException IO problem */ @Override @Retries.RetryTranslated public void put( final DirListingMetadata meta, final List unchangedEntries, @Nullable final BulkOperationState operationState) throws IOException { LOG.debug("Saving {} dir meta for {} to table {} in region {}: {}", meta.isAuthoritative() ? "auth" : "nonauth", meta.getPath(), tableName, region, meta); // directory path Path path = meta.getPath(); DDBPathMetadata ddbPathMeta = new DDBPathMetadata(makeDirStatus(path, username), meta.isEmpty(), false, meta.isAuthoritative(), meta.getLastUpdated()); // put all its ancestors if not present final AncestorState ancestorState = extractOrCreate(operationState, BulkOperationState.OperationType.Put); // First add any missing ancestors... final List metasToPut = fullPathsToPut(ddbPathMeta, ancestorState); // next add all changed children of the directory // ones that came from the previous listing are left as-is final Collection children = meta.getListing() .stream() .filter(e -> !unchangedEntries.contains(e.getFileStatus().getPath())) .collect(Collectors.toList()); metasToPut.addAll(pathMetaToDDBPathMeta(children)); // sort so highest-level entries are written to the store first. // if a sequence fails, no orphan entries will have been written. metasToPut.sort(PathOrderComparators.TOPMOST_PM_FIRST); processBatchWriteRequest(ancestorState, null, pathMetadataToItem(metasToPut)); // and add the ancestors synchronized (ancestorState) { metasToPut.forEach(ancestorState::put); } } @Override public synchronized void close() { instrumentation.storeClosed(); try { if (dynamoDB != null) { LOG.debug("Shutting down {}", this); dynamoDB.shutdown(); dynamoDB = null; } } finally { closeAutocloseables(LOG, credentials); credentials = null; } } @Override @Retries.RetryTranslated public void destroy() throws IOException { tableHandler.destroy(); } @Retries.RetryTranslated private ItemCollection expiredFiles(PruneMode pruneMode, long cutoff, String keyPrefix) throws IOException { String filterExpression; String projectionExpression; ValueMap map; switch (pruneMode) { case ALL_BY_MODTIME: // filter all files under the given parent older than the modtime. // this implicitly skips directories, because they lack a modtime field. // however we explicitly exclude directories to make clear that // directories are to be excluded and avoid any confusion // see: HADOOP-16725. // note: files lack the is_dir field entirely, so we use a `not` to // filter out the directories. filterExpression = "mod_time < :mod_time and begins_with(parent, :parent)" + " and not is_dir = :is_dir"; projectionExpression = "parent,child"; map = new ValueMap() .withLong(":mod_time", cutoff) .withString(":parent", keyPrefix) .withBoolean(":is_dir", true); break; case TOMBSTONES_BY_LASTUPDATED: filterExpression = "last_updated < :last_updated and begins_with(parent, :parent) " + "and is_deleted = :is_deleted"; projectionExpression = "parent,child,is_deleted"; map = new ValueMap() .withLong(":last_updated", cutoff) .withString(":parent", keyPrefix) .withBoolean(":is_deleted", true); break; default: throw new UnsupportedOperationException("Unsupported prune mode: " + pruneMode); } return readOp.retry( "scan", keyPrefix, true, () -> table.scan(filterExpression, projectionExpression, null, map)); } @Override @Retries.RetryTranslated public void prune(PruneMode pruneMode, long cutoff) throws IOException { prune(pruneMode, cutoff, "/"); } /** * Prune files, in batches. There's optionally a sleep between each batch. * * @param pruneMode The mode of operation for the prune For details see * {@link MetadataStore#prune(PruneMode, long)} * @param cutoff Oldest modification time to allow * @param keyPrefix The prefix for the keys that should be removed * @throws IOException Any IO/DDB failure. * @throws InterruptedIOException if the prune was interrupted * @return count of pruned items. */ @Override @Retries.RetryTranslated public long prune(PruneMode pruneMode, long cutoff, String keyPrefix) throws IOException { LOG.debug("Prune {} under {} with age {}", pruneMode == PruneMode.ALL_BY_MODTIME ? "files and tombstones" : "tombstones", keyPrefix, cutoff); final ItemCollection items = expiredFiles(pruneMode, cutoff, keyPrefix); return innerPrune(pruneMode, cutoff, keyPrefix, items); } /** * Prune files, in batches. There's optionally a sleep between each batch. * * @param pruneMode The mode of operation for the prune For details see * {@link MetadataStore#prune(PruneMode, long)} * @param cutoff Oldest modification time to allow * @param keyPrefix The prefix for the keys that should be removed * @param items expired items * @return count of pruned items. * @throws IOException Any IO/DDB failure. * @throws InterruptedIOException if the prune was interrupted */ private int innerPrune( final PruneMode pruneMode, final long cutoff, final String keyPrefix, final ItemCollection items) throws IOException { int itemCount = 0; try (AncestorState state = initiateBulkWrite( BulkOperationState.OperationType.Prune, null); DurationInfo ignored = new DurationInfo(LOG, "Pruning DynamoDB Store")) { ArrayList deletionBatch = new ArrayList<>(S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT); long delay = conf.getTimeDuration( S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_KEY, S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_DEFAULT, TimeUnit.MILLISECONDS); Set parentPathSet = new HashSet<>(); Set clearedParentPathSet = new HashSet<>(); // declare the operation to delete a batch as a function so // as to keep the code consistent across multiple uses. FunctionsRaisingIOE.CallableRaisingIOE deleteBatchOperation = () -> { // lowest path entries get deleted first. deletionBatch.sort(PathOrderComparators.TOPMOST_PATH_LAST); processBatchWriteRequest(state, pathToKey(deletionBatch), null); // set authoritative false for each pruned dir listing // if at least one entry was not a tombstone removeAuthoritativeDirFlag(parentPathSet, state); // already cleared parent paths. clearedParentPathSet.addAll(parentPathSet); parentPathSet.clear(); return null; }; for (Item item : items) { DDBPathMetadata md = PathMetadataDynamoDBTranslation .itemToPathMetadata(item, username); Path path = md.getFileStatus().getPath(); boolean tombstone = md.isDeleted(); LOG.debug("Prune entry {}", path); deletionBatch.add(path); // add parent path of item so it can be marked as non-auth. // this is only done if // * it has not already been processed // * the entry pruned is not a tombstone (no need to update) // * the file is not in the root dir Path parentPath = path.getParent(); if (!tombstone && parentPath != null && !parentPath.isRoot() && !clearedParentPathSet.contains(parentPath)) { parentPathSet.add(parentPath); } itemCount++; if (deletionBatch.size() == S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT) { deleteBatchOperation.apply(); deletionBatch.clear(); if (delay > 0) { Thread.sleep(delay); } } } // final batch of deletes if (!deletionBatch.isEmpty()) { deleteBatchOperation.apply(); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new InterruptedIOException("Pruning was interrupted"); } catch (AmazonDynamoDBException e) { throw translateDynamoDBException(keyPrefix, "Prune of " + keyPrefix + " failed", e); } LOG.info("Finished pruning {} items in batches of {}", itemCount, S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT); return itemCount; } /** * Remove the Authoritative Directory Marker from a set of paths, if * those paths are in the store. *

* This operation is onlyfor pruning; it does not raise an error * if, during the prune phase, the table appears inconsistent. * This is not unusual as it can happen in a number of ways *

    *
  1. The state of the table changes during a slow prune operation which * deliberately inserts pauses to avoid overloading prepaid IO capacity. *
  2. *
  3. Tombstone markers have been left in the table after many other * operations have taken place, including deleting/replacing * parents.
  4. *
*

* * If an exception is raised in the get/update process, then the exception * is caught and only rethrown after all the other paths are processed. * This is to ensure a best-effort attempt to update the store. * @param pathSet set of paths. * @param state ongoing operation state. * @throws IOException only after a best effort is made to update the store. */ private void removeAuthoritativeDirFlag( final Set pathSet, final AncestorState state) throws IOException { AtomicReference rIOException = new AtomicReference<>(); Set metas = pathSet.stream().map(path -> { try { if (path.isRoot()) { LOG.debug("ignoring root path"); return null; } if (state != null && state.get(path) != null) { // there's already an entry for this path LOG.debug("Ignoring update of entry already in the state map"); return null; } DDBPathMetadata ddbPathMetadata = get(path); if (ddbPathMetadata == null) { // there is no entry. LOG.debug("No parent {}; skipping", path); return null; } if (ddbPathMetadata.isDeleted()) { // the parent itself is deleted LOG.debug("Parent has been deleted {}; skipping", path); return null; } if (!ddbPathMetadata.getFileStatus().isDirectory()) { // the parent itself is deleted LOG.debug("Parent is not a directory {}; skipping", path); return null; } LOG.debug("Setting isAuthoritativeDir==false on {}", ddbPathMetadata); ddbPathMetadata.setAuthoritativeDir(false); ddbPathMetadata.setLastUpdated(ttlTimeProvider.getNow()); return ddbPathMetadata; } catch (IOException e) { String msg = String.format("IOException while getting PathMetadata " + "on path: %s.", path); LOG.error(msg, e); rIOException.set(e); return null; } }).filter(Objects::nonNull).collect(Collectors.toSet()); try { LOG.debug("innerPut on metas: {}", metas); if (!metas.isEmpty()) { innerPut(metas, state); } } catch (IOException e) { String msg = String.format("IOException while setting false " + "authoritative directory flag on: %s.", metas); LOG.error(msg, e); rIOException.set(e); } if (rIOException.get() != null) { throw rIOException.get(); } } @VisibleForTesting public AmazonDynamoDB getAmazonDynamoDB() { return amazonDynamoDB; } @Override public String toString() { return getClass().getSimpleName() + '{' + "region=" + region + ", tableName=" + tableName + ", tableArn=" + tableHandler.getTableArn() + '}'; } /** * The administrative policy includes all DDB table operations; * application access is restricted to those operations S3Guard operations * require when working with data in a guarded bucket. * @param access access level desired. * @return a possibly empty list of statements. */ @Override public List listAWSPolicyRules( final Set access) { Preconditions.checkState(tableHandler.getTableArn() != null, "TableARN not known"); if (access.isEmpty()) { return Collections.emptyList(); } RoleModel.Statement stat; if (access.contains(AccessLevel.ADMIN)) { stat = allowAllDynamoDBOperations(tableHandler.getTableArn()); } else { stat = allowS3GuardClientOperations(tableHandler.getTableArn()); } return Lists.newArrayList(stat); } /** * PUT a single item to the table. * @param item item to put * @return the outcome. */ @Retries.OnceRaw private PutItemOutcome putItem(Item item) { LOG.debug("Putting item {}", item); return table.putItem(item); } @VisibleForTesting Table getTable() { return table; } String getRegion() { return region; } @VisibleForTesting public String getTableName() { return tableName; } @VisibleForTesting DynamoDB getDynamoDB() { return dynamoDB; } /** * Validates a path object; it must be absolute, have an s3a:/// scheme * and contain a host (bucket) component. * @param path path to check * @return the path passed in */ private Path checkPath(Path path) { Preconditions.checkNotNull(path); Preconditions.checkArgument(path.isAbsolute(), "Path %s is not absolute", path); URI uri = path.toUri(); Preconditions.checkNotNull(uri.getScheme(), "Path %s missing scheme", path); Preconditions.checkArgument(uri.getScheme().equals(Constants.FS_S3A), "Path %s scheme must be %s", path, Constants.FS_S3A); Preconditions.checkArgument(!StringUtils.isEmpty(uri.getHost()), "Path %s" + " is missing bucket.", path); return path; } /** * Validates a path meta-data object. */ private static void checkPathMetadata(PathMetadata meta) { Preconditions.checkNotNull(meta); Preconditions.checkNotNull(meta.getFileStatus()); Preconditions.checkNotNull(meta.getFileStatus().getPath()); } @Override @Retries.OnceRaw public Map getDiagnostics() throws IOException { Map map = new TreeMap<>(); if (table != null) { TableDescription desc = getTableDescription(true); map.put("name", desc.getTableName()); map.put(STATUS, desc.getTableStatus()); map.put("ARN", desc.getTableArn()); map.put("size", desc.getTableSizeBytes().toString()); map.put(TABLE, desc.toString()); ProvisionedThroughputDescription throughput = desc.getProvisionedThroughput(); map.put(READ_CAPACITY, throughput.getReadCapacityUnits().toString()); map.put(WRITE_CAPACITY, throughput.getWriteCapacityUnits().toString()); map.put(BILLING_MODE, throughput.getWriteCapacityUnits() == 0 ? BILLING_MODE_PER_REQUEST : BILLING_MODE_PROVISIONED); map.put("sse", desc.getSSEDescription() == null ? "DISABLED" : desc.getSSEDescription().toString()); map.put(MetadataStoreCapabilities.PERSISTS_AUTHORITATIVE_BIT, Boolean.toString(true)); } else { map.put("name", "DynamoDB Metadata Store"); map.put(TABLE, "none"); map.put(STATUS, "undefined"); } map.put("description", DESCRIPTION); map.put("region", region); if (batchWriteRetryPolicy != null) { map.put("retryPolicy", batchWriteRetryPolicy.toString()); } return map; } @Retries.OnceRaw private TableDescription getTableDescription(boolean forceUpdate) { TableDescription desc = table.getDescription(); if (desc == null || forceUpdate) { desc = table.describe(); } return desc; } @Override @Retries.OnceRaw public void updateParameters(Map parameters) throws IOException { Preconditions.checkNotNull(table, "Not initialized"); TableDescription desc = getTableDescription(true); ProvisionedThroughputDescription current = desc.getProvisionedThroughput(); long currentRead = current.getReadCapacityUnits(); long newRead = getLongParam(parameters, S3GUARD_DDB_TABLE_CAPACITY_READ_KEY, currentRead); long currentWrite = current.getWriteCapacityUnits(); long newWrite = getLongParam(parameters, S3GUARD_DDB_TABLE_CAPACITY_WRITE_KEY, currentWrite); if (currentRead == 0 || currentWrite == 0) { // table is pay on demand throw new IOException(E_ON_DEMAND_NO_SET_CAPACITY); } if (newRead != currentRead || newWrite != currentWrite) { LOG.info("Current table capacity is read: {}, write: {}", currentRead, currentWrite); LOG.info("Changing capacity of table to read: {}, write: {}", newRead, newWrite); tableHandler.provisionTableBlocking(newRead, newWrite); } else { LOG.info("Table capacity unchanged at read: {}, write: {}", newRead, newWrite); } } private long getLongParam(Map parameters, String key, long defVal) { String k = parameters.get(key); if (k != null) { return Long.parseLong(k); } else { return defVal; } } /** * Callback on a read operation retried. * @param text text of the operation * @param ex exception * @param attempts number of attempts * @param idempotent is the method idempotent (this is assumed to be true) */ void readRetryEvent( String text, IOException ex, int attempts, boolean idempotent) { readThrottleEvents.incrementAndGet(); retryEvent(text, ex, attempts, true); } /** * Callback on a write operation retried. * @param text text of the operation * @param ex exception * @param attempts number of attempts * @param idempotent is the method idempotent (this is assumed to be true) */ void writeRetryEvent( String text, IOException ex, int attempts, boolean idempotent) { writeThrottleEvents.incrementAndGet(); retryEvent(text, ex, attempts, idempotent); } /** * Callback on a scan operation retried. * @param text text of the operation * @param ex exception * @param attempts number of attempts * @param idempotent is the method idempotent (this is assumed to be true) */ void scanRetryEvent( String text, IOException ex, int attempts, boolean idempotent) { scanThrottleEvents.incrementAndGet(); retryEvent(text, ex, attempts, idempotent); } /** * Callback from {@link Invoker} when an operation is retried. * @param text text of the operation * @param ex exception * @param attempts number of attempts * @param idempotent is the method idempotent */ void retryEvent( String text, IOException ex, int attempts, boolean idempotent) { if (S3AUtils.isThrottleException(ex)) { // throttled instrumentation.throttled(); int eventCount = throttleEventCount.addAndGet(1); if (attempts == 1 && eventCount < THROTTLE_EVENT_LOG_LIMIT) { LOG.warn("DynamoDB IO limits reached in {};" + " consider increasing capacity: {}", text, ex.toString()); LOG.debug("Throttled", ex); } else { // user has been warned already, log at debug only. LOG.debug("DynamoDB IO limits reached in {};" + " consider increasing capacity: {}", text, ex.toString()); } } else if (attempts == 1) { // not throttled. Log on the first attempt only LOG.info("Retrying {}: {}", text, ex.toString()); LOG.debug("Retrying {}", text, ex); } // note a retry instrumentation.retrying(); if (owner != null) { owner.metastoreOperationRetried(ex, attempts, idempotent); } } /** * Get the count of read throttle events. * @return the current count of read throttle events. */ @VisibleForTesting public long getReadThrottleEventCount() { return readThrottleEvents.get(); } /** * Get the count of write throttle events. * @return the current count of write throttle events. */ @VisibleForTesting public long getWriteThrottleEventCount() { return writeThrottleEvents.get(); } /** * Get the count of scan throttle events. * @return the current count of scan throttle events. */ @VisibleForTesting public long getScanThrottleEventCount() { return scanThrottleEvents.get(); } @VisibleForTesting public long getBatchWriteCapacityExceededCount() { return batchWriteCapacityExceededEvents.get(); } /** * Get the operation invoker for write operations. * @return an invoker for retrying mutating operations on a store. */ public Invoker getInvoker() { return writeOp; } /** * Wrap an iterator returned from any scan with a retrying one. * This includes throttle handling. * Retries will update the relevant counters/metrics for scan operations. * @param source source iterator * @return a retrying iterator. */ public Iterable wrapWithRetries( final Iterable source) { return new RetryingCollection<>("scan dynamoDB table", scanOp, source); } /** * Record the number of records written. * @param count count of records. */ private void recordsWritten(final int count) { instrumentation.recordsWritten(count); } /** * Record the number of records read. * @param count count of records. */ private void recordsRead(final int count) { instrumentation.recordsRead(count); } /** * Record the number of records deleted. * @param count count of records. */ private void recordsDeleted(final int count) { instrumentation.recordsDeleted(count); } /** * Initiate the rename operation by creating the tracker for the filesystem * to keep up to date with state changes in the S3A bucket. * @param storeContext store context. * @param source source path * @param sourceStatus status of the source file/dir * @param dest destination path. * @return the rename tracker */ @Override public RenameTracker initiateRenameOperation( final StoreContext storeContext, final Path source, final S3AFileStatus sourceStatus, final Path dest) { return new ProgressiveRenameTracker(storeContext, this, source, dest, new AncestorState(this, BulkOperationState.OperationType.Rename, dest)); } /** * Mark the directories instantiated under the destination path * as authoritative. That is: all entries in the * operationState (which must be an AncestorState instance), * that are under the destination path. * * The database update synchronized on the operationState, so all other * threads trying to update that state will be blocked until completion. * * This operation is only used in import and at the end of a rename, * so this is not considered an issue. * @param dest destination path. * @param operationState active state. * @throws IOException failure. * @return the number of directories marked. */ @Override public int markAsAuthoritative( final Path dest, final BulkOperationState operationState) throws IOException { if (operationState == null) { return 0; } Preconditions.checkArgument(operationState instanceof AncestorState, "Not an AncestorState %s", operationState); final AncestorState state = (AncestorState)operationState; // only mark paths under the dest as auth final String simpleDestKey = pathToParentKey(dest); final String destPathKey = simpleDestKey + "/"; final String opId = AncestorState.stateAsString(state); LOG.debug("{}: marking directories under {} as authoritative", opId, destPathKey); // the list of dirs to build up. final List dirsToUpdate = new ArrayList<>(); synchronized (state) { for (Map.Entry entry : state.getAncestry().entrySet()) { final Path path = entry.getKey(); final DDBPathMetadata md = entry.getValue(); final String key = pathToParentKey(path); if (md.getFileStatus().isDirectory() && (key.equals(simpleDestKey) || key.startsWith(destPathKey))) { // the updated entry is under the destination. md.setAuthoritativeDir(true); md.setLastUpdated(ttlTimeProvider.getNow()); LOG.debug("{}: added {}", opId, key); dirsToUpdate.add(md); } } processBatchWriteRequest(state, null, pathMetadataToItem(dirsToUpdate)); } return dirsToUpdate.size(); } @Override public AncestorState initiateBulkWrite( final BulkOperationState.OperationType operation, final Path dest) { return new AncestorState(this, operation, dest); } @Override public void setTtlTimeProvider(ITtlTimeProvider ttlTimeProvider) { this.ttlTimeProvider = ttlTimeProvider; } /** * Username. * @return the current username */ String getUsername() { return username; } /** * Log a PUT into the operations log at debug level. * @param state optional ancestor state. * @param items items which have been PUT */ private static void logPut( @Nullable AncestorState state, Item[] items) { if (OPERATIONS_LOG.isDebugEnabled()) { // log the operations String stateStr = AncestorState.stateAsString(state); for (Item item : items) { boolean tombstone = !itemExists(item); boolean isDir = getBoolAttribute(item, IS_DIR, false); boolean auth = getBoolAttribute(item, IS_AUTHORITATIVE, false); OPERATIONS_LOG.debug("{} {} {}{}{}", stateStr, tombstone ? "TOMBSTONE" : "PUT", itemPrimaryKeyToString(item), auth ? " [auth]" : "", isDir ? " directory" : ""); } } } /** * Log a PUT into the operations log at debug level. * @param state optional ancestor state. * @param item item PUT. */ private static void logPut( @Nullable AncestorState state, Item item) { if (OPERATIONS_LOG.isDebugEnabled()) { // log the operations logPut(state, new Item[]{item}); } } /** * Log a DELETE into the operations log at debug level. * @param state optional ancestor state. * @param keysDeleted keys which were deleted. */ private static void logDelete( @Nullable AncestorState state, PrimaryKey[] keysDeleted) { if (OPERATIONS_LOG.isDebugEnabled()) { // log the operations String stateStr = AncestorState.stateAsString(state); for (PrimaryKey key : keysDeleted) { OPERATIONS_LOG.debug("{} DELETE {}", stateStr, primaryKeyToString(key)); } } } /** * Log a DELETE into the operations log at debug level. * @param state optional ancestor state. * @param key Deleted key */ private static void logDelete( @Nullable AncestorState state, PrimaryKey key) { if (OPERATIONS_LOG.isDebugEnabled()) { logDelete(state, new PrimaryKey[]{key}); } } /** * Get the move state passed in; create a new one if needed. * @param state state. * @param operation the type of the operation to use if the state is created. * @return the cast or created state. */ private AncestorState extractOrCreate(@Nullable BulkOperationState state, BulkOperationState.OperationType operation) { if (state != null) { return (AncestorState) state; } else { return new AncestorState(this, operation, null); } } @Override public MetastoreInstrumentation getInstrumentation() { return instrumentation; } /** * This tracks all the ancestors created, * across multiple move/write operations. * This is to avoid duplicate creation of ancestors during bulk commits * and rename operations managed by a rename tracker. * * There is no thread safety: callers must synchronize as appropriate. */ @VisibleForTesting static final class AncestorState extends BulkOperationState { /** * Counter of IDs issued. */ private static final AtomicLong ID_COUNTER = new AtomicLong(0); /** Owning store. */ private final DynamoDBMetadataStore store; /** The ID of the state; for logging. */ private final long id; /** * Map of ancestors. */ private final Map ancestry = new HashMap<>(); /** * Destination path. */ private final Path dest; /** * Create the state. * @param store the store, for use in validation. * If null: no validation (test only operation) * @param operation the type of the operation. * @param dest destination path. */ AncestorState( @Nullable final DynamoDBMetadataStore store, final OperationType operation, @Nullable final Path dest) { super(operation); this.store = store; this.dest = dest; this.id = ID_COUNTER.addAndGet(1); } int size() { return ancestry.size(); } /** * Get the ancestry. Not thread safe. * @return the map of ancestors. */ Map getAncestry() { return ancestry; } public Path getDest() { return dest; } long getId() { return id; } @Override public String toString() { final StringBuilder sb = new StringBuilder( "AncestorState{"); sb.append("operation=").append(getOperation()); sb.append("id=").append(id); sb.append("; dest=").append(dest); sb.append("; size=").append(size()); sb.append("; paths={") .append(StringUtils.join(ancestry.keySet(), " ")) .append('}'); sb.append('}'); return sb.toString(); } /** * Does the ancestor state contain a path? * @param p path to check * @return true if the state has an entry */ boolean contains(Path p) { return get(p) != null; } DDBPathMetadata put(Path p, DDBPathMetadata md) { return ancestry.put(p, md); } DDBPathMetadata put(DDBPathMetadata md) { return ancestry.put(md.getFileStatus().getPath(), md); } DDBPathMetadata get(Path p) { return ancestry.get(p); } /** * Find an entry in the ancestor state, warning and optionally * raising an exception if there is a file at the path. * @param path path to look up * @param failOnFile fail if a file was found. * @return true iff a directory was found in the ancestor state. * @throws PathIOException if there was a file at the path. */ boolean findEntry( final Path path, final boolean failOnFile) throws PathIOException { final DDBPathMetadata ancestor = get(path); if (ancestor != null) { // there's an entry in the ancestor state if (!ancestor.getFileStatus().isDirectory()) { // but: its a file, which means this update is now inconsistent. final String message = E_INCONSISTENT_UPDATE + " entry is " + ancestor .getFileStatus(); LOG.error(message); if (failOnFile) { // errors trigger failure throw new PathIOException(path.toString(), message); } } return true; } else { return false; } } /** * If debug logging is enabled, this does an audit of the store state. * it only logs this; the error messages are created so as they could * be turned into exception messages. * Audit failures aren't being turned into IOEs is that * rename operations delete the source entry and that ends up in the * ancestor state as present * @throws IOException failure */ @Override public void close() throws IOException { if (LOG.isDebugEnabled() && store != null) { LOG.debug("Auditing {}", stateAsString(this)); for (Map.Entry entry : ancestry .entrySet()) { Path path = entry.getKey(); DDBPathMetadata expected = entry.getValue(); if (expected.isDeleted()) { // file was deleted in bulk op; we don't care about it // any more continue; } DDBPathMetadata actual; try { actual = store.get(path); } catch (IOException e) { LOG.debug("Retrieving {}", path, e); // this is for debug; don't be ambitious return; } if (actual == null || actual.isDeleted()) { String message = "Metastore entry for path " + path + " deleted during bulk " + getOperation() + " operation"; LOG.debug(message); } else { if (actual.getFileStatus().isDirectory() != expected.getFileStatus().isDirectory()) { // the type of the entry has changed String message = "Metastore entry for path " + path + " changed during bulk " + getOperation() + " operation" + " from " + expected + " to " + actual; LOG.debug(message); } } } } } /** * Create a string from the state including operation and ID. * @param state state to use -may be null * @return a string for logging. */ private static String stateAsString(@Nullable AncestorState state) { String stateStr; if (state != null) { stateStr = String.format("#(%s-%04d)", state.getOperation(), state.getId()); } else { stateStr = "#()"; } return stateStr; } } protected DynamoDBMetadataStoreTableManager getTableHandler() { Preconditions.checkNotNull(tableHandler, "Not initialized"); return tableHandler; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy