com.amazonaws.athena.connectors.dynamodb.DynamoDBMetadataHandler Maven / Gradle / Ivy
/*-
* #%L
* athena-dynamodb
* %%
* Copyright (C) 2023 Amazon Web Services
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package com.amazonaws.athena.connectors.dynamodb;
import com.amazonaws.athena.connector.lambda.QueryStatusChecker;
import com.amazonaws.athena.connector.lambda.ThrottlingInvoker;
import com.amazonaws.athena.connector.lambda.data.Block;
import com.amazonaws.athena.connector.lambda.data.BlockAllocator;
import com.amazonaws.athena.connector.lambda.data.BlockWriter;
import com.amazonaws.athena.connector.lambda.data.SchemaBuilder;
import com.amazonaws.athena.connector.lambda.domain.Split;
import com.amazonaws.athena.connector.lambda.domain.TableName;
import com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet;
import com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation;
import com.amazonaws.athena.connector.lambda.exceptions.AthenaConnectorException;
import com.amazonaws.athena.connector.lambda.handlers.GlueMetadataHandler;
import com.amazonaws.athena.connector.lambda.metadata.GetDataSourceCapabilitiesRequest;
import com.amazonaws.athena.connector.lambda.metadata.GetDataSourceCapabilitiesResponse;
import com.amazonaws.athena.connector.lambda.metadata.GetSplitsRequest;
import com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse;
import com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest;
import com.amazonaws.athena.connector.lambda.metadata.GetTableRequest;
import com.amazonaws.athena.connector.lambda.metadata.GetTableResponse;
import com.amazonaws.athena.connector.lambda.metadata.ListSchemasRequest;
import com.amazonaws.athena.connector.lambda.metadata.ListSchemasResponse;
import com.amazonaws.athena.connector.lambda.metadata.ListTablesRequest;
import com.amazonaws.athena.connector.lambda.metadata.ListTablesResponse;
import com.amazonaws.athena.connector.lambda.metadata.glue.GlueFieldLexer;
import com.amazonaws.athena.connector.lambda.metadata.optimizations.OptimizationSubType;
import com.amazonaws.athena.connector.lambda.security.EncryptionKeyFactory;
import com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants;
import com.amazonaws.athena.connectors.dynamodb.credentials.CrossAccountCredentialsProviderV2;
import com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex;
import com.amazonaws.athena.connectors.dynamodb.model.DynamoDBPaginatedTables;
import com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable;
import com.amazonaws.athena.connectors.dynamodb.qpt.DDBQueryPassthrough;
import com.amazonaws.athena.connectors.dynamodb.resolver.DynamoDBTableResolver;
import com.amazonaws.athena.connectors.dynamodb.util.DDBPredicateUtils;
import com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata;
import com.amazonaws.athena.connectors.dynamodb.util.DDBTableUtils;
import com.amazonaws.athena.connectors.dynamodb.util.DDBTypeUtils;
import com.amazonaws.athena.connectors.dynamodb.util.IncrementingValueNameProducer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.types.Types;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.enhanced.dynamodb.document.EnhancedDocument;
import software.amazon.awssdk.services.athena.AthenaClient;
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
import software.amazon.awssdk.services.dynamodb.model.ExecuteStatementRequest;
import software.amazon.awssdk.services.dynamodb.model.ExecuteStatementResponse;
import software.amazon.awssdk.services.glue.GlueClient;
import software.amazon.awssdk.services.glue.model.Database;
import software.amazon.awssdk.services.glue.model.ErrorDetails;
import software.amazon.awssdk.services.glue.model.FederationSourceErrorCode;
import software.amazon.awssdk.services.glue.model.Table;
import software.amazon.awssdk.services.secretsmanager.SecretsManagerClient;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;
import static com.amazonaws.athena.connector.lambda.metadata.ListTablesRequest.UNLIMITED_PAGE_SIZE_VALUE;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.DEFAULT_SCHEMA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.EXPRESSION_NAMES_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.EXPRESSION_VALUES_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.HASH_KEY_NAME_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.INDEX_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.NON_KEY_FILTER_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.PARTITION_TYPE_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.QUERY_PARTITION_TYPE;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.RANGE_KEY_FILTER_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.RANGE_KEY_NAME_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.SCAN_PARTITION_TYPE;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.SEGMENT_COUNT_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.SEGMENT_ID_PROPERTY;
import static com.amazonaws.athena.connectors.dynamodb.constants.DynamoDBConstants.TABLE_METADATA;
import static com.amazonaws.athena.connectors.dynamodb.throttling.DynamoDBExceptionFilter.EXCEPTION_FILTER;
import static com.amazonaws.athena.connectors.dynamodb.util.DDBTableUtils.SCHEMA_INFERENCE_NUM_RECORDS;
/**
* Handles metadata requests for the Athena DynamoDB Connector.
*
* For more detail, please see the module's README.md, some notable characteristics of this class include:
*
* 1. Glue DataCatalog is used for schema information by default unless disabled. If disabled or the table
* is not found, it falls back to doing a small table scan and derives a schema from that.
* 2. Determines if the data splits will need to perform DDB Queries or Scans.
* 3. Splits up the hash key into distinct Query splits if possible, otherwise falls back to creating Scan splits.
* 4. Also determines the best index to use (if available) if the available predicates align with Key Attributes.
* 5. Creates scan splits that support Parallel Scan and tries to choose the optimal number of splits.
* 6. Pushes down all other predicates into ready-to-use filter expressions to pass to DDB.
*/
public class DynamoDBMetadataHandler
extends GlueMetadataHandler
{
@VisibleForTesting
static final int MAX_SPLITS_PER_REQUEST = 1000;
private static final Logger logger = LoggerFactory.getLogger(DynamoDBMetadataHandler.class);
static final String DYNAMODB = "dynamodb";
private static final String SOURCE_TYPE = "ddb";
// defines the value that should be present in the Glue Database URI to enable the DB for DynamoDB.
static final String DYNAMO_DB_FLAG = "dynamo-db-flag";
// used to filter out Glue tables which lack indications of being used for DDB.
private static final TableFilter TABLE_FILTER = (Table table) -> table.storageDescriptor().location().contains(DYNAMODB)
|| (table.parameters() != null && DYNAMODB.equals(table.parameters().get("classification")))
|| (table.storageDescriptor().parameters() != null && DYNAMODB.equals(table.storageDescriptor().parameters().get("classification")));
// used to filter out Glue databases which lack the DYNAMO_DB_FLAG in the URI.
private static final DatabaseFilter DB_FILTER = (Database database) -> (database.locationUri() != null && database.locationUri().contains(DYNAMO_DB_FLAG));
private final ThrottlingInvoker invoker;
private final DynamoDbClient ddbClient;
private final GlueClient glueClient;
private final DynamoDBTableResolver tableResolver;
private final DDBQueryPassthrough queryPassthrough;
public DynamoDBMetadataHandler(java.util.Map configOptions)
{
super(SOURCE_TYPE, configOptions);
this.ddbClient = DynamoDbClient.builder()
.credentialsProvider(CrossAccountCredentialsProviderV2.getCrossAccountCredentialsIfPresent(configOptions, "DynamoDBMetadataHandler_CrossAccountRoleSession"))
.build();
this.glueClient = getAwsGlue();
this.invoker = ThrottlingInvoker.newDefaultBuilder(EXCEPTION_FILTER, configOptions).build();
this.tableResolver = new DynamoDBTableResolver(invoker, ddbClient);
this.queryPassthrough = new DDBQueryPassthrough();
}
@VisibleForTesting
DynamoDBMetadataHandler(
EncryptionKeyFactory keyFactory,
SecretsManagerClient secretsManager,
AthenaClient athena,
String spillBucket,
String spillPrefix,
DynamoDbClient ddbClient,
GlueClient glueClient,
java.util.Map configOptions)
{
super(glueClient, keyFactory, secretsManager, athena, SOURCE_TYPE, spillBucket, spillPrefix, configOptions);
this.glueClient = glueClient;
this.ddbClient = ddbClient;
this.invoker = ThrottlingInvoker.newDefaultBuilder(EXCEPTION_FILTER, configOptions).build();
this.tableResolver = new DynamoDBTableResolver(invoker, ddbClient);
this.queryPassthrough = new DDBQueryPassthrough();
}
@Override
public GetDataSourceCapabilitiesResponse doGetDataSourceCapabilities(BlockAllocator allocator, GetDataSourceCapabilitiesRequest request)
{
ImmutableMap.Builder> capabilities = ImmutableMap.builder();
this.queryPassthrough.addQueryPassthroughCapabilityIfEnabled(capabilities, this.configOptions);
return new GetDataSourceCapabilitiesResponse(request.getCatalogName(), capabilities.build());
}
/**
* Since DynamoDB does not have "schemas" or "databases", this lists all the Glue databases (if not
* disabled) that contain {@value #DYNAMO_DB_FLAG} in their URIs . Otherwise returns just a "default" schema.
*
* @see GlueMetadataHandler
*/
@Override
public ListSchemasResponse doListSchemaNames(BlockAllocator allocator, ListSchemasRequest request)
throws Exception
{
Set combinedSchemas = new LinkedHashSet<>();
if (glueClient != null) {
try {
combinedSchemas.addAll(super.doListSchemaNames(allocator, request, DB_FILTER).getSchemas());
}
catch (RuntimeException e) {
logger.warn("doListSchemaNames: Unable to retrieve schemas from AWSGlue.", e);
}
}
combinedSchemas.add(DEFAULT_SCHEMA);
return new ListSchemasResponse(request.getCatalogName(), combinedSchemas);
}
/**
* Lists all Glue tables (if not disabled) in the schema specified that indicate use for DynamoDB metadata.
* Indications for DynamoDB use in Glue are:
* 1. The top level table properties/parameters contains a key called "classification" with value {@value #DYNAMODB}.
* 2. Or the storage descriptor's location field contains {@value #DYNAMODB}.
* 3. Or the storage descriptor has a parameter called "classification" with value {@value #DYNAMODB}.
*
* If the specified schema is "default", this also returns an intersection with actual tables in DynamoDB.
* Pagination only implemented for DynamoDBTableResolver.listTables()
* @see GlueMetadataHandler
*/
@Override
public ListTablesResponse doListTables(BlockAllocator allocator, ListTablesRequest request)
throws Exception
{
// LinkedHashSet for consistent ordering
Set combinedTables = new LinkedHashSet<>();
String token = request.getNextToken();
if (token == null && glueClient != null) { // first invocation will get ALL glue tables in one shot
try {
// does not validate that the tables are actually DDB tables
combinedTables.addAll(super.doListTables(allocator, new ListTablesRequest(request.getIdentity(), request.getQueryId(), request.getCatalogName(),
request.getSchemaName(), null, UNLIMITED_PAGE_SIZE_VALUE), TABLE_FILTER).getTables());
}
catch (RuntimeException e) {
logger.warn("doListTables: Unable to retrieve tables from AWSGlue in database/schema {}", request.getSchemaName(), e);
}
}
// future invocations will paginate on default ddb schema
// add tables that may not be in Glue (if listing the default schema)
if (DynamoDBConstants.DEFAULT_SCHEMA.equals(request.getSchemaName())) {
DynamoDBPaginatedTables ddbPaginatedResponse = tableResolver.listTables(request.getNextToken(), request.getPageSize());
List tableNames = ddbPaginatedResponse.getTables().stream()
.map(table -> table.toLowerCase(Locale.ENGLISH)) // lowercase for compatibility
.map(table -> new TableName(DEFAULT_SCHEMA, table))
.collect(Collectors.toList());
token = ddbPaginatedResponse.getToken();
combinedTables.addAll(tableNames);
}
return new ListTablesResponse(request.getCatalogName(), new ArrayList<>(combinedTables), token);
}
@Override
public GetTableResponse doGetQueryPassthroughSchema(BlockAllocator allocator, GetTableRequest request) throws Exception
{
if (!request.isQueryPassthrough()) {
throw new AthenaConnectorException("No Query passed through [{}]" + request, ErrorDetails.builder().errorCode(FederationSourceErrorCode.INVALID_INPUT_EXCEPTION.toString()).errorMessage("No Query passed through [{}]" + request).build());
}
queryPassthrough.verify(request.getQueryPassthroughArguments());
String partiQLStatement = request.getQueryPassthroughArguments().get(DDBQueryPassthrough.QUERY);
ExecuteStatementRequest executeStatementRequest =
ExecuteStatementRequest.builder()
.statement(partiQLStatement)
.limit(SCHEMA_INFERENCE_NUM_RECORDS)
.build();
//PartiQL on DynamoDB Doesn't allow a dry run; therefore, we look "Peek" over the first few records
ExecuteStatementResponse response = ddbClient.executeStatement(executeStatementRequest);
SchemaBuilder schemaBuilder = DDBTableUtils.buildSchemaFromItems(response.items());
return new GetTableResponse(request.getCatalogName(), request.getTableName(), schemaBuilder.build(), Collections.emptySet());
}
/**
* Fetches a table's schema from Glue DataCatalog if present and not disabled, otherwise falls
* back to doing a small table scan derives a schema from that.
*
* @see GlueMetadataHandler
*/
@Override
public GetTableResponse doGetTable(BlockAllocator allocator, GetTableRequest request)
throws Exception
{
if (glueClient != null) {
try {
// does not validate that the table is actually a DDB table
return super.doGetTable(allocator, request);
}
catch (RuntimeException e) {
logger.warn("doGetTable: Unable to retrieve table {} from AWSGlue in database/schema {}. " +
"Falling back to schema inference. If inferred schema is incorrect, create " +
"a matching table in Glue to define schema (see README)",
request.getTableName().getTableName(), request.getTableName().getSchemaName(), e);
}
}
// ignore database/schema name since there are no databases/schemas in DDB
Schema schema = tableResolver.getTableSchema(request.getTableName().getTableName());
return new GetTableResponse(request.getCatalogName(), request.getTableName(), schema);
}
/**
* Generates a partition schema with metadata derived from available predicates. This metadata will be
* copied to splits in the #doGetSplits call. At this point it is determined whether we can partition
* by hash key or fall back to a full table scan.
*
* @see GlueMetadataHandler
*/
@Override
public void enhancePartitionSchema(SchemaBuilder partitionSchemaBuilder, GetTableLayoutRequest request)
{
if (request.getTableName().getQualifiedTableName().equalsIgnoreCase(queryPassthrough.getFunctionSignature())) {
//Query passthrough does not support partition
return;
}
// use the source table name from the schema if available (in case Glue table name != actual table name)
String tableName = getSourceTableName(request.getSchema());
if (tableName == null) {
tableName = request.getTableName().getTableName();
}
DynamoDBTable table = null;
try {
table = tableResolver.getTableMetadata(tableName);
}
catch (TimeoutException e) {
throw new AthenaConnectorException(e.getMessage(), ErrorDetails.builder().errorCode(FederationSourceErrorCode.OPERATION_TIMEOUT_EXCEPTION.toString()).errorMessage(e.getMessage()).build());
}
// add table name so we don't have to do case insensitive resolution again
partitionSchemaBuilder.addMetadata(TABLE_METADATA, table.getName());
Map summary = request.getConstraints().getSummary();
List requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
logger.info("using index: {}", index.getName());
String hashKeyName = index.getHashKey();
ValueSet hashKeyValueSet = summary.get(hashKeyName);
List