io.kestra.plugin.aws.athena.Query Maven / Gradle / Ivy
The newest version!
package io.kestra.plugin.aws.athena;
import io.kestra.core.exceptions.IllegalVariableEvaluationException;
import io.kestra.core.models.annotations.Example;
import io.kestra.core.models.annotations.Plugin;
import io.kestra.core.models.annotations.PluginProperty;
import io.kestra.core.models.executions.metrics.Counter;
import io.kestra.core.models.tasks.Output;
import io.kestra.core.models.tasks.RunnableTask;
import io.kestra.core.models.tasks.common.FetchType;
import io.kestra.core.runners.RunContext;
import io.kestra.core.serializers.FileSerde;
import io.kestra.plugin.aws.AbstractConnection;
import io.kestra.plugin.aws.ConnectionUtils;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.NotNull;
import lombok.*;
import lombok.experimental.SuperBuilder;
import org.apache.commons.lang3.tuple.Pair;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import software.amazon.awssdk.services.athena.AthenaClient;
import software.amazon.awssdk.services.athena.model.*;
import java.io.*;
import java.math.BigDecimal;
import java.net.URI;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import static io.kestra.core.utils.Rethrow.throwConsumer;
/**
* This Query task is built with the Athena SDK, more info can be found here: https://docs.aws.amazon.com/athena/latest/ug/code-samples.html.
* A JDBC driver also exists.
*/
@SuperBuilder
@ToString
@EqualsAndHashCode
@Getter
@NoArgsConstructor
@Schema(
title = "Query an Athena table.",
description = """
The query will wait for completion, except if fetchMode is set to `NONE`, and will output converted rows.
Row conversion is based on the types listed [here](https://docs.aws.amazon.com/athena/latest/ug/data-types.html).
Complex data types like array, map and struct will be converted to a string."""
)
@Plugin(
examples = {
@Example(
full = true,
code = {
"""
id: aws_athena_query
namespace: company.team
tasks:
- id: query
type: io.kestra.plugin.aws.athena.Query
accessKeyId: ""
secretKeyId: ""
region: "eu-central-1"
database: my_database
outputLocation: s3://some-s3-bucket
query: |
select * from cloudfront_logs limit 10
"""
}
)
}
)
public class Query extends AbstractConnection implements RunnableTask {
@Schema(title = "Athena catalog.")
@PluginProperty(dynamic = true)
private String catalog;
@Schema(title = "Athena database.")
@NotNull
@PluginProperty(dynamic = true)
private String database;
@Schema(
title = "Athena output location.",
description = "The query results will be stored in this output location. Must be an existing S3 bucket."
)
@NotNull
@PluginProperty(dynamic = true)
private String outputLocation;
@Schema(title = "Athena SQL query.")
@NotNull
@PluginProperty(dynamic = true)
private String query;
@Schema(
title = "The way you want to store the data.",
description = "FETCH_ONE outputs the first row, "
+ "FETCH outputs all the rows, "
+ "STORE stores all rows in a file, "
+ "NONE does nothing — in this case, the task submits the query without waiting for its completion."
)
@NotNull
@PluginProperty
@Builder.Default
private FetchType fetchType = FetchType.STORE;
@Schema(title = "Whether to skip the first row which is usually the header.")
@NotNull
@PluginProperty
@Builder.Default
private boolean skipHeader = true;
private static DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
private static DateTimeFormatter timestampFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSSSSS");
@Override
public QueryOutput run(RunContext runContext) throws Exception {
// The QueryExecutionContext allows us to set the database.
var queryExecutionContext = QueryExecutionContext.builder()
.catalog(catalog != null ? runContext.render(catalog) : null)
.database(runContext.render(database))
.build();
// The result configuration specifies where the results of the query should go.
var resultConfiguration = ResultConfiguration.builder()
.outputLocation(runContext.render(outputLocation))
.build();
var startQueryExecutionRequest = StartQueryExecutionRequest.builder()
.queryString(runContext.render(query))
.queryExecutionContext(queryExecutionContext)
.resultConfiguration(resultConfiguration)
.build();
try (var client = client(runContext)) {
var startQueryExecution = client.startQueryExecution(startQueryExecutionRequest);
runContext.logger().info("Query created with Athena execution identifier {}", startQueryExecution.queryExecutionId());
if (fetchType == FetchType.NONE) {
return QueryOutput.builder().queryExecutionId(startQueryExecution.queryExecutionId()).build();
}
var statistics = waitForQueryToComplete(client, startQueryExecution.queryExecutionId());
if (statistics != null) {
if (statistics.dataScannedInBytes() != null) {
runContext.metric(Counter.of("data.scanned.bytes", statistics.dataScannedInBytes()));
}
if (statistics.engineExecutionTimeInMillis() != null) {
runContext.metric(Counter.of("engine.execution.duration", statistics.engineExecutionTimeInMillis()));
}
if (statistics.queryPlanningTimeInMillis() != null) {
runContext.metric(Counter.of("query.planning.duration", statistics.queryPlanningTimeInMillis()));
}
if (statistics.queryQueueTimeInMillis() != null) {
runContext.metric(Counter.of("query.queue.duration", statistics.queryQueueTimeInMillis()));
}
if (statistics.serviceProcessingTimeInMillis() != null) {
runContext.metric(Counter.of("service.processing.duration", statistics.serviceProcessingTimeInMillis()));
}
if (statistics.totalExecutionTimeInMillis() != null) {
runContext.metric(Counter.of("total.execution.duration", statistics.totalExecutionTimeInMillis()));
}
}
var getQueryResult = GetQueryResultsRequest.builder()
.queryExecutionId(startQueryExecution.queryExecutionId())
.build();
var getQueryResultsResults = client.getQueryResults(getQueryResult);
List results = getQueryResultsResults.resultSet().rows();
if (skipHeader && results != null && !results.isEmpty()) {
// we skip the first row, this is usually needed as by default Athena returns the header as the first row
results = results.subList(1, results.size());
}
if (results != null) {
runContext.metric(Counter.of("total.rows", results.size()));
}
List columnInfo = getQueryResultsResults.resultSet().resultSetMetadata().columnInfo();
QueryOutput output = null;
if (fetchType == FetchType.FETCH_ONE) {
Map row = fetchOne(columnInfo, results);
output = QueryOutput.builder().row(row).size(row == null ? 0L : 1L).build();
}
else if (fetchType == FetchType.FETCH) {
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy