io.kestra.plugin.aws.athena.Query Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-aws Show documentation
AWS plugin for Kestra
The newest version!
package io.kestra.plugin.aws.athena;

import io.kestra.core.exceptions.IllegalVariableEvaluationException;
import io.kestra.core.models.annotations.Example;
import io.kestra.core.models.annotations.Plugin;
import io.kestra.core.models.annotations.PluginProperty;
import io.kestra.core.models.executions.metrics.Counter;
import io.kestra.core.models.tasks.Output;
import io.kestra.core.models.tasks.RunnableTask;
import io.kestra.core.models.tasks.common.FetchType;
import io.kestra.core.runners.RunContext;
import io.kestra.core.serializers.FileSerde;
import io.kestra.plugin.aws.AbstractConnection;
import io.kestra.plugin.aws.ConnectionUtils;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.NotNull;
import lombok.*;
import lombok.experimental.SuperBuilder;
import org.apache.commons.lang3.tuple.Pair;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import software.amazon.awssdk.services.athena.AthenaClient;
import software.amazon.awssdk.services.athena.model.*;

import java.io.*;
import java.math.BigDecimal;
import java.net.URI;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import static io.kestra.core.utils.Rethrow.throwConsumer;

/**
 * This Query task is built with the Athena SDK, more info can be found here: https://docs.aws.amazon.com/athena/latest/ug/code-samples.html.
 * A JDBC driver also exists.
 */
@SuperBuilder
@ToString
@EqualsAndHashCode
@Getter
@NoArgsConstructor
@Schema(
    title = "Query an Athena table.",
    description = """
        The query will wait for completion, except if fetchMode is set to `NONE`, and will output converted rows.
        Row conversion is based on the types listed [here](https://docs.aws.amazon.com/athena/latest/ug/data-types.html). 
        Complex data types like array, map and struct will be converted to a string."""
)
@Plugin(
    examples = {
        @Example(
            full = true,
            code = {
                """
                id: aws_athena_query
                namespace: company.team

                tasks:
                  - id: query
                    type: io.kestra.plugin.aws.athena.Query
                    accessKeyId: ""
                    secretKeyId: ""
                    region: "eu-central-1"
                    database: my_database
                    outputLocation: s3://some-s3-bucket
                    query: |
                      select * from cloudfront_logs limit 10
                """
            }
        )
    }
)
public class Query extends AbstractConnection implements RunnableTask {
    @Schema(title = "Athena catalog.")
    @PluginProperty(dynamic = true)
    private String catalog;

    @Schema(title = "Athena database.")
    @NotNull
    @PluginProperty(dynamic = true)
    private String database;

    @Schema(
        title = "Athena output location.",
        description = "The query results will be stored in this output location. Must be an existing S3 bucket."
    )
    @NotNull
    @PluginProperty(dynamic = true)
    private String outputLocation;

    @Schema(title = "Athena SQL query.")
    @NotNull
    @PluginProperty(dynamic = true)
    private String query;

    @Schema(
        title = "The way you want to store the data.",
        description = "FETCH_ONE outputs the first row, "
            + "FETCH outputs all the rows, "
            + "STORE stores all rows in a file, "
            + "NONE does nothing — in this case, the task submits the query without waiting for its completion."
    )
    @NotNull
    @PluginProperty
    @Builder.Default
    private FetchType fetchType = FetchType.STORE;

    @Schema(title = "Whether to skip the first row which is usually the header.")
    @NotNull
    @PluginProperty
    @Builder.Default
    private boolean skipHeader = true;


    private static DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
    private static DateTimeFormatter timestampFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSSSSS");

    @Override
    public QueryOutput run(RunContext runContext) throws Exception {
        // The QueryExecutionContext allows us to set the database.
        var queryExecutionContext = QueryExecutionContext.builder()
            .catalog(catalog != null ? runContext.render(catalog) : null)
            .database(runContext.render(database))
            .build();

        // The result configuration specifies where the results of the query should go.
        var resultConfiguration = ResultConfiguration.builder()
            .outputLocation(runContext.render(outputLocation))
            .build();

        var startQueryExecutionRequest = StartQueryExecutionRequest.builder()
            .queryString(runContext.render(query))
            .queryExecutionContext(queryExecutionContext)
            .resultConfiguration(resultConfiguration)
            .build();

        try (var client = client(runContext)) {
            var startQueryExecution = client.startQueryExecution(startQueryExecutionRequest);
            runContext.logger().info("Query created with Athena execution identifier {}", startQueryExecution.queryExecutionId());
            if (fetchType == FetchType.NONE) {
                return QueryOutput.builder().queryExecutionId(startQueryExecution.queryExecutionId()).build();
            }

            var statistics = waitForQueryToComplete(client, startQueryExecution.queryExecutionId());
            if (statistics != null) {
                if (statistics.dataScannedInBytes() != null) {
                    runContext.metric(Counter.of("data.scanned.bytes", statistics.dataScannedInBytes()));
                }

                if (statistics.engineExecutionTimeInMillis() != null) {
                    runContext.metric(Counter.of("engine.execution.duration", statistics.engineExecutionTimeInMillis()));
                }

                if (statistics.queryPlanningTimeInMillis() != null) {
                    runContext.metric(Counter.of("query.planning.duration", statistics.queryPlanningTimeInMillis()));
                }

                if (statistics.queryQueueTimeInMillis() != null) {
                    runContext.metric(Counter.of("query.queue.duration", statistics.queryQueueTimeInMillis()));
                }

                if (statistics.serviceProcessingTimeInMillis() != null) {
                    runContext.metric(Counter.of("service.processing.duration", statistics.serviceProcessingTimeInMillis()));
                }

                if (statistics.totalExecutionTimeInMillis() != null) {
                    runContext.metric(Counter.of("total.execution.duration", statistics.totalExecutionTimeInMillis()));
                }
            }

            var getQueryResult = GetQueryResultsRequest.builder()
                .queryExecutionId(startQueryExecution.queryExecutionId())
                .build();
            var getQueryResultsResults = client.getQueryResults(getQueryResult);
            List results = getQueryResultsResults.resultSet().rows();
            if (skipHeader && results != null && !results.isEmpty()) {
                // we skip the first row, this is usually needed as by default Athena returns the header as the first row
                results = results.subList(1, results.size());
            }

            if (results != null) {
                runContext.metric(Counter.of("total.rows", results.size()));
            }

            List columnInfo = getQueryResultsResults.resultSet().resultSetMetadata().columnInfo();
            QueryOutput output = null;
            if (fetchType == FetchType.FETCH_ONE) {
                Map row = fetchOne(columnInfo, results);
                output = QueryOutput.builder().row(row).size(row == null ? 0L : 1L).build();
            }
            else if (fetchType == FetchType.FETCH) {
                List