
io.kestra.plugin.gcp.bigquery.Query Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-gcp Show documentation
Show all versions of plugin-gcp Show documentation
Integrate Google Cloud Platform services with Kestra data workflows.
package io.kestra.plugin.gcp.bigquery;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.google.cloud.bigquery.*;
import com.google.common.collect.ImmutableMap;
import io.kestra.core.serializers.JacksonMapper;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import lombok.experimental.SuperBuilder;
import io.kestra.core.exceptions.IllegalVariableEvaluationException;
import io.kestra.core.models.annotations.Example;
import io.kestra.core.models.annotations.Metric;
import io.kestra.core.models.annotations.Plugin;
import io.kestra.core.models.annotations.PluginProperty;
import io.kestra.core.models.executions.metrics.Counter;
import io.kestra.core.models.executions.metrics.Timer;
import io.kestra.core.models.tasks.RunnableTask;
import io.kestra.core.runners.RunContext;
import io.kestra.core.serializers.FileSerde;
import org.slf4j.Logger;
import reactor.core.publisher.Flux;
import reactor.core.publisher.FluxSink;
import reactor.core.publisher.Mono;
import java.io.*;
import java.net.URI;
import java.time.Duration;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import static io.kestra.core.utils.Rethrow.throwConsumer;
import static io.kestra.core.utils.Rethrow.throwFunction;
@SuperBuilder
@ToString
@EqualsAndHashCode
@Getter
@NoArgsConstructor
@Plugin(
examples = {
@Example(
title = "Create a table with a custom query.",
full = true,
code = """
id: gcp_bq_query
namespace: company.team
tasks:
- id: query
type: io.kestra.plugin.gcp.bigquery.Query
destinationTable: "my_project.my_dataset.my_table"
writeDisposition: WRITE_APPEND
sql: |
SELECT
"hello" as string,
NULL AS `nullable`,
1 as int,
1.25 AS float,
DATE("2008-12-25") AS date,
DATETIME "2008-12-25 15:30:00.123456" AS datetime,
TIME(DATETIME "2008-12-25 15:30:00.123456") AS time,
TIMESTAMP("2008-12-25 15:30:00.123456") AS timestamp,
ST_GEOGPOINT(50.6833, 2.9) AS geopoint,
ARRAY(SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3) AS `array`,
STRUCT(4 AS x, 0 AS y, ARRAY(SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3) AS z) AS `struct`
"""
),
@Example(
title = "Execute a query and fetch results sets on another task.",
full = true,
code = """
id: gcp_bq_query
namespace: company.team
tasks:
- id: fetch
type: io.kestra.plugin.gcp.bigquery.Query
fetch: true
sql: |
SELECT 1 as id, "John" as name
UNION ALL
SELECT 2 as id, "Doe" as name
- id: use_fetched_data
type: io.kestra.plugin.core.debug.Return
format: |
{% for row in outputs.fetch.rows %}
id : {{ row.id }}, name: {{ row.name }}
{% endfor %}
"""
)
},
metrics = {
@Metric(name = "cache.hit", type = Counter.TYPE, description= "Whether the query result was fetched from the query cache."),
@Metric(name = "duration", type = Timer.TYPE, description = "The time it took for the query to run."),
@Metric(name = "estimated.bytes.processed", type = Counter.TYPE, unit = "bytes", description = "The original estimate of bytes processed for the query."),
@Metric(name = "total.bytes.billed", type = Counter.TYPE, unit = "bytes", description = "The total number of bytes billed for the query."),
@Metric(name = "total.bytes.processed", type = Counter.TYPE, unit = "bytes", description = "The total number of bytes processed by the query."),
@Metric(name = "total.partitions.processed", type = Counter.TYPE, unit = "partitions", description = "The totla number of partitions processed from all partitioned tables referenced in the job."),
@Metric(name = "total.slot.ms", type = Counter.TYPE, description = "The slot-milliseconds consumed by the query."),
@Metric(name = "num.dml.affected.rows", type = Counter.TYPE, unit = "records", description="The number of rows affected by a DML statement. Present only for DML statements INSERT, UPDATE or DELETE."),
@Metric(name = "referenced.tables", type = Counter.TYPE, description="The number of tables referenced by the query."),
@Metric(name = "num.child.jobs", type = Counter.TYPE, description="The number of child jobs executed by the query."),
}
)
@Schema(
title = "Execute BigQuery SQL query in a specific BigQuery database."
)
@StoreFetchValidation
@StoreFetchDestinationValidation
public class Query extends AbstractJob implements RunnableTask, QueryInterface {
private String sql;
@Builder.Default
private boolean legacySql = false;
@Builder.Default
private boolean fetch = false;
@Builder.Default
private boolean store = false;
@Builder.Default
private boolean fetchOne = false;
// private List positionalParameters;
// private Map namedParameters;
@Schema(
title = "The clustering specification for the destination table."
)
@PluginProperty(dynamic = true)
private List clusteringFields;
@Schema(
title = "[Experimental] Options allowing the schema of the destination table to be updated as a side effect of the query job.",
description = " Schema update options are supported in two cases: " +
"* when writeDisposition is WRITE_APPEND; \n" +
"* when writeDisposition is WRITE_TRUNCATE and the destination" +
" table is a partition of a table, specified by partition decorators. " +
"\n" +
" For normal tables, WRITE_TRUNCATE will always overwrite the schema."
)
@PluginProperty
private List schemaUpdateOptions;
@Schema(
title = "The time partitioning field for the destination table."
)
@PluginProperty(dynamic = true)
private String timePartitioningField;
@Schema(
title = "The time partitioning type specification."
)
@PluginProperty(dynamic = true)
@Builder.Default
private TimePartitioning.Type timePartitioningType = TimePartitioning.Type.DAY;
@Schema(
title = "Range partitioning field for the destination table."
)
@PluginProperty(dynamic = true)
private String rangePartitioningField;
@Schema(
title = "The start of range partitioning, inclusive."
)
@PluginProperty(dynamic = true)
private Long rangePartitioningStart;
@Schema(
title = "The end range partitioning, inclusive."
)
@PluginProperty(dynamic = true)
private Long rangePartitioningEnd;
@Schema(
title = "The width of each interval."
)
@PluginProperty(dynamic = true)
private Long rangePartitioningInterval;
@Schema(
title = "Sets the default dataset.",
description = "This dataset is used for all unqualified table names used in the query."
)
@PluginProperty(dynamic = true)
private String defaultDataset;
@Schema(
title = "Sets a priority for the query."
)
@PluginProperty
@Builder.Default
private QueryJobConfiguration.Priority priority = QueryJobConfiguration.Priority.INTERACTIVE;
@Schema(
title = "Sets whether the job is enabled to create arbitrarily large results.",
description = "If `true` the query is allowed to create large results at a slight cost in performance. " +
"`destinationTable` must be provided."
)
@PluginProperty
private Boolean allowLargeResults;
@Schema(
title = "Sets whether to look for the result in the query cache.",
description = "The query cache is a best-effort cache that will be flushed whenever tables in the query are " +
"modified. Moreover, the query cache is only available when `destinationTable` is not set "
)
@PluginProperty
private Boolean useQueryCache;
@Schema(
title = "Sets whether nested and repeated fields should be flattened.",
description = "If set to `false`, allowLargeResults must be `true`."
)
@PluginProperty
@Builder.Default
private Boolean flattenResults = true;
@Schema(
title = "Sets whether to use BigQuery's legacy SQL dialect for this query.",
description = " A valid query will return a mostly empty response with some processing statistics, " +
"while an invalid query will return the same error it would if it wasn't a dry run."
)
@PluginProperty
@Builder.Default
private Boolean useLegacySql = false;
@Schema(
title = "Limits the billing tier for this job.",
description = "Queries that have resource usage beyond this tier will fail (without incurring a charge). " +
"If unspecified, this will be set to your project default."
)
@PluginProperty
private Integer maximumBillingTier;
@Schema(
title = "Limits the bytes billed for this job.",
description = "Queries that will have bytes billed beyond this limit will fail (without incurring a charge). " +
"If unspecified, this will be set to your project default."
)
@PluginProperty
private Long maximumBytesBilled;
@Schema(
title = "This is only supported in the fast query path.",
description = "The maximum number of rows of data " +
"to return per page of results. Setting this flag to a small value such as 1000 and then " +
"paging through results might improve reliability when the query result set is large. In " +
"addition to this limit, responses are also limited to 10 MB. By default, there is no maximum " +
"row count, and only the byte limit applies."
)
@PluginProperty
private Long maxResults;
@Override
public Query.Output run(RunContext runContext) throws Exception {
BigQuery connection = this.connection(runContext);
Logger logger = runContext.logger();
QueryJobConfiguration jobConfiguration = this.jobConfiguration(runContext);
logger.debug("Starting query: {}", jobConfiguration.getQuery());
Job queryJob = this.waitForJob(
logger,
() -> connection
.create(JobInfo.newBuilder(jobConfiguration)
.setJobId(BigQueryService.jobId(runContext, this))
.build()
),
this.dryRun
);
JobStatistics.QueryStatistics queryJobStatistics = queryJob.getStatistics();
QueryJobConfiguration config = queryJob.getConfiguration();
TableId tableIdentity = config.getDestinationTable();
if (tableIdentity != null) {
logger.debug("Query loaded in: {}", tableIdentity.getDataset() + "." + tableIdentity.getTable());
}
this.metrics(runContext, queryJobStatistics, queryJob);
Output.OutputBuilder output = Output.builder()
.jobId(queryJob.getJobId().getJob());
if (this.fetch || this.fetchOne || this.store) {
TableResult result = queryJob.getQueryResults();
String[] tags = this.tags(queryJobStatistics, queryJob);
runContext.metric(Counter.of("total.rows", result.getTotalRows(), tags));
if (this.store) {
Map.Entry store = this.storeResult(result, runContext);
runContext.metric(Counter.of("fetch.rows", store.getValue(), tags));
output
.uri(store.getKey())
.size(store.getValue());
} else {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy