io.cdap.plugin.gcp.bigquery.sqlengine.BigQueryReadDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
The newest version!
/*
* Copyright © 2022 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
/*
* Readright © 2022 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a read of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.bigquery.sqlengine;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.CopyJobConfiguration;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobConfiguration;
import com.google.cloud.bigquery.JobId;
import com.google.cloud.bigquery.JobInfo;
import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.QueryJobConfiguration;
import com.google.cloud.bigquery.StandardTableDefinition;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableDefinition.Type;
import com.google.cloud.bigquery.TableId;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.metrics.Metrics;
import io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset;
import io.cdap.cdap.etl.api.engine.sql.request.SQLReadRequest;
import io.cdap.cdap.etl.api.engine.sql.request.SQLReadResult;
import io.cdap.plugin.gcp.bigquery.source.BigQuerySourceConfig;
import io.cdap.plugin.gcp.bigquery.sqlengine.util.BigQuerySQLEngineUtils;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import javax.annotation.Nullable;
/**
* SQL Pull Dataset implementation for BigQuery backed datasets.
*/
public class BigQueryReadDataset implements SQLDataset, BigQuerySQLDataset {
private enum BigQueryJobType { QUERY, COPY, COPY_SNAPSHOT };
private static final Logger LOG = LoggerFactory.getLogger(BigQueryReadDataset.class);
private static final Gson GSON = new Gson();
public static final String SQL_INPUT_CONFIG = "config";
public static final String SQL_INPUT_FIELDS = "fields";
public static final String SQL_INPUT_SCHEMA = "schema";
private static final java.lang.reflect.Type LIST_OF_STRINGS_TYPE = new TypeToken>() { }.getType();
private static final String BQ_PUSHDOWN_OPERATION_TAG = "read";
private final BigQuerySQLEngineConfig sqlEngineConfig;
private final BigQuery bigQuery;
private final String datasetName;
private final SQLReadRequest readRequest;
private final TableId destinationTableId;
private final String jobId;
private Schema schema;
private Long numRows;
private Metrics metrics;
private BigQueryReadDataset(String datasetName,
BigQuerySQLEngineConfig sqlEngineConfig,
BigQuery bigQuery,
SQLReadRequest readRequest,
TableId destinationTableId,
String jobId,
Metrics metrics) {
this.datasetName = datasetName;
this.sqlEngineConfig = sqlEngineConfig;
this.bigQuery = bigQuery;
this.readRequest = readRequest;
this.destinationTableId = destinationTableId;
this.jobId = jobId;
this.metrics = metrics;
}
public static BigQueryReadDataset getInstance(String datasetName,
BigQuerySQLEngineConfig sqlEngineConfig,
BigQuery bigQuery,
SQLReadRequest readRequest,
TableId destinationTableId,
Metrics metrics) {
// Get new Job ID for this push operation
String jobId = BigQuerySQLEngineUtils.newIdentifier();
return new BigQueryReadDataset(datasetName,
sqlEngineConfig,
bigQuery,
readRequest,
destinationTableId,
jobId,
metrics);
}
public SQLReadResult read() {
SQLReadResult result = null;
// We use this atomic reference to delete a new table if it was created for this execution.
AtomicReference newSourceTable = new AtomicReference<>(null);
try {
return readInternal(readRequest, newSourceTable);
} catch (InterruptedException e) {
LOG.error("Interrupted exception during BigQuery read operation.", e);
} catch (BigQueryException bqe) {
LOG.error("BigQuery exception during BigQuery read operation", bqe);
} catch (Exception e) {
LOG.error("Exception during BigQuery read operation", e);
}
// If a new table was created for this execution, but the execution failed for any reason,
// delete the created table so the standard sink workflow can succeed.
if (result == null || !result.isSuccessful()) {
tryDeleteTable(destinationTableId);
}
// Return as a failure if the operation threw an exception.
return SQLReadResult.failure(readRequest.getDatasetName());
}
private SQLReadResult readInternal(SQLReadRequest readRequest,
AtomicReference newSourceTable)
throws BigQueryException, InterruptedException {
// Check if this output matches the expected engine.
String datasetName = readRequest.getDatasetName();
if (!BigQuerySQLEngine.class.getName().equals(readRequest.getInput().getSqlEngineClassName())) {
LOG.debug("Got output for another SQL engine {}, skipping", readRequest.getInput().getSqlEngineClassName());
return SQLReadResult.unsupported(datasetName);
}
// Get configuration properties from read request arguments
Map arguments = readRequest.getInput().getArguments();
BigQuerySourceConfig sourceConfig = GSON.fromJson(arguments.get(SQL_INPUT_CONFIG), BigQuerySourceConfig.class);
schema = GSON.fromJson(arguments.get(SQL_INPUT_SCHEMA), Schema.class);
List fields = GSON.fromJson(arguments.get(SQL_INPUT_FIELDS), LIST_OF_STRINGS_TYPE);
// Get source table information
String sourceProject = sourceConfig.getDatasetProject();
String sourceDataset = sourceConfig.getDataset();
String sourceTableName = sourceConfig.getTable();
TableId sourceTableId = TableId.of(sourceProject, sourceDataset, sourceTableName);
// Check if both datasets are in the same Location. If not, the direct read operation cannot be performed.
DatasetId sourceDatasetId = DatasetId.of(sourceTableId.getProject(), sourceTableId.getDataset());
DatasetId destinationDatasetId = DatasetId.of(destinationTableId.getProject(), destinationTableId.getDataset());
Dataset srcDataset = bigQuery.getDataset(sourceDatasetId);
Dataset destDataset = bigQuery.getDataset(destinationDatasetId);
// Ensure datasets exist before proceeding
if (srcDataset == null || destDataset == null) {
LOG.warn("Direct table read is not supported when the datasets are not created.");
return SQLReadResult.unsupported(datasetName);
}
// Ensure both datasets are in the same location.
if (!Objects.equals(srcDataset.getLocation(), destDataset.getLocation())) {
LOG.error("Direct table read is only supported if both datasets are in the same location. "
+ "'{}' is '{}' , '{}' is '{}' .",
sourceDatasetId.getDataset(), srcDataset.getLocation(),
destinationDatasetId.getDataset(), destDataset.getLocation());
return SQLReadResult.unsupported(datasetName);
}
String jobLocation = srcDataset.getLocation();
Table sourceTable;
try {
sourceTable = bigQuery.getTable(sourceTableId);
} catch (BigQueryException e) {
throw new IllegalArgumentException("Unable to get details about the BigQuery table: " + e.getMessage(), e);
}
Long tableTTL = -1L;
// Calculate TTL for table if needed.
if (!sqlEngineConfig.shouldRetainTables() && sqlEngineConfig.getTempTableTTLHours() > 0) {
long ttlMillis = TimeUnit.MILLISECONDS.convert(sqlEngineConfig.getTempTableTTLHours(), TimeUnit.HOURS);
tableTTL = Instant.now().toEpochMilli() + ttlMillis;
}
// Create configuration for table copy job
JobConfiguration queryConfig = getBQQueryJobConfiguration(sourceTable,
sourceTableId,
sourceConfig.getFilter(),
sourceConfig.getPartitionFrom(),
sourceConfig.getPartitionTo(),
tableTTL);
return executeBigQueryJob(queryConfig, sourceTable, sourceTableId, BigQueryJobType.QUERY, jobLocation);
}
private SQLReadResult executeBigQueryJob(JobConfiguration jobConfiguration,
Table sourceTable,
TableId sourceTableId,
BigQueryJobType bigQueryJobType,
String jobLocation)
throws InterruptedException {
// Create a job ID so that we can safely retry.
JobId bqJobId = JobId.newBuilder()
.setJob(jobId)
.setLocation(jobLocation)
.setProject(sqlEngineConfig.getProject())
.build();
Job bqJob = bigQuery.create(JobInfo.newBuilder(jobConfiguration).setJobId(bqJobId).build());
// Wait for the query to complete.
bqJob = bqJob.waitFor();
// Check for errors
if (bqJob.getStatus().getError() != null) {
BigQuerySQLEngineUtils.logJobMetrics(bqJob, metrics);
LOG.error("Error executing BigQuery Job of type {} : '{}' in Project '{}', Dataset '{}': {}",
bigQueryJobType, jobId, sqlEngineConfig.getProject(), sqlEngineConfig.getDatasetProject(),
bqJob.getStatus().getError().toString());
return SQLReadResult.failure(datasetName);
}
// Number of rows is taken from the job statistics if available.
// If not, we use the number of source table records. (This is also the case for snapshot copy)
long numRows = sourceTable.getNumRows().longValue();
if (bigQueryJobType.equals(BigQueryJobType.QUERY)) {
JobStatistics.QueryStatistics queryJobStats = bqJob.getStatistics();
numRows = queryJobStats != null && queryJobStats.getNumDmlAffectedRows() != null ?
queryJobStats.getNumDmlAffectedRows() : numRows;
}
LOG.info("Executed read operation for {} records from {}.{}.{} into {}.{}.{}", numRows,
sourceTableId.getProject(), sourceTableId.getDataset(), sourceTableId.getTable(),
destinationTableId.getProject(), destinationTableId.getDataset(), destinationTableId.getTable());
BigQuerySQLEngineUtils.logJobMetrics(bqJob, metrics);
return SQLReadResult.success(datasetName, this);
}
private JobConfiguration getBQQueryJobConfiguration(Table sourceTable,
TableId sourceTableId,
String filter,
String partitionFromDate,
String partitionToDate,
Long tableTTL) {
BigQuerySQLEngineUtils.createEmptyTableWithSourceConfig(bigQuery, destinationTableId.getProject(),
destinationTableId.getDataset(), destinationTableId.getTable(),
sourceTable, tableTTL);
// Select all fields from source table into destination table
String query = String.format("SELECT * FROM `%s.%s.%s`",
sourceTableId.getProject(),
sourceTableId.getDataset(),
sourceTableId.getTable());
StringBuilder condition = new StringBuilder();
//Depending on the Type of Table --> add partitioning
StandardTableDefinition tableDefinition = Objects.requireNonNull(sourceTable).getDefinition();
Type type = tableDefinition.getType();
if (!(type == Type.VIEW || type == Type.MATERIALIZED_VIEW || type == Type.EXTERNAL)) {
condition.append(
BigQueryUtil.generateTimePartitionCondition(tableDefinition, partitionFromDate, partitionToDate));
}
//If filter is present add it.
if (!Strings.isNullOrEmpty(filter)) {
if (condition.length() == 0) {
condition.append(filter);
} else {
condition.append(" and (").append(filter).append(")");
}
}
if (condition.length() > 0) {
query = String.format("%s WHERE %s", query, condition);
}
LOG.info("Reading data from `{}.{}.{}` to `{}.{}.{}` using SQL statement: {} ",
sourceTableId.getProject(), sourceTableId.getDataset(), sourceTableId.getTable(),
destinationTableId.getProject(), destinationTableId.getDataset(), destinationTableId.getTable(),
query);
QueryJobConfiguration.Builder queryConfigBuilder = QueryJobConfiguration.newBuilder(query)
.setDestinationTable(destinationTableId)
.setCreateDisposition(JobInfo.CreateDisposition.CREATE_NEVER)
.setWriteDisposition(JobInfo.WriteDisposition.WRITE_APPEND)
.setPriority(sqlEngineConfig.getJobPriority())
.setLabels(BigQuerySQLEngineUtils.getJobTags(BQ_PUSHDOWN_OPERATION_TAG));
return queryConfigBuilder.build();
}
@VisibleForTesting
QueryJobConfiguration.Builder getQueryBuilder(Table sourceTable, TableId sourceTableId,
TableId destinationTableId,
List fields,
String filter,
String partitionFromDate,
String partitionToDate) {
String query = String.format("SELECT %s FROM `%s.%s.%s`",
String.join(",", fields),
sourceTableId.getProject(),
sourceTableId.getDataset(),
sourceTableId.getTable());
StringBuilder condition = new StringBuilder();
//Depending on the Type of Table --> add partitioning
StandardTableDefinition tableDefinition = Objects.requireNonNull(sourceTable).getDefinition();
Type type = tableDefinition.getType();
if (!(type == Type.VIEW || type == Type.MATERIALIZED_VIEW || type == Type.EXTERNAL)) {
condition.append(
BigQueryUtil.generateTimePartitionCondition(tableDefinition, partitionFromDate, partitionToDate));
}
//If filter is present add it.
if (!Strings.isNullOrEmpty(filter)) {
if (condition.length() == 0) {
condition.append(filter);
} else {
condition.append(" and (").append(filter).append(")");
}
}
if (condition.length() > 0) {
query = String.format("%s WHERE %s", query, condition);
}
LOG.info("Reading data from `{}.{}.{}` to `{}.{}.{}` using SQL statement: {} ",
sourceTableId.getProject(), sourceTableId.getDataset(), sourceTableId.getTable(),
destinationTableId.getProject(), destinationTableId.getDataset(), destinationTableId.getTable(),
query);
return QueryJobConfiguration.newBuilder(query)
.setDestinationTable(destinationTableId)
.setCreateDisposition(JobInfo.CreateDisposition.CREATE_NEVER)
.setWriteDisposition(JobInfo.WriteDisposition.WRITE_APPEND)
.setPriority(sqlEngineConfig.getJobPriority())
.setLabels(BigQuerySQLEngineUtils.getJobTags(BQ_PUSHDOWN_OPERATION_TAG));
}
/**
* Try to delete this table while handling exception
*
* @param table the table identified for the table we want to delete.
*/
protected void tryDeleteTable(TableId table) {
try {
bigQuery.delete(table);
} catch (BigQueryException bqe) {
LOG.error("Unable to delete table {}.{}.{}. This may cause the pipeline to fail",
table.getProject(), table.getDataset(), table.getTable(), bqe);
}
}
@Override
public String getBigQueryProject() {
return destinationTableId.getProject();
}
@Override
public String getBigQueryDataset() {
return destinationTableId.getDataset();
}
@Override
public String getBigQueryTable() {
return destinationTableId.getTable();
}
@Nullable
@Override
public String getJobId() {
return jobId;
}
@Nullable
@Override
public String getGCSPath() {
return null;
}
@Override
public long getNumRows() {
// Get the number of rows from BQ if not known at this time.
if (numRows == null) {
numRows = BigQuerySQLEngineUtils.getNumRows(bigQuery,
DatasetId.of(destinationTableId.getProject(),
destinationTableId.getDataset()),
destinationTableId.getTable());
}
return numRows;
}
@Override
public String getDatasetName() {
return datasetName;
}
@Override
public Schema getSchema() {
return schema;
}
}