io.cdap.plugin.gcp.bigquery.sqlengine.BigQueryPullDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
/*
* Copyright © 2021 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.bigquery.sqlengine;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.DatasetId;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Transform;
import io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset;
import io.cdap.cdap.etl.api.engine.sql.request.SQLPullRequest;
import io.cdap.plugin.gcp.bigquery.source.BigQueryInputFormatProvider;
import io.cdap.plugin.gcp.bigquery.source.BigQuerySourceUtils;
import io.cdap.plugin.gcp.bigquery.sqlengine.transform.PullTransform;
import io.cdap.plugin.gcp.bigquery.sqlengine.util.BigQuerySQLEngineUtils;
import org.apache.avro.generic.GenericData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import java.io.IOException;
import javax.annotation.Nullable;
/**
* SQL Pull Dataset implementation for BigQuery backed datasets.
*/
public class BigQueryPullDataset extends BigQueryInputFormatProvider
implements SQLPullDataset, BigQuerySQLDataset {
private final BigQuery bigQuery;
private final String datasetName;
private final Schema schema;
private final DatasetId bqDataset;
private final String bqTable;
private final String gcsPath;
private Long numRows;
private BigQueryPullDataset(Configuration configuration,
String datasetName,
Schema schema,
BigQuery bigQuery,
DatasetId bqDataset,
String bqTable,
String gcsPath) {
super(configuration);
this.datasetName = datasetName;
this.schema = schema;
this.bigQuery = bigQuery;
this.bqDataset = bqDataset;
this.bqTable = bqTable;
this.gcsPath = gcsPath;
}
public static BigQueryPullDataset getInstance(SQLPullRequest pullRequest,
Configuration baseConfiguration,
BigQuery bigQuery,
DatasetId bqDataset,
String bqTable,
String bucket,
String runId) throws IOException {
// Clone configuration object
Configuration configuration = new Configuration(baseConfiguration);
// Configure BigQuery input format.
String gcsPath = BigQuerySQLEngineUtils.getGCSPath(bucket, runId, bqTable);
BigQuerySourceUtils.configureBigQueryInput(configuration, bqDataset, bqTable, gcsPath);
return new BigQueryPullDataset(configuration,
pullRequest.getDatasetName(),
pullRequest.getDatasetSchema(),
bigQuery,
bqDataset,
bqTable,
gcsPath);
}
@Override
public Transform, StructuredRecord> fromKeyValue() {
return new PullTransform(schema);
}
@Override
public String getDatasetName() {
return datasetName;
}
@Override
public Schema getSchema() {
return schema;
}
@Override
public long getNumRows() {
// Get the number of rows from BQ if not known at this time.
if (numRows == null) {
numRows = BigQuerySQLEngineUtils.getNumRows(bigQuery, bqDataset, bqTable);
}
return numRows;
}
@Override
public String getBigQueryProject() {
return bqDataset.getProject();
}
@Override
public String getBigQueryDataset() {
return bqDataset.getDataset();
}
@Override
public String getBigQueryTable() {
return bqTable;
}
@Override
@Nullable
public String getJobId() {
return null;
}
@Override
public String getGCSPath() {
return gcsPath;
}
}