
com.google.cloud.hadoop.io.bigquery.BigQueryHelper Maven / Gradle / Ivy
/*
* Copyright 2017 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.io.bigquery;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.EncryptionConfiguration;
import com.google.api.services.bigquery.model.ExternalDataConfiguration;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.flogger.GoogleLogger;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
import javax.annotation.Nullable;
import org.apache.hadoop.util.Progressable;
/**
* Wrapper for BigQuery API.
*/
public class BigQueryHelper {
// BigQuery job_ids must match this pattern.
public static final String BIGQUERY_JOB_ID_PATTERN = "[a-zA-Z0-9_-]+";
// Maximum number of characters in a BigQuery job_id.
public static final int BIGQUERY_JOB_ID_MAX_LENGTH = 1024;
private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
// Used for specialized handling of various API-defined exceptions.
private ApiErrorExtractor errorExtractor = ApiErrorExtractor.INSTANCE;
private Bigquery service;
/** A progressable that does nothing. */
private static final Progressable NOP_PROGRESSABLE =
new Progressable() {
@Override
public void progress() {}
};
public BigQueryHelper(Bigquery service) {
this.service = service;
}
/**
* Returns the underlying Bigquery instance used for communicating with the BigQuery API.
*/
public Bigquery getRawBigquery() {
return service;
}
/**
* Performs a federated import on data from GCS into BigQuery via a table insert.
*
* @param projectId the project on whose behalf to perform the load.
* @param tableRef the reference to the destination table.
* @param schema the schema of the source data to populate the destination table by.
* @param sourceFormat the file format of the source data.
* @param gcsPaths the location of the source data in GCS.
* @throws IOException
*/
public void importFederatedFromGcs(
String projectId,
TableReference tableRef,
@Nullable TableSchema schema,
BigQueryFileFormat sourceFormat,
List gcsPaths)
throws IOException {
logger.atInfo().log(
"Importing into federated table '%s' from %s paths; path[0] is '%s'",
BigQueryStrings.toString(tableRef),
gcsPaths.size(),
gcsPaths.isEmpty() ? "(empty)" : gcsPaths.get(0));
ExternalDataConfiguration externalConf = new ExternalDataConfiguration();
externalConf.setSchema(schema);
externalConf.setSourceUris(gcsPaths);
externalConf.setSourceFormat(sourceFormat.getFormatIdentifier());
// Auto detect the schema if we're not given one, otherwise use the passed schema.
if (schema == null) {
logger.atInfo().log("No federated import schema provided, auto detecting schema.");
externalConf.setAutodetect(true);
} else {
logger.atInfo().log("Using provided federated import schema '%s'.", schema.toString());
}
Table table = new Table();
table.setTableReference(tableRef);
table.setExternalDataConfiguration(externalConf);
service.tables().insert(projectId, tableRef.getDatasetId(), table).execute();
}
/**
* Imports data from GCS into BigQuery via a load job. Optionally polls for completion before
* returning.
*
* @param projectId the project on whose behalf to perform the load.
* @param tableRef the reference to the destination table.
* @param schema the schema of the source data to populate the destination table by.
* @param kmsKeyName the Cloud KMS encryption key used to protect the output table.
* @param sourceFormat the file format of the source data.
* @param writeDisposition the write disposition of the output table.
* @param gcsPaths the location of the source data in GCS.
* @param awaitCompletion if true, block and poll until job completes, otherwise return as soon as
* the job has been successfully dispatched.
* @throws IOException
* @throws InterruptedException if interrupted while waiting for job completion.
*/
public void importFromGcs(
String projectId,
TableReference tableRef,
@Nullable TableSchema schema,
@Nullable String kmsKeyName,
BigQueryFileFormat sourceFormat,
String writeDisposition,
List gcsPaths,
boolean awaitCompletion)
throws IOException, InterruptedException {
logger.atInfo().log(
"Importing into table '%s' from %s paths; path[0] is '%s'; awaitCompletion: %s",
BigQueryStrings.toString(tableRef),
gcsPaths.size(),
gcsPaths.isEmpty() ? "(empty)" : gcsPaths.get(0),
awaitCompletion);
// Create load conf with minimal requirements.
JobConfigurationLoad loadConfig = new JobConfigurationLoad();
loadConfig.setSchema(schema);
loadConfig.setSourceFormat(sourceFormat.getFormatIdentifier());
loadConfig.setSourceUris(gcsPaths);
loadConfig.setDestinationTable(tableRef);
loadConfig.setWriteDisposition(writeDisposition);
if (!Strings.isNullOrEmpty(kmsKeyName)) {
loadConfig.setDestinationEncryptionConfiguration(
new EncryptionConfiguration().setKmsKeyName(kmsKeyName));
}
// Auto detect the schema if we're not given one, otherwise use the passed schema.
if (schema == null) {
logger.atInfo().log("No import schema provided, auto detecting schema.");
loadConfig.setAutodetect(true);
} else {
logger.atInfo().log("Using provided import schema '%s'.", schema.toString());
}
JobConfiguration config = new JobConfiguration();
config.setLoad(loadConfig);
// Get the dataset to determine the location
Dataset dataset =
service.datasets().get(tableRef.getProjectId(), tableRef.getDatasetId()).execute();
JobReference jobReference =
createJobReference(projectId, "direct-bigqueryhelper-import", dataset.getLocation());
Job job = new Job();
job.setConfiguration(config);
job.setJobReference(jobReference);
// Insert and run job.
insertJobOrFetchDuplicate(projectId, job);
if (awaitCompletion) {
// Poll until job is complete.
BigQueryUtils.waitForJobCompletion(
getRawBigquery(), projectId, jobReference, NOP_PROGRESSABLE);
}
}
/**
* Exports BigQuery results into GCS, polls for completion before returning.
*
* @param projectId the project on whose behalf to perform the export.
* @param tableRef the table to export.
* @param gcsPaths the GCS paths to export to.
* @param awaitCompletion if true, block and poll until job completes, otherwise return as soon as
* the job has been successfully dispatched.
*
* @throws IOException on IO error.
* @throws InterruptedException on interrupt.
*/
public void exportBigQueryToGcs(String projectId, TableReference tableRef, List gcsPaths,
boolean awaitCompletion) throws IOException, InterruptedException {
logger.atFine().log(
"exportBigQueryToGcs(bigquery, '%s', '%s', '%s', '%s')",
projectId, BigQueryStrings.toString(tableRef), gcsPaths, awaitCompletion);
logger.atInfo().log(
"Exporting table '%s' to %s paths; path[0] is '%s'; awaitCompletion: %s",
BigQueryStrings.toString(tableRef),
gcsPaths.size(),
gcsPaths.isEmpty() ? "(empty)" : gcsPaths.get(0),
awaitCompletion);
// Create job and configuration.
JobConfigurationExtract extractConfig = new JobConfigurationExtract();
// Set source.
extractConfig.setSourceTable(tableRef);
// Set destination.
extractConfig.setDestinationUris(gcsPaths);
extractConfig.set("destinationFormat", "NEWLINE_DELIMITED_JSON");
JobConfiguration config = new JobConfiguration();
config.setExtract(extractConfig);
// Get the table to determine the location
Table table = getTable(tableRef);
JobReference jobReference =
createJobReference(projectId, "direct-bigqueryhelper-export", table.getLocation());
Job job = new Job();
job.setConfiguration(config);
job.setJobReference(jobReference);
// Insert and run job.
insertJobOrFetchDuplicate(projectId, job);
if (awaitCompletion) {
// Poll until job is complete.
BigQueryUtils.waitForJobCompletion(service, projectId, jobReference, NOP_PROGRESSABLE);
}
}
/**
* Returns true if the table exists, or false if not.
*/
public boolean tableExists(TableReference tableRef) throws IOException {
try {
Table fetchedTable = service.tables().get(
tableRef.getProjectId(), tableRef.getDatasetId(), tableRef.getTableId()).execute();
logger.atFine().log(
"Successfully fetched table '%s' for tableRef '%s'", fetchedTable, tableRef);
return true;
} catch (IOException ioe) {
if (errorExtractor.itemNotFound(ioe)) {
return false;
} else {
// Unhandled exceptions should just propagate up.
throw ioe;
}
}
}
/**
* Gets the specified table resource by table ID. This method does not return the data in the
* table, it only returns the table resource, which describes the structure of this table.
*
* @param tableRef The BigQuery table reference.
* @return The table resource, which describes the structure of this table.
* @throws IOException
*/
public Table getTable(TableReference tableRef)
throws IOException {
Bigquery.Tables.Get getTablesReply = service.tables().get(
tableRef.getProjectId(), tableRef.getDatasetId(), tableRef.getTableId());
return getTablesReply.execute();
}
/**
* Creates a new JobReference with a unique jobId generated from {@code jobIdPrefix} plus a
* randomly generated UUID String.
*/
public JobReference createJobReference(
String projectId, String jobIdPrefix, @Nullable String location) {
Preconditions.checkArgument(projectId != null, "projectId must not be null.");
Preconditions.checkArgument(jobIdPrefix != null, "jobIdPrefix must not be null.");
Preconditions.checkArgument(jobIdPrefix.matches(BIGQUERY_JOB_ID_PATTERN),
"jobIdPrefix '%s' must match pattern '%s'", jobIdPrefix, BIGQUERY_JOB_ID_PATTERN);
String fullJobId = String.format("%s-%s", jobIdPrefix, UUID.randomUUID().toString());
Preconditions.checkArgument(fullJobId.length() <= BIGQUERY_JOB_ID_MAX_LENGTH,
"fullJobId '%s' has length '%s'; must be less than or equal to %s",
fullJobId, fullJobId.length(), BIGQUERY_JOB_ID_MAX_LENGTH);
return new JobReference().setProjectId(projectId).setJobId(fullJobId).setLocation(location);
}
/**
* Helper to check for non-null Job.getJobReference().getJobId() and quality of the getJobId()
* between {@code expected} and {@code actual}, using Preconditions.checkState.
*/
public void checkJobIdEquality(Job expected, Job actual) {
Preconditions.checkState(actual.getJobReference() != null
&& actual.getJobReference().getJobId() != null
&& expected.getJobReference() != null
&& expected.getJobReference().getJobId() != null
&& actual.getJobReference().getJobId().equals(expected.getJobReference().getJobId()),
"jobIds must match in '[expected|actual].getJobReference()' (got '%s' vs '%s')",
expected.getJobReference(), actual.getJobReference());
}
/**
* Tries to run jobs().insert(...) with the provided {@code projectId} and {@code job}, which
* returns a {@code Job} under normal operation, which is then returned from this method.
* In case of an exception being thrown, if the cause was "409 conflict", then we issue a
* separate "jobs().get(...)" request and return the results of that fetch instead.
* Other exceptions propagate out as normal.
*/
public Job insertJobOrFetchDuplicate(String projectId, Job job) throws IOException {
Preconditions.checkArgument(
job.getJobReference() != null && job.getJobReference().getJobId() != null,
"Require non-null JobReference and JobId inside; getJobReference() == '%s'",
job.getJobReference());
Insert insert = service.jobs().insert(projectId, job);
Job response = null;
try {
response = insert.execute();
logger.atFine().log("Successfully inserted job '%s'. Response: '%s'", job, response);
} catch (IOException ioe) {
if (errorExtractor.itemAlreadyExists(ioe)) {
logger.atInfo().withCause(ioe).log(
"Fetching existing job after catching exception for duplicate jobId '%s'",
job.getJobReference().getJobId());
response = service.jobs().get(projectId, job.getJobReference().getJobId()).execute();
} else {
throw new IOException(
String.format("Unhandled exception trying to insert job '%s'", job), ioe);
}
}
checkJobIdEquality(job, response);
return response;
}
@VisibleForTesting
void setErrorExtractor(ApiErrorExtractor errorExtractor) {
this.errorExtractor = errorExtractor;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy