com.google.cloud.hadoop.io.bigquery.BigQueryOutputCommitter Maven / Gradle / Ivy
package com.google.cloud.hadoop.io.bigquery;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.TableReference;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.cloud.hadoop.util.HadoopToStringUtil;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.security.GeneralSecurityException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An OutputCommitter that commits tables specified in job output dataset in Bigquery. This is
* called before job start, after task completion, job completion, task cancellation, and job
* abortion.
*/
public class BigQueryOutputCommitter
extends OutputCommitter {
// Logger.
protected static final Logger LOG = LoggerFactory.getLogger(BigQueryOutputCommitter.class);
// Used for specialized handling of various API-defined exceptions.
private ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
// Id of project used to describe the project under which all connector operations occur.
private String projectId;
// Fully-qualified id of the temporary table the connector writes into.
private TableReference tempTableRef;
// Fully-qualified id of the final destination table we desire the output to go to.
private TableReference finalTableRef;
// Wrapper around some Bigquery API methods and convenience methods.
private BigQueryHelper bigQueryHelper;
/**
* Creates a bigquery output committer.
*
* @param projectId the job's project id.
* @param tempTableRef the fully-qualified temp table to write to.
* @param finalTableRef the fully-qualified destination table on commit.
* @param configuration the task's configuration
* @throws IOException on IO Error.
*/
public BigQueryOutputCommitter(
String projectId, TableReference tempTableRef,
TableReference finalTableRef, Configuration configuration)
throws IOException {
this.projectId = projectId;
this.tempTableRef = tempTableRef;
this.finalTableRef = finalTableRef;
// Get Bigquery.
try {
BigQueryFactory bigQueryFactory = new BigQueryFactory();
this.bigQueryHelper = bigQueryFactory.getBigQueryHelper(configuration);
} catch (GeneralSecurityException e) {
LOG.error("Could not get Bigquery", e);
throw new IOException("Could not get Bigquery", e);
}
}
/**
* Creates the temporary dataset that will contain all of the task work tables.
*
* @param context the job's context.
* @throws IOException on IO Error.
*/
@Override
public void setupJob(JobContext context)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("setupJob({})", HadoopToStringUtil.toString(context));
}
// Create dataset.
DatasetReference datasetReference = new DatasetReference();
datasetReference.setProjectId(tempTableRef.getProjectId());
datasetReference.setDatasetId(tempTableRef.getDatasetId());
Configuration config = context.getConfiguration();
Dataset tempDataset = new Dataset();
tempDataset.setDatasetReference(datasetReference);
tempDataset.setLocation(config.get(BigQueryConfiguration.DATA_LOCATION_KEY,
BigQueryConfiguration.DATA_LOCATION_DEFAULT));
// Insert dataset into Bigquery.
Bigquery.Datasets datasets = bigQueryHelper.getRawBigquery().datasets();
// TODO(user): Maybe allow the dataset to exist already instead of throwing 409 here.
LOG.debug("Creating temporary dataset '{}' for project '{}'",
tempTableRef.getDatasetId(), tempTableRef.getProjectId());
// NB: Even though this "insert" makes it look like we can specify a different projectId than
// the one which owns the dataset, it actually has to match.
datasets.insert(tempTableRef.getProjectId(), tempDataset).execute();
}
/**
* Deletes the temporary dataset, including all of the work tables.
*
* @param context the job's context.
* @throws IOException
*/
@Override
public void cleanupJob(JobContext context)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("cleanupJob({})", HadoopToStringUtil.toString(context));
}
Bigquery.Datasets datasets = bigQueryHelper.getRawBigquery().datasets();
Configuration config = context.getConfiguration();
try {
LOG.debug("cleanupJob: Deleting dataset '{}' from project '{}'",
tempTableRef.getDatasetId(), tempTableRef.getProjectId());
datasets.delete(tempTableRef.getProjectId(), tempTableRef.getDatasetId())
.setDeleteContents(true)
.execute();
} catch (IOException e) {
// Error is swallowed as job has completed successfully and the only failure is deleting
// temporary data.
// This matches the FileOutputCommitter pattern.
LOG.warn("Could not delete dataset. Temporary data not cleaned up.", e);
}
}
/**
* For cleaning up the job's output after job failure.
*
* @param jobContext Context of the job whose output is being written.
* @param status Final run state of the job, should be JobStatus.KILLED or JobStatus.FAILED.
* @throws IOException on IO Error.
*/
public void abortJob(JobContext jobContext, int status)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("abortJob({}, {})", HadoopToStringUtil.toString(jobContext), status);
}
cleanupJob(jobContext);
}
/**
* For committing job's output after successful job completion. Note that this is invoked for jobs
* with final run state as JobStatus.SUCCEEDED.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException on IO Error.
*/
@Override
public void commitJob(JobContext jobContext)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("commitJob({})", HadoopToStringUtil.toString(jobContext));
}
cleanupJob(jobContext);
}
/**
* No task setup required.
*
* @throws IOException on IO Error.
*/
@Override
public void setupTask(TaskAttemptContext context)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("setupTask({})", HadoopToStringUtil.toString(context));
}
// BigQueryOutputCommitter's setupTask doesn't do anything. Because the
// temporary task table is created on demand when the
// task is writing.
}
/**
* Moves the files from the working dataset to the job output table.
*
* @param context the task context.
* @throws IOException on IO Error.
*/
@Override
public void commitTask(TaskAttemptContext context)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("commitTask({})", HadoopToStringUtil.toString(context));
}
// Create a table copy request object.
JobConfigurationTableCopy copyTableConfig = new JobConfigurationTableCopy();
// Set the table to get results from.
copyTableConfig.setSourceTable(tempTableRef);
// Set the table to put results into.
copyTableConfig.setDestinationTable(finalTableRef);
copyTableConfig.setWriteDisposition("WRITE_APPEND");
JobConfiguration config = new JobConfiguration();
config.setCopy(copyTableConfig);
JobReference jobReference = bigQueryHelper.createJobReference(
projectId, context.getTaskAttemptID().toString());
Job job = new Job();
job.setConfiguration(config);
job.setJobReference(jobReference);
// Run the job.
LOG.debug("commitTask: Running table copy from {} to {}",
BigQueryStrings.toString(tempTableRef), BigQueryStrings.toString(finalTableRef));
Job response = bigQueryHelper.insertJobOrFetchDuplicate(projectId, job);
LOG.debug("Got response '{}'", response);
// Poll until job is complete.
try {
BigQueryUtils.waitForJobCompletion(
bigQueryHelper.getRawBigquery(), projectId, jobReference, context);
} catch (InterruptedException e) {
LOG.error("Could not check if results of task were transfered.", e);
throw new IOException("Could not check if results of task were transfered.", e);
}
LOG.info("Saved output of task to table '{}' using project '{}'",
BigQueryStrings.toString(finalTableRef), projectId);
}
/**
* Deletes the work table.
*
* @param context the task's context.
*/
@Override
public void abortTask(TaskAttemptContext context) {
if (LOG.isDebugEnabled()) {
LOG.debug("abortTask({})", HadoopToStringUtil.toString(context));
}
// Cleanup of per-task temporary tables will be performed at job cleanup time.
}
/**
* Did this task write any files into the working dataset?
*
* @param context the task's context.
* @throws IOException on IO Error.
*/
@Override
public boolean needsTaskCommit(TaskAttemptContext context)
throws IOException {
return needsTaskCommit(context.getTaskAttemptID());
}
/**
* Did this task write any files into the working dataset?
*
* @param attemptId the task's context.
* @throws IOException on IO Error.
*/
@VisibleForTesting
public boolean needsTaskCommit(TaskAttemptID attemptId) throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("needsTaskCommit({}) - tempTableRef: '{}'",
attemptId,
BigQueryStrings.toString(tempTableRef));
}
boolean tableExists = bigQueryHelper.tableExists(tempTableRef);
LOG.debug("needsTaskCommit -> {}", tableExists);
return tableExists;
}
/**
* Sets Bigquery for testing purposes.
*/
@VisibleForTesting
void setBigQueryHelper(BigQueryHelper helper) {
this.bigQueryHelper = helper;
}
@VisibleForTesting
void setErrorExtractor(ApiErrorExtractor errorExtractor) {
this.errorExtractor = errorExtractor;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy