com.google.cloud.hadoop.io.bigquery.BigQueryOutputCommitter Maven / Gradle / Ivy

Go to download
package com.google.cloud.hadoop.io.bigquery;

import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.TableReference;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.cloud.hadoop.util.HadoopToStringUtil;
import com.google.cloud.hadoop.util.LogUtil;
import com.google.common.annotations.VisibleForTesting;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;

import java.io.IOException;
import java.security.GeneralSecurityException;

/**
 * An OutputCommitter that commits tables specified in job output dataset in Bigquery. This is
 * called before job start, after task completion, job completion, task cancellation, and job
 * abortion.
 */
public class BigQueryOutputCommitter
    extends OutputCommitter {
  // Logger.
  protected static final LogUtil log = new LogUtil(BigQueryOutputCommitter.class);

  // Used for specialized handling of various API-defined exceptions.
  private ApiErrorExtractor errorExtractor = new ApiErrorExtractor();

  // Id of project used to describe the project under which all connector operations occur.
  private String projectId;

  // Fully-qualified id of the temporary table the connector writes into.
  private TableReference tempTableRef;

  // Fully-qualified id of the final destination table we desire the output to go to.
  private TableReference finalTableRef;

  // Bigquery connection.
  private Bigquery bigquery;

  /**
   * Creates a bigquery output committer.
   *
   * @param projectId the job's project id.
   * @param tempTableRef the fully-qualified temp table to write to.
   * @param finalTableRef the fully-qualified destination table on commit.
   * @param configuration the task's configuration
   * @throws IOException on IO Error.
   */
  public BigQueryOutputCommitter(
      String projectId, TableReference tempTableRef,
      TableReference finalTableRef, Configuration configuration)
      throws IOException {
    this.projectId = projectId;
    this.tempTableRef = tempTableRef;
    this.finalTableRef = finalTableRef;
    // Get Bigquery.
    try {
      BigQueryFactory bigQueryFactory = new BigQueryFactory();
      this.bigquery = bigQueryFactory.getBigQuery(configuration);
    } catch (GeneralSecurityException e) {
      log.error("Could not get Bigquery", e);
      throw new IOException("Could not get Bigquery", e);
    }
  }

  /**
   * Creates the temporary dataset that will contain all of the task work tables.
   *
   * @param context the job's context.
   * @throws IOException on IO Error.
   */
  @Override
  public void setupJob(JobContext context)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("setupJob(%s)", HadoopToStringUtil.toString(context));
    }
    // Create dataset.
    DatasetReference datasetReference = new DatasetReference();
    datasetReference.setProjectId(tempTableRef.getProjectId());
    datasetReference.setDatasetId(tempTableRef.getDatasetId());

    Dataset tempDataset = new Dataset();
    tempDataset.setDatasetReference(datasetReference);

    // Insert dataset into Bigquery.
    Bigquery.Datasets datasets = bigquery.datasets();

    // TODO(user): Maybe allow the dataset to exist already instead of throwing 409 here.
    log.debug("Creating temporary dataset '%s' for project '%s'",
        tempTableRef.getDatasetId(), tempTableRef.getProjectId());

    // NB: Even though this "insert" makes it look like we can specify a different projectId than
    // the one which owns the dataset, it actually has to match.
    datasets.insert(tempTableRef.getProjectId(), tempDataset).execute();
  }

  /**
   * Deletes the temporary dataset, including all of the work tables.
   *
   * @param context the job's context.
   * @throws IOException
   */
  @Override
  public void cleanupJob(JobContext context)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("cleanupJob(%s)", HadoopToStringUtil.toString(context));
    }
    Bigquery.Datasets datasets = bigquery.datasets();
    Configuration config = context.getConfiguration();
    try {
      log.debug("cleanupJob: Deleting dataset '%s' from project '%s'",
          tempTableRef.getDatasetId(), tempTableRef.getProjectId());
      datasets.delete(tempTableRef.getProjectId(), tempTableRef.getDatasetId())
          .setDeleteContents(true)
          .execute();
    } catch (IOException e) {
      // Error is swallowed as job has completed successfully and the only failure is deleting
      // temporary data.
      // This matches the FileOutputCommitter pattern.
      log.warn("Could not delete dataset. Temporary data not cleaned up.", e);
    }
  }

  /**
   * For cleaning up the job's output after job failure.
   *
   * @param jobContext Context of the job whose output is being written.
   * @param status Final run state of the job, should be JobStatus.KILLED or JobStatus.FAILED.
   * @throws IOException on IO Error.
   */
  public void abortJob(JobContext jobContext, int status)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("abortJob(%s, %d)", HadoopToStringUtil.toString(jobContext), status);
    }
    cleanupJob(jobContext);
  }

  /**
   * For committing job's output after successful job completion. Note that this is invoked for jobs
   * with final run state as JobStatus.SUCCEEDED.
   *
   * @param jobContext Context of the job whose output is being written.
   * @throws IOException on IO Error.
   */
  @Override
  public void commitJob(JobContext jobContext)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("commitJob(%s)", HadoopToStringUtil.toString(jobContext));
    }
    cleanupJob(jobContext);
  }

  /**
   * No task setup required.
   *
   * @throws IOException on IO Error.
   */
  @Override
  public void setupTask(TaskAttemptContext context)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("setupTask(%s)", HadoopToStringUtil.toString(context));
    }
    // BigQueryOutputCommitter's setupTask doesn't do anything. Because the
    // temporary task table is created on demand when the
    // task is writing.
  }

  /**
   * Moves the files from the working dataset to the job output table.
   *
   * @param context the task context.
   * @throws IOException on IO Error.
   */
  @Override
  public void commitTask(TaskAttemptContext context)
      throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("commitTask(%s)", HadoopToStringUtil.toString(context));
    }

    // Create a table copy request object.
    JobConfigurationTableCopy copyTableConfig = new JobConfigurationTableCopy();

    // Set the table to get results from.
    copyTableConfig.setSourceTable(tempTableRef);

    // Set the table to put results into.
    copyTableConfig.setDestinationTable(finalTableRef);

    copyTableConfig.setWriteDisposition("WRITE_APPEND");

    JobConfiguration config = new JobConfiguration();
    config.setCopy(copyTableConfig);

    Job job = new Job();
    job.setConfiguration(config);

    // Run the job.
    log.debug("commitTask: Running table copy from %s to %s",
        BigQueryStrings.toString(tempTableRef), BigQueryStrings.toString(finalTableRef));
    Insert insert = bigquery.jobs().insert(projectId, job);
    JobReference jobReference = insert.execute().getJobReference();

    // Poll until job is complete.
    try {
      BigQueryUtils.waitForJobCompletion(bigquery, projectId, jobReference, context);
    } catch (InterruptedException e) {
      log.error("Could not check if results of task were transfered.", e);
      throw new IOException("Could not check if results of task were transfered.", e);
    }
    log.info("Saved output of task to table '%s' using project '%s'",
        BigQueryStrings.toString(finalTableRef), projectId);
  }

  /**
   * Deletes the work table.
   *
   * @param context the task's context.
   */
  @Override
  public void abortTask(TaskAttemptContext context) {
    if (log.isDebugEnabled()) {
      log.debug("abortTask(%s)", HadoopToStringUtil.toString(context));
    }
    // Cleanup of per-task temporary tables will be performed at job cleanup time.
  }

  /**
   * Did this task write any files into the working dataset?
   *
   * @param context the task's context.
   * @throws IOException on IO Error.
   */
  @Override
  public boolean needsTaskCommit(TaskAttemptContext context)
      throws IOException {
    return needsTaskCommit(context.getTaskAttemptID());
  }

  /**
   * Did this task write any files into the working dataset?
   *
   * @param attemptId the task's context.
   * @throws IOException on IO Error.
   */
  @VisibleForTesting
  public boolean needsTaskCommit(TaskAttemptID attemptId) throws IOException {
    if (log.isDebugEnabled()) {
      log.debug("needsTaskCommit(%s) - tempTableRef: '%s'",
          attemptId,
          BigQueryStrings.toString(tempTableRef));
    }

    // TODO(user): Remove ApiErrorExtractor from this class once we push all the bigquery
    // interactions down into BigQueryHelper exclusively.
    BigQueryHelper bigqueryHelper = new BigQueryHelper(bigquery);
    bigqueryHelper.setErrorExtractor(errorExtractor);
    boolean tableExists = bigqueryHelper.tableExists(tempTableRef);
    log.debug("needsTaskCommit -> %s", tableExists);
    return tableExists;
  }

  /**
   * Sets Bigquery for testing purposes.
   */
  @VisibleForTesting
  void setBigquery(Bigquery bigquery) {
    this.bigquery = bigquery;
  }

  @VisibleForTesting
  void setErrorExtractor(ApiErrorExtractor errorExtractor) {
    this.errorExtractor = errorExtractor;
  }
}