com.google.cloud.hadoop.io.bigquery.AbstractBigQueryInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigquery-connector Show documentation
Hadoop 2 MapReduce input and output formats for use with BigQuery
There is a newer version: 0.13.4-hadoop3
/*
 * Copyright 2017 Google LLC
 *
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.hadoop.io.bigquery;

import static com.google.common.flogger.LazyArgs.lazy;

import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.cloud.hadoop.util.ConfigurationUtil;
import com.google.cloud.hadoop.util.HadoopToStringUtil;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.flogger.GoogleLogger;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * Abstract base class for BigQuery input formats. This class is expected to take care of performing
 * BigQuery exports to temporary tables, BigQuery exports to GCS and cleaning up any files or tables
 * that either of those processes create.
 * @param  Key type
 * @param  Value type
 */
public abstract class AbstractBigQueryInputFormat
    extends InputFormat implements DelegateRecordReaderFactory {

  private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
  /**
   * Configuration key for InputFormat class name.
   */
  public static final String INPUT_FORMAT_CLASS_KEY = "mapreduce.inputformat.class";

  /**
   * The keyword for the type of BigQueryTable store externally.
   */
  public static final String EXTERNAL_TABLE_TYPE = "EXTERNAL";

  // Used by UnshardedExportToCloudStorage
  private InputFormat delegateInputFormat;

  /**
   * Configure the BigQuery input table for a job
   */
  public static void setInputTable(
      Configuration configuration, String projectId, String datasetId, String tableId)
      throws IOException {
    BigQueryConfiguration.configureBigQueryInput(configuration, projectId, datasetId, tableId);
  }

  /**
   * Configure the BigQuery input table for a job
   */
  public static void setInputTable(Configuration configuration, TableReference tableReference)
      throws IOException {
    setInputTable(
        configuration,
        tableReference.getProjectId(),
        tableReference.getDatasetId(),
        tableReference.getTableId());
  }

  /**
   * Configure a directory to which we will export BigQuery data
   */
  public static void setTemporaryCloudStorageDirectory(Configuration configuration, String path) {
    configuration.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, path);
  }

  /** Get the ExportFileFormat that this input format supports. */
  public abstract ExportFileFormat getExportFileFormat();

  @SuppressWarnings("unchecked")
  protected static ExportFileFormat getExportFileFormat(Configuration configuration) {
    Class> clazz =
        (Class>)
            configuration.getClass(INPUT_FORMAT_CLASS_KEY, AbstractBigQueryInputFormat.class);
    Preconditions.checkState(
        AbstractBigQueryInputFormat.class.isAssignableFrom(clazz),
        "Expected input format to derive from AbstractBigQueryInputFormat");
    return getExportFileFormat(clazz);
  }

  protected static ExportFileFormat getExportFileFormat(
      Class> clazz) {
    try {
      AbstractBigQueryInputFormat format = clazz.getConstructor().newInstance();
      return format.getExportFileFormat();
    } catch (ReflectiveOperationException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public List getSplits(JobContext context) throws IOException, InterruptedException {
    logger.atFine().log("getSplits(%s)", lazy(() -> HadoopToStringUtil.toString(context)));

    final Configuration configuration = context.getConfiguration();
    BigQueryHelper bigQueryHelper;
    try {
      bigQueryHelper = getBigQueryHelper(configuration);
    } catch (GeneralSecurityException gse) {
      throw new IOException("Failed to create BigQuery client", gse);
    }

    String exportPath =
        BigQueryConfiguration.getTemporaryPathRoot(configuration, context.getJobID());
    configuration.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, exportPath);

    Export export = constructExport(
        configuration,
        getExportFileFormat(),
        exportPath,
        bigQueryHelper,
        delegateInputFormat);
    export.prepare();

    // Invoke the export, maybe wait for it to complete.
    try {
      export.beginExport();
      export.waitForUsableMapReduceInput();
    } catch (IOException ie) {
      throw new IOException("Error while exporting: " + HadoopToStringUtil.toString(context), ie);
    }

    List splits = export.getSplits(context);

    if (logger.atFine().isEnabled()) {
      // Stringifying a really big list of splits can be expensive, so we guard with
      // isDebugEnabled().
      logger.atFine().log("getSplits -> %s", HadoopToStringUtil.toString(splits));
    }
    return splits;
  }

  @Override
  public RecordReader createRecordReader(
      InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
    return createRecordReader(inputSplit, taskAttemptContext.getConfiguration());
  }

  public RecordReader createRecordReader(
      InputSplit inputSplit, Configuration configuration)
      throws IOException, InterruptedException {
    Preconditions.checkArgument(
        inputSplit instanceof UnshardedInputSplit,
        "Split should be instance of UnshardedInputSplit.");
      logger.atFine().log("createRecordReader -> createDelegateRecordReader()");
      return createDelegateRecordReader(inputSplit, configuration);
  }

  private static Export constructExport(
      Configuration configuration,
      ExportFileFormat format,
      String exportPath,
      BigQueryHelper bigQueryHelper,
      InputFormat delegateInputFormat)
      throws IOException {
    logger.atFine().log("constructExport() with export path %s", exportPath);

    // Extract relevant configuration settings.
    Map mandatoryConfig = ConfigurationUtil.getMandatoryConfig(
        configuration, BigQueryConfiguration.MANDATORY_CONFIG_PROPERTIES_INPUT);
    String jobProjectId = mandatoryConfig.get(BigQueryConfiguration.PROJECT_ID_KEY);
    String inputProjectId = mandatoryConfig.get(BigQueryConfiguration.INPUT_PROJECT_ID_KEY);
    String datasetId = mandatoryConfig.get(BigQueryConfiguration.INPUT_DATASET_ID_KEY);
    String tableName = mandatoryConfig.get(BigQueryConfiguration.INPUT_TABLE_ID_KEY);

    TableReference exportTableReference = new TableReference()
        .setDatasetId(datasetId)
        .setProjectId(inputProjectId)
        .setTableId(tableName);
    Table table = bigQueryHelper.getTable(exportTableReference);

    if (EXTERNAL_TABLE_TYPE.equals(table.getType())) {
        logger.atInfo().log("Table is already external, so skipping export");
        return new NoopFederatedExportToCloudStorage(
            configuration, format, bigQueryHelper, jobProjectId, table, delegateInputFormat);
    }

    return new UnshardedExportToCloudStorage(
        configuration,
        exportPath,
        format,
        bigQueryHelper,
        jobProjectId,
        table,
        delegateInputFormat);
  }

  /**
   * Cleans up relevant temporary resources associated with a job which used the
   * GsonBigQueryInputFormat; this should be called explicitly after the completion of the entire
   * job. Possibly cleans up intermediate export tables if configured to use one due to
   * specifying a BigQuery "query" for the input. Cleans up the GCS directoriy where BigQuery
   * exported its files for reading.
   */
  public static void cleanupJob(Configuration configuration, JobID jobId) throws IOException {
    String exportPathRoot = BigQueryConfiguration.getTemporaryPathRoot(configuration, jobId);
    configuration.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, exportPathRoot);
    Bigquery bigquery = null;
    try {
      bigquery = new BigQueryFactory().getBigQuery(configuration);
    } catch (GeneralSecurityException gse) {
      throw new IOException("Failed to create Bigquery client", gse);
    }
    cleanupJob(new BigQueryHelper(bigquery), configuration);
  }

  /**
   * Similar to {@link #cleanupJob(Configuration, JobID)}, but allows specifying the Bigquery
   * instance to use.
   *
   * @param bigQueryHelper The Bigquery API-client helper instance to use.
   * @param config The job Configuration object which contains settings such as whether sharded
   *     export was enabled, which GCS directory the export was performed in, etc.
   */
  public static void cleanupJob(BigQueryHelper bigQueryHelper, Configuration config)
      throws IOException {
    logger.atFine().log("cleanupJob(Bigquery, Configuration)");

    String gcsPath = ConfigurationUtil.getMandatoryConfig(
        config, BigQueryConfiguration.TEMP_GCS_PATH_KEY);

    Export export = constructExport(
        config, getExportFileFormat(config), gcsPath, bigQueryHelper, null);

    try {
      export.cleanupExport();
    } catch (IOException ioe) {
      // Error is swallowed as job has completed successfully and the only failure is deleting
      // temporary data.
      // This matches the FileOutputCommitter pattern.
      logger.atWarning().withCause(ioe).log(
          "Could not delete intermediate data from BigQuery export");
    }
  }

  /**
   * Helper method to override for testing.
   *
   * @return Bigquery.
   * @throws IOException on IO Error.
   * @throws GeneralSecurityException on security exception.
   */
  protected Bigquery getBigQuery(Configuration config)
      throws GeneralSecurityException, IOException {
    BigQueryFactory factory = new BigQueryFactory();
    return factory.getBigQuery(config);
  }

  /**
   * Helper method to override for testing.
   */
  protected BigQueryHelper getBigQueryHelper(Configuration config)
      throws GeneralSecurityException, IOException {
    BigQueryFactory factory = new BigQueryFactory();
    return factory.getBigQueryHelper(config);
  }

  @VisibleForTesting
  void setDelegateInputFormat(InputFormat inputFormat) {
    delegateInputFormat = inputFormat;
  }
}