com.google.cloud.dataflow.sdk.util.BigQueryTableInserter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.util;

import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ExponentialBackOff;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableDataInsertAllRequest;
import com.google.api.services.bigquery.model.TableDataInsertAllResponse;
import com.google.api.services.bigquery.model.TableDataList;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.MoreExecutors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import javax.annotation.Nullable;

/**
 * Inserts rows into BigQuery.
 */
public class BigQueryTableInserter {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableInserter.class);

  // Approximate amount of table data to upload per InsertAll request.
  private static final long UPLOAD_BATCH_SIZE_BYTES = 64 * 1024;

  // The maximum number of rows to upload per InsertAll request.
  private static final long MAX_ROWS_PER_BATCH = 500;

  // The maximum number of times to retry inserting rows into BigQuery.
  private static final int MAX_INSERT_ATTEMPTS = 5;

  // The initial backoff after a failure inserting rows into BigQuery.
  private static final long INITIAL_INSERT_BACKOFF_INTERVAL_MS = 200L;

  private final Bigquery client;
  private final TableReference defaultRef;
  private final long maxRowsPerBatch;

  private static final ExecutorService executor = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(100), 10, TimeUnit.SECONDS);

  /**
   * Constructs a new row inserter.
   *
   * @param client a BigQuery client
   */
  public BigQueryTableInserter(Bigquery client) {
    this.client = client;
    this.defaultRef = null;
    this.maxRowsPerBatch = MAX_ROWS_PER_BATCH;
  }

  /**
   * Constructs a new row inserter.
   *
   * @param client a BigQuery client
   * @param defaultRef identifies the table to insert into
   * @deprecated replaced by {@link #BigQueryTableInserter(Bigquery)}
   */
  @Deprecated
  public BigQueryTableInserter(Bigquery client, TableReference defaultRef) {
    this.client = client;
    this.defaultRef = defaultRef;
    this.maxRowsPerBatch = MAX_ROWS_PER_BATCH;
  }

  /**
   * Constructs a new row inserter.
   *
   * @param client a BigQuery client
   */
  public BigQueryTableInserter(Bigquery client, int maxRowsPerBatch) {
    this.client = client;
    this.defaultRef = null;
    this.maxRowsPerBatch = maxRowsPerBatch;
  }

  /**
   * Constructs a new row inserter.
   *
   * @param client a BigQuery client
   * @param defaultRef identifies the default table to insert into
   * @deprecated replaced by {@link #BigQueryTableInserter(Bigquery, int)}
   */
  @Deprecated
  public BigQueryTableInserter(Bigquery client, TableReference defaultRef, int maxRowsPerBatch) {
    this.client = client;
    this.defaultRef = defaultRef;
    this.maxRowsPerBatch = maxRowsPerBatch;
  }

  /**
   * Insert all rows from the given list.
   *
   * @deprecated replaced by {@link #insertAll(TableReference, List)}
   */
  @Deprecated
  public void insertAll(List rowList) throws IOException {
    insertAll(defaultRef, rowList, null, null);
  }

  /**
   * Insert all rows from the given list using specified insertIds if not null.
   *
   * @deprecated replaced by {@link #insertAll(TableReference, List, List)}
   */
  @Deprecated
  public void insertAll(List rowList,
      @Nullable List insertIdList) throws IOException {
    insertAll(defaultRef, rowList, insertIdList, null);
  }

  /**
   * Insert all rows from the given list.
   */
  public void insertAll(TableReference ref, List rowList) throws IOException {
    insertAll(ref, rowList, null, null);
  }

  /**
   * Insert all rows from the given list using specified insertIds if not null. Track count of
   * bytes written with the Aggregator.
   */
  public void insertAll(TableReference ref, List rowList,
      @Nullable List insertIdList, Aggregator byteCountAggregator)
      throws IOException {
    Preconditions.checkNotNull(ref, "ref");
    if (insertIdList != null && rowList.size() != insertIdList.size()) {
      throw new AssertionError("If insertIdList is not null it needs to have at least "
          + "as many elements as rowList");
    }

    AttemptBoundedExponentialBackOff backoff = new AttemptBoundedExponentialBackOff(
        MAX_INSERT_ATTEMPTS,
        INITIAL_INSERT_BACKOFF_INTERVAL_MS);

    List allErrors = new ArrayList<>();
    // These lists contain the rows to publish. Initially the contain the entire list. If there are
    // failures, they will contain only the failed rows to be retried.
    List rowsToPublish = rowList;
    List idsToPublish = insertIdList;
    while (true) {
      List retryRows = new ArrayList<>();
      List retryIds = (idsToPublish != null) ? new ArrayList() : null;

      int strideIndex = 0;
      // Upload in batches.
      List rows = new LinkedList<>();
      int dataSize = 0;

      List>> futures = new ArrayList<>();
      List strideIndices = new ArrayList<>();

      for (int i = 0; i < rowsToPublish.size(); ++i) {
        TableRow row = rowsToPublish.get(i);
        TableDataInsertAllRequest.Rows out = new TableDataInsertAllRequest.Rows();
        if (idsToPublish != null) {
          out.setInsertId(idsToPublish.get(i));
        }
        out.setJson(row.getUnknownKeys());
        rows.add(out);

        dataSize += row.toString().length();
        if (dataSize >= UPLOAD_BATCH_SIZE_BYTES || rows.size() >= maxRowsPerBatch ||
            i == rowsToPublish.size() - 1) {
          TableDataInsertAllRequest content = new TableDataInsertAllRequest();
          content.setRows(rows);

          final Bigquery.Tabledata.InsertAll insert = client.tabledata()
              .insertAll(ref.getProjectId(), ref.getDatasetId(), ref.getTableId(),
                  content);

          futures.add(
              executor.submit(new Callable>() {
                @Override
                public List call() throws IOException {
                  return insert.execute().getInsertErrors();
                }
              }));
          strideIndices.add(strideIndex);

          if (byteCountAggregator != null) {
            byteCountAggregator.addValue(Long.valueOf(dataSize));
          }
          dataSize = 0;
          strideIndex = i + 1;
          rows = new LinkedList<>();
        }
      }

      try {
        for (int i = 0; i < futures.size(); i++) {
          List errors = futures.get(i).get();
          if (errors != null) {
            for (TableDataInsertAllResponse.InsertErrors error : errors) {
              allErrors.add(error);
              if (error.getIndex() == null) {
                throw new IOException("Insert failed: " + allErrors);
              }

              int errorIndex = error.getIndex().intValue() + strideIndices.get(i);
              retryRows.add(rowsToPublish.get(errorIndex));
              if (retryIds != null) {
                retryIds.add(idsToPublish.get(errorIndex));
              }
            }
          }
        }
      } catch (InterruptedException e) {
        throw new IOException("Interrupted while inserting " + rowsToPublish);
      } catch (ExecutionException e) {
        Throwables.propagate(e.getCause());
      }

      if (!allErrors.isEmpty() && !backoff.atMaxAttempts()) {
        try {
          Thread.sleep(backoff.nextBackOffMillis());
        } catch (InterruptedException e) {
          throw new IOException("Interrupted while waiting before retrying insert of " + retryRows);
        }
        LOG.info("Retrying failed inserts to BigQuery");
        rowsToPublish = retryRows;
        idsToPublish = retryIds;
        allErrors.clear();
      } else {
        break;
      }
    }
    if (!allErrors.isEmpty()) {
      throw new IOException("Insert failed: " + allErrors);
    }
  }

  /**
   * Retrieves or creates the table.
   *
   * The table is checked to conform to insertion requirements as specified
   * by WriteDisposition and CreateDisposition.
   *
   * 
If table truncation is requested (WriteDisposition.WRITE_TRUNCATE), then
   * this will re-create the table if necessary to ensure it is empty.
   *
   * 
If an empty table is required (WriteDisposition.WRITE_EMPTY), then this
   * will fail if the table exists and is not empty.
   *
   * When constructing a table, a {@code TableSchema} must be available.  If a
   * schema is provided, then it will be used.  If no schema is provided, but
   * an existing table is being cleared (WRITE_TRUNCATE option above), then
   * the existing schema will be re-used.  If no schema is available, then an
   * {@code IOException} is thrown.
   */
  public Table getOrCreateTable(
      TableReference ref,
      WriteDisposition writeDisposition,
      CreateDisposition createDisposition,
      @Nullable TableSchema schema) throws IOException {
    // Check if table already exists.
    Bigquery.Tables.Get get = client.tables()
        .get(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
    Table table = null;
    try {
      table = get.execute();
    } catch (IOException e) {
      ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
      if (!errorExtractor.itemNotFound(e) ||
          createDisposition != CreateDisposition.CREATE_IF_NEEDED) {
        // Rethrow.
        throw e;
      }
    }

    // If we want an empty table, and it isn't, then delete it first.
    if (table != null) {
      if (writeDisposition == WriteDisposition.WRITE_APPEND) {
        return table;
      }

      boolean empty = isEmpty(ref);
      if (empty) {
        if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
          LOG.info("Empty table found, not removing {}", BigQueryIO.toTableSpec(ref));
        }
        return table;

      } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
        throw new IOException("WriteDisposition is WRITE_EMPTY, "
            + "but table is not empty");
      }

      // Reuse the existing schema if none was provided.
      if (schema == null) {
        schema = table.getSchema();
      }

      // Delete table and fall through to re-creating it below.
      LOG.info("Deleting table {}", BigQueryIO.toTableSpec(ref));
      Bigquery.Tables.Delete delete = client.tables()
          .delete(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
      delete.execute();
    }

    if (schema == null) {
      throw new IllegalArgumentException(
          "Table schema required for new table.");
    }

    // Create the table.
    return tryCreateTable(ref, schema);
  }

  /**
   * Checks if a table is empty.
   */
  public boolean isEmpty(TableReference ref) throws IOException {
    Bigquery.Tabledata.List list = client.tabledata()
        .list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
    list.setMaxResults(1L);
    TableDataList dataList = list.execute();

    return dataList.getRows() == null || dataList.getRows().isEmpty();
  }

  /**
   * Retry table creation up to 5 minutes (with exponential backoff) when this user is near the
   * quota for table creation. This relatively innocuous behavior can happen when BigQueryIO is
   * configured with a table spec function to use different tables for each window.
   */
  private static final int RETRY_CREATE_TABLE_DURATION_MILLIS = (int) TimeUnit.MINUTES.toMillis(5);

  /**
   * Tries to create the BigQuery table.
   * If a table with the same name already exists in the dataset, the table
   * creation fails, and the function returns null.  In such a case,
   * the existing table doesn't necessarily have the same schema as specified
   * by the parameter.
   *
   * @param schema Schema of the new BigQuery table.
   * @return The newly created BigQuery table information, or null if the table
   *     with the same name already exists.
   * @throws IOException if other error than already existing table occurs.
   */
  @Nullable
  public Table tryCreateTable(TableReference ref, TableSchema schema) throws IOException {
    LOG.info("Trying to create BigQuery table: {}", BigQueryIO.toTableSpec(ref));
    BackOff backoff =
        new ExponentialBackOff.Builder()
            .setMaxElapsedTimeMillis(RETRY_CREATE_TABLE_DURATION_MILLIS)
            .build();

    Table table = new Table().setTableReference(ref).setSchema(schema);
    return tryCreateTable(table, ref.getProjectId(), ref.getDatasetId(), backoff, Sleeper.DEFAULT);
  }

  @VisibleForTesting
  @Nullable
  Table tryCreateTable(
      Table table, String projectId, String datasetId, BackOff backoff, Sleeper sleeper)
          throws IOException {
    boolean retry = false;
    while (true) {
      try {
        return client.tables().insert(projectId, datasetId, table).execute();
      } catch (IOException e) {
        ApiErrorExtractor extractor = new ApiErrorExtractor();
        if (extractor.itemAlreadyExists(e)) {
          // The table already exists, nothing to return.
          return null;
        } else if (extractor.rateLimited(e)) {
          // The request failed because we hit a temporary quota. Back off and try again.
          try {
            if (BackOffUtils.next(sleeper, backoff)) {
              if (!retry) {
                LOG.info(
                    "Quota limit reached when creating table {}:{}.{}, retrying up to {} minutes",
                    projectId,
                    datasetId,
                    table.getTableReference().getTableId(),
                    TimeUnit.MILLISECONDS.toSeconds(RETRY_CREATE_TABLE_DURATION_MILLIS) / 60.0);
                retry = true;
              }
              continue;
            }
          } catch (InterruptedException e1) {
            // Restore interrupted state and throw the last failure.
            Thread.currentThread().interrupt();
            throw e;
          }
        }
        throw e;
      }
    }
  }
}