All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.util;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ClassInfo;
import com.google.api.client.util.Data;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableCell;
import com.google.api.services.bigquery.model.TableDataList;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Uninterruptibles;

import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.TimeUnit;

import javax.annotation.Nullable;

/**
 * Iterates over all rows in a table.
 */
public class BigQueryTableRowIterator implements AutoCloseable {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableRowIterator.class);

  @Nullable private TableReference ref;
  @Nullable private final String projectId;
  @Nullable private TableSchema schema;
  private final Bigquery client;
  private String pageToken;
  private Iterator iteratorOverCurrentBatch;
  private TableRow current;
  // Set true when the final page is seen from the service.
  private boolean lastPage = false;

  // The maximum number of times a BigQuery request will be retried
  private static final int MAX_RETRIES = 3;
  // Initial wait time for the backoff implementation
  private static final Duration INITIAL_BACKOFF_TIME = Duration.standardSeconds(1);

  // After sending a query to BQ service we will be polling the BQ service to check the status with
  // following interval to check the status of query execution job
  private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1);

  private final String query;
  // Whether to flatten query results.
  private final boolean flattenResults;
  // Whether to use the BigQuery legacy SQL dialect..
  private final boolean useLegacySql;
  // Temporary dataset used to store query results.
  private String temporaryDatasetId = null;
  // Temporary table used to store query results.
  private String temporaryTableId = null;

  private BigQueryTableRowIterator(
      @Nullable TableReference ref, @Nullable String query, @Nullable String projectId,
      Bigquery client, boolean flattenResults, boolean useLegacySql) {
    this.ref = ref;
    this.query = query;
    this.projectId = projectId;
    this.client = checkNotNull(client, "client");
    this.flattenResults = flattenResults;
    this.useLegacySql = useLegacySql;
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the specified table.
   */
  public static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) {
    checkNotNull(ref, "ref");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client, true, true);
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
   * specified query in the specified project with useLegacySql set to True.
   *
   * @deprecated use {@link #fromQuery(String, String, Bigquery, Boolean, Boolean)}.
   */
  @Deprecated
  public static BigQueryTableRowIterator fromQuery(
      String query, String projectId, Bigquery client, @Nullable Boolean flattenResults) {
    return fromQuery(query, projectId, client, flattenResults, null /* useLegacySql */);
  }

  /**
     * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
     * specified query in the specified project.
     */
  public static BigQueryTableRowIterator fromQuery(
      String query, String projectId, Bigquery client, @Nullable Boolean flattenResults,
      @Nullable Boolean useLegacySql) {
    checkNotNull(query, "query");
    checkNotNull(projectId, "projectId");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(null, query, projectId, client,
        MoreObjects.firstNonNull(flattenResults, Boolean.TRUE),
        MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE));
  }

  /**
   * Opens the table for read.
   * @throws IOException on failure
   */
  public void open() throws IOException, InterruptedException {
    if (query != null) {
      ref = executeQueryAndWaitForCompletion();
    }
    // Get table schema.
    schema = getTable(ref).getSchema();
  }

  public boolean advance() throws IOException, InterruptedException {
    while (true) {
      if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
        // Embed schema information into the raw row, so that values have an
        // associated key.  This matches how rows are read when using the
        // DataflowPipelineRunner.
        current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
        return true;
      }
      if (lastPage) {
        return false;
      }

      Bigquery.Tabledata.List list =
          client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
      if (pageToken != null) {
        list.setPageToken(pageToken);
      }

      TableDataList result = executeWithBackOff(
          list,
          String.format(
              "Error reading from BigQuery table %s of dataset %s.",
              ref.getTableId(), ref.getDatasetId()));

      pageToken = result.getPageToken();
      iteratorOverCurrentBatch =
          result.getRows() != null
              ? result.getRows().iterator()
              : Collections.emptyIterator();

      // The server may return a page token indefinitely on a zero-length table.
      if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
        lastPage = true;
      }
    }
  }

  public TableRow getCurrent() {
    if (current == null) {
      throw new NoSuchElementException();
    }
    return current;
  }

  /**
   * Adjusts a field returned from the BigQuery API to match what we will receive when running
   * BigQuery's export-to-GCS and parallel read, which is the efficient parallel implementation
   * used for batch jobs executed on the Cloud Dataflow service.
   *
   * 

The following is the relationship between BigQuery schema and Java types: * *

    *
  • Nulls are {@code null}. *
  • Repeated fields are {@code List} of objects. *
  • Record columns are {@link TableRow} objects. *
  • {@code BOOLEAN} columns are JSON booleans, hence Java {@code Boolean} objects. *
  • {@code FLOAT} columns are JSON floats, hence Java {@code Double} objects. *
  • {@code TIMESTAMP} columns are {@code String} objects that are of the format * {@code yyyy-MM-dd HH:mm:ss[.SSSSSS] UTC}, where the {@code .SSSSSS} has no trailing * zeros and can be 1 to 6 digits long. *
  • Every other atomic type is a {@code String}. *
* *

Note that integers are encoded as strings to match BigQuery's exported JSON format. * *

Finally, values are stored in the {@link TableRow} as {"field name": value} pairs * and are not accessible through the {@link TableRow#getF} function. */ @Nullable private Object getTypedCellValue(TableFieldSchema fieldSchema, Object v) { if (Data.isNull(v)) { return null; } if (Objects.equals(fieldSchema.getMode(), "REPEATED")) { TableFieldSchema elementSchema = fieldSchema.clone().setMode("REQUIRED"); @SuppressWarnings("unchecked") List> rawCells = (List>) v; ImmutableList.Builder values = ImmutableList.builder(); for (Map element : rawCells) { values.add(getTypedCellValue(elementSchema, element.get("v"))); } return values.build(); } if (fieldSchema.getType().equals("RECORD")) { @SuppressWarnings("unchecked") Map typedV = (Map) v; return getTypedTableRow(fieldSchema.getFields(), typedV); } if (fieldSchema.getType().equals("FLOAT")) { return Double.parseDouble((String) v); } if (fieldSchema.getType().equals("BOOLEAN")) { return Boolean.parseBoolean((String) v); } if (fieldSchema.getType().equals("TIMESTAMP")) { return AvroUtils.formatTimestamp((String) v); } // Returns the original value for: // 1. String, 2. base64 encoded BYTES, 3. DATE, DATETIME, TIME strings. return v; } /** * A list of the field names that cannot be used in BigQuery tables processed by Dataflow, * because they are reserved keywords in {@link TableRow}. */ // TODO: This limitation is unfortunate. We need to give users a way to use BigQueryIO that does // not indirect through our broken use of {@link TableRow}. // See discussion: https://github.com/GoogleCloudPlatform/DataflowJavaSDK/pull/41 private static final Collection RESERVED_FIELD_NAMES = ClassInfo.of(TableRow.class).getNames(); /** * Converts a row returned from the BigQuery JSON API as a {@code Map} into a * Java {@link TableRow} with nested {@link TableCell TableCells}. The {@code Object} values in * the cells are converted to Java types according to the provided field schemas. * *

See {@link #getTypedCellValue(TableFieldSchema, Object)} for details on how BigQuery * types are mapped to Java types. */ private TableRow getTypedTableRow(List fields, Map rawRow) { // If rawRow is a TableRow, use it. If not, create a new one. TableRow row; List> cells; if (rawRow instanceof TableRow) { // Since rawRow is a TableRow it already has TableCell objects in setF. We do not need to do // any type conversion, but extract the cells for cell-wise processing below. row = (TableRow) rawRow; cells = row.getF(); // Clear the cells from the row, so that row.getF() will return null. This matches the // behavior of rows produced by the BigQuery export API used on the service. row.setF(null); } else { row = new TableRow(); // Since rawRow is a Map we use Map.get("f") instead of TableRow.getF() to // get its cells. Similarly, when rawCell is a Map instead of a TableCell, // we will use Map.get("v") instead of TableCell.getV() get its value. @SuppressWarnings("unchecked") List> rawCells = (List>) rawRow.get("f"); cells = rawCells; } checkState(cells.size() == fields.size(), "Expected that the row has the same number of cells %s as fields in the schema %s", cells.size(), fields.size()); // Loop through all the fields in the row, normalizing their types with the TableFieldSchema // and storing the normalized values by field name in the Map that // underlies the TableRow. Iterator> cellIt = cells.iterator(); Iterator fieldIt = fields.iterator(); while (cellIt.hasNext()) { Map cell = cellIt.next(); TableFieldSchema fieldSchema = fieldIt.next(); // Convert the object in this cell to the Java type corresponding to its type in the schema. Object convertedValue = getTypedCellValue(fieldSchema, cell.get("v")); String fieldName = fieldSchema.getName(); checkArgument(!RESERVED_FIELD_NAMES.contains(fieldName), "BigQueryIO does not support records with columns named %s", fieldName); if (convertedValue == null) { // BigQuery does not include null values when the export operation (to JSON) is used. // To match that behavior, BigQueryTableRowiterator, and the DirectPipelineRunner, // intentionally omits columns with null values. continue; } row.set(fieldName, convertedValue); } return row; } // Get the BiqQuery table. private Table getTable(TableReference ref) throws IOException, InterruptedException { Bigquery.Tables.Get get = client.tables().get(ref.getProjectId(), ref.getDatasetId(), ref.getTableId()); return executeWithBackOff( get, String.format( "Error opening BigQuery table %s of dataset %s.", ref.getTableId(), ref.getDatasetId())); } // Create a new BigQuery dataset private void createDataset(String datasetId, @Nullable String location) throws IOException, InterruptedException { Dataset dataset = new Dataset(); DatasetReference reference = new DatasetReference(); reference.setProjectId(projectId); reference.setDatasetId(datasetId); dataset.setDatasetReference(reference); if (location != null) { dataset.setLocation(location); } executeWithBackOff( client.datasets().insert(projectId, dataset), String.format( "Error when trying to create the temporary dataset %s in project %s.", datasetId, projectId)); } // Delete the given table that is available in the given dataset. private void deleteTable(String datasetId, String tableId) throws IOException, InterruptedException { executeWithBackOff( client.tables().delete(projectId, datasetId, tableId), String.format( "Error when trying to delete the temporary table %s in dataset %s of project %s. " + "Manual deletion may be required.", tableId, datasetId, projectId)); } // Delete the given dataset. This will fail if the given dataset has any tables. private void deleteDataset(String datasetId) throws IOException, InterruptedException { executeWithBackOff( client.datasets().delete(projectId, datasetId), String.format( "Error when trying to delete the temporary dataset %s in project %s. " + "Manual deletion may be required.", datasetId, projectId)); } /** * Executes the specified query and returns a reference to the temporary BigQuery table created * to hold the results. * * @throws IOException if the query fails. */ private TableReference executeQueryAndWaitForCompletion() throws IOException, InterruptedException { // Dry run query to get source table location Job dryRunJob = new Job() .setConfiguration(new JobConfiguration() .setQuery(new JobConfigurationQuery() .setQuery(query)) .setDryRun(true)); JobStatistics jobStats = executeWithBackOff( client.jobs().insert(projectId, dryRunJob), String.format("Error when trying to dry run query %s.", query)).getStatistics(); // Let BigQuery to pick default location if the query does not read any tables. String location = null; @Nullable List tables = jobStats.getQuery().getReferencedTables(); if (tables != null && !tables.isEmpty()) { Table table = getTable(tables.get(0)); location = table.getLocation(); } // Create a temporary dataset to store results. // Starting dataset name with an "_" so that it is hidden. Random rnd = new Random(System.currentTimeMillis()); temporaryDatasetId = "_dataflow_temporary_dataset_" + rnd.nextInt(1000000); temporaryTableId = "dataflow_temporary_table_" + rnd.nextInt(1000000); createDataset(temporaryDatasetId, location); Job job = new Job(); JobConfiguration config = new JobConfiguration(); JobConfigurationQuery queryConfig = new JobConfigurationQuery(); config.setQuery(queryConfig); job.setConfiguration(config); queryConfig.setQuery(query); queryConfig.setAllowLargeResults(true); queryConfig.setFlattenResults(flattenResults); queryConfig.setUseLegacySql(useLegacySql); TableReference destinationTable = new TableReference(); destinationTable.setProjectId(projectId); destinationTable.setDatasetId(temporaryDatasetId); destinationTable.setTableId(temporaryTableId); queryConfig.setDestinationTable(destinationTable); Job queryJob = executeWithBackOff( client.jobs().insert(projectId, job), String.format("Error when trying to execute the job for query %s.", query)); JobReference jobId = queryJob.getJobReference(); while (true) { Job pollJob = executeWithBackOff( client.jobs().get(projectId, jobId.getJobId()), String.format("Error when trying to get status of the job for query %s.", query)); JobStatus status = pollJob.getStatus(); if (status.getState().equals("DONE")) { // Job is DONE, but did not necessarily succeed. ErrorProto error = status.getErrorResult(); if (error == null) { return pollJob.getConfiguration().getQuery().getDestinationTable(); } else { // There will be no temporary table to delete, so null out the reference. temporaryTableId = null; throw new IOException("Executing query " + query + " failed: " + error.getMessage()); } } Uninterruptibles.sleepUninterruptibly( QUERY_COMPLETION_POLL_TIME.getMillis(), TimeUnit.MILLISECONDS); } } /** * Execute a BQ request with exponential backoff and return the result. * * @deprecated use {@link #executeWithBackOff(AbstractGoogleClientRequest, String)}. */ @Deprecated public static T executeWithBackOff(AbstractGoogleClientRequest client, String error, Object... errorArgs) throws IOException, InterruptedException { return executeWithBackOff(client, String.format(error, errorArgs)); } // Execute a BQ request with exponential backoff and return the result. // client - BQ request to be executed // error - Formatted message to log if when a request fails. Takes exception message as a // formatter parameter. public static T executeWithBackOff(AbstractGoogleClientRequest client, String error) throws IOException, InterruptedException { Sleeper sleeper = Sleeper.DEFAULT; BackOff backOff = FluentBackoff.DEFAULT .withMaxRetries(MAX_RETRIES).withInitialBackoff(INITIAL_BACKOFF_TIME).backoff(); T result = null; while (true) { try { result = client.execute(); break; } catch (IOException e) { LOG.error("{}", error, e); if (!BackOffUtils.next(sleeper, backOff)) { String errorMessage = String.format( "%s Failing to execute job after %d attempts.", error, MAX_RETRIES + 1); LOG.error("{}", errorMessage, e); throw new IOException(errorMessage, e); } } } return result; } @Override public void close() { // Prevent any further requests. lastPage = true; try { // Deleting temporary table and dataset that gets generated when executing a query. if (temporaryDatasetId != null) { if (temporaryTableId != null) { deleteTable(temporaryDatasetId, temporaryTableId); } deleteDataset(temporaryDatasetId); } } catch (IOException | InterruptedException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } throw new RuntimeException(e); } } }