com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.util;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ClassInfo;
import com.google.api.client.util.Data;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableCell;
import com.google.api.services.bigquery.model.TableDataList;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Uninterruptibles;

import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.TimeUnit;

import javax.annotation.Nullable;

/**
 * Iterates over all rows in a table.
 */
public class BigQueryTableRowIterator implements AutoCloseable {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableRowIterator.class);

  @Nullable private TableReference ref;
  @Nullable private final String projectId;
  @Nullable private TableSchema schema;
  private final Bigquery client;
  private String pageToken;
  private Iterator iteratorOverCurrentBatch;
  private TableRow current;
  // Set true when the final page is seen from the service.
  private boolean lastPage = false;

  // The maximum number of times a BigQuery request will be retried
  private static final int MAX_RETRIES = 3;
  // Initial wait time for the backoff implementation
  private static final Duration INITIAL_BACKOFF_TIME = Duration.standardSeconds(1);

  // After sending a query to BQ service we will be polling the BQ service to check the status with
  // following interval to check the status of query execution job
  private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1);

  private final String query;
  // Whether to flatten query results.
  private final boolean flattenResults;
  // Whether to use the BigQuery legacy SQL dialect..
  private final boolean useLegacySql;
  // Temporary dataset used to store query results.
  private String temporaryDatasetId = null;
  // Temporary table used to store query results.
  private String temporaryTableId = null;

  private BigQueryTableRowIterator(
      @Nullable TableReference ref, @Nullable String query, @Nullable String projectId,
      Bigquery client, boolean flattenResults, boolean useLegacySql) {
    this.ref = ref;
    this.query = query;
    this.projectId = projectId;
    this.client = checkNotNull(client, "client");
    this.flattenResults = flattenResults;
    this.useLegacySql = useLegacySql;
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the specified table.
   */
  public static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) {
    checkNotNull(ref, "ref");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client, true, true);
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
   * specified query in the specified project with useLegacySql set to True.
   *
   * @deprecated use {@link #fromQuery(String, String, Bigquery, Boolean, Boolean)}.
   */
  @Deprecated
  public static BigQueryTableRowIterator fromQuery(
      String query, String projectId, Bigquery client, @Nullable Boolean flattenResults) {
    return fromQuery(query, projectId, client, flattenResults, null /* useLegacySql */);
  }

  /**
     * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
     * specified query in the specified project.
     */
  public static BigQueryTableRowIterator fromQuery(
      String query, String projectId, Bigquery client, @Nullable Boolean flattenResults,
      @Nullable Boolean useLegacySql) {
    checkNotNull(query, "query");
    checkNotNull(projectId, "projectId");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(null, query, projectId, client,
        MoreObjects.firstNonNull(flattenResults, Boolean.TRUE),
        MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE));
  }

  /**
   * Opens the table for read.
   * @throws IOException on failure
   */
  public void open() throws IOException, InterruptedException {
    if (query != null) {
      ref = executeQueryAndWaitForCompletion();
    }
    // Get table schema.
    schema = getTable(ref).getSchema();
  }

  public boolean advance() throws IOException, InterruptedException {
    while (true) {
      if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
        // Embed schema information into the raw row, so that values have an
        // associated key.  This matches how rows are read when using the
        // DataflowPipelineRunner.
        current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
        return true;
      }
      if (lastPage) {
        return false;
      }

      Bigquery.Tabledata.List list =
          client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
      if (pageToken != null) {
        list.setPageToken(pageToken);
      }

      TableDataList result = executeWithBackOff(
          list,
          String.format(
              "Error reading from BigQuery table %s of dataset %s.",
              ref.getTableId(), ref.getDatasetId()));

      pageToken = result.getPageToken();
      iteratorOverCurrentBatch =
          result.getRows() != null
              ? result.getRows().iterator()
              : Collections.emptyIterator();

      // The server may return a page token indefinitely on a zero-length table.
      if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
        lastPage = true;
      }
    }
  }

  public TableRow getCurrent() {
    if (current == null) {
      throw new NoSuchElementException();
    }
    return current;
  }

  /**
   * Adjusts a field returned from the BigQuery API to match what we will receive when running
   * BigQuery's export-to-GCS and parallel read, which is the efficient parallel implementation
   * used for batch jobs executed on the Cloud Dataflow service.
   *
   * The following is the relationship between BigQuery schema and Java types:
   *
   * 

   *   Nulls are {@code null}.
   *   
Repeated fields are {@code List} of objects.
   *   
Record columns are {@link TableRow} objects.
   *   
{@code BOOLEAN} columns are JSON booleans, hence Java {@code Boolean} objects.
   *   
{@code FLOAT} columns are JSON floats, hence Java {@code Double} objects.
   *   
{@code TIMESTAMP} columns are {@code String} objects that are of the format
   *       {@code yyyy-MM-dd HH:mm:ss[.SSSSSS] UTC}, where the {@code .SSSSSS} has no trailing
   *       zeros and can be 1 to 6 digits long.
   *   
Every other atomic type is a {@code String}.
   * 
   *
   * Note that integers are encoded as strings to match BigQuery's exported JSON format.
   *
   * 
Finally, values are stored in the {@link TableRow} as {"field name": value} pairs
   * and are not accessible through the {@link TableRow#getF} function.
   */
  @Nullable private Object getTypedCellValue(TableFieldSchema fieldSchema, Object v) {
    if (Data.isNull(v)) {
      return null;
    }

    if (Objects.equals(fieldSchema.getMode(), "REPEATED")) {
      TableFieldSchema elementSchema = fieldSchema.clone().setMode("REQUIRED");
      @SuppressWarnings("unchecked")
      List> rawCells = (List>) v;
      ImmutableList.Builder