com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based
interface for processing virtually any size data using Google cloud
resources. This artifact includes entire Dataflow Java SDK.
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.util;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ClassInfo;
import com.google.api.client.util.Data;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableCell;
import com.google.api.services.bigquery.model.TableDataList;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Uninterruptibles;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
/**
* Iterates over all rows in a table.
*/
public class BigQueryTableRowIterator implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableRowIterator.class);
@Nullable private TableReference ref;
@Nullable private final String projectId;
@Nullable private TableSchema schema;
private final Bigquery client;
private String pageToken;
private Iterator iteratorOverCurrentBatch;
private TableRow current;
// Set true when the final page is seen from the service.
private boolean lastPage = false;
// The maximum number of times a BigQuery request will be retried
private static final int MAX_RETRIES = 3;
// Initial wait time for the backoff implementation
private static final Duration INITIAL_BACKOFF_TIME = Duration.standardSeconds(1);
// After sending a query to BQ service we will be polling the BQ service to check the status with
// following interval to check the status of query execution job
private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1);
private final String query;
// Whether to flatten query results.
private final boolean flattenResults;
// Whether to use the BigQuery legacy SQL dialect..
private final boolean useLegacySql;
// Temporary dataset used to store query results.
private String temporaryDatasetId = null;
// Temporary table used to store query results.
private String temporaryTableId = null;
private BigQueryTableRowIterator(
@Nullable TableReference ref, @Nullable String query, @Nullable String projectId,
Bigquery client, boolean flattenResults, boolean useLegacySql) {
this.ref = ref;
this.query = query;
this.projectId = projectId;
this.client = checkNotNull(client, "client");
this.flattenResults = flattenResults;
this.useLegacySql = useLegacySql;
}
/**
* Constructs a {@code BigQueryTableRowIterator} that reads from the specified table.
*/
public static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) {
checkNotNull(ref, "ref");
checkNotNull(client, "client");
return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client, true, true);
}
/**
* Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
* specified query in the specified project with useLegacySql set to True.
*
* @deprecated use {@link #fromQuery(String, String, Bigquery, Boolean, Boolean)}.
*/
@Deprecated
public static BigQueryTableRowIterator fromQuery(
String query, String projectId, Bigquery client, @Nullable Boolean flattenResults) {
return fromQuery(query, projectId, client, flattenResults, null /* useLegacySql */);
}
/**
* Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
* specified query in the specified project.
*/
public static BigQueryTableRowIterator fromQuery(
String query, String projectId, Bigquery client, @Nullable Boolean flattenResults,
@Nullable Boolean useLegacySql) {
checkNotNull(query, "query");
checkNotNull(projectId, "projectId");
checkNotNull(client, "client");
return new BigQueryTableRowIterator(null, query, projectId, client,
MoreObjects.firstNonNull(flattenResults, Boolean.TRUE),
MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE));
}
/**
* Opens the table for read.
* @throws IOException on failure
*/
public void open() throws IOException, InterruptedException {
if (query != null) {
ref = executeQueryAndWaitForCompletion();
}
// Get table schema.
schema = getTable(ref).getSchema();
}
public boolean advance() throws IOException, InterruptedException {
while (true) {
if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
// Embed schema information into the raw row, so that values have an
// associated key. This matches how rows are read when using the
// DataflowPipelineRunner.
current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
return true;
}
if (lastPage) {
return false;
}
Bigquery.Tabledata.List list =
client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
if (pageToken != null) {
list.setPageToken(pageToken);
}
TableDataList result = executeWithBackOff(
list,
String.format(
"Error reading from BigQuery table %s of dataset %s.",
ref.getTableId(), ref.getDatasetId()));
pageToken = result.getPageToken();
iteratorOverCurrentBatch =
result.getRows() != null
? result.getRows().iterator()
: Collections.emptyIterator();
// The server may return a page token indefinitely on a zero-length table.
if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
lastPage = true;
}
}
}
public TableRow getCurrent() {
if (current == null) {
throw new NoSuchElementException();
}
return current;
}
/**
* Adjusts a field returned from the BigQuery API to match what we will receive when running
* BigQuery's export-to-GCS and parallel read, which is the efficient parallel implementation
* used for batch jobs executed on the Cloud Dataflow service.
*
* The following is the relationship between BigQuery schema and Java types:
*
*
* - Nulls are {@code null}.
*
- Repeated fields are {@code List} of objects.
*
- Record columns are {@link TableRow} objects.
*
- {@code BOOLEAN} columns are JSON booleans, hence Java {@code Boolean} objects.
*
- {@code FLOAT} columns are JSON floats, hence Java {@code Double} objects.
*
- {@code TIMESTAMP} columns are {@code String} objects that are of the format
* {@code yyyy-MM-dd HH:mm:ss[.SSSSSS] UTC}, where the {@code .SSSSSS} has no trailing
* zeros and can be 1 to 6 digits long.
*
- Every other atomic type is a {@code String}.
*
*
* Note that integers are encoded as strings to match BigQuery's exported JSON format.
*
*
Finally, values are stored in the {@link TableRow} as {"field name": value} pairs
* and are not accessible through the {@link TableRow#getF} function.
*/
@Nullable private Object getTypedCellValue(TableFieldSchema fieldSchema, Object v) {
if (Data.isNull(v)) {
return null;
}
if (Objects.equals(fieldSchema.getMode(), "REPEATED")) {
TableFieldSchema elementSchema = fieldSchema.clone().setMode("REQUIRED");
@SuppressWarnings("unchecked")
List