com.google.cloud.dataflow.sdk.io.BigQueryIO Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.QueryRequest;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.Coder.Context;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
import com.google.cloud.dataflow.sdk.options.GcpOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.NestedValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.AvroUtils;
import com.google.cloud.dataflow.sdk.util.BigQueryServices;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.DatasetService;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.JobService;
import com.google.cloud.dataflow.sdk.util.BigQueryServicesImpl;
import com.google.cloud.dataflow.sdk.util.BigQueryTableInserter;
import com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator;
import com.google.cloud.dataflow.sdk.util.FileIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsUtil;
import com.google.cloud.dataflow.sdk.util.GcsUtil.GcsUtilFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.MoreObjects;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.io.CountingOutputStream;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.avro.generic.GenericRecord;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* {@link PTransform}s for reading and writing
* BigQuery tables.
*
* Table References
*
* A fully-qualified BigQuery table name consists of three components:
*
* - {@code projectId}: the Cloud project id (defaults to
* {@link GcpOptions#getProject()}).
*
- {@code datasetId}: the BigQuery dataset id, unique within a project.
*
- {@code tableId}: a table id, unique within a dataset.
*
*
* BigQuery table references are stored as a {@link TableReference}, which comes
* from the
* BigQuery Java Client API.
* Tables can be referred to as Strings, with or without the {@code projectId}.
* A helper function is provided ({@link BigQueryIO#parseTableSpec(String)})
* that parses the following string forms into a {@link TableReference}:
*
*
* - [{@code project_id}]:[{@code dataset_id}].[{@code table_id}]
*
- [{@code dataset_id}].[{@code table_id}]
*
*
* Reading
*
* To read from a BigQuery table, apply a {@link BigQueryIO.Read} transformation.
* This produces a {@link PCollection} of {@link TableRow TableRows} as output:
*
{@code
* PCollection weatherData = pipeline.apply(
* BigQueryIO.Read.named("Read")
* .from("clouddataflow-readonly:samples.weather_stations"));
* }
*
* See {@link TableRow} for more information on the {@link TableRow} object.
*
*
Users may provide a query to read from rather than reading all of a BigQuery table. If
* specified, the result obtained by executing the specified query will be used as the data of the
* input transform.
*
*
{@code
* PCollection meanTemperatureData = pipeline.apply(
* BigQueryIO.Read.named("Read")
* .fromQuery("SELECT year, mean_temp FROM [samples.weather_stations]"));
* }
*
* When creating a BigQuery input transform, users should provide either a query or a table.
* Pipeline construction will fail with a validation error if neither or both are specified.
*
*
Writing
*
* To write to a BigQuery table, apply a {@link BigQueryIO.Write} transformation.
* This consumes a {@link PCollection} of {@link TableRow TableRows} as input.
*
{@code
* PCollection quotes = ...
*
* List fields = new ArrayList<>();
* fields.add(new TableFieldSchema().setName("source").setType("STRING"));
* fields.add(new TableFieldSchema().setName("quote").setType("STRING"));
* TableSchema schema = new TableSchema().setFields(fields);
*
* quotes.apply(BigQueryIO.Write
* .named("Write")
* .to("my-project:output.output_table")
* .withSchema(schema)
* .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
* }
*
* See {@link BigQueryIO.Write} for details on how to specify if a write should
* append to an existing table, replace the table, or verify that the table is
* empty. Note that the dataset being written to must already exist. Write
* dispositions are not supported in streaming mode.
*
*
Sharding BigQuery output tables
*
* A common use case is to dynamically generate BigQuery table names based on
* the current window. To support this,
* {@link BigQueryIO.Write#to(SerializableFunction)}
* accepts a function mapping the current window to a tablespec. For example,
* here's code that outputs daily tables to BigQuery:
*
{@code
* PCollection quotes = ...
* quotes.apply(Window.into(CalendarWindows.days(1)))
* .apply(BigQueryIO.Write
* .named("Write")
* .withSchema(schema)
* .to(new SerializableFunction() {
* public String apply(BoundedWindow window) {
* // The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
* String dayString = DateTimeFormat.forPattern("yyyy_MM_dd")
* .withZone(DateTimeZone.UTC)
* .print(((IntervalWindow) window).start());
* return "my-project:output.output_table_" + dayString;
* }
* }));
* }
*
* Per-window tables are not yet supported in batch mode.
*
*
Permissions
*
* Permission requirements depend on the {@link PipelineRunner} that is used to execute the
* Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
* more details.
*
*
Please see BigQuery Access Control
* for security and permission related information specific to BigQuery.
*/
public class BigQueryIO {
private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class);
/**
* Singleton instance of the JSON factory used to read and write JSON
* formatted rows.
*/
private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
/**
* Project IDs must contain 6-63 lowercase letters, digits, or dashes.
* IDs must start with a letter and may not end with a dash.
* This regex isn't exact - this allows for patterns that would be rejected by
* the service, but this is sufficient for basic parsing of table references.
*/
private static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]{4,61}[a-z0-9]";
/**
* Regular expression that matches Dataset IDs.
*/
private static final String DATASET_REGEXP = "[-\\w.]{1,1024}";
/**
* Regular expression that matches Table IDs.
*/
private static final String TABLE_REGEXP = "[-\\w$@]{1,1024}";
/**
* Matches table specifications in the form {@code "[project_id]:[dataset_id].[table_id]"} or
* {@code "[dataset_id].[table_id]"}.
*/
private static final String DATASET_TABLE_REGEXP =
String.format("((?%s):)?(?%s)\\.(?%s)", PROJECT_ID_REGEXP,
DATASET_REGEXP, TABLE_REGEXP);
private static final Pattern TABLE_SPEC = Pattern.compile(DATASET_TABLE_REGEXP);
@Deprecated // unused.
public static final String SET_PROJECT_FROM_OPTIONS_WARNING =
"No project specified for BigQuery table \"%1$s.%2$s\". Assuming it is in \"%3$s\". If the"
+ " table is in a different project please specify it as a part of the BigQuery table"
+ " definition.";
private static final String RESOURCE_NOT_FOUND_ERROR =
"BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline"
+ " execution. If the %1$s is created by an earlier stage of the pipeline, this"
+ " validation can be disabled using #withoutValidation.";
private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR =
"Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by"
+ " an earlier stage of the pipeline, this validation can be disabled using"
+ " #withoutValidation.";
/**
* Parse a table specification in the form
* {@code "[project_id]:[dataset_id].[table_id]"} or {@code "[dataset_id].[table_id]"}.
*
* If the project id is omitted, the default project id is used.
*/
public static TableReference parseTableSpec(String tableSpec) {
Matcher match = TABLE_SPEC.matcher(tableSpec);
if (!match.matches()) {
throw new IllegalArgumentException(
"Table reference is not in [project_id]:[dataset_id].[table_id] "
+ "format: " + tableSpec);
}
TableReference ref = new TableReference();
ref.setProjectId(match.group("PROJECT"));
return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE"));
}
/**
* Returns a canonical string representation of the {@link TableReference}.
*/
public static String toTableSpec(TableReference ref) {
StringBuilder sb = new StringBuilder();
if (ref.getProjectId() != null) {
sb.append(ref.getProjectId());
sb.append(":");
}
sb.append(ref.getDatasetId()).append('.').append(ref.getTableId());
return sb.toString();
}
@VisibleForTesting
static class JsonSchemaToTableSchema
implements SerializableFunction {
@Override
public TableSchema apply(String from) {
return fromJsonString(from, TableSchema.class);
}
}
private static class TableSchemaToJsonSchema
implements SerializableFunction {
@Override
public String apply(TableSchema from) {
return toJsonString(from);
}
}
private static class JsonTableRefToTableRef
implements SerializableFunction {
@Override
public TableReference apply(String from) {
return fromJsonString(from, TableReference.class);
}
}
private static class TableRefToTableSpec
implements SerializableFunction {
@Override
public String apply(TableReference from) {
return toTableSpec(from);
}
}
private static class TableRefToJson
implements SerializableFunction {
@Override
public String apply(TableReference from) {
return toJsonString(from);
}
}
private static class TableRefToProjectId
implements SerializableFunction {
@Override
public String apply(TableReference from) {
return from.getProjectId();
}
}
@VisibleForTesting
static class TableSpecToTableRef
implements SerializableFunction {
@Override
public TableReference apply(String from) {
return parseTableSpec(from);
}
}
@Nullable
private static ValueProvider displayTable(
@Nullable ValueProvider table) {
if (table == null) {
return null;
}
return NestedValueProvider.of(table, new TableRefToTableSpec());
}
/**
* A {@link PTransform} that reads from a BigQuery table and returns a
* {@link PCollection} of {@link TableRow TableRows} containing each of the rows of the table.
*
* Each {@link TableRow} contains values indexed by column name. Here is a
* sample processing function that processes a "line" column from rows:
*
{@code
* static class ExtractWordsFn extends DoFn {
* public void processElement(ProcessContext c) {
* // Get the "line" field of the TableRow object, split it into words, and emit them.
* TableRow row = c.element();
* String[] words = row.get("line").toString().split("[^a-zA-Z']+");
* for (String word : words) {
* if (!word.isEmpty()) {
* c.output(word);
* }
* }
* }
* }}
*/
public static class Read {
/**
* Returns a {@link Read.Bound} with the given name. The BigQuery table or query to be read
* from has not yet been configured.
*/
public static Bound named(String name) {
return new Bound().named(name);
}
/**
* Reads a BigQuery table specified as {@code "[project_id]:[dataset_id].[table_id]"} or
* {@code "[dataset_id].[table_id]"} for tables within the current project.
*/
public static Bound from(String tableSpec) {
return new Bound().from(StaticValueProvider.of(tableSpec));
}
/**
* Same as {@code from(String)}, but with a {@link ValueProvider}.
*/
public static Bound from(ValueProvider tableSpec) {
return new Bound().from(tableSpec);
}
/**
* Reads results received after executing the given query.
*/
public static Bound fromQuery(String query) {
return new Bound().fromQuery(StaticValueProvider.of(query));
}
/**
* Same as {@code from(String)}, but with a {@link ValueProvider}.
*/
public static Bound fromQuery(ValueProvider query) {
return new Bound().fromQuery(query);
}
/**
* Reads a BigQuery table specified as a {@link TableReference} object.
*/
public static Bound from(TableReference table) {
return new Bound().from(table);
}
/**
* Disables BigQuery table validation, which is enabled by default.
*/
public static Bound withoutValidation() {
return new Bound().withoutValidation();
}
/**
* A {@link PTransform} that reads from a BigQuery table and returns a bounded
* {@link PCollection} of {@link TableRow TableRows}.
*/
public static class Bound extends PTransform> {
@Nullable final ValueProvider jsonTableRef;
@Nullable final ValueProvider query;
final boolean validate;
@Nullable final Boolean flattenResults;
@Nullable final Boolean useLegacySql;
@Nullable BigQueryServices bigQueryServices;
private static final String QUERY_VALIDATION_FAILURE_ERROR =
"Validation of query \"%1$s\" failed. If the query depends on an earlier stage of the"
+ " pipeline, This validation can be disabled using #withoutValidation.";
private Bound() {
this(
null /* name */,
null /* query */,
null /* jsonTableRef */,
true /* validate */,
null /* flattenResults */,
null /* useLegacySql */,
null /* bigQueryServices */);
}
private Bound(
String name, @Nullable ValueProvider query,
@Nullable ValueProvider jsonTableRef, boolean validate,
@Nullable Boolean flattenResults, @Nullable Boolean useLegacySql,
@Nullable BigQueryServices bigQueryServices) {
super(name);
this.jsonTableRef = jsonTableRef;
this.query = query;
this.validate = validate;
this.flattenResults = flattenResults;
this.useLegacySql = useLegacySql;
this.bigQueryServices = bigQueryServices;
}
/**
* Returns a copy of this transform using the name associated with this transformation.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound(
name, query, jsonTableRef, validate, flattenResults, useLegacySql,
bigQueryServices);
}
/**
* Returns a copy of this transform that reads from the specified table. Refer to
* {@link #parseTableSpec(String)} for the specification format.
*
*
Does not modify this object.
*/
public Bound from(String tableSpec) {
return from(StaticValueProvider.of(tableSpec));
}
/**
* Returns a copy of this transform that reads from the specified table. Refer to
* {@link #parseTableSpec(String)} for the specification format.
*
*
Does not modify this object.
*/
public Bound from(ValueProvider tableSpec) {
return new Bound(
name, query,
NestedValueProvider.of(
NestedValueProvider.of(
tableSpec, new TableSpecToTableRef()),
new TableRefToJson()),
validate, flattenResults, useLegacySql, bigQueryServices);
}
/**
* Returns a copy of this transform that reads from the specified table.
*
* Does not modify this object.
*/
public Bound from(TableReference table) {
return from(StaticValueProvider.of(toTableSpec(table)));
}
/**
* Returns a copy of this transform that reads the results of the specified query.
*
*
Does not modify this object.
*
*
By default, the query results will be flattened -- see
* "flattenResults" in the
* Jobs documentation for more information. To disable flattening, use
* {@link BigQueryIO.Read.Bound#withoutResultFlattening}.
*
*
By default, the query will use BigQuery's legacy SQL dialect. To use the BigQuery
* Standard SQL dialect, use {@link BigQueryIO.Read.Bound#usingStandardSql}.
*/
public Bound fromQuery(String query) {
return fromQuery(StaticValueProvider.of(query));
}
/**
* Like {@link #fromQuery(String)}, but from a {@link ValueProvider}.
*/
public Bound fromQuery(ValueProvider query) {
return new Bound(name, query, jsonTableRef, validate,
MoreObjects.firstNonNull(flattenResults, Boolean.TRUE),
MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE),
bigQueryServices);
}
/**
* Disable table validation.
*/
public Bound withoutValidation() {
return new Bound(
name, query, jsonTableRef, false /* validate */, flattenResults, useLegacySql,
bigQueryServices);
}
/**
* Disable
* flattening of query results.
*
* Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
* from a table will cause an error during validation.
*/
public Bound withoutResultFlattening() {
return new Bound(
name, query, jsonTableRef, validate, false /* flattenResults */, useLegacySql,
bigQueryServices);
}
/**
* Enables BigQuery's Standard SQL dialect when reading from a query.
*
*
Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
* from a table will cause an error during validation.
*/
public Bound usingStandardSql() {
return new Bound(
name, query, jsonTableRef, validate, flattenResults, false /* useLegacySql */,
bigQueryServices);
}
@VisibleForTesting
Bound withTestServices(BigQueryServices testServices) {
return new Bound(
name, query, jsonTableRef, validate, flattenResults, useLegacySql, testServices);
}
@Override
public void validate(PInput input) {
if (!validate) {
// Note that a table or query check can fail if the table or dataset are created by
// earlier stages of the pipeline or if a query depends on earlier stages of a pipeline.
// For these cases the withoutValidation method can be used to disable the check.
return;
}
BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class);
String tempLocation = bqOptions.getTempLocation();
checkArgument(
!Strings.isNullOrEmpty(tempLocation),
"BigQueryIO.Read needs a GCS temp location to store temp files.");
if (bigQueryServices == null) {
try {
GcsPath.fromUri(tempLocation);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
String.format(
"BigQuery temp location expected a valid 'gs://' path, but was given '%s'",
tempLocation),
e);
}
}
ValueProvider table = getTableWithDefaultProject(bqOptions);
checkState(
table == null || query == null,
"Invalid BigQueryIO.Read: table reference and query may not both be set");
checkState(
table != null || query != null,
"Invalid BigQueryIO.Read: one of table reference and query must be set");
if (table != null) {
checkState(
flattenResults == null,
"Invalid BigQueryIO.Read: Specifies a table with a result flattening"
+ " preference, which only applies to queries");
checkState(
useLegacySql == null,
"Invalid BigQueryIO.Read: Specifies a table with a SQL dialect"
+ " preference, which only applies to queries");
checkState(table.isAccessible(), "Cannot call validate if table is dynamically set.");
// Check for source table presence for early failure notification.
verifyDatasetPresence(bqOptions, table.get());
verifyTablePresence(bqOptions, table.get());
} else /* query != null */ {
checkState(query.isAccessible(), "Cannot call validate if query is dynamically set.");
checkState(flattenResults != null, "flattenResults should not be null if query is set");
checkState(useLegacySql != null, "useLegacySql should not be null if query is set");
dryRunQuery(bqOptions, query.get(), useLegacySql);
}
}
private static void dryRunQuery(
BigQueryOptions options, String query, boolean useLegacySql) {
Bigquery client = Transport.newBigQueryClient(options).build();
QueryRequest request = new QueryRequest();
request.setQuery(query);
request.setDryRun(true);
request.setUseLegacySql(useLegacySql);
String queryValidationErrorMsg = String.format(QUERY_VALIDATION_FAILURE_ERROR, query);
try {
BigQueryTableRowIterator.executeWithBackOff(
client.jobs().query(options.getProject(), request),
queryValidationErrorMsg);
} catch (Exception e) {
throw new IllegalArgumentException(queryValidationErrorMsg, e);
}
}
@Override
public PCollection apply(PInput input) {
String uuid = randomUUIDString();
final String jobIdToken = "beam_job_" + uuid;
BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class);
BoundedSource source;
final BigQueryServices bqServices = getBigQueryServices();
final String extractDestinationDir;
String tempLocation = bqOptions.getTempLocation();
try {
IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
extractDestinationDir = factory.resolve(tempLocation, uuid);
} catch (IOException e) {
throw new RuntimeException(
String.format("Failed to resolve extract destination directory in %s", tempLocation));
}
final String executingProject = bqOptions.getProject();
if (query != null && (!query.isAccessible() || !Strings.isNullOrEmpty(query.get()))) {
String queryTempDatasetId = "temp_dataset_" + uuid;
String queryTempTableId = "temp_table_" + uuid;
TableReference queryTempTableRef = new TableReference()
.setProjectId(executingProject)
.setDatasetId(queryTempDatasetId)
.setTableId(queryTempTableId);
String jsonTableRef = toJsonString(queryTempTableRef);
source = BigQueryQuerySource.create(
jobIdToken, query, NestedValueProvider.of(
StaticValueProvider.of(jsonTableRef), new JsonTableRefToTableRef()),
flattenResults, useLegacySql, extractDestinationDir, bqServices);
} else {
ValueProvider inputTable = getTableWithDefaultProject(bqOptions);
source = BigQueryTableSource.create(
jobIdToken, inputTable, extractDestinationDir, bqServices,
StaticValueProvider.of(executingProject));
}
PassThroughThenCleanup.CleanupOperation cleanupOperation =
new PassThroughThenCleanup.CleanupOperation() {
@Override
void cleanup(PipelineOptions options) throws Exception {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
JobReference jobRef = new JobReference()
.setProjectId(executingProject)
.setJobId(getExtractJobId(jobIdToken));
Job extractJob = bqServices.getJobService(bqOptions)
.getJob(jobRef);
Collection extractFiles = null;
if (extractJob != null) {
extractFiles = getExtractFilePaths(extractDestinationDir, extractJob);
} else {
IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir);
Collection dirMatch = factory.match(extractDestinationDir);
if (!dirMatch.isEmpty()) {
extractFiles = factory.match(factory.resolve(extractDestinationDir, "*"));
}
}
if (extractFiles != null && !extractFiles.isEmpty()) {
new GcsUtilFactory().create(options).remove(extractFiles);
}
}};
return input.getPipeline()
.apply(com.google.cloud.dataflow.sdk.io.Read.from(source))
.setCoder(getDefaultOutputCoder())
.apply(new PassThroughThenCleanup(cleanupOperation));
}
@Override
protected Coder getDefaultOutputCoder() {
return TableRowJsonCoder.of();
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("table", displayTable(getTableProvider()))
.withLabel("Table"))
.addIfNotNull(DisplayData.item("query", query)
.withLabel("Query"))
.addIfNotNull(DisplayData.item("flattenResults", flattenResults)
.withLabel("Flatten Query Results"))
.addIfNotNull(DisplayData.item("useLegacySql", useLegacySql)
.withLabel("Use Legacy SQL Dialect"))
.addIfNotDefault(DisplayData.item("validation", validate)
.withLabel("Validation Enabled"),
true);
}
/**
* Returns the table to read, or {@code null} if reading from a query instead.
*
* If the table's project is not specified, use the executing project.
*/
@Nullable private ValueProvider getTableWithDefaultProject(
BigQueryOptions bqOptions) {
ValueProvider table = getTableProvider();
if (table == null) {
return table;
}
if (!table.isAccessible()) {
LOG.info("Using a dynamic value for table input. This must contain a project"
+ " in the table reference: {}", table);
return table;
}
if (Strings.isNullOrEmpty(table.get().getProjectId())) {
// If user does not specify a project we assume the table to be located in
// the default project.
TableReference tableRef = table.get();
tableRef.setProjectId(bqOptions.getProject());
return NestedValueProvider.of(StaticValueProvider.of(
toJsonString(tableRef)), new JsonTableRefToTableRef());
}
return table;
}
/**
* Returns the table to read, or {@code null} if reading from a query instead.
*/
@Nullable
public ValueProvider getTableProvider() {
return jsonTableRef == null
? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef());
}
/**
* Returns the table to read, or {@code null} if reading from a query instead.
*/
@Nullable
public TableReference getTable() {
ValueProvider provider = getTableProvider();
return provider == null ? null : provider.get();
}
/**
* Returns the query to be read, or {@code null} if reading from a table instead.
*/
@Nullable
public String getQuery() {
return query == null ? null : query.get();
}
/**
* Returns the query to be read, or {@code null} if reading from a table instead.
*/
@Nullable
public ValueProvider getQueryProvider() {
return query;
}
/**
* Returns true if table validation is enabled.
*/
public boolean getValidate() {
return validate;
}
/**
* Returns true/false if result flattening is enabled/disabled, or null if not applicable.
*/
public Boolean getFlattenResults() {
return flattenResults;
}
/**
* Returns true (false) if the query will (will not) use BigQuery's legacy SQL mode, or null
* if not applicable.
*/
@Nullable
public Boolean getUseLegacySql() {
return useLegacySql;
}
private BigQueryServices getBigQueryServices() {
if (bigQueryServices == null) {
bigQueryServices = new BigQueryServicesImpl();
}
return bigQueryServices;
}
}
/** Disallow construction of utility class. */
private Read() {}
}
/**
* A {@link PTransform} that invokes {@link CleanupOperation} after the input {@link PCollection}
* has been processed.
*/
@VisibleForTesting
static class PassThroughThenCleanup extends PTransform, PCollection> {
private CleanupOperation cleanupOperation;
PassThroughThenCleanup(CleanupOperation cleanupOperation) {
this.cleanupOperation = cleanupOperation;
}
@Override
public PCollection apply(PCollection input) {
TupleTag mainOutput = new TupleTag<>();
TupleTag cleanupSignal = new TupleTag<>();
PCollectionTuple outputs = input.apply(ParDo.of(new IdentityFn())
.withOutputTags(mainOutput, TupleTagList.of(cleanupSignal)));
PCollectionView cleanupSignalView = outputs.get(cleanupSignal)
.setCoder(VoidCoder.of())
.apply(View.asSingleton().withDefaultValue(null));
input.getPipeline()
.apply("Create(CleanupOperation)", Create.of(cleanupOperation))
.apply("Cleanup", ParDo.of(
new DoFn() {
@Override
public void processElement(ProcessContext c)
throws Exception {
c.element().cleanup(c.getPipelineOptions());
}
}).withSideInputs(cleanupSignalView));
return outputs.get(mainOutput);
}
private static class IdentityFn extends DoFn {
@Override
public void processElement(ProcessContext c) {
c.output(c.element());
}
}
abstract static class CleanupOperation implements Serializable {
abstract void cleanup(PipelineOptions options) throws Exception;
}
}
/**
* A {@link BigQuerySourceBase} for reading BigQuery tables.
*/
@VisibleForTesting
static class BigQueryTableSource extends BigQuerySourceBase {
static BigQueryTableSource create(
String jobIdToken,
ValueProvider table,
String extractDestinationDir,
BigQueryServices bqServices,
ValueProvider executingProject) {
return new BigQueryTableSource(
jobIdToken, table, extractDestinationDir, bqServices, executingProject);
}
private final ValueProvider jsonTable;
private final AtomicReference tableSizeBytes;
private BigQueryTableSource(
String jobIdToken,
ValueProvider table,
String extractDestinationDir,
BigQueryServices bqServices,
ValueProvider executingProject) {
super(jobIdToken, extractDestinationDir, bqServices, executingProject);
this.jsonTable = NestedValueProvider.of(checkNotNull(table, "table"), new TableRefToJson());
this.tableSizeBytes = new AtomicReference<>();
}
@Override
protected TableReference getTableToExtract(BigQueryOptions bqOptions) throws IOException {
checkState(jsonTable.isAccessible());
return JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);
}
@Override
public BoundedReader createReader(PipelineOptions options) throws IOException {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
checkState(jsonTable.isAccessible());
TableReference tableRef = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);
return new BigQueryReader(this, bqServices.getReaderFromTable(bqOptions, tableRef));
}
@Override
public synchronized long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
if (tableSizeBytes.get() == null) {
TableReference table = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);
Long numBytes = bqServices.getDatasetService(options.as(BigQueryOptions.class))
.getTable(table.getProjectId(), table.getDatasetId(), table.getTableId())
.getNumBytes();
tableSizeBytes.compareAndSet(null, numBytes);
}
return tableSizeBytes.get();
}
@Override
protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception {
// Do nothing.
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.add(DisplayData.item("table", jsonTable));
}
}
/**
* A {@link BigQuerySourceBase} for querying BigQuery tables.
*/
@VisibleForTesting
static class BigQueryQuerySource extends BigQuerySourceBase {
static BigQueryQuerySource create(
String jobIdToken,
ValueProvider query,
ValueProvider queryTempTableRef,
Boolean flattenResults,
Boolean useLegacySql,
String extractDestinationDir,
BigQueryServices bqServices) {
return new BigQueryQuerySource(
jobIdToken,
query,
queryTempTableRef,
flattenResults,
useLegacySql,
extractDestinationDir,
bqServices);
}
private final ValueProvider query;
private final ValueProvider jsonQueryTempTable;
private final Boolean flattenResults;
private final Boolean useLegacySql;
private transient AtomicReference dryRunJobStats;
private BigQueryQuerySource(
String jobIdToken,
ValueProvider query,
ValueProvider queryTempTableRef,
Boolean flattenResults,
Boolean useLegacySql,
String extractDestinationDir,
BigQueryServices bqServices) {
super(jobIdToken, extractDestinationDir, bqServices,
NestedValueProvider.of(
checkNotNull(queryTempTableRef, "queryTempTableRef"), new TableRefToProjectId()));
this.query = checkNotNull(query, "query");
this.jsonQueryTempTable = NestedValueProvider.of(
queryTempTableRef, new TableRefToJson());
this.flattenResults = checkNotNull(flattenResults, "flattenResults");
this.useLegacySql = checkNotNull(useLegacySql, "useLegacySql");
this.dryRunJobStats = new AtomicReference<>();
}
@Override
public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
return dryRunQueryIfNeeded(bqOptions).getTotalBytesProcessed();
}
@Override
public BoundedReader createReader(PipelineOptions options) throws IOException {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
return new BigQueryReader(this, bqServices.getReaderFromQuery(
bqOptions, query.get(), executingProject.get(), flattenResults, useLegacySql));
}
@Override
protected TableReference getTableToExtract(BigQueryOptions bqOptions)
throws IOException, InterruptedException {
// 1. Find the location of the query.
String location = null;
List referencedTables =
dryRunQueryIfNeeded(bqOptions).getQuery().getReferencedTables();
DatasetService tableService = bqServices.getDatasetService(bqOptions);
if (referencedTables != null && !referencedTables.isEmpty()) {
TableReference queryTable = referencedTables.get(0);
location = tableService.getTable(
queryTable.getProjectId(),
queryTable.getDatasetId(),
queryTable.getTableId()).getLocation();
}
// 2. Create the temporary dataset in the query location.
TableReference tableToExtract =
JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class);
tableService.createDataset(
tableToExtract.getProjectId(),
tableToExtract.getDatasetId(),
location,
"Dataset for BigQuery query job temporary table");
// 3. Execute the query.
String queryJobId = jobIdToken + "-query";
executeQuery(
executingProject.get(),
queryJobId,
tableToExtract,
bqServices.getJobService(bqOptions));
return tableToExtract;
}
@Override
protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception {
checkState(jsonQueryTempTable.isAccessible());
TableReference tableToRemove =
JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class);
DatasetService tableService = bqServices.getDatasetService(bqOptions);
tableService.deleteTable(
tableToRemove.getProjectId(),
tableToRemove.getDatasetId(),
tableToRemove.getTableId());
tableService.deleteDataset(tableToRemove.getProjectId(), tableToRemove.getDatasetId());
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.add(DisplayData.item("query", query));
}
private synchronized JobStatistics dryRunQueryIfNeeded(BigQueryOptions bqOptions)
throws InterruptedException, IOException {
if (dryRunJobStats.get() == null) {
JobStatistics jobStats = bqServices.getJobService(bqOptions).dryRunQuery(
executingProject.get(), createBasicQueryConfig());
dryRunJobStats.compareAndSet(null, jobStats);
}
return dryRunJobStats.get();
}
private void executeQuery(
String executingProject,
String jobId,
TableReference destinationTable,
JobService jobService) throws IOException, InterruptedException {
JobReference jobRef = new JobReference()
.setProjectId(executingProject)
.setJobId(jobId);
JobConfigurationQuery queryConfig = createBasicQueryConfig()
.setAllowLargeResults(true)
.setCreateDisposition("CREATE_IF_NEEDED")
.setDestinationTable(destinationTable)
.setPriority("BATCH")
.setWriteDisposition("WRITE_EMPTY");
jobService.startQueryJob(jobRef, queryConfig);
Job job = jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES);
if (parseStatus(job) != Status.SUCCEEDED) {
throw new IOException("Query job failed: " + jobId);
}
}
private JobConfigurationQuery createBasicQueryConfig() {
return new JobConfigurationQuery()
.setQuery(query.get())
.setFlattenResults(flattenResults)
.setUseLegacySql(useLegacySql);
}
private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException {
in.defaultReadObject();
dryRunJobStats = new AtomicReference<>();
}
}
/**
* An abstract {@link BoundedSource} to read a table from BigQuery.
*
* This source uses a BigQuery export job to take a snapshot of the table on GCS, and then
* reads in parallel from each produced file. It is implemented by {@link BigQueryTableSource},
* and {@link BigQueryQuerySource}, depending on the configuration of the read.
* Specifically,
*
* - {@link BigQueryTableSource} is for reading BigQuery tables
* - {@link BigQueryQuerySource} is for querying BigQuery tables
*
* ...
*/
private abstract static class BigQuerySourceBase extends BoundedSource {
// The maximum number of retries to verify temp files.
private static final int MAX_FILES_VERIFY_RETRIES = 9;
// The maximum number of retries to poll a BigQuery job.
protected static final int JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE;
// The initial backoff for verifying temp files.
private static final Duration INITIAL_FILES_VERIFY_BACKOFF = Duration.standardSeconds(1);
protected final String jobIdToken;
protected final String extractDestinationDir;
protected final BigQueryServices bqServices;
protected final ValueProvider executingProject;
private BigQuerySourceBase(
String jobIdToken,
String extractDestinationDir,
BigQueryServices bqServices,
ValueProvider executingProject) {
this.jobIdToken = checkNotNull(jobIdToken, "jobIdToken");
this.extractDestinationDir = checkNotNull(extractDestinationDir, "extractDestinationDir");
this.bqServices = checkNotNull(bqServices, "bqServices");
this.executingProject = checkNotNull(executingProject, "executingProject");
}
@Override
public List> splitIntoBundles(
long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
TableReference tableToExtract = getTableToExtract(bqOptions);
JobService jobService = bqServices.getJobService(bqOptions);
String extractJobId = getExtractJobId(jobIdToken);
List tempFiles = executeExtract(extractJobId, tableToExtract, jobService);
TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable(
tableToExtract.getProjectId(),
tableToExtract.getDatasetId(),
tableToExtract.getTableId()).getSchema();
cleanupTempResource(bqOptions);
return createSources(tempFiles, tableSchema);
}
protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception;
protected abstract void cleanupTempResource(BigQueryOptions bqOptions) throws Exception;
@Override
public boolean producesSortedKeys(PipelineOptions options) throws Exception {
return false;
}
@Override
public void validate() {
// Do nothing, validation is done in BigQuery.Read.
}
@Override
public Coder getDefaultOutputCoder() {
return TableRowJsonCoder.of();
}
private List executeExtract(
String jobId, TableReference table, JobService jobService)
throws InterruptedException, IOException {
JobReference jobRef = new JobReference()
.setProjectId(executingProject.get())
.setJobId(jobId);
String destinationUri = getExtractDestinationUri(extractDestinationDir);
JobConfigurationExtract extract = new JobConfigurationExtract()
.setSourceTable(table)
.setDestinationFormat("AVRO")
.setDestinationUris(ImmutableList.of(destinationUri));
LOG.info("Starting BigQuery extract job: {}", jobId);
jobService.startExtractJob(jobRef, extract);
Job extractJob =
jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES);
if (parseStatus(extractJob) != Status.SUCCEEDED) {
throw new IOException(String.format(
"Extract job %s failed, status: %s",
extractJob.getJobReference().getJobId(), extractJob.getStatus()));
}
List tempFiles = getExtractFilePaths(extractDestinationDir, extractJob);
return ImmutableList.copyOf(tempFiles);
}
private List> createSources(
List files, TableSchema tableSchema) throws IOException, InterruptedException {
final String jsonSchema = JSON_FACTORY.toString(tableSchema);
SerializableFunction function =
new SerializableFunction() {
@Override
public TableRow apply(GenericRecord input) {
try {
return AvroUtils.convertGenericRecordToTableRow(
input, JSON_FACTORY.fromString(jsonSchema, TableSchema.class));
} catch (IOException e) {
throw new RuntimeException("Failed to convert GenericRecord to TableRow", e);
}
}};
List> avroSources = Lists.newArrayList();
for (String fileName : files) {
avroSources.add(new TransformingSource<>(
AvroSource.from(fileName), function, getDefaultOutputCoder()));
}
return ImmutableList.copyOf(avroSources);
}
protected static class BigQueryReader extends BoundedSource.BoundedReader {
private final BigQuerySourceBase source;
private final BigQueryServices.BigQueryJsonReader reader;
private BigQueryReader(
BigQuerySourceBase source, BigQueryServices.BigQueryJsonReader reader) {
this.source = source;
this.reader = reader;
}
@Override
public BoundedSource getCurrentSource() {
return source;
}
@Override
public boolean start() throws IOException {
return reader.start();
}
@Override
public boolean advance() throws IOException {
return reader.advance();
}
@Override
public TableRow getCurrent() throws NoSuchElementException {
return reader.getCurrent();
}
@Override
public void close() throws IOException {
reader.close();
}
}
}
/**
* A {@link BoundedSource} that reads from {@code BoundedSource}
* and transforms elements to type {@code V}.
*/
@VisibleForTesting
static class TransformingSource extends BoundedSource {
private final BoundedSource boundedSource;
private final SerializableFunction function;
private final Coder outputCoder;
TransformingSource(
BoundedSource boundedSource,
SerializableFunction function,
Coder outputCoder) {
this.boundedSource = checkNotNull(boundedSource, "boundedSource");
this.function = checkNotNull(function, "function");
this.outputCoder = checkNotNull(outputCoder, "outputCoder");
}
@Override
public List> splitIntoBundles(
long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
return Lists.transform(
boundedSource.splitIntoBundles(desiredBundleSizeBytes, options),
new Function, BoundedSource>() {
@Override
public BoundedSource apply(BoundedSource input) {
return new TransformingSource<>(input, function, outputCoder);
}
});
}
@Override
public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
return boundedSource.getEstimatedSizeBytes(options);
}
@Override
public boolean producesSortedKeys(PipelineOptions options) throws Exception {
return boundedSource.producesSortedKeys(options);
}
@Override
public BoundedReader createReader(PipelineOptions options) throws IOException {
return new TransformingReader(boundedSource.createReader(options));
}
@Override
public void validate() {
boundedSource.validate();
}
@Override
public Coder getDefaultOutputCoder() {
return outputCoder;
}
private class TransformingReader extends BoundedReader {
private final BoundedReader boundedReader;
private TransformingReader(BoundedReader boundedReader) {
this.boundedReader = checkNotNull(boundedReader, "boundedReader");
}
@Override
public synchronized BoundedSource getCurrentSource() {
return new TransformingSource<>(boundedReader.getCurrentSource(), function, outputCoder);
}
@Override
public boolean start() throws IOException {
return boundedReader.start();
}
@Override
public boolean advance() throws IOException {
return boundedReader.advance();
}
@Override
public V getCurrent() throws NoSuchElementException {
T current = boundedReader.getCurrent();
return function.apply(current);
}
@Override
public void close() throws IOException {
boundedReader.close();
}
@Override
public synchronized BoundedSource splitAtFraction(double fraction) {
BoundedSource split = boundedReader.splitAtFraction(fraction);
return split == null ? null : new TransformingSource<>(split, function, outputCoder);
}
@Override
public Double getFractionConsumed() {
return boundedReader.getFractionConsumed();
}
@Override
public Instant getCurrentTimestamp() throws NoSuchElementException {
return boundedReader.getCurrentTimestamp();
}
}
}
private static String getExtractJobId(String jobIdToken) {
return jobIdToken + "-extract";
}
private static String getExtractDestinationUri(String extractDestinationDir) {
return String.format("%s/%s", extractDestinationDir, "*.avro");
}
private static List getExtractFilePaths(String extractDestinationDir, Job extractJob)
throws IOException {
JobStatistics jobStats = extractJob.getStatistics();
List counts = jobStats.getExtract().getDestinationUriFileCounts();
if (counts.size() != 1) {
String errorMessage = (counts.size() == 0
? "No destination uri file count received."
: String.format("More than one destination uri file count received. First two are %s, %s",
counts.get(0), counts.get(1)));
throw new RuntimeException(errorMessage);
}
long filesCount = counts.get(0);
ImmutableList.Builder paths = ImmutableList.builder();
IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir);
for (long i = 0; i < filesCount; ++i) {
String filePath =
factory.resolve(extractDestinationDir, String.format("%012d%s", i, ".avro"));
paths.add(filePath);
}
return paths.build();
}
/////////////////////////////////////////////////////////////////////////////
/**
* A {@link PTransform} that writes a {@link PCollection} containing {@link TableRow TableRows}
* to a BigQuery table.
*
* In BigQuery, each table has an encosing dataset. The dataset being written must already
* exist.
*
*
By default, tables will be created if they do not exist, which corresponds to a
* {@link CreateDisposition#CREATE_IF_NEEDED} disposition that matches the default of BigQuery's
* Jobs API. A schema must be provided (via {@link BigQueryIO.Write#withSchema(TableSchema)}),
* or else the transform may fail at runtime with an {@link IllegalArgumentException}.
*
*
By default, writes require an empty table, which corresponds to
* a {@link WriteDisposition#WRITE_EMPTY} disposition that matches the
* default of BigQuery's Jobs API.
*
*
Here is a sample transform that produces TableRow values containing
* "word" and "count" columns:
*
{@code
* static class FormatCountsFn extends DoFn, TableRow> {
* public void processElement(ProcessContext c) {
* TableRow row = new TableRow()
* .set("word", c.element().getKey())
* .set("count", c.element().getValue().intValue());
* c.output(row);
* }
* }}
*/
public static class Write {
/**
* An enumeration type for the BigQuery create disposition strings.
*
* @see
* configuration.query.createDisposition
in the BigQuery Jobs API
*/
public enum CreateDisposition {
/**
* Specifics that tables should not be created.
*
* If the output table does not exist, the write fails.
*/
CREATE_NEVER,
/**
* Specifies that tables should be created if needed. This is the default
* behavior.
*
*
Requires that a table schema is provided via {@link BigQueryIO.Write#withSchema}.
* This precondition is checked before starting a job. The schema is
* not required to match an existing table's schema.
*
*
When this transformation is executed, if the output table does not
* exist, the table is created from the provided schema. Note that even if
* the table exists, it may be recreated if necessary when paired with a
* {@link WriteDisposition#WRITE_TRUNCATE}.
*/
CREATE_IF_NEEDED
}
/**
* An enumeration type for the BigQuery write disposition strings.
*
* @see
* configuration.query.writeDisposition
in the BigQuery Jobs API
*/
public enum WriteDisposition {
/**
* Specifies that write should replace a table.
*
*
The replacement may occur in multiple steps - for instance by first
* removing the existing table, then creating a replacement, then filling
* it in. This is not an atomic operation, and external programs may
* see the table in any of these intermediate steps.
*/
WRITE_TRUNCATE,
/**
* Specifies that rows may be appended to an existing table.
*/
WRITE_APPEND,
/**
* Specifies that the output table must be empty. This is the default
* behavior.
*
*
If the output table is not empty, the write fails at runtime.
*
*
This check may occur long before data is written, and does not
* guarantee exclusive access to the table. If two programs are run
* concurrently, each specifying the same output table and
* a {@link WriteDisposition} of {@link WriteDisposition#WRITE_EMPTY}, it is possible
* for both to succeed.
*/
WRITE_EMPTY
}
/**
* Creates a write transformation with the given transform name. The BigQuery table to be
* written has not yet been configured.
*/
public static Bound named(String name) {
return new Bound().named(name);
}
/**
* Creates a write transformation for the given table specification.
*
*
Refer to {@link #parseTableSpec(String)} for the specification format.
*/
public static Bound to(String tableSpec) {
return new Bound().to(tableSpec);
}
/** Creates a write transformation for the given table. */
public static Bound to(ValueProvider tableSpec) {
return new Bound().to(tableSpec);
}
/** Creates a write transformation for the given table. */
public static Bound to(TableReference table) {
return new Bound().to(table);
}
/**
* Creates a write transformation from a function that maps windows to table specifications.
* Each time a new window is encountered, this function will be called and the resulting table
* will be created. Records within that window will be written to the associated table.
*
* See {@link #parseTableSpec(String)} for the format that {@code tableSpecFunction} should
* return.
*
*
{@code tableSpecFunction} should be deterministic. When given the same window, it should
* always return the same table specification.
*/
public static Bound to(SerializableFunction tableSpecFunction) {
return new Bound().to(tableSpecFunction);
}
/**
* Creates a write transformation from a function that maps windows to {@link TableReference}
* objects.
*
* {@code tableRefFunction} should be deterministic. When given the same window, it should
* always return the same table reference.
*/
public static Bound toTableReference(
SerializableFunction tableRefFunction) {
return new Bound().toTableReference(tableRefFunction);
}
/**
* Creates a write transformation with the specified schema to use in table creation.
*
* The schema is required only if writing to a table that does not already
* exist, and {@link CreateDisposition} is set to
* {@link CreateDisposition#CREATE_IF_NEEDED}.
*/
public static Bound withSchema(TableSchema schema) {
return new Bound().withSchema(schema);
}
/**
* Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}.
*/
public static Bound withSchema(ValueProvider schema) {
return new Bound().withSchema(schema);
}
/** Creates a write transformation with the specified options for creating the table. */
public static Bound withCreateDisposition(CreateDisposition disposition) {
return new Bound().withCreateDisposition(disposition);
}
/** Creates a write transformation with the specified options for writing to the table. */
public static Bound withWriteDisposition(WriteDisposition disposition) {
return new Bound().withWriteDisposition(disposition);
}
/**
* Creates a write transformation with BigQuery table validation disabled.
*/
public static Bound withoutValidation() {
return new Bound().withoutValidation();
}
/**
* A {@link PTransform} that can write either a bounded or unbounded
* {@link PCollection} of {@link TableRow TableRows} to a BigQuery table.
*/
public static class Bound extends PTransform, PDone> {
// Maximum number of files in a single partition.
static final int MAX_NUM_FILES = 10000;
// Maximum number of bytes in a single partition -- 11 TiB just under BQ's 12 TiB limit.
static final long MAX_SIZE_BYTES = 11 * (1L << 40);
// The maximum number of retry jobs.
static final int MAX_RETRY_JOBS = 3;
// The maximum number of retries to poll the status of a job.
// It sets to {@code Integer.MAX_VALUE} to block until the BigQuery job finishes.
static final int LOAD_JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE;
@Nullable final ValueProvider jsonTableRef;
@Nullable final SerializableFunction tableRefFunction;
// Table schema. The schema is required only if the table does not exist.
@Nullable final ValueProvider jsonSchema;
// Options for creating the table. Valid values are CREATE_IF_NEEDED and
// CREATE_NEVER.
final CreateDisposition createDisposition;
// Options for writing to the table. Valid values are WRITE_TRUNCATE,
// WRITE_APPEND and WRITE_EMPTY.
final WriteDisposition writeDisposition;
// An option to indicate if table validation is desired. Default is true.
final boolean validate;
@Nullable private BigQueryServices bigQueryServices;
private static class TranslateTableSpecFunction implements
SerializableFunction {
private SerializableFunction tableSpecFunction;
TranslateTableSpecFunction(SerializableFunction tableSpecFunction) {
this.tableSpecFunction = tableSpecFunction;
}
@Override
public TableReference apply(BoundedWindow value) {
return parseTableSpec(tableSpecFunction.apply(value));
}
}
/**
* @deprecated Should be private. Instead, use one of the factory methods in
* {@link BigQueryIO.Write}, such as {@link BigQueryIO.Write#to(String)}, to create an
* instance of this class.
*/
@Deprecated
public Bound() {
this(
null /* name */,
null /* jsonTableRef */,
null /* tableRefFunction */,
null /* jsonSchema */,
CreateDisposition.CREATE_IF_NEEDED,
WriteDisposition.WRITE_EMPTY,
true /* validate */,
null /* bigQueryServices */);
}
private Bound(String name, @Nullable ValueProvider jsonTableRef,
@Nullable SerializableFunction tableRefFunction,
@Nullable ValueProvider jsonSchema,
CreateDisposition createDisposition, WriteDisposition writeDisposition, boolean validate,
@Nullable BigQueryServices bigQueryServices) {
super(name);
this.jsonTableRef = jsonTableRef;
this.tableRefFunction = tableRefFunction;
this.jsonSchema = jsonSchema;
this.createDisposition = checkNotNull(createDisposition, "createDisposition");
this.writeDisposition = checkNotNull(writeDisposition, "writeDisposition");
this.validate = validate;
this.bigQueryServices = bigQueryServices;
}
/**
* Returns a copy of this write transformation, but with the specified transform name.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but writing to the specified table. Refer to
* {@link #parseTableSpec(String)} for the specification format.
*
*
Does not modify this object.
*/
public Bound to(String tableSpec) {
return toTableRef(NestedValueProvider.of(
StaticValueProvider.of(tableSpec), new TableSpecToTableRef()));
}
/**
* Returns a copy of this write transformation, but writing to the specified table.
*
*
Does not modify this object.
*/
public Bound to(TableReference table) {
return to(StaticValueProvider.of(toTableSpec(table)));
}
/**
* Returns a copy of this write transformation, but using the specified function to determine
* which table to write to for each window.
*
*
Does not modify this object.
*
*
{@code tableSpecFunction} should be deterministic. When given the same window, it
* should always return the same table specification.
*/
public Bound to(
SerializableFunction tableSpecFunction) {
return toTableReference(new TranslateTableSpecFunction(tableSpecFunction));
}
/**
* Returns a copy of this write transformation, but writing to the specified table. Refer to
* {@link #parseTableSpec(String)} for the specification format.
*
* Does not modify this object.
*/
public Bound to(ValueProvider tableSpec) {
return toTableRef(NestedValueProvider.of(tableSpec, new TableSpecToTableRef()));
}
/**
* Returns a copy of this write transformation, but writing to the specified table.
*
* Does not modify this object.
*/
private Bound toTableRef(ValueProvider table) {
return new Bound(name,
NestedValueProvider.of(table, new TableRefToJson()),
tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but using the specified function to determine
* which table to write to for each window.
*
* Does not modify this object.
*
*
{@code tableRefFunction} should be deterministic. When given the same window, it should
* always return the same table reference.
*/
public Bound toTableReference(
SerializableFunction tableRefFunction) {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but using the specified schema for rows
* to be written.
*
* Does not modify this object.
*/
public Bound withSchema(TableSchema schema) {
return new Bound(name, jsonTableRef, tableRefFunction,
StaticValueProvider.of(toJsonString(schema)),
createDisposition, writeDisposition, validate, bigQueryServices);
}
/**
* Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}.
*/
public Bound withSchema(ValueProvider schema) {
return new Bound(name, jsonTableRef, tableRefFunction,
NestedValueProvider.of(schema, new TableSchemaToJsonSchema()),
createDisposition, writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but using the specified create disposition.
*
* Does not modify this object.
*/
public Bound withCreateDisposition(CreateDisposition createDisposition) {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but using the specified write disposition.
*
*
Does not modify this object.
*/
public Bound withWriteDisposition(WriteDisposition writeDisposition) {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, bigQueryServices);
}
/**
* Returns a copy of this write transformation, but without BigQuery table validation.
*
*
Does not modify this object.
*/
public Bound withoutValidation() {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, false, bigQueryServices);
}
@VisibleForTesting
Bound withTestServices(BigQueryServices testServices) {
return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
writeDisposition, validate, testServices);
}
private static void verifyTableEmpty(
BigQueryOptions options,
TableReference table) {
try {
Bigquery client = Transport.newBigQueryClient(options).build();
BigQueryTableInserter inserter = new BigQueryTableInserter(client);
if (!inserter.isEmpty(table)) {
throw new IllegalArgumentException(
"BigQuery table is not empty: " + BigQueryIO.toTableSpec(table));
}
} catch (IOException e) {
ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
if (errorExtractor.itemNotFound(e)) {
// Nothing to do. If the table does not exist, it is considered empty.
} else {
throw new RuntimeException(
"unable to confirm BigQuery table emptiness for table "
+ BigQueryIO.toTableSpec(table), e);
}
}
}
@Override
public void validate(PCollection input) {
BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
// Exactly one of the table and table reference can be configured.
checkState(
jsonTableRef != null || tableRefFunction != null,
"must set the table reference of a BigQueryIO.Write transform");
checkState(
jsonTableRef == null || tableRefFunction == null,
"Cannot set both a table reference and a table function for a BigQueryIO.Write"
+ " transform");
// Require a schema if creating one or more tables.
checkArgument(
createDisposition != CreateDisposition.CREATE_IF_NEEDED || jsonSchema != null,
"CreateDisposition is CREATE_IF_NEEDED, however no schema was provided.");
// The user specified a table.
if (jsonTableRef != null && validate) {
TableReference table = getTableWithDefaultProject(options).get();
// Check for destination table presence and emptiness for early failure notification.
// Note that a presence check can fail when the table or dataset is created by an earlier
// stage of the pipeline. For these cases the #withoutValidation method can be used to
// disable the check.
verifyDatasetPresence(options, table);
if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
verifyTablePresence(options, table);
}
if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
verifyTableEmpty(options, table);
}
}
if (options.isStreaming() || tableRefFunction != null) {
// We will use BigQuery's streaming write API -- validate supported dispositions.
checkArgument(
createDisposition != CreateDisposition.CREATE_NEVER,
"CreateDisposition.CREATE_NEVER is not supported for an unbounded PCollection or when"
+ " using a tablespec function.");
checkArgument(
writeDisposition != WriteDisposition.WRITE_TRUNCATE,
"WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded PCollection or"
+ " when using a tablespec function.");
} else {
// We will use a BigQuery load job -- validate the temp location.
String tempLocation = options.getTempLocation();
checkArgument(
!Strings.isNullOrEmpty(tempLocation),
"BigQueryIO.Write needs a GCS temp location to store temp files.");
if (bigQueryServices == null) {
try {
GcsPath.fromUri(tempLocation);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
String.format(
"BigQuery temp location expected a valid 'gs://' path, but was given '%s'",
tempLocation),
e);
}
}
}
}
@Override
public PDone apply(PCollection input) {
Pipeline p = input.getPipeline();
BigQueryOptions options = p.getOptions().as(BigQueryOptions.class);
BigQueryServices bqServices = getBigQueryServices();
// In a streaming job, or when a tablespec function is defined, we use StreamWithDeDup
// and BigQuery's streaming import API.
if (options.isStreaming() || tableRefFunction != null) {
return input.apply(new StreamWithDeDup(getTable(), tableRefFunction,
NestedValueProvider.of(jsonSchema, new JsonSchemaToTableSchema())));
}
ValueProvider table = getTableWithDefaultProject(options);
String jobIdToken = "beam_job_" + randomUUIDString();
String tempLocation = options.getTempLocation();
String tempFilePrefix;
try {
IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
tempFilePrefix = factory.resolve(
factory.resolve(tempLocation, "BigQueryWriteTemp"),
jobIdToken);
} catch (IOException e) {
throw new RuntimeException(
String.format("Failed to resolve BigQuery temp location in %s", tempLocation),
e);
}
PCollection singleton = p.apply("Create", Create.of(tempFilePrefix));
PCollection inputInGlobalWindow =
input.apply(
Window.into(new GlobalWindows())
.triggering(DefaultTrigger.of())
.discardingFiredPanes());
PCollection> results = inputInGlobalWindow
.apply("WriteBundles",
ParDo.of(new WriteBundles(tempFilePrefix)));
TupleTag>> multiPartitionsTag =
new TupleTag>>("multiPartitionsTag") {};
TupleTag>> singlePartitionTag =
new TupleTag>>("singlePartitionTag") {};
PCollectionView>> resultsView = results
.apply("ResultsView", View.>asIterable());
PCollectionTuple partitions = singleton.apply(ParDo
.of(new WritePartition(
resultsView,
multiPartitionsTag,
singlePartitionTag))
.withSideInputs(resultsView)
.withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
// Write multiple partitions to separate temporary tables
PCollection tempTables = partitions.get(multiPartitionsTag)
.apply("MultiPartitionsGroupByKey", GroupByKey.>create())
.apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
false,
bqServices,
jobIdToken,
tempFilePrefix,
NestedValueProvider.of(table, new TableRefToJson()),
jsonSchema,
WriteDisposition.WRITE_EMPTY,
CreateDisposition.CREATE_IF_NEEDED)));
PCollectionView> tempTablesView = tempTables
.apply("TempTablesView", View.asIterable());
singleton.apply(ParDo
.of(new WriteRename(
bqServices,
jobIdToken,
NestedValueProvider.of(table, new TableRefToJson()),
writeDisposition,
createDisposition,
tempTablesView))
.withSideInputs(tempTablesView));
// Write single partition to final table
partitions.get(singlePartitionTag)
.apply("SinglePartitionGroupByKey", GroupByKey.>create())
.apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
true,
bqServices,
jobIdToken,
tempFilePrefix,
NestedValueProvider.of(table, new TableRefToJson()),
jsonSchema,
writeDisposition,
createDisposition)));
return PDone.in(input.getPipeline());
}
private class WriteBundles extends DoFn> {
private TableRowWriter writer = null;
private final String tempFilePrefix;
WriteBundles(String tempFilePrefix) {
this.tempFilePrefix = tempFilePrefix;
}
@Override
public void processElement(ProcessContext c) throws Exception {
if (writer == null) {
writer = new TableRowWriter(tempFilePrefix);
writer.open(UUID.randomUUID().toString());
LOG.debug("Done opening writer {}", writer);
}
try {
writer.write(c.element());
} catch (Exception e) {
// Discard write result and close the write.
try {
writer.close();
// The writer does not need to be reset, as this OldDoFn cannot be reused.
} catch (Exception closeException) {
// Do not mask the exception that caused the write to fail.
e.addSuppressed(closeException);
}
throw e;
}
}
@Override
public void finishBundle(Context c) throws Exception {
if (writer != null) {
c.output(writer.close());
writer = null;
}
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
.withLabel("Temporary File Prefix"));
}
}
@Override
protected Coder getDefaultOutputCoder() {
return VoidCoder.of();
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("table", jsonTableRef)
.withLabel("Table Reference"))
.addIfNotNull(DisplayData.item("schema", jsonSchema)
.withLabel("Table Schema"));
if (tableRefFunction != null) {
builder.add(DisplayData.item("tableFn", tableRefFunction.getClass())
.withLabel("Table Reference Function"));
}
builder
.add(DisplayData.item("createDisposition", createDisposition.toString())
.withLabel("Table CreateDisposition"))
.add(DisplayData.item("writeDisposition", writeDisposition.toString())
.withLabel("Table WriteDisposition"))
.addIfNotDefault(DisplayData.item("validation", validate)
.withLabel("Validation Enabled"), true);
}
/** Returns the create disposition. */
public CreateDisposition getCreateDisposition() {
return createDisposition;
}
/** Returns the write disposition. */
public WriteDisposition getWriteDisposition() {
return writeDisposition;
}
/** Returns the table schema. */
public TableSchema getSchema() {
return fromJsonString(
jsonSchema == null ? null : jsonSchema.get(), TableSchema.class);
}
/**
* Returns the table to write, or {@code null} if writing with {@code tableRefFunction}.
*
* If the table's project is not specified, use the executing project.
*/
@Nullable private ValueProvider getTableWithDefaultProject(
BigQueryOptions bqOptions) {
ValueProvider table = getTable();
if (table == null) {
return table;
}
if (!table.isAccessible()) {
LOG.info("Using a dynamic value for table input. This must contain a project"
+ " in the table reference: {}", table);
return table;
}
if (Strings.isNullOrEmpty(table.get().getProjectId())) {
// If user does not specify a project we assume the table to be located in
// the default project.
TableReference tableRef = table.get();
tableRef.setProjectId(bqOptions.getProject());
return NestedValueProvider.of(StaticValueProvider.of(
toJsonString(tableRef)), new JsonTableRefToTableRef());
}
return table;
}
/** Returns the table reference, or {@code null}. */
@Nullable
public ValueProvider getTable() {
return jsonTableRef == null
? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef());
}
/** Returns {@code true} if table validation is enabled. */
public boolean getValidate() {
return validate;
}
private BigQueryServices getBigQueryServices() {
if (bigQueryServices == null) {
bigQueryServices = new BigQueryServicesImpl();
}
return bigQueryServices;
}
}
static class TableRowWriter {
private static final Coder CODER = TableRowJsonCoder.of();
private static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8);
private final String tempFilePrefix;
private String id;
private String fileName;
private WritableByteChannel channel;
protected String mimeType = MimeTypes.TEXT;
private CountingOutputStream out;
TableRowWriter(String basename) {
this.tempFilePrefix = basename;
}
public final void open(String uId) throws Exception {
id = uId;
fileName = tempFilePrefix + id;
LOG.debug("Opening {}.", fileName);
channel = IOChannelUtils.create(fileName, mimeType);
try {
out = new CountingOutputStream(Channels.newOutputStream(channel));
LOG.debug("Writing header to {}.", fileName);
} catch (Exception e) {
try {
LOG.error("Writing header to {} failed, closing channel.", fileName);
channel.close();
} catch (IOException closeException) {
LOG.error("Closing channel for {} failed", fileName);
}
throw e;
}
LOG.debug("Starting write of bundle {} to {}.", this.id, fileName);
}
public void write(TableRow value) throws Exception {
CODER.encode(value, out, Context.OUTER);
out.write(NEWLINE);
}
public final KV close() throws IOException {
channel.close();
return KV.of(fileName, out.getCount());
}
}
/**
* Partitions temporary files based on number of files and file sizes.
*/
static class WritePartition extends DoFn>> {
private final PCollectionView>> resultsView;
private TupleTag>> multiPartitionsTag;
private TupleTag>> singlePartitionTag;
public WritePartition(
PCollectionView>> resultsView,
TupleTag>> multiPartitionsTag,
TupleTag>> singlePartitionTag) {
this.resultsView = resultsView;
this.multiPartitionsTag = multiPartitionsTag;
this.singlePartitionTag = singlePartitionTag;
}
@Override
public void processElement(ProcessContext c) throws Exception {
List> results = Lists.newArrayList(c.sideInput(resultsView));
if (results.isEmpty()) {
TableRowWriter writer = new TableRowWriter(c.element());
writer.open(UUID.randomUUID().toString());
results.add(writer.close());
}
long partitionId = 0;
int currNumFiles = 0;
long currSizeBytes = 0;
List currResults = Lists.newArrayList();
for (int i = 0; i < results.size(); ++i) {
KV fileResult = results.get(i);
if (currNumFiles + 1 > Bound.MAX_NUM_FILES
|| currSizeBytes + fileResult.getValue() > Bound.MAX_SIZE_BYTES) {
c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
currResults = Lists.newArrayList();
currNumFiles = 0;
currSizeBytes = 0;
}
++currNumFiles;
currSizeBytes += fileResult.getValue();
currResults.add(fileResult.getKey());
}
if (partitionId == 0) {
c.sideOutput(singlePartitionTag, KV.of(++partitionId, currResults));
} else {
c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
}
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
}
}
/**
* Writes partitions to BigQuery tables.
*/
static class WriteTables extends DoFn>>, String> {
private final boolean singlePartition;
private final BigQueryServices bqServices;
private final String jobIdToken;
private final String tempFilePrefix;
private final ValueProvider jsonTableRef;
private final ValueProvider jsonSchema;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
public WriteTables(
boolean singlePartition,
BigQueryServices bqServices,
String jobIdToken,
String tempFilePrefix,
ValueProvider jsonTableRef,
ValueProvider jsonSchema,
WriteDisposition writeDisposition,
CreateDisposition createDisposition) {
this.singlePartition = singlePartition;
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
this.tempFilePrefix = tempFilePrefix;
this.jsonTableRef = jsonTableRef;
this.jsonSchema = jsonSchema;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
}
@Override
public void processElement(ProcessContext c) throws Exception {
List partition = Lists.newArrayList(c.element().getValue()).get(0);
String jobIdPrefix = String.format(jobIdToken + "_%05d", c.element().getKey());
TableReference ref = fromJsonString(jsonTableRef.get(), TableReference.class);
if (!singlePartition) {
ref.setTableId(jobIdPrefix);
}
load(
bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
jobIdPrefix,
ref,
fromJsonString(
jsonSchema == null ? null : jsonSchema.get(), TableSchema.class),
partition,
writeDisposition,
createDisposition);
c.output(toJsonString(ref));
removeTemporaryFiles(c.getPipelineOptions(), tempFilePrefix, partition);
}
private void load(
JobService jobService,
String jobIdPrefix,
TableReference ref,
@Nullable TableSchema schema,
List gcsUris,
WriteDisposition writeDisposition,
CreateDisposition createDisposition) throws InterruptedException, IOException {
JobConfigurationLoad loadConfig = new JobConfigurationLoad()
.setDestinationTable(ref)
.setSchema(schema)
.setSourceUris(gcsUris)
.setWriteDisposition(writeDisposition.name())
.setCreateDisposition(createDisposition.name())
.setSourceFormat("NEWLINE_DELIMITED_JSON");
String projectId = ref.getProjectId();
for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) {
String jobId = jobIdPrefix + "-" + i;
LOG.info("Starting BigQuery load job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS);
JobReference jobRef = new JobReference()
.setProjectId(projectId)
.setJobId(jobId);
jobService.startLoadJob(jobRef, loadConfig);
Status jobStatus =
parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES));
switch (jobStatus) {
case SUCCEEDED:
return;
case UNKNOWN:
throw new RuntimeException("Failed to poll the load job status of job " + jobId);
case FAILED:
LOG.info("BigQuery load job failed: {}", jobId);
continue;
default:
throw new IllegalStateException(String.format("Unexpected job status: %s of job %s",
jobStatus, jobId));
}
}
throw new RuntimeException(String.format("Failed to create the load job %s, reached max "
+ "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS));
}
static void removeTemporaryFiles(
PipelineOptions options,
String tempFilePrefix,
Collection files)
throws IOException {
IOChannelFactory factory = IOChannelUtils.getFactory(tempFilePrefix);
if (factory instanceof GcsIOChannelFactory) {
GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(options);
gcsUtil.remove(files);
} else if (factory instanceof FileIOChannelFactory) {
for (String filename : files) {
LOG.debug("Removing file {}", filename);
boolean exists = Files.deleteIfExists(Paths.get(filename));
if (!exists) {
LOG.debug("{} does not exist.", filename);
}
}
} else {
throw new IOException("Unrecognized file system.");
}
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("jobIdToken", jobIdToken)
.withLabel("Job ID Token"))
.addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
.withLabel("Temporary File Prefix"))
.addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
.withLabel("Table Reference"))
.addIfNotNull(DisplayData.item("jsonSchema", jsonSchema)
.withLabel("Table Schema"));
}
}
/**
* Copies temporary tables to destination table.
*/
static class WriteRename extends DoFn {
private final BigQueryServices bqServices;
private final String jobIdToken;
private final ValueProvider jsonTableRef;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
private final PCollectionView> tempTablesView;
public WriteRename(
BigQueryServices bqServices,
String jobIdToken,
ValueProvider jsonTableRef,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
PCollectionView> tempTablesView) {
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
this.jsonTableRef = jsonTableRef;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
this.tempTablesView = tempTablesView;
}
@Override
public void processElement(ProcessContext c) throws Exception {
List tempTablesJson = Lists.newArrayList(c.sideInput(tempTablesView));
// Do not copy if no temp tables are provided
if (tempTablesJson.size() == 0) {
return;
}
List tempTables = Lists.newArrayList();
for (String table : tempTablesJson) {
tempTables.add(fromJsonString(table, TableReference.class));
}
copy(
bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
jobIdToken,
fromJsonString(jsonTableRef.get(), TableReference.class),
tempTables,
writeDisposition,
createDisposition);
DatasetService tableService =
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
removeTemporaryTables(tableService, tempTables);
}
private void copy(
JobService jobService,
String jobIdPrefix,
TableReference ref,
List tempTables,
WriteDisposition writeDisposition,
CreateDisposition createDisposition) throws InterruptedException, IOException {
JobConfigurationTableCopy copyConfig = new JobConfigurationTableCopy()
.setSourceTables(tempTables)
.setDestinationTable(ref)
.setWriteDisposition(writeDisposition.name())
.setCreateDisposition(createDisposition.name());
String projectId = ref.getProjectId();
for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) {
String jobId = jobIdPrefix + "-" + i;
LOG.info("Starting BigQuery copy job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS);
JobReference jobRef = new JobReference()
.setProjectId(projectId)
.setJobId(jobId);
jobService.startCopyJob(jobRef, copyConfig);
Status jobStatus =
parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES));
switch (jobStatus) {
case SUCCEEDED:
return;
case UNKNOWN:
throw new RuntimeException("Failed to poll the copy job status of job " + jobId);
case FAILED:
LOG.info("BigQuery copy job failed: {}", jobId);
continue;
default:
throw new IllegalStateException(String.format("Unexpected job status: %s of job %s",
jobStatus, jobId));
}
}
throw new RuntimeException(String.format("Failed to create the copy job %s, reached max "
+ "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS));
}
static void removeTemporaryTables(DatasetService tableService,
List tempTables) throws Exception {
for (TableReference tableRef : tempTables) {
try {
LOG.debug("Deleting table {}", toJsonString(tableRef));
tableService.deleteTable(
tableRef.getProjectId(),
tableRef.getDatasetId(),
tableRef.getTableId());
} catch (Exception e) {
LOG.warn("Failed to delete the table {}", toJsonString(tableRef), e);
}
}
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("jobIdToken", jobIdToken)
.withLabel("Job ID Token"))
.addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
.withLabel("Table Reference"))
.add(DisplayData.item("writeDisposition", writeDisposition.toString())
.withLabel("Write Disposition"))
.add(DisplayData.item("createDisposition", createDisposition.toString())
.withLabel("Create Disposition"));
}
}
/** Disallow construction of utility class. */
private Write() {}
}
private static void verifyDatasetPresence(BigQueryOptions options, TableReference table) {
String resourceNotFoundMsg =
String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", BigQueryIO.toTableSpec(table));
try {
Bigquery client = Transport.newBigQueryClient(options).build();
BigQueryTableRowIterator.executeWithBackOff(
client.datasets().get(table.getProjectId(), table.getDatasetId()),
resourceNotFoundMsg);
} catch (Exception e) {
ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
throw new IllegalArgumentException(resourceNotFoundMsg, e);
} else {
throw new RuntimeException(
String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset",
BigQueryIO.toTableSpec(table)),
e);
}
}
}
private static void verifyTablePresence(BigQueryOptions options, TableReference table) {
String resourceNotFoundMsg =
String.format(RESOURCE_NOT_FOUND_ERROR, "table", BigQueryIO.toTableSpec(table));
try {
Bigquery client = Transport.newBigQueryClient(options).build();
BigQueryTableRowIterator.executeWithBackOff(
client.tables().get(table.getProjectId(), table.getDatasetId(), table.getTableId()),
resourceNotFoundMsg);
} catch (Exception e) {
ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
throw new IllegalArgumentException(resourceNotFoundMsg, e);
} else {
throw new RuntimeException(
String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table",
BigQueryIO.toTableSpec(table)),
e);
}
}
}
/////////////////////////////////////////////////////////////////////////////
/**
* Implementation of DoFn to perform streaming BigQuery write.
*/
@SystemDoFnInternal
private static class StreamingWriteFn
extends DoFn, TableRowInfo>, Void> {
/** TableSchema in JSON. Use String to make the class Serializable. */
private final ValueProvider jsonTableSchema;
/** JsonTableRows to accumulate BigQuery rows in order to batch writes. */
private transient Map> tableRows;
/** The list of unique ids for each BigQuery table row. */
private transient Map> uniqueIdsForTableRows;
/** The list of tables created so far, so we don't try the creation
each time. */
private static Set createdTables =
Collections.newSetFromMap(new ConcurrentHashMap());
/** Tracks bytes written, exposed as "ByteCount" Counter. */
private Aggregator byteCountAggregator =
createAggregator("ByteCount", new Sum.SumLongFn());
/** Constructor. */
StreamingWriteFn(ValueProvider schema) {
this.jsonTableSchema =
NestedValueProvider.of(schema, new TableSchemaToJsonSchema());
}
/** Prepares a target BigQuery table. */
@Override
public void startBundle(Context context) {
tableRows = new HashMap<>();
uniqueIdsForTableRows = new HashMap<>();
}
/** Accumulates the input into JsonTableRows and uniqueIdsForTableRows. */
@Override
public void processElement(ProcessContext context) {
String tableSpec = context.element().getKey().getKey();
List rows = getOrCreateMapListValue(tableRows, tableSpec);
List uniqueIds = getOrCreateMapListValue(uniqueIdsForTableRows, tableSpec);
rows.add(context.element().getValue().tableRow);
uniqueIds.add(context.element().getValue().uniqueId);
}
/** Writes the accumulated rows into BigQuery with streaming API. */
@Override
public void finishBundle(Context context) throws Exception {
BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
Bigquery client = Transport.newBigQueryClient(options).build();
for (String tableSpec : tableRows.keySet()) {
TableReference tableReference = getOrCreateTable(options, tableSpec);
flushRows(client, tableReference, tableRows.get(tableSpec),
uniqueIdsForTableRows.get(tableSpec));
}
tableRows.clear();
uniqueIdsForTableRows.clear();
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.addIfNotNull(DisplayData.item("schema", jsonTableSchema)
.withLabel("Table Schema"));
}
public TableReference getOrCreateTable(BigQueryOptions options, String tableSpec)
throws IOException {
TableReference tableReference = parseTableSpec(tableSpec);
if (!createdTables.contains(tableSpec)) {
synchronized (createdTables) {
// Another thread may have succeeded in creating the table in the meanwhile, so
// check again. This check isn't needed for correctness, but we add it to prevent
// every thread from attempting a create and overwhelming our BigQuery quota.
if (!createdTables.contains(tableSpec)) {
TableSchema tableSchema = JSON_FACTORY.fromString(
jsonTableSchema.get(), TableSchema.class);
Bigquery client = Transport.newBigQueryClient(options).build();
BigQueryTableInserter inserter = new BigQueryTableInserter(client);
inserter.getOrCreateTable(tableReference, Write.WriteDisposition.WRITE_APPEND,
Write.CreateDisposition.CREATE_IF_NEEDED, tableSchema);
createdTables.add(tableSpec);
}
}
}
return tableReference;
}
/** Writes the accumulated rows into BigQuery with streaming API. */
private void flushRows(Bigquery client, TableReference tableReference,
List tableRows, List uniqueIds) {
if (!tableRows.isEmpty()) {
try {
BigQueryTableInserter inserter = new BigQueryTableInserter(client);
inserter.insertAll(tableReference, tableRows, uniqueIds, byteCountAggregator);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
private static class ShardedKey {
private final K key;
private final int shardNumber;
public static ShardedKey of(K key, int shardNumber) {
return new ShardedKey(key, shardNumber);
}
private ShardedKey(K key, int shardNumber) {
this.key = key;
this.shardNumber = shardNumber;
}
public K getKey() {
return key;
}
public int getShardNumber() {
return shardNumber;
}
}
/**
* A {@link Coder} for {@link ShardedKey}, using a wrapped key {@link Coder}.
*/
private static class ShardedKeyCoder
extends StandardCoder> {
public static ShardedKeyCoder of(Coder keyCoder) {
return new ShardedKeyCoder<>(keyCoder);
}
@JsonCreator
public static ShardedKeyCoder of(
@JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
List> components) {
checkArgument(components.size() == 1, "Expecting 1 component, got %s", components.size());
return of(components.get(0));
}
protected ShardedKeyCoder(Coder keyCoder) {
this.keyCoder = keyCoder;
this.shardNumberCoder = VarIntCoder.of();
}
@Override
public List> getCoderArguments() {
return Arrays.asList(keyCoder);
}
@Override
public void encode(ShardedKey key, OutputStream outStream, Context context)
throws IOException {
keyCoder.encode(key.getKey(), outStream, context.nested());
shardNumberCoder.encode(key.getShardNumber(), outStream, context);
}
@Override
public ShardedKey decode(InputStream inStream, Context context)
throws IOException {
return new ShardedKey(
keyCoder.decode(inStream, context.nested()),
shardNumberCoder.decode(inStream, context));
}
@Override
public void verifyDeterministic() throws NonDeterministicException {
keyCoder.verifyDeterministic();
}
Coder keyCoder;
VarIntCoder shardNumberCoder;
}
private static class TableRowInfoCoder extends AtomicCoder {
private static final TableRowInfoCoder INSTANCE = new TableRowInfoCoder();
@JsonCreator
public static TableRowInfoCoder of() {
return INSTANCE;
}
@Override
public void encode(TableRowInfo value, OutputStream outStream, Context context)
throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null value");
}
tableRowCoder.encode(value.tableRow, outStream, context.nested());
idCoder.encode(value.uniqueId, outStream, context.nested());
}
@Override
public TableRowInfo decode(InputStream inStream, Context context)
throws IOException {
return new TableRowInfo(
tableRowCoder.decode(inStream, context.nested()),
idCoder.decode(inStream, context.nested()));
}
@Override
public void verifyDeterministic() throws NonDeterministicException {
throw new NonDeterministicException(this, "TableRows are not deterministic.");
}
TableRowJsonCoder tableRowCoder = TableRowJsonCoder.of();
StringUtf8Coder idCoder = StringUtf8Coder.of();
}
private static class TableRowInfo {
TableRowInfo(TableRow tableRow, String uniqueId) {
this.tableRow = tableRow;
this.uniqueId = uniqueId;
}
final TableRow tableRow;
final String uniqueId;
}
/////////////////////////////////////////////////////////////////////////////
/**
* Fn that tags each table row with a unique id and destination table.
* To avoid calling UUID.randomUUID() for each element, which can be costly,
* a randomUUID is generated only once per bucket of data. The actual unique
* id is created by concatenating this randomUUID with a sequential number.
*/
@VisibleForTesting
static class TagWithUniqueIdsAndTable
extends DoFn, TableRowInfo>>
implements DoFn.RequiresWindowAccess {
/** TableSpec to write to. */
private final ValueProvider tableSpec;
/** User function mapping windows to {@link TableReference} in JSON. */
private final SerializableFunction tableRefFunction;
private transient String randomUUID;
private transient long sequenceNo = 0L;
TagWithUniqueIdsAndTable(BigQueryOptions options,
ValueProvider table,
SerializableFunction tableRefFunction) {
checkArgument(table == null ^ tableRefFunction == null,
"Exactly one of table or tableRefFunction should be set");
if (table != null) {
if (table.isAccessible() && Strings.isNullOrEmpty(table.get().getProjectId())) {
TableReference tableRef = table.get()
.setProjectId(options.as(BigQueryOptions.class).getProject());
table = NestedValueProvider.of(
StaticValueProvider.of(toJsonString(tableRef)),
new JsonTableRefToTableRef());
}
this.tableSpec = NestedValueProvider.of(table, new TableRefToTableSpec());
} else {
tableSpec = null;
}
this.tableRefFunction = tableRefFunction;
}
@Override
public void startBundle(Context context) {
randomUUID = UUID.randomUUID().toString();
}
/** Tag the input with a unique id. */
@Override
public void processElement(ProcessContext context) throws IOException {
String uniqueId = randomUUID + sequenceNo++;
ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
String tableSpec = tableSpecFromWindow(
context.getPipelineOptions().as(BigQueryOptions.class), context.window());
// We output on keys 0-50 to ensure that there's enough batching for
// BigQuery.
context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
new TableRowInfo(context.element(), uniqueId)));
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.addIfNotNull(DisplayData.item("table", tableSpec));
if (tableRefFunction != null) {
builder.add(DisplayData.item("tableFn", tableRefFunction.getClass())
.withLabel("Table Reference Function"));
}
}
@VisibleForTesting
ValueProvider getTableSpec() {
return tableSpec;
}
private String tableSpecFromWindow(BigQueryOptions options, BoundedWindow window) {
if (tableSpec != null) {
return tableSpec.get();
} else {
TableReference table = tableRefFunction.apply(window);
if (table.getProjectId() == null) {
table.setProjectId(options.getProject());
}
return toTableSpec(table);
}
}
}
/////////////////////////////////////////////////////////////////////////////
/**
* PTransform that performs streaming BigQuery write. To increase consistency,
* it leverages BigQuery best effort de-dup mechanism.
*/
private static class StreamWithDeDup extends PTransform, PDone> {
private final transient ValueProvider tableReference;
private final SerializableFunction tableRefFunction;
private final transient ValueProvider tableSchema;
/** Constructor. */
StreamWithDeDup(ValueProvider tableReference,
SerializableFunction tableRefFunction,
ValueProvider tableSchema) {
this.tableReference = tableReference;
this.tableRefFunction = tableRefFunction;
this.tableSchema = tableSchema;
}
@Override
protected Coder getDefaultOutputCoder() {
return VoidCoder.of();
}
@Override
public PDone apply(PCollection input) {
// A naive implementation would be to simply stream data directly to BigQuery.
// However, this could occasionally lead to duplicated data, e.g., when
// a VM that runs this code is restarted and the code is re-run.
// The above risk is mitigated in this implementation by relying on
// BigQuery built-in best effort de-dup mechanism.
// To use this mechanism, each input TableRow is tagged with a generated
// unique id, which is then passed to BigQuery and used to ignore duplicates.
PCollection, TableRowInfo>> tagged = input.apply(ParDo.of(
new TagWithUniqueIdsAndTable(input.getPipeline().getOptions().as(BigQueryOptions.class),
tableReference, tableRefFunction)));
// To prevent having the same TableRow processed more than once with regenerated
// different unique ids, this implementation relies on "checkpointing", which is
// achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
// performed by Reshuffle.
tagged
.setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
.apply(Reshuffle., TableRowInfo>of())
.apply(ParDo.of(new StreamingWriteFn(tableSchema)));
// Note that the implementation to return PDone here breaks the
// implicit assumption about the job execution order. If a user
// implements a PTransform that takes PDone returned here as its
// input, the transform may not necessarily be executed after
// the BigQueryIO.Write.
return PDone.in(input.getPipeline());
}
}
/**
* Status of a BigQuery job or request.
*/
enum Status {
SUCCEEDED,
FAILED,
UNKNOWN,
}
private static Status parseStatus(@Nullable Job job) {
if (job == null) {
return Status.UNKNOWN;
}
JobStatus status = job.getStatus();
if (status.getErrorResult() != null) {
return Status.FAILED;
} else if (status.getErrors() != null && !status.getErrors().isEmpty()) {
return Status.FAILED;
} else {
return Status.SUCCEEDED;
}
}
@VisibleForTesting
static String toJsonString(Object item) {
if (item == null) {
return null;
}
try {
return JSON_FACTORY.toString(item);
} catch (IOException e) {
throw new RuntimeException(
String.format("Cannot serialize %s to a JSON string.", item.getClass().getSimpleName()),
e);
}
}
@VisibleForTesting
static T fromJsonString(String json, Class clazz) {
if (json == null) {
return null;
}
try {
return JSON_FACTORY.fromString(json, clazz);
} catch (IOException e) {
throw new RuntimeException(
String.format("Cannot deserialize %s from a JSON string: %s.", clazz, json),
e);
}
}
/**
* Returns a randomUUID string.
*
* {@code '-'} is removed because BigQuery doesn't allow it in dataset id.
*/
private static String randomUUIDString() {
return UUID.randomUUID().toString().replaceAll("-", "");
}
/////////////////////////////////////////////////////////////////////////////
/** Disallow construction of utility class. */
private BigQueryIO() {}
private static List getOrCreateMapListValue(Map> map, K key) {
List value = map.get(key);
if (value == null) {
value = new ArrayList<>();
map.put(key, value);
}
return value;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy