All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.BigQueryIO Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.QueryRequest;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.Coder.Context;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
import com.google.cloud.dataflow.sdk.options.GcpOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.NestedValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.AvroUtils;
import com.google.cloud.dataflow.sdk.util.BigQueryServices;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.DatasetService;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.JobService;
import com.google.cloud.dataflow.sdk.util.BigQueryServicesImpl;
import com.google.cloud.dataflow.sdk.util.BigQueryTableInserter;
import com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator;
import com.google.cloud.dataflow.sdk.util.FileIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsUtil;
import com.google.cloud.dataflow.sdk.util.GcsUtil.GcsUtilFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.MoreObjects;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.io.CountingOutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.apache.avro.generic.GenericRecord;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nullable;

/**
 * {@link PTransform}s for reading and writing
 * BigQuery tables.
 *
 * 

Table References

* *

A fully-qualified BigQuery table name consists of three components: *

    *
  • {@code projectId}: the Cloud project id (defaults to * {@link GcpOptions#getProject()}). *
  • {@code datasetId}: the BigQuery dataset id, unique within a project. *
  • {@code tableId}: a table id, unique within a dataset. *
* *

BigQuery table references are stored as a {@link TableReference}, which comes * from the * BigQuery Java Client API. * Tables can be referred to as Strings, with or without the {@code projectId}. * A helper function is provided ({@link BigQueryIO#parseTableSpec(String)}) * that parses the following string forms into a {@link TableReference}: * *

    *
  • [{@code project_id}]:[{@code dataset_id}].[{@code table_id}] *
  • [{@code dataset_id}].[{@code table_id}] *
* *

Reading

* *

To read from a BigQuery table, apply a {@link BigQueryIO.Read} transformation. * This produces a {@link PCollection} of {@link TableRow TableRows} as output: *

{@code
 * PCollection weatherData = pipeline.apply(
 *     BigQueryIO.Read.named("Read")
 *                    .from("clouddataflow-readonly:samples.weather_stations"));
 * }
* *

See {@link TableRow} for more information on the {@link TableRow} object. * *

Users may provide a query to read from rather than reading all of a BigQuery table. If * specified, the result obtained by executing the specified query will be used as the data of the * input transform. * *

{@code
 * PCollection meanTemperatureData = pipeline.apply(
 *     BigQueryIO.Read.named("Read")
 *                    .fromQuery("SELECT year, mean_temp FROM [samples.weather_stations]"));
 * }
* *

When creating a BigQuery input transform, users should provide either a query or a table. * Pipeline construction will fail with a validation error if neither or both are specified. * *

Writing

* *

To write to a BigQuery table, apply a {@link BigQueryIO.Write} transformation. * This consumes a {@link PCollection} of {@link TableRow TableRows} as input. *

{@code
 * PCollection quotes = ...
 *
 * List fields = new ArrayList<>();
 * fields.add(new TableFieldSchema().setName("source").setType("STRING"));
 * fields.add(new TableFieldSchema().setName("quote").setType("STRING"));
 * TableSchema schema = new TableSchema().setFields(fields);
 *
 * quotes.apply(BigQueryIO.Write
 *     .named("Write")
 *     .to("my-project:output.output_table")
 *     .withSchema(schema)
 *     .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
 * }
* *

See {@link BigQueryIO.Write} for details on how to specify if a write should * append to an existing table, replace the table, or verify that the table is * empty. Note that the dataset being written to must already exist. Write * dispositions are not supported in streaming mode. * *

Sharding BigQuery output tables

* *

A common use case is to dynamically generate BigQuery table names based on * the current window. To support this, * {@link BigQueryIO.Write#to(SerializableFunction)} * accepts a function mapping the current window to a tablespec. For example, * here's code that outputs daily tables to BigQuery: *

{@code
 * PCollection quotes = ...
 * quotes.apply(Window.into(CalendarWindows.days(1)))
 *       .apply(BigQueryIO.Write
 *         .named("Write")
 *         .withSchema(schema)
 *         .to(new SerializableFunction() {
 *           public String apply(BoundedWindow window) {
 *             // The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
 *             String dayString = DateTimeFormat.forPattern("yyyy_MM_dd")
 *                  .withZone(DateTimeZone.UTC)
 *                  .print(((IntervalWindow) window).start());
 *             return "my-project:output.output_table_" + dayString;
 *           }
 *         }));
 * }
* *

Per-window tables are not yet supported in batch mode. * *

Permissions

* *

Permission requirements depend on the {@link PipelineRunner} that is used to execute the * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for * more details. * *

Please see BigQuery Access Control * for security and permission related information specific to BigQuery. */ public class BigQueryIO { private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class); /** * Singleton instance of the JSON factory used to read and write JSON * formatted rows. */ private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory(); /** * Project IDs must contain 6-63 lowercase letters, digits, or dashes. * IDs must start with a letter and may not end with a dash. * This regex isn't exact - this allows for patterns that would be rejected by * the service, but this is sufficient for basic parsing of table references. */ private static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]{4,61}[a-z0-9]"; /** * Regular expression that matches Dataset IDs. */ private static final String DATASET_REGEXP = "[-\\w.]{1,1024}"; /** * Regular expression that matches Table IDs. */ private static final String TABLE_REGEXP = "[-\\w$@]{1,1024}"; /** * Matches table specifications in the form {@code "[project_id]:[dataset_id].[table_id]"} or * {@code "[dataset_id].[table_id]"}. */ private static final String DATASET_TABLE_REGEXP = String.format("((?%s):)?(?%s)\\.(?

%s)", PROJECT_ID_REGEXP, DATASET_REGEXP, TABLE_REGEXP); private static final Pattern TABLE_SPEC = Pattern.compile(DATASET_TABLE_REGEXP); @Deprecated // unused. public static final String SET_PROJECT_FROM_OPTIONS_WARNING = "No project specified for BigQuery table \"%1$s.%2$s\". Assuming it is in \"%3$s\". If the" + " table is in a different project please specify it as a part of the BigQuery table" + " definition."; private static final String RESOURCE_NOT_FOUND_ERROR = "BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline" + " execution. If the %1$s is created by an earlier stage of the pipeline, this" + " validation can be disabled using #withoutValidation."; private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR = "Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by" + " an earlier stage of the pipeline, this validation can be disabled using" + " #withoutValidation."; /** * Parse a table specification in the form * {@code "[project_id]:[dataset_id].[table_id]"} or {@code "[dataset_id].[table_id]"}. * *

If the project id is omitted, the default project id is used. */ public static TableReference parseTableSpec(String tableSpec) { Matcher match = TABLE_SPEC.matcher(tableSpec); if (!match.matches()) { throw new IllegalArgumentException( "Table reference is not in [project_id]:[dataset_id].[table_id] " + "format: " + tableSpec); } TableReference ref = new TableReference(); ref.setProjectId(match.group("PROJECT")); return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE")); } /** * Returns a canonical string representation of the {@link TableReference}. */ public static String toTableSpec(TableReference ref) { StringBuilder sb = new StringBuilder(); if (ref.getProjectId() != null) { sb.append(ref.getProjectId()); sb.append(":"); } sb.append(ref.getDatasetId()).append('.').append(ref.getTableId()); return sb.toString(); } @VisibleForTesting static class JsonSchemaToTableSchema implements SerializableFunction { @Override public TableSchema apply(String from) { return fromJsonString(from, TableSchema.class); } } private static class TableSchemaToJsonSchema implements SerializableFunction { @Override public String apply(TableSchema from) { return toJsonString(from); } } private static class JsonTableRefToTableRef implements SerializableFunction { @Override public TableReference apply(String from) { return fromJsonString(from, TableReference.class); } } private static class TableRefToTableSpec implements SerializableFunction { @Override public String apply(TableReference from) { return toTableSpec(from); } } private static class TableRefToJson implements SerializableFunction { @Override public String apply(TableReference from) { return toJsonString(from); } } private static class TableRefToProjectId implements SerializableFunction { @Override public String apply(TableReference from) { return from.getProjectId(); } } @VisibleForTesting static class TableSpecToTableRef implements SerializableFunction { @Override public TableReference apply(String from) { return parseTableSpec(from); } } @Nullable private static ValueProvider displayTable( @Nullable ValueProvider table) { if (table == null) { return null; } return NestedValueProvider.of(table, new TableRefToTableSpec()); } /** * A {@link PTransform} that reads from a BigQuery table and returns a * {@link PCollection} of {@link TableRow TableRows} containing each of the rows of the table. * *

Each {@link TableRow} contains values indexed by column name. Here is a * sample processing function that processes a "line" column from rows: *

{@code
   * static class ExtractWordsFn extends DoFn {
   *   public void processElement(ProcessContext c) {
   *     // Get the "line" field of the TableRow object, split it into words, and emit them.
   *     TableRow row = c.element();
   *     String[] words = row.get("line").toString().split("[^a-zA-Z']+");
   *     for (String word : words) {
   *       if (!word.isEmpty()) {
   *         c.output(word);
   *       }
   *     }
   *   }
   * }}
*/ public static class Read { /** * Returns a {@link Read.Bound} with the given name. The BigQuery table or query to be read * from has not yet been configured. */ public static Bound named(String name) { return new Bound().named(name); } /** * Reads a BigQuery table specified as {@code "[project_id]:[dataset_id].[table_id]"} or * {@code "[dataset_id].[table_id]"} for tables within the current project. */ public static Bound from(String tableSpec) { return new Bound().from(StaticValueProvider.of(tableSpec)); } /** * Same as {@code from(String)}, but with a {@link ValueProvider}. */ public static Bound from(ValueProvider tableSpec) { return new Bound().from(tableSpec); } /** * Reads results received after executing the given query. */ public static Bound fromQuery(String query) { return new Bound().fromQuery(StaticValueProvider.of(query)); } /** * Same as {@code from(String)}, but with a {@link ValueProvider}. */ public static Bound fromQuery(ValueProvider query) { return new Bound().fromQuery(query); } /** * Reads a BigQuery table specified as a {@link TableReference} object. */ public static Bound from(TableReference table) { return new Bound().from(table); } /** * Disables BigQuery table validation, which is enabled by default. */ public static Bound withoutValidation() { return new Bound().withoutValidation(); } /** * A {@link PTransform} that reads from a BigQuery table and returns a bounded * {@link PCollection} of {@link TableRow TableRows}. */ public static class Bound extends PTransform> { @Nullable final ValueProvider jsonTableRef; @Nullable final ValueProvider query; final boolean validate; @Nullable final Boolean flattenResults; @Nullable final Boolean useLegacySql; @Nullable BigQueryServices bigQueryServices; private static final String QUERY_VALIDATION_FAILURE_ERROR = "Validation of query \"%1$s\" failed. If the query depends on an earlier stage of the" + " pipeline, This validation can be disabled using #withoutValidation."; private Bound() { this( null /* name */, null /* query */, null /* jsonTableRef */, true /* validate */, null /* flattenResults */, null /* useLegacySql */, null /* bigQueryServices */); } private Bound( String name, @Nullable ValueProvider query, @Nullable ValueProvider jsonTableRef, boolean validate, @Nullable Boolean flattenResults, @Nullable Boolean useLegacySql, @Nullable BigQueryServices bigQueryServices) { super(name); this.jsonTableRef = jsonTableRef; this.query = query; this.validate = validate; this.flattenResults = flattenResults; this.useLegacySql = useLegacySql; this.bigQueryServices = bigQueryServices; } /** * Returns a copy of this transform using the name associated with this transformation. * *

Does not modify this object. */ public Bound named(String name) { return new Bound( name, query, jsonTableRef, validate, flattenResults, useLegacySql, bigQueryServices); } /** * Returns a copy of this transform that reads from the specified table. Refer to * {@link #parseTableSpec(String)} for the specification format. * *

Does not modify this object. */ public Bound from(String tableSpec) { return from(StaticValueProvider.of(tableSpec)); } /** * Returns a copy of this transform that reads from the specified table. Refer to * {@link #parseTableSpec(String)} for the specification format. * *

Does not modify this object. */ public Bound from(ValueProvider tableSpec) { return new Bound( name, query, NestedValueProvider.of( NestedValueProvider.of( tableSpec, new TableSpecToTableRef()), new TableRefToJson()), validate, flattenResults, useLegacySql, bigQueryServices); } /** * Returns a copy of this transform that reads from the specified table. * *

Does not modify this object. */ public Bound from(TableReference table) { return from(StaticValueProvider.of(toTableSpec(table))); } /** * Returns a copy of this transform that reads the results of the specified query. * *

Does not modify this object. * *

By default, the query results will be flattened -- see * "flattenResults" in the * Jobs documentation for more information. To disable flattening, use * {@link BigQueryIO.Read.Bound#withoutResultFlattening}. * *

By default, the query will use BigQuery's legacy SQL dialect. To use the BigQuery * Standard SQL dialect, use {@link BigQueryIO.Read.Bound#usingStandardSql}. */ public Bound fromQuery(String query) { return fromQuery(StaticValueProvider.of(query)); } /** * Like {@link #fromQuery(String)}, but from a {@link ValueProvider}. */ public Bound fromQuery(ValueProvider query) { return new Bound(name, query, jsonTableRef, validate, MoreObjects.firstNonNull(flattenResults, Boolean.TRUE), MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE), bigQueryServices); } /** * Disable table validation. */ public Bound withoutValidation() { return new Bound( name, query, jsonTableRef, false /* validate */, flattenResults, useLegacySql, bigQueryServices); } /** * Disable * flattening of query results. * *

Only valid when a query is used ({@link #fromQuery}). Setting this option when reading * from a table will cause an error during validation. */ public Bound withoutResultFlattening() { return new Bound( name, query, jsonTableRef, validate, false /* flattenResults */, useLegacySql, bigQueryServices); } /** * Enables BigQuery's Standard SQL dialect when reading from a query. * *

Only valid when a query is used ({@link #fromQuery}). Setting this option when reading * from a table will cause an error during validation. */ public Bound usingStandardSql() { return new Bound( name, query, jsonTableRef, validate, flattenResults, false /* useLegacySql */, bigQueryServices); } @VisibleForTesting Bound withTestServices(BigQueryServices testServices) { return new Bound( name, query, jsonTableRef, validate, flattenResults, useLegacySql, testServices); } @Override public void validate(PInput input) { if (!validate) { // Note that a table or query check can fail if the table or dataset are created by // earlier stages of the pipeline or if a query depends on earlier stages of a pipeline. // For these cases the withoutValidation method can be used to disable the check. return; } BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class); String tempLocation = bqOptions.getTempLocation(); checkArgument( !Strings.isNullOrEmpty(tempLocation), "BigQueryIO.Read needs a GCS temp location to store temp files."); if (bigQueryServices == null) { try { GcsPath.fromUri(tempLocation); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( String.format( "BigQuery temp location expected a valid 'gs://' path, but was given '%s'", tempLocation), e); } } ValueProvider table = getTableWithDefaultProject(bqOptions); checkState( table == null || query == null, "Invalid BigQueryIO.Read: table reference and query may not both be set"); checkState( table != null || query != null, "Invalid BigQueryIO.Read: one of table reference and query must be set"); if (table != null) { checkState( flattenResults == null, "Invalid BigQueryIO.Read: Specifies a table with a result flattening" + " preference, which only applies to queries"); checkState( useLegacySql == null, "Invalid BigQueryIO.Read: Specifies a table with a SQL dialect" + " preference, which only applies to queries"); checkState(table.isAccessible(), "Cannot call validate if table is dynamically set."); // Check for source table presence for early failure notification. verifyDatasetPresence(bqOptions, table.get()); verifyTablePresence(bqOptions, table.get()); } else /* query != null */ { checkState(query.isAccessible(), "Cannot call validate if query is dynamically set."); checkState(flattenResults != null, "flattenResults should not be null if query is set"); checkState(useLegacySql != null, "useLegacySql should not be null if query is set"); dryRunQuery(bqOptions, query.get(), useLegacySql); } } private static void dryRunQuery( BigQueryOptions options, String query, boolean useLegacySql) { Bigquery client = Transport.newBigQueryClient(options).build(); QueryRequest request = new QueryRequest(); request.setQuery(query); request.setDryRun(true); request.setUseLegacySql(useLegacySql); String queryValidationErrorMsg = String.format(QUERY_VALIDATION_FAILURE_ERROR, query); try { BigQueryTableRowIterator.executeWithBackOff( client.jobs().query(options.getProject(), request), queryValidationErrorMsg); } catch (Exception e) { throw new IllegalArgumentException(queryValidationErrorMsg, e); } } @Override public PCollection apply(PInput input) { String uuid = randomUUIDString(); final String jobIdToken = "beam_job_" + uuid; BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class); BoundedSource source; final BigQueryServices bqServices = getBigQueryServices(); final String extractDestinationDir; String tempLocation = bqOptions.getTempLocation(); try { IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation); extractDestinationDir = factory.resolve(tempLocation, uuid); } catch (IOException e) { throw new RuntimeException( String.format("Failed to resolve extract destination directory in %s", tempLocation)); } final String executingProject = bqOptions.getProject(); if (query != null && (!query.isAccessible() || !Strings.isNullOrEmpty(query.get()))) { String queryTempDatasetId = "temp_dataset_" + uuid; String queryTempTableId = "temp_table_" + uuid; TableReference queryTempTableRef = new TableReference() .setProjectId(executingProject) .setDatasetId(queryTempDatasetId) .setTableId(queryTempTableId); String jsonTableRef = toJsonString(queryTempTableRef); source = BigQueryQuerySource.create( jobIdToken, query, NestedValueProvider.of( StaticValueProvider.of(jsonTableRef), new JsonTableRefToTableRef()), flattenResults, useLegacySql, extractDestinationDir, bqServices); } else { ValueProvider inputTable = getTableWithDefaultProject(bqOptions); source = BigQueryTableSource.create( jobIdToken, inputTable, extractDestinationDir, bqServices, StaticValueProvider.of(executingProject)); } PassThroughThenCleanup.CleanupOperation cleanupOperation = new PassThroughThenCleanup.CleanupOperation() { @Override void cleanup(PipelineOptions options) throws Exception { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); JobReference jobRef = new JobReference() .setProjectId(executingProject) .setJobId(getExtractJobId(jobIdToken)); Job extractJob = bqServices.getJobService(bqOptions) .getJob(jobRef); Collection extractFiles = null; if (extractJob != null) { extractFiles = getExtractFilePaths(extractDestinationDir, extractJob); } else { IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir); Collection dirMatch = factory.match(extractDestinationDir); if (!dirMatch.isEmpty()) { extractFiles = factory.match(factory.resolve(extractDestinationDir, "*")); } } if (extractFiles != null && !extractFiles.isEmpty()) { new GcsUtilFactory().create(options).remove(extractFiles); } }}; return input.getPipeline() .apply(com.google.cloud.dataflow.sdk.io.Read.from(source)) .setCoder(getDefaultOutputCoder()) .apply(new PassThroughThenCleanup(cleanupOperation)); } @Override protected Coder getDefaultOutputCoder() { return TableRowJsonCoder.of(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("table", displayTable(getTableProvider())) .withLabel("Table")) .addIfNotNull(DisplayData.item("query", query) .withLabel("Query")) .addIfNotNull(DisplayData.item("flattenResults", flattenResults) .withLabel("Flatten Query Results")) .addIfNotNull(DisplayData.item("useLegacySql", useLegacySql) .withLabel("Use Legacy SQL Dialect")) .addIfNotDefault(DisplayData.item("validation", validate) .withLabel("Validation Enabled"), true); } /** * Returns the table to read, or {@code null} if reading from a query instead. * *

If the table's project is not specified, use the executing project. */ @Nullable private ValueProvider getTableWithDefaultProject( BigQueryOptions bqOptions) { ValueProvider table = getTableProvider(); if (table == null) { return table; } if (!table.isAccessible()) { LOG.info("Using a dynamic value for table input. This must contain a project" + " in the table reference: {}", table); return table; } if (Strings.isNullOrEmpty(table.get().getProjectId())) { // If user does not specify a project we assume the table to be located in // the default project. TableReference tableRef = table.get(); tableRef.setProjectId(bqOptions.getProject()); return NestedValueProvider.of(StaticValueProvider.of( toJsonString(tableRef)), new JsonTableRefToTableRef()); } return table; } /** * Returns the table to read, or {@code null} if reading from a query instead. */ @Nullable public ValueProvider getTableProvider() { return jsonTableRef == null ? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef()); } /** * Returns the table to read, or {@code null} if reading from a query instead. */ @Nullable public TableReference getTable() { ValueProvider provider = getTableProvider(); return provider == null ? null : provider.get(); } /** * Returns the query to be read, or {@code null} if reading from a table instead. */ @Nullable public String getQuery() { return query == null ? null : query.get(); } /** * Returns the query to be read, or {@code null} if reading from a table instead. */ @Nullable public ValueProvider getQueryProvider() { return query; } /** * Returns true if table validation is enabled. */ public boolean getValidate() { return validate; } /** * Returns true/false if result flattening is enabled/disabled, or null if not applicable. */ public Boolean getFlattenResults() { return flattenResults; } /** * Returns true (false) if the query will (will not) use BigQuery's legacy SQL mode, or null * if not applicable. */ @Nullable public Boolean getUseLegacySql() { return useLegacySql; } private BigQueryServices getBigQueryServices() { if (bigQueryServices == null) { bigQueryServices = new BigQueryServicesImpl(); } return bigQueryServices; } } /** Disallow construction of utility class. */ private Read() {} } /** * A {@link PTransform} that invokes {@link CleanupOperation} after the input {@link PCollection} * has been processed. */ @VisibleForTesting static class PassThroughThenCleanup extends PTransform, PCollection> { private CleanupOperation cleanupOperation; PassThroughThenCleanup(CleanupOperation cleanupOperation) { this.cleanupOperation = cleanupOperation; } @Override public PCollection apply(PCollection input) { TupleTag mainOutput = new TupleTag<>(); TupleTag cleanupSignal = new TupleTag<>(); PCollectionTuple outputs = input.apply(ParDo.of(new IdentityFn()) .withOutputTags(mainOutput, TupleTagList.of(cleanupSignal))); PCollectionView cleanupSignalView = outputs.get(cleanupSignal) .setCoder(VoidCoder.of()) .apply(View.asSingleton().withDefaultValue(null)); input.getPipeline() .apply("Create(CleanupOperation)", Create.of(cleanupOperation)) .apply("Cleanup", ParDo.of( new DoFn() { @Override public void processElement(ProcessContext c) throws Exception { c.element().cleanup(c.getPipelineOptions()); } }).withSideInputs(cleanupSignalView)); return outputs.get(mainOutput); } private static class IdentityFn extends DoFn { @Override public void processElement(ProcessContext c) { c.output(c.element()); } } abstract static class CleanupOperation implements Serializable { abstract void cleanup(PipelineOptions options) throws Exception; } } /** * A {@link BigQuerySourceBase} for reading BigQuery tables. */ @VisibleForTesting static class BigQueryTableSource extends BigQuerySourceBase { static BigQueryTableSource create( String jobIdToken, ValueProvider table, String extractDestinationDir, BigQueryServices bqServices, ValueProvider executingProject) { return new BigQueryTableSource( jobIdToken, table, extractDestinationDir, bqServices, executingProject); } private final ValueProvider jsonTable; private final AtomicReference tableSizeBytes; private BigQueryTableSource( String jobIdToken, ValueProvider table, String extractDestinationDir, BigQueryServices bqServices, ValueProvider executingProject) { super(jobIdToken, extractDestinationDir, bqServices, executingProject); this.jsonTable = NestedValueProvider.of(checkNotNull(table, "table"), new TableRefToJson()); this.tableSizeBytes = new AtomicReference<>(); } @Override protected TableReference getTableToExtract(BigQueryOptions bqOptions) throws IOException { checkState(jsonTable.isAccessible()); return JSON_FACTORY.fromString(jsonTable.get(), TableReference.class); } @Override public BoundedReader createReader(PipelineOptions options) throws IOException { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); checkState(jsonTable.isAccessible()); TableReference tableRef = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class); return new BigQueryReader(this, bqServices.getReaderFromTable(bqOptions, tableRef)); } @Override public synchronized long getEstimatedSizeBytes(PipelineOptions options) throws Exception { if (tableSizeBytes.get() == null) { TableReference table = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class); Long numBytes = bqServices.getDatasetService(options.as(BigQueryOptions.class)) .getTable(table.getProjectId(), table.getDatasetId(), table.getTableId()) .getNumBytes(); tableSizeBytes.compareAndSet(null, numBytes); } return tableSizeBytes.get(); } @Override protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception { // Do nothing. } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("table", jsonTable)); } } /** * A {@link BigQuerySourceBase} for querying BigQuery tables. */ @VisibleForTesting static class BigQueryQuerySource extends BigQuerySourceBase { static BigQueryQuerySource create( String jobIdToken, ValueProvider query, ValueProvider queryTempTableRef, Boolean flattenResults, Boolean useLegacySql, String extractDestinationDir, BigQueryServices bqServices) { return new BigQueryQuerySource( jobIdToken, query, queryTempTableRef, flattenResults, useLegacySql, extractDestinationDir, bqServices); } private final ValueProvider query; private final ValueProvider jsonQueryTempTable; private final Boolean flattenResults; private final Boolean useLegacySql; private transient AtomicReference dryRunJobStats; private BigQueryQuerySource( String jobIdToken, ValueProvider query, ValueProvider queryTempTableRef, Boolean flattenResults, Boolean useLegacySql, String extractDestinationDir, BigQueryServices bqServices) { super(jobIdToken, extractDestinationDir, bqServices, NestedValueProvider.of( checkNotNull(queryTempTableRef, "queryTempTableRef"), new TableRefToProjectId())); this.query = checkNotNull(query, "query"); this.jsonQueryTempTable = NestedValueProvider.of( queryTempTableRef, new TableRefToJson()); this.flattenResults = checkNotNull(flattenResults, "flattenResults"); this.useLegacySql = checkNotNull(useLegacySql, "useLegacySql"); this.dryRunJobStats = new AtomicReference<>(); } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); return dryRunQueryIfNeeded(bqOptions).getTotalBytesProcessed(); } @Override public BoundedReader createReader(PipelineOptions options) throws IOException { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); return new BigQueryReader(this, bqServices.getReaderFromQuery( bqOptions, query.get(), executingProject.get(), flattenResults, useLegacySql)); } @Override protected TableReference getTableToExtract(BigQueryOptions bqOptions) throws IOException, InterruptedException { // 1. Find the location of the query. String location = null; List referencedTables = dryRunQueryIfNeeded(bqOptions).getQuery().getReferencedTables(); DatasetService tableService = bqServices.getDatasetService(bqOptions); if (referencedTables != null && !referencedTables.isEmpty()) { TableReference queryTable = referencedTables.get(0); location = tableService.getTable( queryTable.getProjectId(), queryTable.getDatasetId(), queryTable.getTableId()).getLocation(); } // 2. Create the temporary dataset in the query location. TableReference tableToExtract = JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class); tableService.createDataset( tableToExtract.getProjectId(), tableToExtract.getDatasetId(), location, "Dataset for BigQuery query job temporary table"); // 3. Execute the query. String queryJobId = jobIdToken + "-query"; executeQuery( executingProject.get(), queryJobId, tableToExtract, bqServices.getJobService(bqOptions)); return tableToExtract; } @Override protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception { checkState(jsonQueryTempTable.isAccessible()); TableReference tableToRemove = JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class); DatasetService tableService = bqServices.getDatasetService(bqOptions); tableService.deleteTable( tableToRemove.getProjectId(), tableToRemove.getDatasetId(), tableToRemove.getTableId()); tableService.deleteDataset(tableToRemove.getProjectId(), tableToRemove.getDatasetId()); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("query", query)); } private synchronized JobStatistics dryRunQueryIfNeeded(BigQueryOptions bqOptions) throws InterruptedException, IOException { if (dryRunJobStats.get() == null) { JobStatistics jobStats = bqServices.getJobService(bqOptions).dryRunQuery( executingProject.get(), createBasicQueryConfig()); dryRunJobStats.compareAndSet(null, jobStats); } return dryRunJobStats.get(); } private void executeQuery( String executingProject, String jobId, TableReference destinationTable, JobService jobService) throws IOException, InterruptedException { JobReference jobRef = new JobReference() .setProjectId(executingProject) .setJobId(jobId); JobConfigurationQuery queryConfig = createBasicQueryConfig() .setAllowLargeResults(true) .setCreateDisposition("CREATE_IF_NEEDED") .setDestinationTable(destinationTable) .setPriority("BATCH") .setWriteDisposition("WRITE_EMPTY"); jobService.startQueryJob(jobRef, queryConfig); Job job = jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES); if (parseStatus(job) != Status.SUCCEEDED) { throw new IOException("Query job failed: " + jobId); } } private JobConfigurationQuery createBasicQueryConfig() { return new JobConfigurationQuery() .setQuery(query.get()) .setFlattenResults(flattenResults) .setUseLegacySql(useLegacySql); } private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException { in.defaultReadObject(); dryRunJobStats = new AtomicReference<>(); } } /** * An abstract {@link BoundedSource} to read a table from BigQuery. * *

This source uses a BigQuery export job to take a snapshot of the table on GCS, and then * reads in parallel from each produced file. It is implemented by {@link BigQueryTableSource}, * and {@link BigQueryQuerySource}, depending on the configuration of the read. * Specifically, *

    *
  • {@link BigQueryTableSource} is for reading BigQuery tables
  • *
  • {@link BigQueryQuerySource} is for querying BigQuery tables
  • *
* ... */ private abstract static class BigQuerySourceBase extends BoundedSource { // The maximum number of retries to verify temp files. private static final int MAX_FILES_VERIFY_RETRIES = 9; // The maximum number of retries to poll a BigQuery job. protected static final int JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE; // The initial backoff for verifying temp files. private static final Duration INITIAL_FILES_VERIFY_BACKOFF = Duration.standardSeconds(1); protected final String jobIdToken; protected final String extractDestinationDir; protected final BigQueryServices bqServices; protected final ValueProvider executingProject; private BigQuerySourceBase( String jobIdToken, String extractDestinationDir, BigQueryServices bqServices, ValueProvider executingProject) { this.jobIdToken = checkNotNull(jobIdToken, "jobIdToken"); this.extractDestinationDir = checkNotNull(extractDestinationDir, "extractDestinationDir"); this.bqServices = checkNotNull(bqServices, "bqServices"); this.executingProject = checkNotNull(executingProject, "executingProject"); } @Override public List> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); TableReference tableToExtract = getTableToExtract(bqOptions); JobService jobService = bqServices.getJobService(bqOptions); String extractJobId = getExtractJobId(jobIdToken); List tempFiles = executeExtract(extractJobId, tableToExtract, jobService); TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable( tableToExtract.getProjectId(), tableToExtract.getDatasetId(), tableToExtract.getTableId()).getSchema(); cleanupTempResource(bqOptions); return createSources(tempFiles, tableSchema); } protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception; protected abstract void cleanupTempResource(BigQueryOptions bqOptions) throws Exception; @Override public boolean producesSortedKeys(PipelineOptions options) throws Exception { return false; } @Override public void validate() { // Do nothing, validation is done in BigQuery.Read. } @Override public Coder getDefaultOutputCoder() { return TableRowJsonCoder.of(); } private List executeExtract( String jobId, TableReference table, JobService jobService) throws InterruptedException, IOException { JobReference jobRef = new JobReference() .setProjectId(executingProject.get()) .setJobId(jobId); String destinationUri = getExtractDestinationUri(extractDestinationDir); JobConfigurationExtract extract = new JobConfigurationExtract() .setSourceTable(table) .setDestinationFormat("AVRO") .setDestinationUris(ImmutableList.of(destinationUri)); LOG.info("Starting BigQuery extract job: {}", jobId); jobService.startExtractJob(jobRef, extract); Job extractJob = jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES); if (parseStatus(extractJob) != Status.SUCCEEDED) { throw new IOException(String.format( "Extract job %s failed, status: %s", extractJob.getJobReference().getJobId(), extractJob.getStatus())); } List tempFiles = getExtractFilePaths(extractDestinationDir, extractJob); return ImmutableList.copyOf(tempFiles); } private List> createSources( List files, TableSchema tableSchema) throws IOException, InterruptedException { final String jsonSchema = JSON_FACTORY.toString(tableSchema); SerializableFunction function = new SerializableFunction() { @Override public TableRow apply(GenericRecord input) { try { return AvroUtils.convertGenericRecordToTableRow( input, JSON_FACTORY.fromString(jsonSchema, TableSchema.class)); } catch (IOException e) { throw new RuntimeException("Failed to convert GenericRecord to TableRow", e); } }}; List> avroSources = Lists.newArrayList(); for (String fileName : files) { avroSources.add(new TransformingSource<>( AvroSource.from(fileName), function, getDefaultOutputCoder())); } return ImmutableList.copyOf(avroSources); } protected static class BigQueryReader extends BoundedSource.BoundedReader { private final BigQuerySourceBase source; private final BigQueryServices.BigQueryJsonReader reader; private BigQueryReader( BigQuerySourceBase source, BigQueryServices.BigQueryJsonReader reader) { this.source = source; this.reader = reader; } @Override public BoundedSource getCurrentSource() { return source; } @Override public boolean start() throws IOException { return reader.start(); } @Override public boolean advance() throws IOException { return reader.advance(); } @Override public TableRow getCurrent() throws NoSuchElementException { return reader.getCurrent(); } @Override public void close() throws IOException { reader.close(); } } } /** * A {@link BoundedSource} that reads from {@code BoundedSource} * and transforms elements to type {@code V}. */ @VisibleForTesting static class TransformingSource extends BoundedSource { private final BoundedSource boundedSource; private final SerializableFunction function; private final Coder outputCoder; TransformingSource( BoundedSource boundedSource, SerializableFunction function, Coder outputCoder) { this.boundedSource = checkNotNull(boundedSource, "boundedSource"); this.function = checkNotNull(function, "function"); this.outputCoder = checkNotNull(outputCoder, "outputCoder"); } @Override public List> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { return Lists.transform( boundedSource.splitIntoBundles(desiredBundleSizeBytes, options), new Function, BoundedSource>() { @Override public BoundedSource apply(BoundedSource input) { return new TransformingSource<>(input, function, outputCoder); } }); } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { return boundedSource.getEstimatedSizeBytes(options); } @Override public boolean producesSortedKeys(PipelineOptions options) throws Exception { return boundedSource.producesSortedKeys(options); } @Override public BoundedReader createReader(PipelineOptions options) throws IOException { return new TransformingReader(boundedSource.createReader(options)); } @Override public void validate() { boundedSource.validate(); } @Override public Coder getDefaultOutputCoder() { return outputCoder; } private class TransformingReader extends BoundedReader { private final BoundedReader boundedReader; private TransformingReader(BoundedReader boundedReader) { this.boundedReader = checkNotNull(boundedReader, "boundedReader"); } @Override public synchronized BoundedSource getCurrentSource() { return new TransformingSource<>(boundedReader.getCurrentSource(), function, outputCoder); } @Override public boolean start() throws IOException { return boundedReader.start(); } @Override public boolean advance() throws IOException { return boundedReader.advance(); } @Override public V getCurrent() throws NoSuchElementException { T current = boundedReader.getCurrent(); return function.apply(current); } @Override public void close() throws IOException { boundedReader.close(); } @Override public synchronized BoundedSource splitAtFraction(double fraction) { BoundedSource split = boundedReader.splitAtFraction(fraction); return split == null ? null : new TransformingSource<>(split, function, outputCoder); } @Override public Double getFractionConsumed() { return boundedReader.getFractionConsumed(); } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { return boundedReader.getCurrentTimestamp(); } } } private static String getExtractJobId(String jobIdToken) { return jobIdToken + "-extract"; } private static String getExtractDestinationUri(String extractDestinationDir) { return String.format("%s/%s", extractDestinationDir, "*.avro"); } private static List getExtractFilePaths(String extractDestinationDir, Job extractJob) throws IOException { JobStatistics jobStats = extractJob.getStatistics(); List counts = jobStats.getExtract().getDestinationUriFileCounts(); if (counts.size() != 1) { String errorMessage = (counts.size() == 0 ? "No destination uri file count received." : String.format("More than one destination uri file count received. First two are %s, %s", counts.get(0), counts.get(1))); throw new RuntimeException(errorMessage); } long filesCount = counts.get(0); ImmutableList.Builder paths = ImmutableList.builder(); IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir); for (long i = 0; i < filesCount; ++i) { String filePath = factory.resolve(extractDestinationDir, String.format("%012d%s", i, ".avro")); paths.add(filePath); } return paths.build(); } ///////////////////////////////////////////////////////////////////////////// /** * A {@link PTransform} that writes a {@link PCollection} containing {@link TableRow TableRows} * to a BigQuery table. * *

In BigQuery, each table has an encosing dataset. The dataset being written must already * exist. * *

By default, tables will be created if they do not exist, which corresponds to a * {@link CreateDisposition#CREATE_IF_NEEDED} disposition that matches the default of BigQuery's * Jobs API. A schema must be provided (via {@link BigQueryIO.Write#withSchema(TableSchema)}), * or else the transform may fail at runtime with an {@link IllegalArgumentException}. * *

By default, writes require an empty table, which corresponds to * a {@link WriteDisposition#WRITE_EMPTY} disposition that matches the * default of BigQuery's Jobs API. * *

Here is a sample transform that produces TableRow values containing * "word" and "count" columns: *

{@code
   * static class FormatCountsFn extends DoFn, TableRow> {
   *   public void processElement(ProcessContext c) {
   *     TableRow row = new TableRow()
   *         .set("word", c.element().getKey())
   *         .set("count", c.element().getValue().intValue());
   *     c.output(row);
   *   }
   * }}
*/ public static class Write { /** * An enumeration type for the BigQuery create disposition strings. * * @see * configuration.query.createDisposition in the BigQuery Jobs API */ public enum CreateDisposition { /** * Specifics that tables should not be created. * *

If the output table does not exist, the write fails. */ CREATE_NEVER, /** * Specifies that tables should be created if needed. This is the default * behavior. * *

Requires that a table schema is provided via {@link BigQueryIO.Write#withSchema}. * This precondition is checked before starting a job. The schema is * not required to match an existing table's schema. * *

When this transformation is executed, if the output table does not * exist, the table is created from the provided schema. Note that even if * the table exists, it may be recreated if necessary when paired with a * {@link WriteDisposition#WRITE_TRUNCATE}. */ CREATE_IF_NEEDED } /** * An enumeration type for the BigQuery write disposition strings. * * @see * configuration.query.writeDisposition in the BigQuery Jobs API */ public enum WriteDisposition { /** * Specifies that write should replace a table. * *

The replacement may occur in multiple steps - for instance by first * removing the existing table, then creating a replacement, then filling * it in. This is not an atomic operation, and external programs may * see the table in any of these intermediate steps. */ WRITE_TRUNCATE, /** * Specifies that rows may be appended to an existing table. */ WRITE_APPEND, /** * Specifies that the output table must be empty. This is the default * behavior. * *

If the output table is not empty, the write fails at runtime. * *

This check may occur long before data is written, and does not * guarantee exclusive access to the table. If two programs are run * concurrently, each specifying the same output table and * a {@link WriteDisposition} of {@link WriteDisposition#WRITE_EMPTY}, it is possible * for both to succeed. */ WRITE_EMPTY } /** * Creates a write transformation with the given transform name. The BigQuery table to be * written has not yet been configured. */ public static Bound named(String name) { return new Bound().named(name); } /** * Creates a write transformation for the given table specification. * *

Refer to {@link #parseTableSpec(String)} for the specification format. */ public static Bound to(String tableSpec) { return new Bound().to(tableSpec); } /** Creates a write transformation for the given table. */ public static Bound to(ValueProvider tableSpec) { return new Bound().to(tableSpec); } /** Creates a write transformation for the given table. */ public static Bound to(TableReference table) { return new Bound().to(table); } /** * Creates a write transformation from a function that maps windows to table specifications. * Each time a new window is encountered, this function will be called and the resulting table * will be created. Records within that window will be written to the associated table. * *

See {@link #parseTableSpec(String)} for the format that {@code tableSpecFunction} should * return. * *

{@code tableSpecFunction} should be deterministic. When given the same window, it should * always return the same table specification. */ public static Bound to(SerializableFunction tableSpecFunction) { return new Bound().to(tableSpecFunction); } /** * Creates a write transformation from a function that maps windows to {@link TableReference} * objects. * *

{@code tableRefFunction} should be deterministic. When given the same window, it should * always return the same table reference. */ public static Bound toTableReference( SerializableFunction tableRefFunction) { return new Bound().toTableReference(tableRefFunction); } /** * Creates a write transformation with the specified schema to use in table creation. * *

The schema is required only if writing to a table that does not already * exist, and {@link CreateDisposition} is set to * {@link CreateDisposition#CREATE_IF_NEEDED}. */ public static Bound withSchema(TableSchema schema) { return new Bound().withSchema(schema); } /** * Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}. */ public static Bound withSchema(ValueProvider schema) { return new Bound().withSchema(schema); } /** Creates a write transformation with the specified options for creating the table. */ public static Bound withCreateDisposition(CreateDisposition disposition) { return new Bound().withCreateDisposition(disposition); } /** Creates a write transformation with the specified options for writing to the table. */ public static Bound withWriteDisposition(WriteDisposition disposition) { return new Bound().withWriteDisposition(disposition); } /** * Creates a write transformation with BigQuery table validation disabled. */ public static Bound withoutValidation() { return new Bound().withoutValidation(); } /** * A {@link PTransform} that can write either a bounded or unbounded * {@link PCollection} of {@link TableRow TableRows} to a BigQuery table. */ public static class Bound extends PTransform, PDone> { // Maximum number of files in a single partition. static final int MAX_NUM_FILES = 10000; // Maximum number of bytes in a single partition -- 11 TiB just under BQ's 12 TiB limit. static final long MAX_SIZE_BYTES = 11 * (1L << 40); // The maximum number of retry jobs. static final int MAX_RETRY_JOBS = 3; // The maximum number of retries to poll the status of a job. // It sets to {@code Integer.MAX_VALUE} to block until the BigQuery job finishes. static final int LOAD_JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE; @Nullable final ValueProvider jsonTableRef; @Nullable final SerializableFunction tableRefFunction; // Table schema. The schema is required only if the table does not exist. @Nullable final ValueProvider jsonSchema; // Options for creating the table. Valid values are CREATE_IF_NEEDED and // CREATE_NEVER. final CreateDisposition createDisposition; // Options for writing to the table. Valid values are WRITE_TRUNCATE, // WRITE_APPEND and WRITE_EMPTY. final WriteDisposition writeDisposition; // An option to indicate if table validation is desired. Default is true. final boolean validate; @Nullable private BigQueryServices bigQueryServices; private static class TranslateTableSpecFunction implements SerializableFunction { private SerializableFunction tableSpecFunction; TranslateTableSpecFunction(SerializableFunction tableSpecFunction) { this.tableSpecFunction = tableSpecFunction; } @Override public TableReference apply(BoundedWindow value) { return parseTableSpec(tableSpecFunction.apply(value)); } } /** * @deprecated Should be private. Instead, use one of the factory methods in * {@link BigQueryIO.Write}, such as {@link BigQueryIO.Write#to(String)}, to create an * instance of this class. */ @Deprecated public Bound() { this( null /* name */, null /* jsonTableRef */, null /* tableRefFunction */, null /* jsonSchema */, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, true /* validate */, null /* bigQueryServices */); } private Bound(String name, @Nullable ValueProvider jsonTableRef, @Nullable SerializableFunction tableRefFunction, @Nullable ValueProvider jsonSchema, CreateDisposition createDisposition, WriteDisposition writeDisposition, boolean validate, @Nullable BigQueryServices bigQueryServices) { super(name); this.jsonTableRef = jsonTableRef; this.tableRefFunction = tableRefFunction; this.jsonSchema = jsonSchema; this.createDisposition = checkNotNull(createDisposition, "createDisposition"); this.writeDisposition = checkNotNull(writeDisposition, "writeDisposition"); this.validate = validate; this.bigQueryServices = bigQueryServices; } /** * Returns a copy of this write transformation, but with the specified transform name. * *

Does not modify this object. */ public Bound named(String name) { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but writing to the specified table. Refer to * {@link #parseTableSpec(String)} for the specification format. * *

Does not modify this object. */ public Bound to(String tableSpec) { return toTableRef(NestedValueProvider.of( StaticValueProvider.of(tableSpec), new TableSpecToTableRef())); } /** * Returns a copy of this write transformation, but writing to the specified table. * *

Does not modify this object. */ public Bound to(TableReference table) { return to(StaticValueProvider.of(toTableSpec(table))); } /** * Returns a copy of this write transformation, but using the specified function to determine * which table to write to for each window. * *

Does not modify this object. * *

{@code tableSpecFunction} should be deterministic. When given the same window, it * should always return the same table specification. */ public Bound to( SerializableFunction tableSpecFunction) { return toTableReference(new TranslateTableSpecFunction(tableSpecFunction)); } /** * Returns a copy of this write transformation, but writing to the specified table. Refer to * {@link #parseTableSpec(String)} for the specification format. * *

Does not modify this object. */ public Bound to(ValueProvider tableSpec) { return toTableRef(NestedValueProvider.of(tableSpec, new TableSpecToTableRef())); } /** * Returns a copy of this write transformation, but writing to the specified table. * *

Does not modify this object. */ private Bound toTableRef(ValueProvider table) { return new Bound(name, NestedValueProvider.of(table, new TableRefToJson()), tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but using the specified function to determine * which table to write to for each window. * *

Does not modify this object. * *

{@code tableRefFunction} should be deterministic. When given the same window, it should * always return the same table reference. */ public Bound toTableReference( SerializableFunction tableRefFunction) { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but using the specified schema for rows * to be written. * *

Does not modify this object. */ public Bound withSchema(TableSchema schema) { return new Bound(name, jsonTableRef, tableRefFunction, StaticValueProvider.of(toJsonString(schema)), createDisposition, writeDisposition, validate, bigQueryServices); } /** * Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}. */ public Bound withSchema(ValueProvider schema) { return new Bound(name, jsonTableRef, tableRefFunction, NestedValueProvider.of(schema, new TableSchemaToJsonSchema()), createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but using the specified create disposition. * *

Does not modify this object. */ public Bound withCreateDisposition(CreateDisposition createDisposition) { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but using the specified write disposition. * *

Does not modify this object. */ public Bound withWriteDisposition(WriteDisposition writeDisposition) { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, bigQueryServices); } /** * Returns a copy of this write transformation, but without BigQuery table validation. * *

Does not modify this object. */ public Bound withoutValidation() { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, false, bigQueryServices); } @VisibleForTesting Bound withTestServices(BigQueryServices testServices) { return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition, writeDisposition, validate, testServices); } private static void verifyTableEmpty( BigQueryOptions options, TableReference table) { try { Bigquery client = Transport.newBigQueryClient(options).build(); BigQueryTableInserter inserter = new BigQueryTableInserter(client); if (!inserter.isEmpty(table)) { throw new IllegalArgumentException( "BigQuery table is not empty: " + BigQueryIO.toTableSpec(table)); } } catch (IOException e) { ApiErrorExtractor errorExtractor = new ApiErrorExtractor(); if (errorExtractor.itemNotFound(e)) { // Nothing to do. If the table does not exist, it is considered empty. } else { throw new RuntimeException( "unable to confirm BigQuery table emptiness for table " + BigQueryIO.toTableSpec(table), e); } } } @Override public void validate(PCollection input) { BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class); // Exactly one of the table and table reference can be configured. checkState( jsonTableRef != null || tableRefFunction != null, "must set the table reference of a BigQueryIO.Write transform"); checkState( jsonTableRef == null || tableRefFunction == null, "Cannot set both a table reference and a table function for a BigQueryIO.Write" + " transform"); // Require a schema if creating one or more tables. checkArgument( createDisposition != CreateDisposition.CREATE_IF_NEEDED || jsonSchema != null, "CreateDisposition is CREATE_IF_NEEDED, however no schema was provided."); // The user specified a table. if (jsonTableRef != null && validate) { TableReference table = getTableWithDefaultProject(options).get(); // Check for destination table presence and emptiness for early failure notification. // Note that a presence check can fail when the table or dataset is created by an earlier // stage of the pipeline. For these cases the #withoutValidation method can be used to // disable the check. verifyDatasetPresence(options, table); if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) { verifyTablePresence(options, table); } if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) { verifyTableEmpty(options, table); } } if (options.isStreaming() || tableRefFunction != null) { // We will use BigQuery's streaming write API -- validate supported dispositions. checkArgument( createDisposition != CreateDisposition.CREATE_NEVER, "CreateDisposition.CREATE_NEVER is not supported for an unbounded PCollection or when" + " using a tablespec function."); checkArgument( writeDisposition != WriteDisposition.WRITE_TRUNCATE, "WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded PCollection or" + " when using a tablespec function."); } else { // We will use a BigQuery load job -- validate the temp location. String tempLocation = options.getTempLocation(); checkArgument( !Strings.isNullOrEmpty(tempLocation), "BigQueryIO.Write needs a GCS temp location to store temp files."); if (bigQueryServices == null) { try { GcsPath.fromUri(tempLocation); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( String.format( "BigQuery temp location expected a valid 'gs://' path, but was given '%s'", tempLocation), e); } } } } @Override public PDone apply(PCollection input) { Pipeline p = input.getPipeline(); BigQueryOptions options = p.getOptions().as(BigQueryOptions.class); BigQueryServices bqServices = getBigQueryServices(); // In a streaming job, or when a tablespec function is defined, we use StreamWithDeDup // and BigQuery's streaming import API. if (options.isStreaming() || tableRefFunction != null) { return input.apply(new StreamWithDeDup(getTable(), tableRefFunction, NestedValueProvider.of(jsonSchema, new JsonSchemaToTableSchema()))); } ValueProvider table = getTableWithDefaultProject(options); String jobIdToken = "beam_job_" + randomUUIDString(); String tempLocation = options.getTempLocation(); String tempFilePrefix; try { IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation); tempFilePrefix = factory.resolve( factory.resolve(tempLocation, "BigQueryWriteTemp"), jobIdToken); } catch (IOException e) { throw new RuntimeException( String.format("Failed to resolve BigQuery temp location in %s", tempLocation), e); } PCollection singleton = p.apply("Create", Create.of(tempFilePrefix)); PCollection inputInGlobalWindow = input.apply( Window.into(new GlobalWindows()) .triggering(DefaultTrigger.of()) .discardingFiredPanes()); PCollection> results = inputInGlobalWindow .apply("WriteBundles", ParDo.of(new WriteBundles(tempFilePrefix))); TupleTag>> multiPartitionsTag = new TupleTag>>("multiPartitionsTag") {}; TupleTag>> singlePartitionTag = new TupleTag>>("singlePartitionTag") {}; PCollectionView>> resultsView = results .apply("ResultsView", View.>asIterable()); PCollectionTuple partitions = singleton.apply(ParDo .of(new WritePartition( resultsView, multiPartitionsTag, singlePartitionTag)) .withSideInputs(resultsView) .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag))); // Write multiple partitions to separate temporary tables PCollection tempTables = partitions.get(multiPartitionsTag) .apply("MultiPartitionsGroupByKey", GroupByKey.>create()) .apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables( false, bqServices, jobIdToken, tempFilePrefix, NestedValueProvider.of(table, new TableRefToJson()), jsonSchema, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED))); PCollectionView> tempTablesView = tempTables .apply("TempTablesView", View.asIterable()); singleton.apply(ParDo .of(new WriteRename( bqServices, jobIdToken, NestedValueProvider.of(table, new TableRefToJson()), writeDisposition, createDisposition, tempTablesView)) .withSideInputs(tempTablesView)); // Write single partition to final table partitions.get(singlePartitionTag) .apply("SinglePartitionGroupByKey", GroupByKey.>create()) .apply("SinglePartitionWriteTables", ParDo.of(new WriteTables( true, bqServices, jobIdToken, tempFilePrefix, NestedValueProvider.of(table, new TableRefToJson()), jsonSchema, writeDisposition, createDisposition))); return PDone.in(input.getPipeline()); } private class WriteBundles extends DoFn> { private TableRowWriter writer = null; private final String tempFilePrefix; WriteBundles(String tempFilePrefix) { this.tempFilePrefix = tempFilePrefix; } @Override public void processElement(ProcessContext c) throws Exception { if (writer == null) { writer = new TableRowWriter(tempFilePrefix); writer.open(UUID.randomUUID().toString()); LOG.debug("Done opening writer {}", writer); } try { writer.write(c.element()); } catch (Exception e) { // Discard write result and close the write. try { writer.close(); // The writer does not need to be reset, as this OldDoFn cannot be reused. } catch (Exception closeException) { // Do not mask the exception that caused the write to fail. e.addSuppressed(closeException); } throw e; } } @Override public void finishBundle(Context c) throws Exception { if (writer != null) { c.output(writer.close()); writer = null; } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix) .withLabel("Temporary File Prefix")); } } @Override protected Coder getDefaultOutputCoder() { return VoidCoder.of(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("table", jsonTableRef) .withLabel("Table Reference")) .addIfNotNull(DisplayData.item("schema", jsonSchema) .withLabel("Table Schema")); if (tableRefFunction != null) { builder.add(DisplayData.item("tableFn", tableRefFunction.getClass()) .withLabel("Table Reference Function")); } builder .add(DisplayData.item("createDisposition", createDisposition.toString()) .withLabel("Table CreateDisposition")) .add(DisplayData.item("writeDisposition", writeDisposition.toString()) .withLabel("Table WriteDisposition")) .addIfNotDefault(DisplayData.item("validation", validate) .withLabel("Validation Enabled"), true); } /** Returns the create disposition. */ public CreateDisposition getCreateDisposition() { return createDisposition; } /** Returns the write disposition. */ public WriteDisposition getWriteDisposition() { return writeDisposition; } /** Returns the table schema. */ public TableSchema getSchema() { return fromJsonString( jsonSchema == null ? null : jsonSchema.get(), TableSchema.class); } /** * Returns the table to write, or {@code null} if writing with {@code tableRefFunction}. * *

If the table's project is not specified, use the executing project. */ @Nullable private ValueProvider getTableWithDefaultProject( BigQueryOptions bqOptions) { ValueProvider table = getTable(); if (table == null) { return table; } if (!table.isAccessible()) { LOG.info("Using a dynamic value for table input. This must contain a project" + " in the table reference: {}", table); return table; } if (Strings.isNullOrEmpty(table.get().getProjectId())) { // If user does not specify a project we assume the table to be located in // the default project. TableReference tableRef = table.get(); tableRef.setProjectId(bqOptions.getProject()); return NestedValueProvider.of(StaticValueProvider.of( toJsonString(tableRef)), new JsonTableRefToTableRef()); } return table; } /** Returns the table reference, or {@code null}. */ @Nullable public ValueProvider getTable() { return jsonTableRef == null ? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef()); } /** Returns {@code true} if table validation is enabled. */ public boolean getValidate() { return validate; } private BigQueryServices getBigQueryServices() { if (bigQueryServices == null) { bigQueryServices = new BigQueryServicesImpl(); } return bigQueryServices; } } static class TableRowWriter { private static final Coder CODER = TableRowJsonCoder.of(); private static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8); private final String tempFilePrefix; private String id; private String fileName; private WritableByteChannel channel; protected String mimeType = MimeTypes.TEXT; private CountingOutputStream out; TableRowWriter(String basename) { this.tempFilePrefix = basename; } public final void open(String uId) throws Exception { id = uId; fileName = tempFilePrefix + id; LOG.debug("Opening {}.", fileName); channel = IOChannelUtils.create(fileName, mimeType); try { out = new CountingOutputStream(Channels.newOutputStream(channel)); LOG.debug("Writing header to {}.", fileName); } catch (Exception e) { try { LOG.error("Writing header to {} failed, closing channel.", fileName); channel.close(); } catch (IOException closeException) { LOG.error("Closing channel for {} failed", fileName); } throw e; } LOG.debug("Starting write of bundle {} to {}.", this.id, fileName); } public void write(TableRow value) throws Exception { CODER.encode(value, out, Context.OUTER); out.write(NEWLINE); } public final KV close() throws IOException { channel.close(); return KV.of(fileName, out.getCount()); } } /** * Partitions temporary files based on number of files and file sizes. */ static class WritePartition extends DoFn>> { private final PCollectionView>> resultsView; private TupleTag>> multiPartitionsTag; private TupleTag>> singlePartitionTag; public WritePartition( PCollectionView>> resultsView, TupleTag>> multiPartitionsTag, TupleTag>> singlePartitionTag) { this.resultsView = resultsView; this.multiPartitionsTag = multiPartitionsTag; this.singlePartitionTag = singlePartitionTag; } @Override public void processElement(ProcessContext c) throws Exception { List> results = Lists.newArrayList(c.sideInput(resultsView)); if (results.isEmpty()) { TableRowWriter writer = new TableRowWriter(c.element()); writer.open(UUID.randomUUID().toString()); results.add(writer.close()); } long partitionId = 0; int currNumFiles = 0; long currSizeBytes = 0; List currResults = Lists.newArrayList(); for (int i = 0; i < results.size(); ++i) { KV fileResult = results.get(i); if (currNumFiles + 1 > Bound.MAX_NUM_FILES || currSizeBytes + fileResult.getValue() > Bound.MAX_SIZE_BYTES) { c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults)); currResults = Lists.newArrayList(); currNumFiles = 0; currSizeBytes = 0; } ++currNumFiles; currSizeBytes += fileResult.getValue(); currResults.add(fileResult.getKey()); } if (partitionId == 0) { c.sideOutput(singlePartitionTag, KV.of(++partitionId, currResults)); } else { c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults)); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); } } /** * Writes partitions to BigQuery tables. */ static class WriteTables extends DoFn>>, String> { private final boolean singlePartition; private final BigQueryServices bqServices; private final String jobIdToken; private final String tempFilePrefix; private final ValueProvider jsonTableRef; private final ValueProvider jsonSchema; private final WriteDisposition writeDisposition; private final CreateDisposition createDisposition; public WriteTables( boolean singlePartition, BigQueryServices bqServices, String jobIdToken, String tempFilePrefix, ValueProvider jsonTableRef, ValueProvider jsonSchema, WriteDisposition writeDisposition, CreateDisposition createDisposition) { this.singlePartition = singlePartition; this.bqServices = bqServices; this.jobIdToken = jobIdToken; this.tempFilePrefix = tempFilePrefix; this.jsonTableRef = jsonTableRef; this.jsonSchema = jsonSchema; this.writeDisposition = writeDisposition; this.createDisposition = createDisposition; } @Override public void processElement(ProcessContext c) throws Exception { List partition = Lists.newArrayList(c.element().getValue()).get(0); String jobIdPrefix = String.format(jobIdToken + "_%05d", c.element().getKey()); TableReference ref = fromJsonString(jsonTableRef.get(), TableReference.class); if (!singlePartition) { ref.setTableId(jobIdPrefix); } load( bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)), jobIdPrefix, ref, fromJsonString( jsonSchema == null ? null : jsonSchema.get(), TableSchema.class), partition, writeDisposition, createDisposition); c.output(toJsonString(ref)); removeTemporaryFiles(c.getPipelineOptions(), tempFilePrefix, partition); } private void load( JobService jobService, String jobIdPrefix, TableReference ref, @Nullable TableSchema schema, List gcsUris, WriteDisposition writeDisposition, CreateDisposition createDisposition) throws InterruptedException, IOException { JobConfigurationLoad loadConfig = new JobConfigurationLoad() .setDestinationTable(ref) .setSchema(schema) .setSourceUris(gcsUris) .setWriteDisposition(writeDisposition.name()) .setCreateDisposition(createDisposition.name()) .setSourceFormat("NEWLINE_DELIMITED_JSON"); String projectId = ref.getProjectId(); for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) { String jobId = jobIdPrefix + "-" + i; LOG.info("Starting BigQuery load job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS); JobReference jobRef = new JobReference() .setProjectId(projectId) .setJobId(jobId); jobService.startLoadJob(jobRef, loadConfig); Status jobStatus = parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES)); switch (jobStatus) { case SUCCEEDED: return; case UNKNOWN: throw new RuntimeException("Failed to poll the load job status of job " + jobId); case FAILED: LOG.info("BigQuery load job failed: {}", jobId); continue; default: throw new IllegalStateException(String.format("Unexpected job status: %s of job %s", jobStatus, jobId)); } } throw new RuntimeException(String.format("Failed to create the load job %s, reached max " + "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS)); } static void removeTemporaryFiles( PipelineOptions options, String tempFilePrefix, Collection files) throws IOException { IOChannelFactory factory = IOChannelUtils.getFactory(tempFilePrefix); if (factory instanceof GcsIOChannelFactory) { GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(options); gcsUtil.remove(files); } else if (factory instanceof FileIOChannelFactory) { for (String filename : files) { LOG.debug("Removing file {}", filename); boolean exists = Files.deleteIfExists(Paths.get(filename)); if (!exists) { LOG.debug("{} does not exist.", filename); } } } else { throw new IOException("Unrecognized file system."); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("jobIdToken", jobIdToken) .withLabel("Job ID Token")) .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix) .withLabel("Temporary File Prefix")) .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef) .withLabel("Table Reference")) .addIfNotNull(DisplayData.item("jsonSchema", jsonSchema) .withLabel("Table Schema")); } } /** * Copies temporary tables to destination table. */ static class WriteRename extends DoFn { private final BigQueryServices bqServices; private final String jobIdToken; private final ValueProvider jsonTableRef; private final WriteDisposition writeDisposition; private final CreateDisposition createDisposition; private final PCollectionView> tempTablesView; public WriteRename( BigQueryServices bqServices, String jobIdToken, ValueProvider jsonTableRef, WriteDisposition writeDisposition, CreateDisposition createDisposition, PCollectionView> tempTablesView) { this.bqServices = bqServices; this.jobIdToken = jobIdToken; this.jsonTableRef = jsonTableRef; this.writeDisposition = writeDisposition; this.createDisposition = createDisposition; this.tempTablesView = tempTablesView; } @Override public void processElement(ProcessContext c) throws Exception { List tempTablesJson = Lists.newArrayList(c.sideInput(tempTablesView)); // Do not copy if no temp tables are provided if (tempTablesJson.size() == 0) { return; } List tempTables = Lists.newArrayList(); for (String table : tempTablesJson) { tempTables.add(fromJsonString(table, TableReference.class)); } copy( bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)), jobIdToken, fromJsonString(jsonTableRef.get(), TableReference.class), tempTables, writeDisposition, createDisposition); DatasetService tableService = bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)); removeTemporaryTables(tableService, tempTables); } private void copy( JobService jobService, String jobIdPrefix, TableReference ref, List tempTables, WriteDisposition writeDisposition, CreateDisposition createDisposition) throws InterruptedException, IOException { JobConfigurationTableCopy copyConfig = new JobConfigurationTableCopy() .setSourceTables(tempTables) .setDestinationTable(ref) .setWriteDisposition(writeDisposition.name()) .setCreateDisposition(createDisposition.name()); String projectId = ref.getProjectId(); for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) { String jobId = jobIdPrefix + "-" + i; LOG.info("Starting BigQuery copy job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS); JobReference jobRef = new JobReference() .setProjectId(projectId) .setJobId(jobId); jobService.startCopyJob(jobRef, copyConfig); Status jobStatus = parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES)); switch (jobStatus) { case SUCCEEDED: return; case UNKNOWN: throw new RuntimeException("Failed to poll the copy job status of job " + jobId); case FAILED: LOG.info("BigQuery copy job failed: {}", jobId); continue; default: throw new IllegalStateException(String.format("Unexpected job status: %s of job %s", jobStatus, jobId)); } } throw new RuntimeException(String.format("Failed to create the copy job %s, reached max " + "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS)); } static void removeTemporaryTables(DatasetService tableService, List tempTables) throws Exception { for (TableReference tableRef : tempTables) { try { LOG.debug("Deleting table {}", toJsonString(tableRef)); tableService.deleteTable( tableRef.getProjectId(), tableRef.getDatasetId(), tableRef.getTableId()); } catch (Exception e) { LOG.warn("Failed to delete the table {}", toJsonString(tableRef), e); } } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("jobIdToken", jobIdToken) .withLabel("Job ID Token")) .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef) .withLabel("Table Reference")) .add(DisplayData.item("writeDisposition", writeDisposition.toString()) .withLabel("Write Disposition")) .add(DisplayData.item("createDisposition", createDisposition.toString()) .withLabel("Create Disposition")); } } /** Disallow construction of utility class. */ private Write() {} } private static void verifyDatasetPresence(BigQueryOptions options, TableReference table) { String resourceNotFoundMsg = String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", BigQueryIO.toTableSpec(table)); try { Bigquery client = Transport.newBigQueryClient(options).build(); BigQueryTableRowIterator.executeWithBackOff( client.datasets().get(table.getProjectId(), table.getDatasetId()), resourceNotFoundMsg); } catch (Exception e) { ApiErrorExtractor errorExtractor = new ApiErrorExtractor(); if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) { throw new IllegalArgumentException(resourceNotFoundMsg, e); } else { throw new RuntimeException( String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset", BigQueryIO.toTableSpec(table)), e); } } } private static void verifyTablePresence(BigQueryOptions options, TableReference table) { String resourceNotFoundMsg = String.format(RESOURCE_NOT_FOUND_ERROR, "table", BigQueryIO.toTableSpec(table)); try { Bigquery client = Transport.newBigQueryClient(options).build(); BigQueryTableRowIterator.executeWithBackOff( client.tables().get(table.getProjectId(), table.getDatasetId(), table.getTableId()), resourceNotFoundMsg); } catch (Exception e) { ApiErrorExtractor errorExtractor = new ApiErrorExtractor(); if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) { throw new IllegalArgumentException(resourceNotFoundMsg, e); } else { throw new RuntimeException( String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table", BigQueryIO.toTableSpec(table)), e); } } } ///////////////////////////////////////////////////////////////////////////// /** * Implementation of DoFn to perform streaming BigQuery write. */ @SystemDoFnInternal private static class StreamingWriteFn extends DoFn, TableRowInfo>, Void> { /** TableSchema in JSON. Use String to make the class Serializable. */ private final ValueProvider jsonTableSchema; /** JsonTableRows to accumulate BigQuery rows in order to batch writes. */ private transient Map> tableRows; /** The list of unique ids for each BigQuery table row. */ private transient Map> uniqueIdsForTableRows; /** The list of tables created so far, so we don't try the creation each time. */ private static Set createdTables = Collections.newSetFromMap(new ConcurrentHashMap()); /** Tracks bytes written, exposed as "ByteCount" Counter. */ private Aggregator byteCountAggregator = createAggregator("ByteCount", new Sum.SumLongFn()); /** Constructor. */ StreamingWriteFn(ValueProvider schema) { this.jsonTableSchema = NestedValueProvider.of(schema, new TableSchemaToJsonSchema()); } /** Prepares a target BigQuery table. */ @Override public void startBundle(Context context) { tableRows = new HashMap<>(); uniqueIdsForTableRows = new HashMap<>(); } /** Accumulates the input into JsonTableRows and uniqueIdsForTableRows. */ @Override public void processElement(ProcessContext context) { String tableSpec = context.element().getKey().getKey(); List rows = getOrCreateMapListValue(tableRows, tableSpec); List uniqueIds = getOrCreateMapListValue(uniqueIdsForTableRows, tableSpec); rows.add(context.element().getValue().tableRow); uniqueIds.add(context.element().getValue().uniqueId); } /** Writes the accumulated rows into BigQuery with streaming API. */ @Override public void finishBundle(Context context) throws Exception { BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class); Bigquery client = Transport.newBigQueryClient(options).build(); for (String tableSpec : tableRows.keySet()) { TableReference tableReference = getOrCreateTable(options, tableSpec); flushRows(client, tableReference, tableRows.get(tableSpec), uniqueIdsForTableRows.get(tableSpec)); } tableRows.clear(); uniqueIdsForTableRows.clear(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.addIfNotNull(DisplayData.item("schema", jsonTableSchema) .withLabel("Table Schema")); } public TableReference getOrCreateTable(BigQueryOptions options, String tableSpec) throws IOException { TableReference tableReference = parseTableSpec(tableSpec); if (!createdTables.contains(tableSpec)) { synchronized (createdTables) { // Another thread may have succeeded in creating the table in the meanwhile, so // check again. This check isn't needed for correctness, but we add it to prevent // every thread from attempting a create and overwhelming our BigQuery quota. if (!createdTables.contains(tableSpec)) { TableSchema tableSchema = JSON_FACTORY.fromString( jsonTableSchema.get(), TableSchema.class); Bigquery client = Transport.newBigQueryClient(options).build(); BigQueryTableInserter inserter = new BigQueryTableInserter(client); inserter.getOrCreateTable(tableReference, Write.WriteDisposition.WRITE_APPEND, Write.CreateDisposition.CREATE_IF_NEEDED, tableSchema); createdTables.add(tableSpec); } } } return tableReference; } /** Writes the accumulated rows into BigQuery with streaming API. */ private void flushRows(Bigquery client, TableReference tableReference, List tableRows, List uniqueIds) { if (!tableRows.isEmpty()) { try { BigQueryTableInserter inserter = new BigQueryTableInserter(client); inserter.insertAll(tableReference, tableRows, uniqueIds, byteCountAggregator); } catch (IOException e) { throw new RuntimeException(e); } } } } private static class ShardedKey { private final K key; private final int shardNumber; public static ShardedKey of(K key, int shardNumber) { return new ShardedKey(key, shardNumber); } private ShardedKey(K key, int shardNumber) { this.key = key; this.shardNumber = shardNumber; } public K getKey() { return key; } public int getShardNumber() { return shardNumber; } } /** * A {@link Coder} for {@link ShardedKey}, using a wrapped key {@link Coder}. */ private static class ShardedKeyCoder extends StandardCoder> { public static ShardedKeyCoder of(Coder keyCoder) { return new ShardedKeyCoder<>(keyCoder); } @JsonCreator public static ShardedKeyCoder of( @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List> components) { checkArgument(components.size() == 1, "Expecting 1 component, got %s", components.size()); return of(components.get(0)); } protected ShardedKeyCoder(Coder keyCoder) { this.keyCoder = keyCoder; this.shardNumberCoder = VarIntCoder.of(); } @Override public List> getCoderArguments() { return Arrays.asList(keyCoder); } @Override public void encode(ShardedKey key, OutputStream outStream, Context context) throws IOException { keyCoder.encode(key.getKey(), outStream, context.nested()); shardNumberCoder.encode(key.getShardNumber(), outStream, context); } @Override public ShardedKey decode(InputStream inStream, Context context) throws IOException { return new ShardedKey( keyCoder.decode(inStream, context.nested()), shardNumberCoder.decode(inStream, context)); } @Override public void verifyDeterministic() throws NonDeterministicException { keyCoder.verifyDeterministic(); } Coder keyCoder; VarIntCoder shardNumberCoder; } private static class TableRowInfoCoder extends AtomicCoder { private static final TableRowInfoCoder INSTANCE = new TableRowInfoCoder(); @JsonCreator public static TableRowInfoCoder of() { return INSTANCE; } @Override public void encode(TableRowInfo value, OutputStream outStream, Context context) throws IOException { if (value == null) { throw new CoderException("cannot encode a null value"); } tableRowCoder.encode(value.tableRow, outStream, context.nested()); idCoder.encode(value.uniqueId, outStream, context.nested()); } @Override public TableRowInfo decode(InputStream inStream, Context context) throws IOException { return new TableRowInfo( tableRowCoder.decode(inStream, context.nested()), idCoder.decode(inStream, context.nested())); } @Override public void verifyDeterministic() throws NonDeterministicException { throw new NonDeterministicException(this, "TableRows are not deterministic."); } TableRowJsonCoder tableRowCoder = TableRowJsonCoder.of(); StringUtf8Coder idCoder = StringUtf8Coder.of(); } private static class TableRowInfo { TableRowInfo(TableRow tableRow, String uniqueId) { this.tableRow = tableRow; this.uniqueId = uniqueId; } final TableRow tableRow; final String uniqueId; } ///////////////////////////////////////////////////////////////////////////// /** * Fn that tags each table row with a unique id and destination table. * To avoid calling UUID.randomUUID() for each element, which can be costly, * a randomUUID is generated only once per bucket of data. The actual unique * id is created by concatenating this randomUUID with a sequential number. */ @VisibleForTesting static class TagWithUniqueIdsAndTable extends DoFn, TableRowInfo>> implements DoFn.RequiresWindowAccess { /** TableSpec to write to. */ private final ValueProvider tableSpec; /** User function mapping windows to {@link TableReference} in JSON. */ private final SerializableFunction tableRefFunction; private transient String randomUUID; private transient long sequenceNo = 0L; TagWithUniqueIdsAndTable(BigQueryOptions options, ValueProvider table, SerializableFunction tableRefFunction) { checkArgument(table == null ^ tableRefFunction == null, "Exactly one of table or tableRefFunction should be set"); if (table != null) { if (table.isAccessible() && Strings.isNullOrEmpty(table.get().getProjectId())) { TableReference tableRef = table.get() .setProjectId(options.as(BigQueryOptions.class).getProject()); table = NestedValueProvider.of( StaticValueProvider.of(toJsonString(tableRef)), new JsonTableRefToTableRef()); } this.tableSpec = NestedValueProvider.of(table, new TableRefToTableSpec()); } else { tableSpec = null; } this.tableRefFunction = tableRefFunction; } @Override public void startBundle(Context context) { randomUUID = UUID.randomUUID().toString(); } /** Tag the input with a unique id. */ @Override public void processElement(ProcessContext context) throws IOException { String uniqueId = randomUUID + sequenceNo++; ThreadLocalRandom randomGenerator = ThreadLocalRandom.current(); String tableSpec = tableSpecFromWindow( context.getPipelineOptions().as(BigQueryOptions.class), context.window()); // We output on keys 0-50 to ensure that there's enough batching for // BigQuery. context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)), new TableRowInfo(context.element(), uniqueId))); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.addIfNotNull(DisplayData.item("table", tableSpec)); if (tableRefFunction != null) { builder.add(DisplayData.item("tableFn", tableRefFunction.getClass()) .withLabel("Table Reference Function")); } } @VisibleForTesting ValueProvider getTableSpec() { return tableSpec; } private String tableSpecFromWindow(BigQueryOptions options, BoundedWindow window) { if (tableSpec != null) { return tableSpec.get(); } else { TableReference table = tableRefFunction.apply(window); if (table.getProjectId() == null) { table.setProjectId(options.getProject()); } return toTableSpec(table); } } } ///////////////////////////////////////////////////////////////////////////// /** * PTransform that performs streaming BigQuery write. To increase consistency, * it leverages BigQuery best effort de-dup mechanism. */ private static class StreamWithDeDup extends PTransform, PDone> { private final transient ValueProvider tableReference; private final SerializableFunction tableRefFunction; private final transient ValueProvider tableSchema; /** Constructor. */ StreamWithDeDup(ValueProvider tableReference, SerializableFunction tableRefFunction, ValueProvider tableSchema) { this.tableReference = tableReference; this.tableRefFunction = tableRefFunction; this.tableSchema = tableSchema; } @Override protected Coder getDefaultOutputCoder() { return VoidCoder.of(); } @Override public PDone apply(PCollection input) { // A naive implementation would be to simply stream data directly to BigQuery. // However, this could occasionally lead to duplicated data, e.g., when // a VM that runs this code is restarted and the code is re-run. // The above risk is mitigated in this implementation by relying on // BigQuery built-in best effort de-dup mechanism. // To use this mechanism, each input TableRow is tagged with a generated // unique id, which is then passed to BigQuery and used to ignore duplicates. PCollection, TableRowInfo>> tagged = input.apply(ParDo.of( new TagWithUniqueIdsAndTable(input.getPipeline().getOptions().as(BigQueryOptions.class), tableReference, tableRefFunction))); // To prevent having the same TableRow processed more than once with regenerated // different unique ids, this implementation relies on "checkpointing", which is // achieved as a side effect of having StreamingWriteFn immediately follow a GBK, // performed by Reshuffle. tagged .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of())) .apply(Reshuffle., TableRowInfo>of()) .apply(ParDo.of(new StreamingWriteFn(tableSchema))); // Note that the implementation to return PDone here breaks the // implicit assumption about the job execution order. If a user // implements a PTransform that takes PDone returned here as its // input, the transform may not necessarily be executed after // the BigQueryIO.Write. return PDone.in(input.getPipeline()); } } /** * Status of a BigQuery job or request. */ enum Status { SUCCEEDED, FAILED, UNKNOWN, } private static Status parseStatus(@Nullable Job job) { if (job == null) { return Status.UNKNOWN; } JobStatus status = job.getStatus(); if (status.getErrorResult() != null) { return Status.FAILED; } else if (status.getErrors() != null && !status.getErrors().isEmpty()) { return Status.FAILED; } else { return Status.SUCCEEDED; } } @VisibleForTesting static String toJsonString(Object item) { if (item == null) { return null; } try { return JSON_FACTORY.toString(item); } catch (IOException e) { throw new RuntimeException( String.format("Cannot serialize %s to a JSON string.", item.getClass().getSimpleName()), e); } } @VisibleForTesting static T fromJsonString(String json, Class clazz) { if (json == null) { return null; } try { return JSON_FACTORY.fromString(json, clazz); } catch (IOException e) { throw new RuntimeException( String.format("Cannot deserialize %s from a JSON string: %s.", clazz, json), e); } } /** * Returns a randomUUID string. * *

{@code '-'} is removed because BigQuery doesn't allow it in dataset id. */ private static String randomUUIDString() { return UUID.randomUUID().toString().replaceAll("-", ""); } ///////////////////////////////////////////////////////////////////////////// /** Disallow construction of utility class. */ private BigQueryIO() {} private static List getOrCreateMapListValue(Map> map, K key) { List value = map.get(key); if (value == null) { value = new ArrayList<>(); map.put(key, value); } return value; } }