com.google.cloud.dataflow.sdk.io.BigQueryIO Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.QueryRequest;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.Coder.Context;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
import com.google.cloud.dataflow.sdk.options.GcpOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.NestedValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.AvroUtils;
import com.google.cloud.dataflow.sdk.util.BigQueryServices;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.DatasetService;
import com.google.cloud.dataflow.sdk.util.BigQueryServices.JobService;
import com.google.cloud.dataflow.sdk.util.BigQueryServicesImpl;
import com.google.cloud.dataflow.sdk.util.BigQueryTableInserter;
import com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator;
import com.google.cloud.dataflow.sdk.util.FileIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsIOChannelFactory;
import com.google.cloud.dataflow.sdk.util.GcsUtil;
import com.google.cloud.dataflow.sdk.util.GcsUtil.GcsUtilFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.MoreObjects;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.io.CountingOutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.apache.avro.generic.GenericRecord;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nullable;

/**
 * {@link PTransform}s for reading and writing
 * BigQuery tables.
 *
 * Table References
 *
 * A fully-qualified BigQuery table name consists of three components:
 * 

 *   {@code projectId}: the Cloud project id (defaults to
 *       {@link GcpOptions#getProject()}).
 *   
{@code datasetId}: the BigQuery dataset id, unique within a project.
 *   
{@code tableId}: a table id, unique within a dataset.
 * 
 *
 * BigQuery table references are stored as a {@link TableReference}, which comes
 * from the 
 * BigQuery Java Client API.
 * Tables can be referred to as Strings, with or without the {@code projectId}.
 * A helper function is provided ({@link BigQueryIO#parseTableSpec(String)})
 * that parses the following string forms into a {@link TableReference}:
 *
 * 

 *   [{@code project_id}]:[{@code dataset_id}].[{@code table_id}]
 *   
[{@code dataset_id}].[{@code table_id}]
 * 
 *
 * Reading
 *
 * To read from a BigQuery table, apply a {@link BigQueryIO.Read} transformation.
 * This produces a {@link PCollection} of {@link TableRow TableRows} as output:
 * 
{@code
 * PCollection weatherData = pipeline.apply(
 *     BigQueryIO.Read.named("Read")
 *                    .from("clouddataflow-readonly:samples.weather_stations"));
 * }
 *
 * See {@link TableRow} for more information on the {@link TableRow} object.
 *
 * 
Users may provide a query to read from rather than reading all of a BigQuery table. If
 * specified, the result obtained by executing the specified query will be used as the data of the
 * input transform.
 *
 * 
{@code
 * PCollection meanTemperatureData = pipeline.apply(
 *     BigQueryIO.Read.named("Read")
 *                    .fromQuery("SELECT year, mean_temp FROM [samples.weather_stations]"));
 * }
 *
 * When creating a BigQuery input transform, users should provide either a query or a table.
 * Pipeline construction will fail with a validation error if neither or both are specified.
 *
 * 
Writing
 *
 * To write to a BigQuery table, apply a {@link BigQueryIO.Write} transformation.
 * This consumes a {@link PCollection} of {@link TableRow TableRows} as input.
 * 
{@code
 * PCollection quotes = ...
 *
 * List fields = new ArrayList<>();
 * fields.add(new TableFieldSchema().setName("source").setType("STRING"));
 * fields.add(new TableFieldSchema().setName("quote").setType("STRING"));
 * TableSchema schema = new TableSchema().setFields(fields);
 *
 * quotes.apply(BigQueryIO.Write
 *     .named("Write")
 *     .to("my-project:output.output_table")
 *     .withSchema(schema)
 *     .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
 * }
 *
 * See {@link BigQueryIO.Write} for details on how to specify if a write should
 * append to an existing table, replace the table, or verify that the table is
 * empty. Note that the dataset being written to must already exist. Write
 * dispositions are not supported in streaming mode.
 *
 * 
Sharding BigQuery output tables
 *
 * A common use case is to dynamically generate BigQuery table names based on
 * the current window. To support this,
 * {@link BigQueryIO.Write#to(SerializableFunction)}
 * accepts a function mapping the current window to a tablespec. For example,
 * here's code that outputs daily tables to BigQuery:
 * 
{@code
 * PCollection quotes = ...
 * quotes.apply(Window.into(CalendarWindows.days(1)))
 *       .apply(BigQueryIO.Write
 *         .named("Write")
 *         .withSchema(schema)
 *         .to(new SerializableFunction() {
 *           public String apply(BoundedWindow window) {
 *             // The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
 *             String dayString = DateTimeFormat.forPattern("yyyy_MM_dd")
 *                  .withZone(DateTimeZone.UTC)
 *                  .print(((IntervalWindow) window).start());
 *             return "my-project:output.output_table_" + dayString;
 *           }
 *         }));
 * }
 *
 * Per-window tables are not yet supported in batch mode.
 *
 * 
Permissions
 *
 * Permission requirements depend on the {@link PipelineRunner} that is used to execute the
 * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
 * more details.
 *
 * 
Please see BigQuery Access Control
 *  for security and permission related information specific to BigQuery.
 */
public class BigQueryIO {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class);

  /**
   * Singleton instance of the JSON factory used to read and write JSON
   * formatted rows.
   */
  private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();

  /**
   * Project IDs must contain 6-63 lowercase letters, digits, or dashes.
   * IDs must start with a letter and may not end with a dash.
   * This regex isn't exact - this allows for patterns that would be rejected by
   * the service, but this is sufficient for basic parsing of table references.
   */
  private static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]{4,61}[a-z0-9]";

  /**
   * Regular expression that matches Dataset IDs.
   */
  private static final String DATASET_REGEXP = "[-\\w.]{1,1024}";

  /**
   * Regular expression that matches Table IDs.
   */
  private static final String TABLE_REGEXP = "[-\\w$@]{1,1024}";

  /**
   * Matches table specifications in the form {@code "[project_id]:[dataset_id].[table_id]"} or
   * {@code "[dataset_id].[table_id]"}.
   */
  private static final String DATASET_TABLE_REGEXP =
      String.format("((?%s):)?(?%s)\\.(?%s)", PROJECT_ID_REGEXP,
          DATASET_REGEXP, TABLE_REGEXP);

  private static final Pattern TABLE_SPEC = Pattern.compile(DATASET_TABLE_REGEXP);

  @Deprecated // unused.
  public static final String SET_PROJECT_FROM_OPTIONS_WARNING =
      "No project specified for BigQuery table \"%1$s.%2$s\". Assuming it is in \"%3$s\". If the"
      + " table is in a different project please specify it as a part of the BigQuery table"
      + " definition.";

  private static final String RESOURCE_NOT_FOUND_ERROR =
      "BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline"
          + " execution. If the %1$s is created by an earlier stage of the pipeline, this"
          + " validation can be disabled using #withoutValidation.";

  private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR =
      "Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by"
          + " an earlier stage of the pipeline, this validation can be disabled using"
          + " #withoutValidation.";

  /**
   * Parse a table specification in the form
   * {@code "[project_id]:[dataset_id].[table_id]"} or {@code "[dataset_id].[table_id]"}.
   *
   * If the project id is omitted, the default project id is used.
   */
  public static TableReference parseTableSpec(String tableSpec) {
    Matcher match = TABLE_SPEC.matcher(tableSpec);
    if (!match.matches()) {
      throw new IllegalArgumentException(
          "Table reference is not in [project_id]:[dataset_id].[table_id] "
          + "format: " + tableSpec);
    }

    TableReference ref = new TableReference();
    ref.setProjectId(match.group("PROJECT"));

    return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE"));
  }

  /**
   * Returns a canonical string representation of the {@link TableReference}.
   */
  public static String toTableSpec(TableReference ref) {
    StringBuilder sb = new StringBuilder();
    if (ref.getProjectId() != null) {
      sb.append(ref.getProjectId());
      sb.append(":");
    }

    sb.append(ref.getDatasetId()).append('.').append(ref.getTableId());
    return sb.toString();
  }

  @VisibleForTesting
  static class JsonSchemaToTableSchema
      implements SerializableFunction {
    @Override
    public TableSchema apply(String from) {
      return fromJsonString(from, TableSchema.class);
    }
  }

  private static class TableSchemaToJsonSchema
      implements SerializableFunction {
    @Override
    public String apply(TableSchema from) {
      return toJsonString(from);
    }
  }

  private static class JsonTableRefToTableRef
      implements SerializableFunction {
    @Override
    public TableReference apply(String from) {
      return fromJsonString(from, TableReference.class);
    }
  }

  private static class TableRefToTableSpec
      implements SerializableFunction {
    @Override
    public String apply(TableReference from) {
      return toTableSpec(from);
    }
  }

  private static class TableRefToJson
      implements SerializableFunction {
    @Override
    public String apply(TableReference from) {
      return toJsonString(from);
    }
  }

  private static class TableRefToProjectId
      implements SerializableFunction {
    @Override
    public String apply(TableReference from) {
      return from.getProjectId();
    }
  }

  @VisibleForTesting
  static class TableSpecToTableRef
      implements SerializableFunction {
    @Override
    public TableReference apply(String from) {
      return parseTableSpec(from);
    }
  }

  @Nullable
  private static ValueProvider displayTable(
      @Nullable ValueProvider table) {
    if (table == null) {
      return null;
    }
    return NestedValueProvider.of(table, new TableRefToTableSpec());
  }

  /**
   * A {@link PTransform} that reads from a BigQuery table and returns a
   * {@link PCollection} of {@link TableRow TableRows} containing each of the rows of the table.
   *
   * 
Each {@link TableRow} contains values indexed by column name. Here is a
   * sample processing function that processes a "line" column from rows:
   * 
{@code
   * static class ExtractWordsFn extends DoFn {
   *   public void processElement(ProcessContext c) {
   *     // Get the "line" field of the TableRow object, split it into words, and emit them.
   *     TableRow row = c.element();
   *     String[] words = row.get("line").toString().split("[^a-zA-Z']+");
   *     for (String word : words) {
   *       if (!word.isEmpty()) {
   *         c.output(word);
   *       }
   *     }
   *   }
   * }}
   */
  public static class Read {
    /**
     * Returns a {@link Read.Bound} with the given name. The BigQuery table or query to be read
     * from has not yet been configured.
     */
    public static Bound named(String name) {
      return new Bound().named(name);
    }

    /**
     * Reads a BigQuery table specified as {@code "[project_id]:[dataset_id].[table_id]"} or
     * {@code "[dataset_id].[table_id]"} for tables within the current project.
     */
    public static Bound from(String tableSpec) {
      return new Bound().from(StaticValueProvider.of(tableSpec));
    }

    /**
     * Same as {@code from(String)}, but with a {@link ValueProvider}.
     */
    public static Bound from(ValueProvider tableSpec) {
      return new Bound().from(tableSpec);
    }

    /**
     * Reads results received after executing the given query.
     */
    public static Bound fromQuery(String query) {
      return new Bound().fromQuery(StaticValueProvider.of(query));
    }

    /**
     * Same as {@code from(String)}, but with a {@link ValueProvider}.
     */
    public static Bound fromQuery(ValueProvider query) {
      return new Bound().fromQuery(query);
    }

    /**
     * Reads a BigQuery table specified as a {@link TableReference} object.
     */
    public static Bound from(TableReference table) {
      return new Bound().from(table);
    }

    /**
     * Disables BigQuery table validation, which is enabled by default.
     */
    public static Bound withoutValidation() {
      return new Bound().withoutValidation();
    }

    /**
     * A {@link PTransform} that reads from a BigQuery table and returns a bounded
     * {@link PCollection} of {@link TableRow TableRows}.
     */
    public static class Bound extends PTransform> {
      @Nullable final ValueProvider jsonTableRef;
      @Nullable final ValueProvider query;
      final boolean validate;
      @Nullable final Boolean flattenResults;
      @Nullable final Boolean useLegacySql;
      @Nullable BigQueryServices bigQueryServices;

      private static final String QUERY_VALIDATION_FAILURE_ERROR =
          "Validation of query \"%1$s\" failed. If the query depends on an earlier stage of the"
          + " pipeline, This validation can be disabled using #withoutValidation.";

      private Bound() {
        this(
            null /* name */,
            null /* query */,
            null /* jsonTableRef */,
            true /* validate */,
            null /* flattenResults */,
            null /* useLegacySql */,
            null /* bigQueryServices */);
      }

      private Bound(
          String name, @Nullable ValueProvider query,
          @Nullable ValueProvider jsonTableRef, boolean validate,
          @Nullable Boolean flattenResults, @Nullable Boolean useLegacySql,
          @Nullable BigQueryServices bigQueryServices) {
        super(name);
        this.jsonTableRef = jsonTableRef;
        this.query = query;
        this.validate = validate;
        this.flattenResults = flattenResults;
        this.useLegacySql = useLegacySql;
        this.bigQueryServices = bigQueryServices;
      }

      /**
       * Returns a copy of this transform using the name associated with this transformation.
       *
       * Does not modify this object.
       */
      public Bound named(String name) {
        return new Bound(
            name, query, jsonTableRef, validate, flattenResults, useLegacySql,
            bigQueryServices);
      }

      /**
       * Returns a copy of this transform that reads from the specified table. Refer to
       * {@link #parseTableSpec(String)} for the specification format.
       *
       * 
Does not modify this object.
       */
      public Bound from(String tableSpec) {
        return from(StaticValueProvider.of(tableSpec));
      }

      /**
       * Returns a copy of this transform that reads from the specified table. Refer to
       * {@link #parseTableSpec(String)} for the specification format.
       *
       * 
Does not modify this object.
       */
      public Bound from(ValueProvider tableSpec) {
        return new Bound(
            name, query,
            NestedValueProvider.of(
                NestedValueProvider.of(
                    tableSpec, new TableSpecToTableRef()),
                new TableRefToJson()),
            validate, flattenResults, useLegacySql, bigQueryServices);
      }

      /**
       * Returns a copy of this transform that reads from the specified table.
       *
       * 
Does not modify this object.
       */
      public Bound from(TableReference table) {
        return from(StaticValueProvider.of(toTableSpec(table)));
      }

      /**
       * Returns a copy of this transform that reads the results of the specified query.
       *
       * 
Does not modify this object.
       *
       * 
By default, the query results will be flattened -- see
       * "flattenResults" in the 
       * Jobs documentation for more information.  To disable flattening, use
       * {@link BigQueryIO.Read.Bound#withoutResultFlattening}.
       *
       * 
By default, the query will use BigQuery's legacy SQL dialect. To use the BigQuery
       * Standard SQL dialect, use {@link BigQueryIO.Read.Bound#usingStandardSql}.
       */
      public Bound fromQuery(String query) {
        return fromQuery(StaticValueProvider.of(query));
      }

      /**
       * Like {@link #fromQuery(String)}, but from a {@link ValueProvider}.
       */
      public Bound fromQuery(ValueProvider query) {
        return new Bound(name, query, jsonTableRef, validate,
            MoreObjects.firstNonNull(flattenResults, Boolean.TRUE),
            MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE),
            bigQueryServices);
      }

      /**
       * Disable table validation.
       */
      public Bound withoutValidation() {
        return new Bound(
            name, query, jsonTableRef, false /* validate */, flattenResults, useLegacySql,
            bigQueryServices);
      }

      /**
       * Disable 
       * flattening of query results.
       *
       * 
Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
       * from a table will cause an error during validation.
       */
      public Bound withoutResultFlattening() {
        return new Bound(
            name, query, jsonTableRef, validate, false /* flattenResults */, useLegacySql,
            bigQueryServices);
      }

      /**
       * Enables BigQuery's Standard SQL dialect when reading from a query.
       *
       * 
Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
       * from a table will cause an error during validation.
       */
      public Bound usingStandardSql() {
        return new Bound(
            name, query, jsonTableRef, validate, flattenResults, false /* useLegacySql */,
            bigQueryServices);
      }

      @VisibleForTesting
      Bound withTestServices(BigQueryServices testServices) {
        return new Bound(
            name, query, jsonTableRef, validate, flattenResults, useLegacySql, testServices);
      }

      @Override
      public void validate(PInput input) {
        if (!validate) {
          // Note that a table or query check can fail if the table or dataset are created by
          // earlier stages of the pipeline or if a query depends on earlier stages of a pipeline.
          // For these cases the withoutValidation method can be used to disable the check.
          return;
        }

        BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class);

        String tempLocation = bqOptions.getTempLocation();
        checkArgument(
            !Strings.isNullOrEmpty(tempLocation),
            "BigQueryIO.Read needs a GCS temp location to store temp files.");
        if (bigQueryServices == null) {
          try {
            GcsPath.fromUri(tempLocation);
          } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException(
                String.format(
                    "BigQuery temp location expected a valid 'gs://' path, but was given '%s'",
                    tempLocation),
                e);
          }
        }

        ValueProvider table = getTableWithDefaultProject(bqOptions);

        checkState(
            table == null || query == null,
            "Invalid BigQueryIO.Read: table reference and query may not both be set");
        checkState(
            table != null || query != null,
            "Invalid BigQueryIO.Read: one of table reference and query must be set");

        if (table != null) {
          checkState(
              flattenResults == null,
              "Invalid BigQueryIO.Read: Specifies a table with a result flattening"
                  + " preference, which only applies to queries");
          checkState(
              useLegacySql == null,
              "Invalid BigQueryIO.Read: Specifies a table with a SQL dialect"
                  + " preference, which only applies to queries");

          checkState(table.isAccessible(), "Cannot call validate if table is dynamically set.");
          // Check for source table presence for early failure notification.
          verifyDatasetPresence(bqOptions, table.get());
          verifyTablePresence(bqOptions, table.get());
        } else /* query != null */ {
          checkState(query.isAccessible(), "Cannot call validate if query is dynamically set.");
          checkState(flattenResults != null, "flattenResults should not be null if query is set");
          checkState(useLegacySql != null, "useLegacySql should not be null if query is set");
          dryRunQuery(bqOptions, query.get(), useLegacySql);
        }
      }

      private static void dryRunQuery(
          BigQueryOptions options, String query, boolean useLegacySql) {
        Bigquery client = Transport.newBigQueryClient(options).build();
        QueryRequest request = new QueryRequest();
        request.setQuery(query);
        request.setDryRun(true);
        request.setUseLegacySql(useLegacySql);

        String queryValidationErrorMsg = String.format(QUERY_VALIDATION_FAILURE_ERROR, query);
        try {
          BigQueryTableRowIterator.executeWithBackOff(
              client.jobs().query(options.getProject(), request),
              queryValidationErrorMsg);
        } catch (Exception e) {
          throw new IllegalArgumentException(queryValidationErrorMsg, e);
        }
      }

      @Override
      public PCollection apply(PInput input) {
        String uuid = randomUUIDString();
        final String jobIdToken = "beam_job_" + uuid;

        BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class);

        BoundedSource source;
        final BigQueryServices bqServices = getBigQueryServices();

        final String extractDestinationDir;
        String tempLocation = bqOptions.getTempLocation();
        try {
          IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
          extractDestinationDir = factory.resolve(tempLocation, uuid);
        } catch (IOException e) {
          throw new RuntimeException(
              String.format("Failed to resolve extract destination directory in %s", tempLocation));
        }

        final String executingProject = bqOptions.getProject();
        if (query != null && (!query.isAccessible() || !Strings.isNullOrEmpty(query.get()))) {
          String queryTempDatasetId = "temp_dataset_" + uuid;
          String queryTempTableId = "temp_table_" + uuid;

          TableReference queryTempTableRef = new TableReference()
              .setProjectId(executingProject)
              .setDatasetId(queryTempDatasetId)
              .setTableId(queryTempTableId);
          String jsonTableRef = toJsonString(queryTempTableRef);

          source = BigQueryQuerySource.create(
              jobIdToken, query, NestedValueProvider.of(
                  StaticValueProvider.of(jsonTableRef), new JsonTableRefToTableRef()),
              flattenResults, useLegacySql, extractDestinationDir, bqServices);
        } else {
          ValueProvider inputTable = getTableWithDefaultProject(bqOptions);
          source = BigQueryTableSource.create(
              jobIdToken, inputTable, extractDestinationDir, bqServices,
              StaticValueProvider.of(executingProject));
        }
        PassThroughThenCleanup.CleanupOperation cleanupOperation =
            new PassThroughThenCleanup.CleanupOperation() {
              @Override
              void cleanup(PipelineOptions options) throws Exception {
                BigQueryOptions bqOptions = options.as(BigQueryOptions.class);

                JobReference jobRef = new JobReference()
                    .setProjectId(executingProject)
                    .setJobId(getExtractJobId(jobIdToken));

                Job extractJob = bqServices.getJobService(bqOptions)
                    .getJob(jobRef);

                Collection extractFiles = null;
                if (extractJob != null) {
                  extractFiles = getExtractFilePaths(extractDestinationDir, extractJob);
                } else {
                  IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir);
                  Collection dirMatch = factory.match(extractDestinationDir);
                  if (!dirMatch.isEmpty()) {
                    extractFiles = factory.match(factory.resolve(extractDestinationDir, "*"));
                  }
                }
                if (extractFiles != null && !extractFiles.isEmpty()) {
                  new GcsUtilFactory().create(options).remove(extractFiles);
                }
              }};
        return input.getPipeline()
            .apply(com.google.cloud.dataflow.sdk.io.Read.from(source))
            .setCoder(getDefaultOutputCoder())
            .apply(new PassThroughThenCleanup(cleanupOperation));
      }

      @Override
      protected Coder getDefaultOutputCoder() {
        return TableRowJsonCoder.of();
      }

      @Override
      public void populateDisplayData(DisplayData.Builder builder) {
        super.populateDisplayData(builder);
        builder
            .addIfNotNull(DisplayData.item("table", displayTable(getTableProvider()))
              .withLabel("Table"))
            .addIfNotNull(DisplayData.item("query", query)
              .withLabel("Query"))
            .addIfNotNull(DisplayData.item("flattenResults", flattenResults)
              .withLabel("Flatten Query Results"))
            .addIfNotNull(DisplayData.item("useLegacySql", useLegacySql)
              .withLabel("Use Legacy SQL Dialect"))
            .addIfNotDefault(DisplayData.item("validation", validate)
              .withLabel("Validation Enabled"),
                true);
      }

      /**
       * Returns the table to read, or {@code null} if reading from a query instead.
       *
       * 
If the table's project is not specified, use the executing project.
       */
      @Nullable private ValueProvider getTableWithDefaultProject(
          BigQueryOptions bqOptions) {
        ValueProvider table = getTableProvider();
        if (table == null) {
          return table;
        }
        if (!table.isAccessible()) {
          LOG.info("Using a dynamic value for table input. This must contain a project"
              + " in the table reference: {}", table);
          return table;
        }
        if (Strings.isNullOrEmpty(table.get().getProjectId())) {
          // If user does not specify a project we assume the table to be located in
          // the default project.
          TableReference tableRef = table.get();
          tableRef.setProjectId(bqOptions.getProject());
          return NestedValueProvider.of(StaticValueProvider.of(
              toJsonString(tableRef)), new JsonTableRefToTableRef());
        }
        return table;
      }

      /**
       * Returns the table to read, or {@code null} if reading from a query instead.
       */
      @Nullable
      public ValueProvider getTableProvider() {
        return jsonTableRef == null
            ? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef());
      }

      /**
       * Returns the table to read, or {@code null} if reading from a query instead.
       */
      @Nullable
      public TableReference getTable() {
        ValueProvider provider = getTableProvider();
        return provider == null ? null : provider.get();
      }

      /**
       * Returns the query to be read, or {@code null} if reading from a table instead.
       */
      @Nullable
      public String getQuery() {
        return query == null ? null : query.get();
      }

      /**
       * Returns the query to be read, or {@code null} if reading from a table instead.
       */
      @Nullable
      public ValueProvider getQueryProvider() {
        return query;
      }

      /**
       * Returns true if table validation is enabled.
       */
      public boolean getValidate() {
        return validate;
      }

      /**
       * Returns true/false if result flattening is enabled/disabled, or null if not applicable.
       */
      public Boolean getFlattenResults() {
        return flattenResults;
      }

      /**
       * Returns true (false) if the query will (will not) use BigQuery's legacy SQL mode, or null
       * if not applicable.
       */
      @Nullable
      public Boolean getUseLegacySql() {
        return useLegacySql;
      }

      private BigQueryServices getBigQueryServices() {
        if (bigQueryServices == null) {
          bigQueryServices = new BigQueryServicesImpl();
        }
        return bigQueryServices;
      }
    }

    /** Disallow construction of utility class. */
    private Read() {}
  }

  /**
   * A {@link PTransform} that invokes {@link CleanupOperation} after the input {@link PCollection}
   * has been processed.
   */
  @VisibleForTesting
  static class PassThroughThenCleanup extends PTransform, PCollection> {

    private CleanupOperation cleanupOperation;

    PassThroughThenCleanup(CleanupOperation cleanupOperation) {
      this.cleanupOperation = cleanupOperation;
    }

    @Override
    public PCollection apply(PCollection input) {
      TupleTag mainOutput = new TupleTag<>();
      TupleTag cleanupSignal = new TupleTag<>();
      PCollectionTuple outputs = input.apply(ParDo.of(new IdentityFn())
          .withOutputTags(mainOutput, TupleTagList.of(cleanupSignal)));

      PCollectionView cleanupSignalView = outputs.get(cleanupSignal)
          .setCoder(VoidCoder.of())
          .apply(View.asSingleton().withDefaultValue(null));

      input.getPipeline()
          .apply("Create(CleanupOperation)", Create.of(cleanupOperation))
          .apply("Cleanup", ParDo.of(
              new DoFn() {
                @Override
                public void processElement(ProcessContext c)
                    throws Exception {
                  c.element().cleanup(c.getPipelineOptions());
                }
              }).withSideInputs(cleanupSignalView));

      return outputs.get(mainOutput);
    }

    private static class IdentityFn extends DoFn {
      @Override
      public void processElement(ProcessContext c) {
        c.output(c.element());
      }
    }

    abstract static class CleanupOperation implements Serializable {
      abstract void cleanup(PipelineOptions options) throws Exception;
    }
  }

  /**
   * A {@link BigQuerySourceBase} for reading BigQuery tables.
   */
  @VisibleForTesting
  static class BigQueryTableSource extends BigQuerySourceBase {

    static BigQueryTableSource create(
        String jobIdToken,
        ValueProvider table,
        String extractDestinationDir,
        BigQueryServices bqServices,
        ValueProvider executingProject) {
      return new BigQueryTableSource(
          jobIdToken, table, extractDestinationDir, bqServices, executingProject);
    }

    private final ValueProvider jsonTable;
    private final AtomicReference tableSizeBytes;

    private BigQueryTableSource(
        String jobIdToken,
        ValueProvider table,
        String extractDestinationDir,
        BigQueryServices bqServices,
        ValueProvider executingProject) {
      super(jobIdToken, extractDestinationDir, bqServices, executingProject);
      this.jsonTable = NestedValueProvider.of(checkNotNull(table, "table"), new TableRefToJson());
      this.tableSizeBytes = new AtomicReference<>();
    }

    @Override
    protected TableReference getTableToExtract(BigQueryOptions bqOptions) throws IOException {
      checkState(jsonTable.isAccessible());
      return JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);
    }

    @Override
    public BoundedReader createReader(PipelineOptions options) throws IOException {
      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
      checkState(jsonTable.isAccessible());
      TableReference tableRef = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);
      return new BigQueryReader(this, bqServices.getReaderFromTable(bqOptions, tableRef));
    }

    @Override
    public synchronized long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
      if (tableSizeBytes.get() == null) {
        TableReference table = JSON_FACTORY.fromString(jsonTable.get(), TableReference.class);

        Long numBytes = bqServices.getDatasetService(options.as(BigQueryOptions.class))
            .getTable(table.getProjectId(), table.getDatasetId(), table.getTableId())
            .getNumBytes();
        tableSizeBytes.compareAndSet(null, numBytes);
      }
      return tableSizeBytes.get();
    }

    @Override
    protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception {
      // Do nothing.
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);
      builder.add(DisplayData.item("table", jsonTable));
    }
  }

  /**
   * A {@link BigQuerySourceBase} for querying BigQuery tables.
   */
  @VisibleForTesting
  static class BigQueryQuerySource extends BigQuerySourceBase {

    static BigQueryQuerySource create(
        String jobIdToken,
        ValueProvider query,
        ValueProvider queryTempTableRef,
        Boolean flattenResults,
        Boolean useLegacySql,
        String extractDestinationDir,
        BigQueryServices bqServices) {
      return new BigQueryQuerySource(
          jobIdToken,
          query,
          queryTempTableRef,
          flattenResults,
          useLegacySql,
          extractDestinationDir,
          bqServices);
    }

    private final ValueProvider query;
    private final ValueProvider jsonQueryTempTable;
    private final Boolean flattenResults;
    private final Boolean useLegacySql;
    private transient AtomicReference dryRunJobStats;

    private BigQueryQuerySource(
        String jobIdToken,
        ValueProvider query,
        ValueProvider queryTempTableRef,
        Boolean flattenResults,
        Boolean useLegacySql,
        String extractDestinationDir,
        BigQueryServices bqServices) {
      super(jobIdToken, extractDestinationDir, bqServices,
          NestedValueProvider.of(
              checkNotNull(queryTempTableRef, "queryTempTableRef"), new TableRefToProjectId()));
      this.query = checkNotNull(query, "query");
      this.jsonQueryTempTable = NestedValueProvider.of(
          queryTempTableRef, new TableRefToJson());
      this.flattenResults = checkNotNull(flattenResults, "flattenResults");
      this.useLegacySql = checkNotNull(useLegacySql, "useLegacySql");
      this.dryRunJobStats = new AtomicReference<>();
    }

    @Override
    public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
      return dryRunQueryIfNeeded(bqOptions).getTotalBytesProcessed();
    }

    @Override
    public BoundedReader createReader(PipelineOptions options) throws IOException {
      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
      return new BigQueryReader(this, bqServices.getReaderFromQuery(
          bqOptions, query.get(), executingProject.get(), flattenResults, useLegacySql));
    }

    @Override
    protected TableReference getTableToExtract(BigQueryOptions bqOptions)
        throws IOException, InterruptedException {
      // 1. Find the location of the query.
      String location = null;
      List referencedTables =
          dryRunQueryIfNeeded(bqOptions).getQuery().getReferencedTables();
      DatasetService tableService = bqServices.getDatasetService(bqOptions);
      if (referencedTables != null && !referencedTables.isEmpty()) {
        TableReference queryTable = referencedTables.get(0);
        location = tableService.getTable(
            queryTable.getProjectId(),
            queryTable.getDatasetId(),
            queryTable.getTableId()).getLocation();
      }

      // 2. Create the temporary dataset in the query location.
      TableReference tableToExtract =
          JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class);
      tableService.createDataset(
          tableToExtract.getProjectId(),
          tableToExtract.getDatasetId(),
          location,
          "Dataset for BigQuery query job temporary table");

      // 3. Execute the query.
      String queryJobId = jobIdToken + "-query";
      executeQuery(
          executingProject.get(),
          queryJobId,
          tableToExtract,
          bqServices.getJobService(bqOptions));
      return tableToExtract;
    }

    @Override
    protected void cleanupTempResource(BigQueryOptions bqOptions) throws Exception {
      checkState(jsonQueryTempTable.isAccessible());
      TableReference tableToRemove =
          JSON_FACTORY.fromString(jsonQueryTempTable.get(), TableReference.class);

      DatasetService tableService = bqServices.getDatasetService(bqOptions);
      tableService.deleteTable(
          tableToRemove.getProjectId(),
          tableToRemove.getDatasetId(),
          tableToRemove.getTableId());
      tableService.deleteDataset(tableToRemove.getProjectId(), tableToRemove.getDatasetId());
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);
      builder.add(DisplayData.item("query", query));
    }

    private synchronized JobStatistics dryRunQueryIfNeeded(BigQueryOptions bqOptions)
        throws InterruptedException, IOException {
      if (dryRunJobStats.get() == null) {
        JobStatistics jobStats = bqServices.getJobService(bqOptions).dryRunQuery(
            executingProject.get(), createBasicQueryConfig());
        dryRunJobStats.compareAndSet(null, jobStats);
      }
      return dryRunJobStats.get();
    }

    private void executeQuery(
        String executingProject,
        String jobId,
        TableReference destinationTable,
        JobService jobService) throws IOException, InterruptedException {
      JobReference jobRef = new JobReference()
          .setProjectId(executingProject)
          .setJobId(jobId);

      JobConfigurationQuery queryConfig = createBasicQueryConfig()
          .setAllowLargeResults(true)
          .setCreateDisposition("CREATE_IF_NEEDED")
          .setDestinationTable(destinationTable)
          .setPriority("BATCH")
          .setWriteDisposition("WRITE_EMPTY");

      jobService.startQueryJob(jobRef, queryConfig);
      Job job = jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES);
      if (parseStatus(job) != Status.SUCCEEDED) {
        throw new IOException("Query job failed: " + jobId);
      }
    }

    private JobConfigurationQuery createBasicQueryConfig() {
      return new JobConfigurationQuery()
          .setQuery(query.get())
          .setFlattenResults(flattenResults)
          .setUseLegacySql(useLegacySql);
    }

    private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException {
      in.defaultReadObject();
      dryRunJobStats = new AtomicReference<>();
    }
  }

  /**
   * An abstract {@link BoundedSource} to read a table from BigQuery.
   *
   * 
This source uses a BigQuery export job to take a snapshot of the table on GCS, and then
   * reads in parallel from each produced file. It is implemented by {@link BigQueryTableSource},
   * and {@link BigQueryQuerySource}, depending on the configuration of the read.
   * Specifically,
   * 

   * {@link BigQueryTableSource} is for reading BigQuery tables
   * {@link BigQueryQuerySource} is for querying BigQuery tables
   * 
   * ...
   */
  private abstract static class BigQuerySourceBase extends BoundedSource {
    // The maximum number of retries to verify temp files.
    private static final int MAX_FILES_VERIFY_RETRIES = 9;

    // The maximum number of retries to poll a BigQuery job.
    protected static final int JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE;

    // The initial backoff for verifying temp files.
    private static final Duration INITIAL_FILES_VERIFY_BACKOFF = Duration.standardSeconds(1);

    protected final String jobIdToken;
    protected final String extractDestinationDir;
    protected final BigQueryServices bqServices;
    protected final ValueProvider executingProject;

    private BigQuerySourceBase(
        String jobIdToken,
        String extractDestinationDir,
        BigQueryServices bqServices,
        ValueProvider executingProject) {
      this.jobIdToken = checkNotNull(jobIdToken, "jobIdToken");
      this.extractDestinationDir = checkNotNull(extractDestinationDir, "extractDestinationDir");
      this.bqServices = checkNotNull(bqServices, "bqServices");
      this.executingProject = checkNotNull(executingProject, "executingProject");
    }

    @Override
    public List> splitIntoBundles(
        long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
      TableReference tableToExtract = getTableToExtract(bqOptions);
      JobService jobService = bqServices.getJobService(bqOptions);
      String extractJobId = getExtractJobId(jobIdToken);
      List tempFiles = executeExtract(extractJobId, tableToExtract, jobService);

      TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable(
          tableToExtract.getProjectId(),
          tableToExtract.getDatasetId(),
          tableToExtract.getTableId()).getSchema();

      cleanupTempResource(bqOptions);
      return createSources(tempFiles, tableSchema);
    }

    protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception;

    protected abstract void cleanupTempResource(BigQueryOptions bqOptions) throws Exception;

    @Override
    public boolean producesSortedKeys(PipelineOptions options) throws Exception {
      return false;
    }

    @Override
    public void validate() {
      // Do nothing, validation is done in BigQuery.Read.
    }

    @Override
    public Coder getDefaultOutputCoder() {
      return TableRowJsonCoder.of();
    }

    private List executeExtract(
        String jobId, TableReference table, JobService jobService)
            throws InterruptedException, IOException {
      JobReference jobRef = new JobReference()
          .setProjectId(executingProject.get())
          .setJobId(jobId);

      String destinationUri = getExtractDestinationUri(extractDestinationDir);
      JobConfigurationExtract extract = new JobConfigurationExtract()
          .setSourceTable(table)
          .setDestinationFormat("AVRO")
          .setDestinationUris(ImmutableList.of(destinationUri));

      LOG.info("Starting BigQuery extract job: {}", jobId);
      jobService.startExtractJob(jobRef, extract);
      Job extractJob =
          jobService.pollJob(jobRef, JOB_POLL_MAX_RETRIES);
      if (parseStatus(extractJob) != Status.SUCCEEDED) {
        throw new IOException(String.format(
            "Extract job %s failed, status: %s",
            extractJob.getJobReference().getJobId(), extractJob.getStatus()));
      }

      List tempFiles = getExtractFilePaths(extractDestinationDir, extractJob);
      return ImmutableList.copyOf(tempFiles);
    }

    private List> createSources(
        List files, TableSchema tableSchema) throws IOException, InterruptedException {
      final String jsonSchema = JSON_FACTORY.toString(tableSchema);

      SerializableFunction function =
          new SerializableFunction() {
            @Override
            public TableRow apply(GenericRecord input) {
              try {
                return AvroUtils.convertGenericRecordToTableRow(
                    input, JSON_FACTORY.fromString(jsonSchema, TableSchema.class));
              } catch (IOException e) {
                throw new RuntimeException("Failed to convert GenericRecord to TableRow", e);
              }
            }};

      List> avroSources = Lists.newArrayList();
      for (String fileName : files) {
        avroSources.add(new TransformingSource<>(
            AvroSource.from(fileName), function, getDefaultOutputCoder()));
      }
      return ImmutableList.copyOf(avroSources);
    }

    protected static class BigQueryReader extends BoundedSource.BoundedReader {
      private final BigQuerySourceBase source;
      private final BigQueryServices.BigQueryJsonReader reader;

      private BigQueryReader(
          BigQuerySourceBase source, BigQueryServices.BigQueryJsonReader reader) {
        this.source = source;
        this.reader = reader;
      }

      @Override
      public BoundedSource getCurrentSource() {
        return source;
      }

      @Override
      public boolean start() throws IOException {
        return reader.start();
      }

      @Override
      public boolean advance() throws IOException {
        return reader.advance();
      }

      @Override
      public TableRow getCurrent() throws NoSuchElementException {
        return reader.getCurrent();
      }

      @Override
      public void close() throws IOException {
        reader.close();
      }
    }
  }

  /**
   * A {@link BoundedSource} that reads from {@code BoundedSource}
   * and transforms elements to type {@code V}.
  */
  @VisibleForTesting
  static class TransformingSource extends BoundedSource {
    private final BoundedSource boundedSource;
    private final SerializableFunction function;
    private final Coder outputCoder;

    TransformingSource(
        BoundedSource boundedSource,
        SerializableFunction function,
        Coder outputCoder) {
      this.boundedSource = checkNotNull(boundedSource, "boundedSource");
      this.function = checkNotNull(function, "function");
      this.outputCoder = checkNotNull(outputCoder, "outputCoder");
    }

    @Override
    public List> splitIntoBundles(
        long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
      return Lists.transform(
          boundedSource.splitIntoBundles(desiredBundleSizeBytes, options),
          new Function, BoundedSource>() {
            @Override
            public BoundedSource apply(BoundedSource input) {
              return new TransformingSource<>(input, function, outputCoder);
            }
          });
    }

    @Override
    public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
      return boundedSource.getEstimatedSizeBytes(options);
    }

    @Override
    public boolean producesSortedKeys(PipelineOptions options) throws Exception {
      return boundedSource.producesSortedKeys(options);
    }

    @Override
    public BoundedReader createReader(PipelineOptions options) throws IOException {
      return new TransformingReader(boundedSource.createReader(options));
    }

    @Override
    public void validate() {
      boundedSource.validate();
    }

    @Override
    public Coder getDefaultOutputCoder() {
      return outputCoder;
    }

    private class TransformingReader extends BoundedReader {
      private final BoundedReader boundedReader;

      private TransformingReader(BoundedReader boundedReader) {
        this.boundedReader = checkNotNull(boundedReader, "boundedReader");
      }

      @Override
      public synchronized BoundedSource getCurrentSource() {
        return new TransformingSource<>(boundedReader.getCurrentSource(), function, outputCoder);
      }

      @Override
      public boolean start() throws IOException {
        return boundedReader.start();
      }

      @Override
      public boolean advance() throws IOException {
        return boundedReader.advance();
      }

      @Override
      public V getCurrent() throws NoSuchElementException {
        T current = boundedReader.getCurrent();
        return function.apply(current);
      }

      @Override
      public void close() throws IOException {
        boundedReader.close();
      }

      @Override
      public synchronized BoundedSource splitAtFraction(double fraction) {
        BoundedSource split = boundedReader.splitAtFraction(fraction);
        return split == null ? null : new TransformingSource<>(split, function, outputCoder);
      }

      @Override
      public Double getFractionConsumed() {
        return boundedReader.getFractionConsumed();
      }

      @Override
      public Instant getCurrentTimestamp() throws NoSuchElementException {
        return boundedReader.getCurrentTimestamp();
      }
    }
  }

  private static String getExtractJobId(String jobIdToken) {
    return jobIdToken + "-extract";
  }

  private static String getExtractDestinationUri(String extractDestinationDir) {
    return String.format("%s/%s", extractDestinationDir, "*.avro");
  }

  private static List getExtractFilePaths(String extractDestinationDir, Job extractJob)
      throws IOException {
    JobStatistics jobStats = extractJob.getStatistics();
    List counts = jobStats.getExtract().getDestinationUriFileCounts();
    if (counts.size() != 1) {
      String errorMessage = (counts.size() == 0
          ? "No destination uri file count received."
          : String.format("More than one destination uri file count received. First two are %s, %s",
              counts.get(0), counts.get(1)));
      throw new RuntimeException(errorMessage);
    }
    long filesCount = counts.get(0);

    ImmutableList.Builder paths = ImmutableList.builder();
    IOChannelFactory factory = IOChannelUtils.getFactory(extractDestinationDir);
    for (long i = 0; i < filesCount; ++i) {
      String filePath =
          factory.resolve(extractDestinationDir, String.format("%012d%s", i, ".avro"));
      paths.add(filePath);
    }
    return paths.build();
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
   * A {@link PTransform} that writes a {@link PCollection} containing {@link TableRow TableRows}
   * to a BigQuery table.
   *
   * In BigQuery, each table has an encosing dataset. The dataset being written must already
   * exist.
   *
   * 
By default, tables will be created if they do not exist, which corresponds to a
   * {@link CreateDisposition#CREATE_IF_NEEDED} disposition that matches the default of BigQuery's
   * Jobs API. A schema must be provided (via {@link BigQueryIO.Write#withSchema(TableSchema)}),
   * or else the transform may fail at runtime with an {@link IllegalArgumentException}.
   *
   * 
By default, writes require an empty table, which corresponds to
   * a {@link WriteDisposition#WRITE_EMPTY} disposition that matches the
   * default of BigQuery's Jobs API.
   *
   * 
Here is a sample transform that produces TableRow values containing
   * "word" and "count" columns:
   * 
{@code
   * static class FormatCountsFn extends DoFn, TableRow> {
   *   public void processElement(ProcessContext c) {
   *     TableRow row = new TableRow()
   *         .set("word", c.element().getKey())
   *         .set("count", c.element().getValue().intValue());
   *     c.output(row);
   *   }
   * }}
   */
  public static class Write {
    /**
     * An enumeration type for the BigQuery create disposition strings.
     *
     * @see 
     * configuration.query.createDisposition in the BigQuery Jobs API
     */
    public enum CreateDisposition {
      /**
       * Specifics that tables should not be created.
       *
       * If the output table does not exist, the write fails.
       */
      CREATE_NEVER,

      /**
       * Specifies that tables should be created if needed. This is the default
       * behavior.
       *
       * 
Requires that a table schema is provided via {@link BigQueryIO.Write#withSchema}.
       * This precondition is checked before starting a job. The schema is
       * not required to match an existing table's schema.
       *
       * 
When this transformation is executed, if the output table does not
       * exist, the table is created from the provided schema. Note that even if
       * the table exists, it may be recreated if necessary when paired with a
       * {@link WriteDisposition#WRITE_TRUNCATE}.
       */
      CREATE_IF_NEEDED
    }

    /**
     * An enumeration type for the BigQuery write disposition strings.
     *
     * @see 
     * configuration.query.writeDisposition in the BigQuery Jobs API
     */
    public enum WriteDisposition {
      /**
       * Specifies that write should replace a table.
       *
       * 
The replacement may occur in multiple steps - for instance by first
       * removing the existing table, then creating a replacement, then filling
       * it in. This is not an atomic operation, and external programs may
       * see the table in any of these intermediate steps.
       */
      WRITE_TRUNCATE,

      /**
       * Specifies that rows may be appended to an existing table.
       */
      WRITE_APPEND,

      /**
       * Specifies that the output table must be empty. This is the default
       * behavior.
       *
       * 
If the output table is not empty, the write fails at runtime.
       *
       * 
This check may occur long before data is written, and does not
       * guarantee exclusive access to the table. If two programs are run
       * concurrently, each specifying the same output table and
       * a {@link WriteDisposition} of {@link WriteDisposition#WRITE_EMPTY}, it is possible
       * for both to succeed.
       */
      WRITE_EMPTY
    }

    /**
     * Creates a write transformation with the given transform name. The BigQuery table to be
     * written has not yet been configured.
     */
    public static Bound named(String name) {
      return new Bound().named(name);
    }

    /**
     * Creates a write transformation for the given table specification.
     *
     * 
Refer to {@link #parseTableSpec(String)} for the specification format.
     */
    public static Bound to(String tableSpec) {
      return new Bound().to(tableSpec);
    }

    /** Creates a write transformation for the given table. */
    public static Bound to(ValueProvider tableSpec) {
      return new Bound().to(tableSpec);
    }

    /** Creates a write transformation for the given table. */
    public static Bound to(TableReference table) {
      return new Bound().to(table);
    }

    /**
     * Creates a write transformation from a function that maps windows to table specifications.
     * Each time a new window is encountered, this function will be called and the resulting table
     * will be created. Records within that window will be written to the associated table.
     *
     * 
See {@link #parseTableSpec(String)} for the format that {@code tableSpecFunction} should
     * return.
     *
     * 
{@code tableSpecFunction} should be deterministic. When given the same window, it should
     * always return the same table specification.
     */
    public static Bound to(SerializableFunction tableSpecFunction) {
      return new Bound().to(tableSpecFunction);
    }

    /**
     * Creates a write transformation from a function that maps windows to {@link TableReference}
     * objects.
     *
     * 
{@code tableRefFunction} should be deterministic. When given the same window, it should
     * always return the same table reference.
     */
    public static Bound toTableReference(
        SerializableFunction tableRefFunction) {
      return new Bound().toTableReference(tableRefFunction);
    }

    /**
     * Creates a write transformation with the specified schema to use in table creation.
     *
     * 
The schema is required only if writing to a table that does not already
     * exist, and {@link CreateDisposition} is set to
     * {@link CreateDisposition#CREATE_IF_NEEDED}.
     */
    public static Bound withSchema(TableSchema schema) {
      return new Bound().withSchema(schema);
    }

    /**
     * Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}.
     */
    public static Bound withSchema(ValueProvider schema) {
      return new Bound().withSchema(schema);
    }

    /** Creates a write transformation with the specified options for creating the table. */
    public static Bound withCreateDisposition(CreateDisposition disposition) {
      return new Bound().withCreateDisposition(disposition);
    }

    /** Creates a write transformation with the specified options for writing to the table. */
    public static Bound withWriteDisposition(WriteDisposition disposition) {
      return new Bound().withWriteDisposition(disposition);
    }

    /**
     * Creates a write transformation with BigQuery table validation disabled.
     */
    public static Bound withoutValidation() {
      return new Bound().withoutValidation();
    }

    /**
     * A {@link PTransform} that can write either a bounded or unbounded
     * {@link PCollection} of {@link TableRow TableRows} to a BigQuery table.
     */
    public static class Bound extends PTransform, PDone> {
      // Maximum number of files in a single partition.
      static final int MAX_NUM_FILES = 10000;

      // Maximum number of bytes in a single partition -- 11 TiB just under BQ's 12 TiB limit.
      static final long MAX_SIZE_BYTES = 11 * (1L << 40);

      // The maximum number of retry jobs.
      static final int MAX_RETRY_JOBS = 3;

      // The maximum number of retries to poll the status of a job.
      // It sets to {@code Integer.MAX_VALUE} to block until the BigQuery job finishes.
      static final int LOAD_JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE;

      @Nullable final ValueProvider jsonTableRef;

      @Nullable final SerializableFunction tableRefFunction;

      // Table schema. The schema is required only if the table does not exist.
      @Nullable final ValueProvider jsonSchema;

      // Options for creating the table. Valid values are CREATE_IF_NEEDED and
      // CREATE_NEVER.
      final CreateDisposition createDisposition;

      // Options for writing to the table. Valid values are WRITE_TRUNCATE,
      // WRITE_APPEND and WRITE_EMPTY.
      final WriteDisposition writeDisposition;

      // An option to indicate if table validation is desired. Default is true.
      final boolean validate;

      @Nullable private BigQueryServices bigQueryServices;

      private static class TranslateTableSpecFunction implements
          SerializableFunction {
        private SerializableFunction tableSpecFunction;

        TranslateTableSpecFunction(SerializableFunction tableSpecFunction) {
          this.tableSpecFunction = tableSpecFunction;
        }

        @Override
        public TableReference apply(BoundedWindow value) {
          return parseTableSpec(tableSpecFunction.apply(value));
        }
      }

      /**
       * @deprecated Should be private. Instead, use one of the factory methods in
       * {@link BigQueryIO.Write}, such as {@link BigQueryIO.Write#to(String)}, to create an
       * instance of this class.
       */
      @Deprecated
      public Bound() {
        this(
            null /* name */,
            null /* jsonTableRef */,
            null /* tableRefFunction */,
            null /* jsonSchema */,
            CreateDisposition.CREATE_IF_NEEDED,
            WriteDisposition.WRITE_EMPTY,
            true /* validate */,
            null /* bigQueryServices */);
      }

      private Bound(String name, @Nullable ValueProvider jsonTableRef,
          @Nullable SerializableFunction tableRefFunction,
          @Nullable ValueProvider jsonSchema,
          CreateDisposition createDisposition, WriteDisposition writeDisposition, boolean validate,
          @Nullable BigQueryServices bigQueryServices) {
        super(name);
        this.jsonTableRef = jsonTableRef;
        this.tableRefFunction = tableRefFunction;
        this.jsonSchema = jsonSchema;
        this.createDisposition = checkNotNull(createDisposition, "createDisposition");
        this.writeDisposition = checkNotNull(writeDisposition, "writeDisposition");
        this.validate = validate;
        this.bigQueryServices = bigQueryServices;
      }

      /**
       * Returns a copy of this write transformation, but with the specified transform name.
       *
       * 
Does not modify this object.
       */
      public Bound named(String name) {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but writing to the specified table. Refer to
       * {@link #parseTableSpec(String)} for the specification format.
       *
       * 
Does not modify this object.
       */
      public Bound to(String tableSpec) {
        return toTableRef(NestedValueProvider.of(
            StaticValueProvider.of(tableSpec), new TableSpecToTableRef()));
      }

      /**
       * Returns a copy of this write transformation, but writing to the specified table.
       *
       * 
Does not modify this object.
       */
      public Bound to(TableReference table) {
        return to(StaticValueProvider.of(toTableSpec(table)));
      }

      /**
       * Returns a copy of this write transformation, but using the specified function to determine
       * which table to write to for each window.
       *
       * 
Does not modify this object.
       *
       * 
{@code tableSpecFunction} should be deterministic. When given the same window, it
       * should always return the same table specification.
       */
      public Bound to(
          SerializableFunction tableSpecFunction) {
        return toTableReference(new TranslateTableSpecFunction(tableSpecFunction));
      }

      /**
       * Returns a copy of this write transformation, but writing to the specified table. Refer to
       * {@link #parseTableSpec(String)} for the specification format.
       *
       * 
Does not modify this object.
       */
      public Bound to(ValueProvider tableSpec) {
        return toTableRef(NestedValueProvider.of(tableSpec, new TableSpecToTableRef()));
      }

      /**
       * Returns a copy of this write transformation, but writing to the specified table.
       *
       * 
Does not modify this object.
       */
      private Bound toTableRef(ValueProvider table) {
        return new Bound(name,
            NestedValueProvider.of(table, new TableRefToJson()),
            tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but using the specified function to determine
       * which table to write to for each window.
       *
       * 
Does not modify this object.
       *
       * 
{@code tableRefFunction} should be deterministic. When given the same window, it should
       * always return the same table reference.
       */
      public Bound toTableReference(
          SerializableFunction tableRefFunction) {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but using the specified schema for rows
       * to be written.
       *
       * 
Does not modify this object.
       */
      public Bound withSchema(TableSchema schema) {
        return new Bound(name, jsonTableRef, tableRefFunction,
            StaticValueProvider.of(toJsonString(schema)),
            createDisposition, writeDisposition, validate, bigQueryServices);
      }

      /**
       * Like {@link #withSchema(TableSchema)}, but with a {@link ValueProvider}.
       */
      public Bound withSchema(ValueProvider schema) {
        return new Bound(name, jsonTableRef, tableRefFunction,
            NestedValueProvider.of(schema, new TableSchemaToJsonSchema()),
            createDisposition, writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but using the specified create disposition.
       *
       * 
Does not modify this object.
       */
      public Bound withCreateDisposition(CreateDisposition createDisposition) {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but using the specified write disposition.
       *
       * 
Does not modify this object.
       */
      public Bound withWriteDisposition(WriteDisposition writeDisposition) {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, bigQueryServices);
      }

      /**
       * Returns a copy of this write transformation, but without BigQuery table validation.
       *
       * 
Does not modify this object.
       */
      public Bound withoutValidation() {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, false, bigQueryServices);
      }

      @VisibleForTesting
      Bound withTestServices(BigQueryServices testServices) {
        return new Bound(name, jsonTableRef, tableRefFunction, jsonSchema, createDisposition,
            writeDisposition, validate, testServices);
      }

      private static void verifyTableEmpty(
          BigQueryOptions options,
          TableReference table) {
        try {
          Bigquery client = Transport.newBigQueryClient(options).build();
          BigQueryTableInserter inserter = new BigQueryTableInserter(client);
          if (!inserter.isEmpty(table)) {
            throw new IllegalArgumentException(
                "BigQuery table is not empty: " + BigQueryIO.toTableSpec(table));
          }
        } catch (IOException e) {
          ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
          if (errorExtractor.itemNotFound(e)) {
            // Nothing to do. If the table does not exist, it is considered empty.
          } else {
            throw new RuntimeException(
                "unable to confirm BigQuery table emptiness for table "
                    + BigQueryIO.toTableSpec(table), e);
          }
        }
      }

      @Override
      public void validate(PCollection input) {
        BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);

        // Exactly one of the table and table reference can be configured.
        checkState(
            jsonTableRef != null || tableRefFunction != null,
            "must set the table reference of a BigQueryIO.Write transform");
        checkState(
            jsonTableRef == null || tableRefFunction == null,
            "Cannot set both a table reference and a table function for a BigQueryIO.Write"
                + " transform");

        // Require a schema if creating one or more tables.
        checkArgument(
            createDisposition != CreateDisposition.CREATE_IF_NEEDED || jsonSchema != null,
            "CreateDisposition is CREATE_IF_NEEDED, however no schema was provided.");

        // The user specified a table.
        if (jsonTableRef != null && validate) {
          TableReference table = getTableWithDefaultProject(options).get();

          // Check for destination table presence and emptiness for early failure notification.
          // Note that a presence check can fail when the table or dataset is created by an earlier
          // stage of the pipeline. For these cases the #withoutValidation method can be used to
          // disable the check.
          verifyDatasetPresence(options, table);
          if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
            verifyTablePresence(options, table);
          }
          if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
            verifyTableEmpty(options, table);
          }
        }

        if (options.isStreaming() || tableRefFunction != null) {
          // We will use BigQuery's streaming write API -- validate supported dispositions.
          checkArgument(
              createDisposition != CreateDisposition.CREATE_NEVER,
              "CreateDisposition.CREATE_NEVER is not supported for an unbounded PCollection or when"
                  + " using a tablespec function.");

          checkArgument(
              writeDisposition != WriteDisposition.WRITE_TRUNCATE,
              "WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded PCollection or"
                  + " when using a tablespec function.");
        } else {
          // We will use a BigQuery load job -- validate the temp location.
          String tempLocation = options.getTempLocation();
          checkArgument(
              !Strings.isNullOrEmpty(tempLocation),
              "BigQueryIO.Write needs a GCS temp location to store temp files.");
          if (bigQueryServices == null) {
            try {
              GcsPath.fromUri(tempLocation);
            } catch (IllegalArgumentException e) {
              throw new IllegalArgumentException(
                  String.format(
                      "BigQuery temp location expected a valid 'gs://' path, but was given '%s'",
                      tempLocation),
                  e);
            }
          }
        }
      }

      @Override
      public PDone apply(PCollection input) {
        Pipeline p = input.getPipeline();
        BigQueryOptions options = p.getOptions().as(BigQueryOptions.class);
        BigQueryServices bqServices = getBigQueryServices();

        // In a streaming job, or when a tablespec function is defined, we use StreamWithDeDup
        // and BigQuery's streaming import API.
        if (options.isStreaming() || tableRefFunction != null) {
          return input.apply(new StreamWithDeDup(getTable(), tableRefFunction,
                  NestedValueProvider.of(jsonSchema, new JsonSchemaToTableSchema())));

        }

        ValueProvider table = getTableWithDefaultProject(options);

        String jobIdToken = "beam_job_" + randomUUIDString();
        String tempLocation = options.getTempLocation();
        String tempFilePrefix;
        try {
          IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
          tempFilePrefix = factory.resolve(
                  factory.resolve(tempLocation, "BigQueryWriteTemp"),
                  jobIdToken);
        } catch (IOException e) {
          throw new RuntimeException(
              String.format("Failed to resolve BigQuery temp location in %s", tempLocation),
              e);
        }

        PCollection singleton = p.apply("Create", Create.of(tempFilePrefix));

        PCollection inputInGlobalWindow =
            input.apply(
                Window.into(new GlobalWindows())
                    .triggering(DefaultTrigger.of())
                    .discardingFiredPanes());

        PCollection> results = inputInGlobalWindow
            .apply("WriteBundles",
                ParDo.of(new WriteBundles(tempFilePrefix)));

        TupleTag>> multiPartitionsTag =
            new TupleTag>>("multiPartitionsTag") {};
        TupleTag>> singlePartitionTag =
            new TupleTag>>("singlePartitionTag") {};

        PCollectionView>> resultsView = results
            .apply("ResultsView", View.>asIterable());
        PCollectionTuple partitions = singleton.apply(ParDo
            .of(new WritePartition(
                resultsView,
                multiPartitionsTag,
                singlePartitionTag))
            .withSideInputs(resultsView)
            .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));

        // Write multiple partitions to separate temporary tables
        PCollection tempTables = partitions.get(multiPartitionsTag)
            .apply("MultiPartitionsGroupByKey", GroupByKey.>create())
            .apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
                false,
                bqServices,
                jobIdToken,
                tempFilePrefix,
                NestedValueProvider.of(table, new TableRefToJson()),
                jsonSchema,
                WriteDisposition.WRITE_EMPTY,
                CreateDisposition.CREATE_IF_NEEDED)));

        PCollectionView> tempTablesView = tempTables
            .apply("TempTablesView", View.asIterable());
        singleton.apply(ParDo
            .of(new WriteRename(
                bqServices,
                jobIdToken,
                NestedValueProvider.of(table, new TableRefToJson()),
                writeDisposition,
                createDisposition,
                tempTablesView))
            .withSideInputs(tempTablesView));

        // Write single partition to final table
        partitions.get(singlePartitionTag)
            .apply("SinglePartitionGroupByKey", GroupByKey.>create())
            .apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
                true,
                bqServices,
                jobIdToken,
                tempFilePrefix,
                NestedValueProvider.of(table, new TableRefToJson()),
                jsonSchema,
                writeDisposition,
                createDisposition)));

        return PDone.in(input.getPipeline());
      }

      private class WriteBundles extends DoFn> {
        private TableRowWriter writer = null;
        private final String tempFilePrefix;

        WriteBundles(String tempFilePrefix) {
          this.tempFilePrefix = tempFilePrefix;
        }

        @Override
        public void processElement(ProcessContext c) throws Exception {
          if (writer == null) {
            writer = new TableRowWriter(tempFilePrefix);
            writer.open(UUID.randomUUID().toString());
            LOG.debug("Done opening writer {}", writer);
          }
          try {
            writer.write(c.element());
          } catch (Exception e) {
            // Discard write result and close the write.
            try {
              writer.close();
              // The writer does not need to be reset, as this OldDoFn cannot be reused.
            } catch (Exception closeException) {
              // Do not mask the exception that caused the write to fail.
              e.addSuppressed(closeException);
            }
            throw e;
          }
        }

        @Override
        public void finishBundle(Context c) throws Exception {
          if (writer != null) {
            c.output(writer.close());
            writer = null;
          }
        }

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
          super.populateDisplayData(builder);

          builder
              .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
                  .withLabel("Temporary File Prefix"));
        }
      }

      @Override
      protected Coder getDefaultOutputCoder() {
        return VoidCoder.of();
      }

      @Override
      public void populateDisplayData(DisplayData.Builder builder) {
        super.populateDisplayData(builder);

        builder
            .addIfNotNull(DisplayData.item("table", jsonTableRef)
              .withLabel("Table Reference"))
            .addIfNotNull(DisplayData.item("schema", jsonSchema)
              .withLabel("Table Schema"));

        if (tableRefFunction != null) {
          builder.add(DisplayData.item("tableFn", tableRefFunction.getClass())
            .withLabel("Table Reference Function"));
        }

        builder
            .add(DisplayData.item("createDisposition", createDisposition.toString())
              .withLabel("Table CreateDisposition"))
            .add(DisplayData.item("writeDisposition", writeDisposition.toString())
              .withLabel("Table WriteDisposition"))
            .addIfNotDefault(DisplayData.item("validation", validate)
              .withLabel("Validation Enabled"), true);
      }

      /** Returns the create disposition. */
      public CreateDisposition getCreateDisposition() {
        return createDisposition;
      }

      /** Returns the write disposition. */
      public WriteDisposition getWriteDisposition() {
        return writeDisposition;
      }

      /** Returns the table schema. */
      public TableSchema getSchema() {
        return fromJsonString(
            jsonSchema == null ? null : jsonSchema.get(), TableSchema.class);
      }

      /**
       * Returns the table to write, or {@code null} if writing with {@code tableRefFunction}.
       *
       * 
If the table's project is not specified, use the executing project.
       */
      @Nullable private ValueProvider getTableWithDefaultProject(
          BigQueryOptions bqOptions) {
        ValueProvider table = getTable();
        if (table == null) {
          return table;
        }
        if (!table.isAccessible()) {
          LOG.info("Using a dynamic value for table input. This must contain a project"
              + " in the table reference: {}", table);
          return table;
        }
        if (Strings.isNullOrEmpty(table.get().getProjectId())) {
          // If user does not specify a project we assume the table to be located in
          // the default project.
          TableReference tableRef = table.get();
          tableRef.setProjectId(bqOptions.getProject());
          return NestedValueProvider.of(StaticValueProvider.of(
              toJsonString(tableRef)), new JsonTableRefToTableRef());
        }
        return table;
      }

      /** Returns the table reference, or {@code null}. */
      @Nullable
      public ValueProvider getTable() {
        return jsonTableRef == null
            ? null : NestedValueProvider.of(jsonTableRef, new JsonTableRefToTableRef());
      }

      /** Returns {@code true} if table validation is enabled. */
      public boolean getValidate() {
        return validate;
      }

      private BigQueryServices getBigQueryServices() {
        if (bigQueryServices == null) {
          bigQueryServices = new BigQueryServicesImpl();
        }
        return bigQueryServices;
      }
    }

    static class TableRowWriter {
      private static final Coder CODER = TableRowJsonCoder.of();
      private static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8);
      private final String tempFilePrefix;
      private String id;
      private String fileName;
      private WritableByteChannel channel;
      protected String mimeType = MimeTypes.TEXT;
      private CountingOutputStream out;

      TableRowWriter(String basename) {
        this.tempFilePrefix = basename;
      }

      public final void open(String uId) throws Exception {
        id = uId;
        fileName = tempFilePrefix + id;
        LOG.debug("Opening {}.", fileName);
        channel = IOChannelUtils.create(fileName, mimeType);
        try {
          out = new CountingOutputStream(Channels.newOutputStream(channel));
          LOG.debug("Writing header to {}.", fileName);
        } catch (Exception e) {
          try {
            LOG.error("Writing header to {} failed, closing channel.", fileName);
            channel.close();
          } catch (IOException closeException) {
            LOG.error("Closing channel for {} failed", fileName);
          }
          throw e;
        }
        LOG.debug("Starting write of bundle {} to {}.", this.id, fileName);
      }

      public void write(TableRow value) throws Exception {
        CODER.encode(value, out, Context.OUTER);
        out.write(NEWLINE);
      }

      public final KV close() throws IOException {
        channel.close();
        return KV.of(fileName, out.getCount());
      }
    }

    /**
     * Partitions temporary files based on number of files and file sizes.
     */
    static class WritePartition extends DoFn>> {
      private final PCollectionView>> resultsView;
      private TupleTag>> multiPartitionsTag;
      private TupleTag>> singlePartitionTag;

      public WritePartition(
          PCollectionView>> resultsView,
          TupleTag>> multiPartitionsTag,
          TupleTag>> singlePartitionTag) {
        this.resultsView = resultsView;
        this.multiPartitionsTag = multiPartitionsTag;
        this.singlePartitionTag = singlePartitionTag;
      }

      @Override
      public void processElement(ProcessContext c) throws Exception {
        List> results = Lists.newArrayList(c.sideInput(resultsView));
        if (results.isEmpty()) {
          TableRowWriter writer = new TableRowWriter(c.element());
          writer.open(UUID.randomUUID().toString());
          results.add(writer.close());
        }

        long partitionId = 0;
        int currNumFiles = 0;
        long currSizeBytes = 0;
        List currResults = Lists.newArrayList();
        for (int i = 0; i < results.size(); ++i) {
          KV fileResult = results.get(i);
          if (currNumFiles + 1 > Bound.MAX_NUM_FILES
              || currSizeBytes + fileResult.getValue() > Bound.MAX_SIZE_BYTES) {
            c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
            currResults = Lists.newArrayList();
            currNumFiles = 0;
            currSizeBytes = 0;
          }
          ++currNumFiles;
          currSizeBytes += fileResult.getValue();
          currResults.add(fileResult.getKey());
        }
        if (partitionId == 0) {
          c.sideOutput(singlePartitionTag, KV.of(++partitionId, currResults));
        } else {
          c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
        }
      }

      @Override
      public void populateDisplayData(DisplayData.Builder builder) {
        super.populateDisplayData(builder);
      }
    }

    /**
     * Writes partitions to BigQuery tables.
     */
    static class WriteTables extends DoFn>>, String> {
      private final boolean singlePartition;
      private final BigQueryServices bqServices;
      private final String jobIdToken;
      private final String tempFilePrefix;
      private final ValueProvider jsonTableRef;
      private final ValueProvider jsonSchema;
      private final WriteDisposition writeDisposition;
      private final CreateDisposition createDisposition;

      public WriteTables(
          boolean singlePartition,
          BigQueryServices bqServices,
          String jobIdToken,
          String tempFilePrefix,
          ValueProvider jsonTableRef,
          ValueProvider jsonSchema,
          WriteDisposition writeDisposition,
          CreateDisposition createDisposition) {
        this.singlePartition = singlePartition;
        this.bqServices = bqServices;
        this.jobIdToken = jobIdToken;
        this.tempFilePrefix = tempFilePrefix;
        this.jsonTableRef = jsonTableRef;
        this.jsonSchema = jsonSchema;
        this.writeDisposition = writeDisposition;
        this.createDisposition = createDisposition;
      }

      @Override
      public void processElement(ProcessContext c) throws Exception {
        List partition = Lists.newArrayList(c.element().getValue()).get(0);
        String jobIdPrefix = String.format(jobIdToken + "_%05d", c.element().getKey());
        TableReference ref = fromJsonString(jsonTableRef.get(), TableReference.class);
        if (!singlePartition) {
          ref.setTableId(jobIdPrefix);
        }

        load(
            bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
            jobIdPrefix,
            ref,
            fromJsonString(
                jsonSchema == null ? null : jsonSchema.get(), TableSchema.class),
            partition,
            writeDisposition,
            createDisposition);
        c.output(toJsonString(ref));

        removeTemporaryFiles(c.getPipelineOptions(), tempFilePrefix, partition);
      }

      private void load(
          JobService jobService,
          String jobIdPrefix,
          TableReference ref,
          @Nullable TableSchema schema,
          List gcsUris,
          WriteDisposition writeDisposition,
          CreateDisposition createDisposition) throws InterruptedException, IOException {
        JobConfigurationLoad loadConfig = new JobConfigurationLoad()
            .setDestinationTable(ref)
            .setSchema(schema)
            .setSourceUris(gcsUris)
            .setWriteDisposition(writeDisposition.name())
            .setCreateDisposition(createDisposition.name())
            .setSourceFormat("NEWLINE_DELIMITED_JSON");

        String projectId = ref.getProjectId();
        for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) {
          String jobId = jobIdPrefix + "-" + i;
          LOG.info("Starting BigQuery load job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS);
          JobReference jobRef = new JobReference()
              .setProjectId(projectId)
              .setJobId(jobId);
          jobService.startLoadJob(jobRef, loadConfig);
          Status jobStatus =
              parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES));
          switch (jobStatus) {
            case SUCCEEDED:
              return;
            case UNKNOWN:
              throw new RuntimeException("Failed to poll the load job status of job " + jobId);
            case FAILED:
              LOG.info("BigQuery load job failed: {}", jobId);
              continue;
            default:
              throw new IllegalStateException(String.format("Unexpected job status: %s of job %s",
                  jobStatus, jobId));
          }
        }
        throw new RuntimeException(String.format("Failed to create the load job %s, reached max "
            + "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS));
      }

      static void removeTemporaryFiles(
          PipelineOptions options,
          String tempFilePrefix,
          Collection files)
          throws IOException {
        IOChannelFactory factory = IOChannelUtils.getFactory(tempFilePrefix);
        if (factory instanceof GcsIOChannelFactory) {
          GcsUtil gcsUtil = new GcsUtil.GcsUtilFactory().create(options);
          gcsUtil.remove(files);
        } else if (factory instanceof FileIOChannelFactory) {
          for (String filename : files) {
            LOG.debug("Removing file {}", filename);
            boolean exists = Files.deleteIfExists(Paths.get(filename));
            if (!exists) {
              LOG.debug("{} does not exist.", filename);
            }
          }
        } else {
          throw new IOException("Unrecognized file system.");
        }
      }

      @Override
      public void populateDisplayData(DisplayData.Builder builder) {
        super.populateDisplayData(builder);

        builder
            .addIfNotNull(DisplayData.item("jobIdToken", jobIdToken)
                .withLabel("Job ID Token"))
            .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
                .withLabel("Temporary File Prefix"))
            .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
                .withLabel("Table Reference"))
            .addIfNotNull(DisplayData.item("jsonSchema", jsonSchema)
                .withLabel("Table Schema"));
      }
    }

    /**
     * Copies temporary tables to destination table.
     */
    static class WriteRename extends DoFn {
      private final BigQueryServices bqServices;
      private final String jobIdToken;
      private final ValueProvider jsonTableRef;
      private final WriteDisposition writeDisposition;
      private final CreateDisposition createDisposition;
      private final PCollectionView> tempTablesView;

      public WriteRename(
          BigQueryServices bqServices,
          String jobIdToken,
          ValueProvider jsonTableRef,
          WriteDisposition writeDisposition,
          CreateDisposition createDisposition,
          PCollectionView> tempTablesView) {
        this.bqServices = bqServices;
        this.jobIdToken = jobIdToken;
        this.jsonTableRef = jsonTableRef;
        this.writeDisposition = writeDisposition;
        this.createDisposition = createDisposition;
        this.tempTablesView = tempTablesView;
      }

      @Override
      public void processElement(ProcessContext c) throws Exception {
        List tempTablesJson = Lists.newArrayList(c.sideInput(tempTablesView));

        // Do not copy if no temp tables are provided
        if (tempTablesJson.size() == 0) {
          return;
        }

        List tempTables = Lists.newArrayList();
        for (String table : tempTablesJson) {
          tempTables.add(fromJsonString(table, TableReference.class));
        }
        copy(
            bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
            jobIdToken,
            fromJsonString(jsonTableRef.get(), TableReference.class),
            tempTables,
            writeDisposition,
            createDisposition);

        DatasetService tableService =
            bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
        removeTemporaryTables(tableService, tempTables);
      }

      private void copy(
          JobService jobService,
          String jobIdPrefix,
          TableReference ref,
          List tempTables,
          WriteDisposition writeDisposition,
          CreateDisposition createDisposition) throws InterruptedException, IOException {
        JobConfigurationTableCopy copyConfig = new JobConfigurationTableCopy()
            .setSourceTables(tempTables)
            .setDestinationTable(ref)
            .setWriteDisposition(writeDisposition.name())
            .setCreateDisposition(createDisposition.name());

        String projectId = ref.getProjectId();
        for (int i = 0; i < Bound.MAX_RETRY_JOBS; ++i) {
          String jobId = jobIdPrefix + "-" + i;
          LOG.info("Starting BigQuery copy job {}: try {}/{}", jobId, i, Bound.MAX_RETRY_JOBS);
          JobReference jobRef = new JobReference()
              .setProjectId(projectId)
              .setJobId(jobId);
          jobService.startCopyJob(jobRef, copyConfig);
          Status jobStatus =
              parseStatus(jobService.pollJob(jobRef, Bound.LOAD_JOB_POLL_MAX_RETRIES));
          switch (jobStatus) {
            case SUCCEEDED:
              return;
            case UNKNOWN:
              throw new RuntimeException("Failed to poll the copy job status of job " + jobId);
            case FAILED:
              LOG.info("BigQuery copy job failed: {}", jobId);
              continue;
            default:
              throw new IllegalStateException(String.format("Unexpected job status: %s of job %s",
                  jobStatus, jobId));
          }
        }
        throw new RuntimeException(String.format("Failed to create the copy job %s, reached max "
            + "retries: %d", jobIdPrefix, Bound.MAX_RETRY_JOBS));
      }

      static void removeTemporaryTables(DatasetService tableService,
          List tempTables) throws Exception {
        for (TableReference tableRef : tempTables) {
          try {
            LOG.debug("Deleting table {}", toJsonString(tableRef));
            tableService.deleteTable(
                tableRef.getProjectId(),
                tableRef.getDatasetId(),
                tableRef.getTableId());
          } catch (Exception e) {
            LOG.warn("Failed to delete the table {}", toJsonString(tableRef), e);
          }
        }
      }

      @Override
      public void populateDisplayData(DisplayData.Builder builder) {
        super.populateDisplayData(builder);

        builder
            .addIfNotNull(DisplayData.item("jobIdToken", jobIdToken)
                .withLabel("Job ID Token"))
            .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
                .withLabel("Table Reference"))
            .add(DisplayData.item("writeDisposition", writeDisposition.toString())
                .withLabel("Write Disposition"))
            .add(DisplayData.item("createDisposition", createDisposition.toString())
                .withLabel("Create Disposition"));
      }
    }

    /** Disallow construction of utility class. */
    private Write() {}
  }

  private static void verifyDatasetPresence(BigQueryOptions options, TableReference table) {
    String resourceNotFoundMsg =
        String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", BigQueryIO.toTableSpec(table));
    try {
      Bigquery client = Transport.newBigQueryClient(options).build();
      BigQueryTableRowIterator.executeWithBackOff(
          client.datasets().get(table.getProjectId(), table.getDatasetId()),
          resourceNotFoundMsg);
    } catch (Exception e) {
      ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
      if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
        throw new IllegalArgumentException(resourceNotFoundMsg, e);
      } else {
        throw new RuntimeException(
            String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset",
                BigQueryIO.toTableSpec(table)),
            e);
      }
    }
  }

  private static void verifyTablePresence(BigQueryOptions options, TableReference table) {
    String resourceNotFoundMsg =
        String.format(RESOURCE_NOT_FOUND_ERROR, "table", BigQueryIO.toTableSpec(table));
    try {
      Bigquery client = Transport.newBigQueryClient(options).build();
      BigQueryTableRowIterator.executeWithBackOff(
          client.tables().get(table.getProjectId(), table.getDatasetId(), table.getTableId()),
          resourceNotFoundMsg);
    } catch (Exception e) {
      ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
      if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
        throw new IllegalArgumentException(resourceNotFoundMsg, e);
      } else {
        throw new RuntimeException(
            String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table",
                BigQueryIO.toTableSpec(table)),
            e);
      }
    }
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
   * Implementation of DoFn to perform streaming BigQuery write.
   */
  @SystemDoFnInternal
  private static class StreamingWriteFn
      extends DoFn, TableRowInfo>, Void> {
    /** TableSchema in JSON. Use String to make the class Serializable. */
    private final ValueProvider jsonTableSchema;

    /** JsonTableRows to accumulate BigQuery rows in order to batch writes. */
    private transient Map> tableRows;

    /** The list of unique ids for each BigQuery table row. */
    private transient Map> uniqueIdsForTableRows;

    /** The list of tables created so far, so we don't try the creation
        each time. */
    private static Set createdTables =
        Collections.newSetFromMap(new ConcurrentHashMap());

    /** Tracks bytes written, exposed as "ByteCount" Counter. */
    private Aggregator byteCountAggregator =
        createAggregator("ByteCount", new Sum.SumLongFn());

    /** Constructor. */
    StreamingWriteFn(ValueProvider schema) {
      this.jsonTableSchema =
          NestedValueProvider.of(schema, new TableSchemaToJsonSchema());
    }

    /** Prepares a target BigQuery table. */
    @Override
    public void startBundle(Context context) {
      tableRows = new HashMap<>();
      uniqueIdsForTableRows = new HashMap<>();
    }

    /** Accumulates the input into JsonTableRows and uniqueIdsForTableRows. */
    @Override
    public void processElement(ProcessContext context) {
      String tableSpec = context.element().getKey().getKey();
      List rows = getOrCreateMapListValue(tableRows, tableSpec);
      List uniqueIds = getOrCreateMapListValue(uniqueIdsForTableRows, tableSpec);

      rows.add(context.element().getValue().tableRow);
      uniqueIds.add(context.element().getValue().uniqueId);
    }

    /** Writes the accumulated rows into BigQuery with streaming API. */
    @Override
    public void finishBundle(Context context) throws Exception {
      BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
      Bigquery client = Transport.newBigQueryClient(options).build();

      for (String tableSpec : tableRows.keySet()) {
        TableReference tableReference = getOrCreateTable(options, tableSpec);
        flushRows(client, tableReference, tableRows.get(tableSpec),
            uniqueIdsForTableRows.get(tableSpec));
      }
      tableRows.clear();
      uniqueIdsForTableRows.clear();
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);

      builder.addIfNotNull(DisplayData.item("schema", jsonTableSchema)
        .withLabel("Table Schema"));
    }

    public TableReference getOrCreateTable(BigQueryOptions options, String tableSpec)
        throws IOException {
      TableReference tableReference = parseTableSpec(tableSpec);
      if (!createdTables.contains(tableSpec)) {
        synchronized (createdTables) {
          // Another thread may have succeeded in creating the table in the meanwhile, so
          // check again. This check isn't needed for correctness, but we add it to prevent
          // every thread from attempting a create and overwhelming our BigQuery quota.
          if (!createdTables.contains(tableSpec)) {
            TableSchema tableSchema = JSON_FACTORY.fromString(
                jsonTableSchema.get(), TableSchema.class);
            Bigquery client = Transport.newBigQueryClient(options).build();
            BigQueryTableInserter inserter = new BigQueryTableInserter(client);
            inserter.getOrCreateTable(tableReference, Write.WriteDisposition.WRITE_APPEND,
                Write.CreateDisposition.CREATE_IF_NEEDED, tableSchema);
            createdTables.add(tableSpec);
          }
        }
      }
      return tableReference;
    }

    /** Writes the accumulated rows into BigQuery with streaming API. */
    private void flushRows(Bigquery client, TableReference tableReference,
        List tableRows, List uniqueIds) {
      if (!tableRows.isEmpty()) {
        try {
          BigQueryTableInserter inserter = new BigQueryTableInserter(client);
          inserter.insertAll(tableReference, tableRows, uniqueIds, byteCountAggregator);
        } catch (IOException e) {
          throw new RuntimeException(e);
        }
      }
    }
  }

  private static class ShardedKey {
    private final K key;
    private final int shardNumber;

    public static  ShardedKey of(K key, int shardNumber) {
      return new ShardedKey(key, shardNumber);
    }

    private ShardedKey(K key, int shardNumber) {
      this.key = key;
      this.shardNumber = shardNumber;
    }

    public K getKey() {
      return key;
    }

    public int getShardNumber() {
      return shardNumber;
    }
  }

  /**
   * A {@link Coder} for {@link ShardedKey}, using a wrapped key {@link Coder}.
   */
  private static class ShardedKeyCoder
      extends StandardCoder> {
    public static  ShardedKeyCoder of(Coder keyCoder) {
      return new ShardedKeyCoder<>(keyCoder);
    }

    @JsonCreator
    public static  ShardedKeyCoder of(
         @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
        List> components) {
      checkArgument(components.size() == 1, "Expecting 1 component, got %s", components.size());
      return of(components.get(0));
    }

    protected ShardedKeyCoder(Coder keyCoder) {
      this.keyCoder = keyCoder;
      this.shardNumberCoder = VarIntCoder.of();
    }

    @Override
    public List> getCoderArguments() {
      return Arrays.asList(keyCoder);
    }

    @Override
    public void encode(ShardedKey key, OutputStream outStream, Context context)
        throws IOException {
      keyCoder.encode(key.getKey(), outStream, context.nested());
      shardNumberCoder.encode(key.getShardNumber(), outStream, context);
    }

    @Override
    public ShardedKey decode(InputStream inStream, Context context)
        throws IOException {
      return new ShardedKey(
          keyCoder.decode(inStream, context.nested()),
          shardNumberCoder.decode(inStream, context));
    }

    @Override
    public void verifyDeterministic() throws NonDeterministicException {
      keyCoder.verifyDeterministic();
    }

    Coder keyCoder;
    VarIntCoder shardNumberCoder;
  }

  private static class TableRowInfoCoder extends AtomicCoder {
    private static final TableRowInfoCoder INSTANCE = new TableRowInfoCoder();

    @JsonCreator
    public static TableRowInfoCoder of() {
      return INSTANCE;
    }

    @Override
    public void encode(TableRowInfo value, OutputStream outStream, Context context)
      throws IOException {
      if (value == null) {
        throw new CoderException("cannot encode a null value");
      }
      tableRowCoder.encode(value.tableRow, outStream, context.nested());
      idCoder.encode(value.uniqueId, outStream, context.nested());
    }

    @Override
    public TableRowInfo decode(InputStream inStream, Context context)
      throws IOException {
      return new TableRowInfo(
          tableRowCoder.decode(inStream, context.nested()),
          idCoder.decode(inStream, context.nested()));
    }

    @Override
    public void verifyDeterministic() throws NonDeterministicException {
      throw new NonDeterministicException(this, "TableRows are not deterministic.");
    }

    TableRowJsonCoder tableRowCoder = TableRowJsonCoder.of();
    StringUtf8Coder idCoder = StringUtf8Coder.of();
  }

  private static class TableRowInfo {
    TableRowInfo(TableRow tableRow, String uniqueId) {
      this.tableRow = tableRow;
      this.uniqueId = uniqueId;
    }

    final TableRow tableRow;
    final String uniqueId;
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
   * Fn that tags each table row with a unique id and destination table.
   * To avoid calling UUID.randomUUID() for each element, which can be costly,
   * a randomUUID is generated only once per bucket of data. The actual unique
   * id is created by concatenating this randomUUID with a sequential number.
   */
  @VisibleForTesting
  static class TagWithUniqueIdsAndTable
      extends DoFn, TableRowInfo>>
      implements DoFn.RequiresWindowAccess {
    /** TableSpec to write to. */
    private final ValueProvider tableSpec;

    /** User function mapping windows to {@link TableReference} in JSON. */
    private final SerializableFunction tableRefFunction;

    private transient String randomUUID;
    private transient long sequenceNo = 0L;

    TagWithUniqueIdsAndTable(BigQueryOptions options,
        ValueProvider table,
        SerializableFunction tableRefFunction) {
      checkArgument(table == null ^ tableRefFunction == null,
          "Exactly one of table or tableRefFunction should be set");
      if (table != null) {
        if (table.isAccessible() && Strings.isNullOrEmpty(table.get().getProjectId())) {
          TableReference tableRef = table.get()
              .setProjectId(options.as(BigQueryOptions.class).getProject());
          table = NestedValueProvider.of(
              StaticValueProvider.of(toJsonString(tableRef)),
              new JsonTableRefToTableRef());
        }
        this.tableSpec = NestedValueProvider.of(table, new TableRefToTableSpec());
      } else {
        tableSpec = null;
      }
      this.tableRefFunction = tableRefFunction;
    }


    @Override
    public void startBundle(Context context) {
      randomUUID = UUID.randomUUID().toString();
    }

    /** Tag the input with a unique id. */
    @Override
    public void processElement(ProcessContext context) throws IOException {
      String uniqueId = randomUUID + sequenceNo++;
      ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
      String tableSpec = tableSpecFromWindow(
          context.getPipelineOptions().as(BigQueryOptions.class), context.window());
      // We output on keys 0-50 to ensure that there's enough batching for
      // BigQuery.
      context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
          new TableRowInfo(context.element(), uniqueId)));
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);

      builder.addIfNotNull(DisplayData.item("table", tableSpec));
      if (tableRefFunction != null) {
        builder.add(DisplayData.item("tableFn", tableRefFunction.getClass())
          .withLabel("Table Reference Function"));
      }
    }

    @VisibleForTesting
    ValueProvider getTableSpec() {
      return tableSpec;
    }

    private String tableSpecFromWindow(BigQueryOptions options, BoundedWindow window) {
      if (tableSpec != null) {
        return tableSpec.get();
      } else {
        TableReference table = tableRefFunction.apply(window);
        if (table.getProjectId() == null) {
          table.setProjectId(options.getProject());
        }
        return toTableSpec(table);
      }
    }
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
  * PTransform that performs streaming BigQuery write. To increase consistency,
  * it leverages BigQuery best effort de-dup mechanism.
   */
  private static class StreamWithDeDup extends PTransform, PDone> {
    private final transient ValueProvider tableReference;
    private final SerializableFunction tableRefFunction;
    private final transient ValueProvider tableSchema;

    /** Constructor. */
    StreamWithDeDup(ValueProvider tableReference,
        SerializableFunction tableRefFunction,
        ValueProvider tableSchema) {
      this.tableReference = tableReference;
      this.tableRefFunction = tableRefFunction;
      this.tableSchema = tableSchema;
    }

    @Override
    protected Coder getDefaultOutputCoder() {
      return VoidCoder.of();
    }

    @Override
    public PDone apply(PCollection input) {
      // A naive implementation would be to simply stream data directly to BigQuery.
      // However, this could occasionally lead to duplicated data, e.g., when
      // a VM that runs this code is restarted and the code is re-run.

      // The above risk is mitigated in this implementation by relying on
      // BigQuery built-in best effort de-dup mechanism.

      // To use this mechanism, each input TableRow is tagged with a generated
      // unique id, which is then passed to BigQuery and used to ignore duplicates.

      PCollection, TableRowInfo>> tagged = input.apply(ParDo.of(
          new TagWithUniqueIdsAndTable(input.getPipeline().getOptions().as(BigQueryOptions.class),
              tableReference, tableRefFunction)));

      // To prevent having the same TableRow processed more than once with regenerated
      // different unique ids, this implementation relies on "checkpointing", which is
      // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
      // performed by Reshuffle.
      tagged
          .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
          .apply(Reshuffle., TableRowInfo>of())
          .apply(ParDo.of(new StreamingWriteFn(tableSchema)));

      // Note that the implementation to return PDone here breaks the
      // implicit assumption about the job execution order. If a user
      // implements a PTransform that takes PDone returned here as its
      // input, the transform may not necessarily be executed after
      // the BigQueryIO.Write.

      return PDone.in(input.getPipeline());
    }
  }

  /**
   * Status of a BigQuery job or request.
   */
  enum Status {
    SUCCEEDED,
    FAILED,
    UNKNOWN,
  }

  private static Status parseStatus(@Nullable Job job) {
    if (job == null) {
      return Status.UNKNOWN;
    }
    JobStatus status = job.getStatus();
    if (status.getErrorResult() != null) {
      return Status.FAILED;
    } else if (status.getErrors() != null && !status.getErrors().isEmpty()) {
      return Status.FAILED;
    } else {
      return Status.SUCCEEDED;
    }
  }

  @VisibleForTesting
  static String toJsonString(Object item) {
    if (item == null) {
      return null;
    }
    try {
      return JSON_FACTORY.toString(item);
    } catch (IOException e) {
      throw new RuntimeException(
          String.format("Cannot serialize %s to a JSON string.", item.getClass().getSimpleName()),
          e);
    }
  }

  @VisibleForTesting
  static  T fromJsonString(String json, Class clazz) {
    if (json == null) {
      return null;
    }
    try {
      return JSON_FACTORY.fromString(json, clazz);
    } catch (IOException e) {
      throw new RuntimeException(
          String.format("Cannot deserialize %s from a JSON string: %s.", clazz, json),
          e);
    }
  }

  /**
   * Returns a randomUUID string.
   *
   * 
{@code '-'} is removed because BigQuery doesn't allow it in dataset id.
   */
  private static String randomUUIDString() {
    return UUID.randomUUID().toString().replaceAll("-", "");
  }

  /////////////////////////////////////////////////////////////////////////////

  /** Disallow construction of utility class. */
  private BigQueryIO() {}

  private static  List getOrCreateMapListValue(Map> map, K key) {
    List value = map.get(key);
    if (value == null) {
      value = new ArrayList<>();
      map.put(key, value);
    }
    return value;
  }
}
 













© 2015 - 2024 Weber Informatics LLC | Privacy Policy