All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigquery.storage.v1.SchemaAwareStreamWriter Maven / Gradle / Ivy

/*
 * Copyright 2020 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigquery.storage.v1;

import com.google.api.core.ApiFuture;
import com.google.api.gax.batching.FlowControlSettings;
import com.google.api.gax.core.CredentialsProvider;
import com.google.api.gax.core.ExecutorProvider;
import com.google.api.gax.retrying.RetrySettings;
import com.google.api.gax.rpc.TransportChannelProvider;
import com.google.cloud.bigquery.storage.v1.AppendRowsRequest.MissingValueInterpretation;
import com.google.cloud.bigquery.storage.v1.Exceptions.AppendSerializationError;
import com.google.cloud.bigquery.storage.v1.Exceptions.RowIndexToErrorException;
import com.google.common.base.Preconditions;
import com.google.protobuf.Descriptors.Descriptor;
import com.google.protobuf.Descriptors.DescriptorValidationException;
import com.google.protobuf.DynamicMessage;
import com.google.rpc.Code;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

/**
 * A StreamWriter that can write data to BigQuery tables. The SchemaAwareStreamWriter is built on
 * top of a StreamWriter, and it converts all data to Protobuf messages using provided converter
 * then calls StreamWriter's append() method to write to BigQuery tables. It maintains all
 * StreamWriter functions, but also provides an additional feature: schema update support, where if
 * the BigQuery table schema is updated, users will be able to ingest data on the new schema after
 * some time (in order of minutes).
 *
 * 

NOTE: The schema update ability will be disabled when you pass in a table schema explicitly * through the writer. It is recommended that user either use JsonStreamWriter (which fully manages * table schema) or StreamWriter (which accepts proto format in raw and user will handle the schema * update event themsevles). If you use this class, you need to be very cautious about possible * mistmach between the writer's schema and the input data, any mismatch of the two will cause data * corruption. */ public class SchemaAwareStreamWriter implements AutoCloseable { private static final Logger LOG = Logger.getLogger(SchemaAwareStreamWriter.class.getName()); private final BigQueryWriteClient client; private final String streamName; private final StreamWriter.Builder streamWriterBuilder; private final boolean ignoreUnknownFields; private final ToProtoConverter toProtoConverter; private StreamWriter streamWriter; private Descriptor descriptor; private TableSchema tableSchema; private ProtoSchema protoSchema; private String CompressorName; // During some sitaution we want to skip stream writer refresh for updated schema. e.g. when // the user provides the table schema, we should always use that schema. private final boolean skipRefreshStreamWriter; /** * Constructs the SchemaAwareStreamWriter * * @param builder The Builder object for the SchemaAwareStreamWriter */ private SchemaAwareStreamWriter(Builder builder) throws DescriptorValidationException, IllegalArgumentException, IOException { this.descriptor = BQTableSchemaToProtoDescriptor.convertBQTableSchemaToProtoDescriptor(builder.tableSchema); if (builder.client == null) { streamWriterBuilder = StreamWriter.newBuilder(builder.streamName); } else { streamWriterBuilder = StreamWriter.newBuilder(builder.streamName, builder.client); } this.protoSchema = ProtoSchemaConverter.convert(this.descriptor); this.client = builder.client; streamWriterBuilder.setWriterSchema(protoSchema); setStreamWriterSettings( builder.channelProvider, builder.credentialsProvider, builder.executorProvider, builder.endpoint, builder.flowControlSettings, builder.traceIdBase, builder.traceId, builder.compressorName, builder.retrySettings); streamWriterBuilder.setEnableConnectionPool(builder.enableConnectionPool); streamWriterBuilder.setLocation(builder.location); streamWriterBuilder.setDefaultMissingValueInterpretation( builder.defaultMissingValueInterpretation); this.streamWriter = streamWriterBuilder.build(); this.streamName = builder.streamName; this.tableSchema = builder.tableSchema; this.toProtoConverter = builder.toProtoConverter; this.ignoreUnknownFields = builder.ignoreUnknownFields; this.skipRefreshStreamWriter = builder.skipRefreshStreamWriter; } /** * Writes a collection that contains objects to the BigQuery table by first converting the data to * Protobuf messages, then using StreamWriter's append() to write the data at current end of * stream. If there is a schema update, the current StreamWriter is closed. A new StreamWriter is * created with the updated TableSchema. * * @param items The array that contains objects to be written * @return {@code ApiFuture} returns an AppendRowsResponse message wrapped in * an ApiFuture */ public ApiFuture append(Iterable items) throws IOException, DescriptorValidationException { return append(items, -1); } private void refreshWriter(TableSchema updatedSchema) throws DescriptorValidationException, IOException { Preconditions.checkNotNull(updatedSchema, "updatedSchema is null."); LOG.info("Refresh internal writer due to schema update, stream: " + this.streamName); // Close the StreamWriterf this.streamWriter.close(); // Update SchemaAwareStreamWriter's TableSchema and Descriptor this.tableSchema = updatedSchema; this.descriptor = BQTableSchemaToProtoDescriptor.convertBQTableSchemaToProtoDescriptor(updatedSchema); this.protoSchema = ProtoSchemaConverter.convert(this.descriptor); // Create a new underlying StreamWriter with the updated TableSchema and Descriptor this.streamWriter = streamWriterBuilder.setWriterSchema(this.protoSchema).build(); } private List buildMessage(Iterable items) throws InterruptedException, DescriptorValidationException, IOException { try { return this.toProtoConverter.convertToProtoMessage( this.descriptor, this.tableSchema, items, ignoreUnknownFields); } catch (RowIndexToErrorException ex) { // We only retry for data unknown error. if (!ex.hasDataUnknownError) { throw ex; } // Directly return error when stream writer refresh is disabled. if (this.skipRefreshStreamWriter) { throw ex; } LOG.warning( "Saw unknown field error during proto message conversin within error messages" + ex.rowIndexToErrorMessage + ", try to refresh the writer with updated schema, stream: " + streamName); GetWriteStreamRequest writeStreamRequest = GetWriteStreamRequest.newBuilder() .setName(this.streamName) .setView(WriteStreamView.FULL) .build(); WriteStream writeStream = client.getWriteStream(writeStreamRequest); refreshWriter(writeStream.getTableSchema()); return this.toProtoConverter.convertToProtoMessage( this.descriptor, this.tableSchema, items, ignoreUnknownFields); } } /** * Writes a collection that contains objects to the BigQuery table by first converting the data to * Protobuf messages, then using StreamWriter's append() to write the data at the specified * offset. If there is a schema update, the current StreamWriter is closed. A new StreamWriter is * created with the updated TableSchema. * * @param items The collection that contains objects to be written * @param offset Offset for deduplication * @return {@code ApiFuture} returns an AppendRowsResponse message wrapped in * an ApiFuture */ public ApiFuture append(Iterable items, long offset) throws IOException, DescriptorValidationException { // Handle schema updates in a Thread-safe way by locking down the operation synchronized (this) { // Create a new stream writer internally if a new updated schema is reported from backend. if (!this.skipRefreshStreamWriter && this.streamWriter.getUpdatedSchema() != null) { refreshWriter(this.streamWriter.getUpdatedSchema()); } ProtoRows.Builder rowsBuilder = ProtoRows.newBuilder(); // Any error in convertToProtoMessage will throw an // IllegalArgumentException/IllegalStateException/NullPointerException. // IllegalArgumentException will be collected into a Map of row indexes to error messages. // After the conversion is finished an AppendSerializtionError exception that contains all the // conversion errors will be thrown. Map rowIndexToErrorMessage = new HashMap<>(); try { List protoMessages = buildMessage(items); for (DynamicMessage dynamicMessage : protoMessages) { rowsBuilder.addSerializedRows(dynamicMessage.toByteString()); } } catch (RowIndexToErrorException exception) { rowIndexToErrorMessage = exception.rowIndexToErrorMessage; } catch (InterruptedException ex) { throw new RuntimeException(ex); } if (!rowIndexToErrorMessage.isEmpty()) { throw new AppendSerializationError( Code.INVALID_ARGUMENT.getNumber(), "Append serialization failed for writer: " + streamName, streamName, rowIndexToErrorMessage); } return this.streamWriter.append(rowsBuilder.build(), offset); } } /** @return The name of the write stream associated with this writer. */ public String getStreamName() { return this.streamName; } /** @return A unique Id for this writer. */ public String getWriterId() { return streamWriter.getWriterId(); } /** * Gets current descriptor * * @return Descriptor */ public Descriptor getDescriptor() { return this.descriptor; } /** * Gets the location of the destination * * @return Descriptor */ public String getLocation() { return this.streamWriter.getLocation(); } /** * Returns the wait of a request in Client side before sending to the Server. Request could wait * in Client because it reached the client side inflight request limit (adjustable when * constructing the Writer). The value is the wait time for the last sent request. A constant high * wait value indicates a need for more throughput, you can create a new Stream for to increase * the throughput in exclusive stream case, or create a new Writer in the default stream case. */ public long getInflightWaitSeconds() { return streamWriter.getInflightWaitSeconds(); } /** * Sets the missing value interpretation map for the SchemaAwareStreamWriter. The input * missingValueInterpretationMap is used for all append requests unless otherwise changed. * * @param missingValueInterpretationMap the missing value interpretation map used by the * SchemaAwareStreamWriter. */ public void setMissingValueInterpretationMap( Map missingValueInterpretationMap) { streamWriter.setMissingValueInterpretationMap(missingValueInterpretationMap); } /** @return the missing value interpretation map used for the writer. */ public Map getMissingValueInterpretationMap() { return streamWriter.getMissingValueInterpretationMap(); } /** Sets all StreamWriter settings. */ private void setStreamWriterSettings( @Nullable TransportChannelProvider channelProvider, @Nullable CredentialsProvider credentialsProvider, @Nullable ExecutorProvider executorProvider, @Nullable String endpoint, @Nullable FlowControlSettings flowControlSettings, @Nullable String traceIdBase, @Nullable String traceId, @Nullable String compressorName, @Nullable RetrySettings retrySettings) { if (channelProvider != null) { streamWriterBuilder.setChannelProvider(channelProvider); } if (credentialsProvider != null) { streamWriterBuilder.setCredentialsProvider(credentialsProvider); } if (executorProvider != null) { streamWriterBuilder.setExecutorProvider(executorProvider); } if (endpoint != null) { streamWriterBuilder.setEndpoint(endpoint); } if (traceIdBase != null) { if (traceId != null) { streamWriterBuilder.setTraceId(traceIdBase + "_" + traceId); } else { streamWriterBuilder.setTraceId(traceIdBase + ":null"); } } else { if (traceId != null) { streamWriterBuilder.setTraceId("SchemaAwareStreamWriter_" + traceId); } else { streamWriterBuilder.setTraceId("SchemaAwareStreamWriter:null"); } } if (flowControlSettings != null) { if (flowControlSettings.getMaxOutstandingRequestBytes() != null) { streamWriterBuilder.setMaxInflightBytes( flowControlSettings.getMaxOutstandingRequestBytes()); } if (flowControlSettings.getMaxOutstandingElementCount() != null) { streamWriterBuilder.setMaxInflightRequests( flowControlSettings.getMaxOutstandingElementCount()); } if (flowControlSettings.getLimitExceededBehavior() != null) { streamWriterBuilder.setLimitExceededBehavior( flowControlSettings.getLimitExceededBehavior()); } } if (compressorName != null) { streamWriterBuilder.setCompressorName(compressorName); } if (retrySettings != null) { streamWriterBuilder.setRetrySettings(retrySettings); } } /** * newBuilder that constructs a SchemaAwareStreamWriter builder with BigQuery client being * initialized by StreamWriter by default. * *

The table schema passed in will be updated automatically when there is a schema update * event. When used for Writer creation, it should be the latest schema. So when you are trying to * reuse a stream, you should use Builder newBuilder( String streamOrTableName, * BigQueryWriteClient client) instead, so the created Writer will be based on a fresh schema. * * @param streamOrTableName name of the stream that must follow * "projects/[^/]+/datasets/[^/]+/tables/[^/]+/streams/[^/]+" or table name * "projects/[^/]+/datasets/[^/]+/tables/[^/]+" * @param tableSchema The schema of the table when the stream was created, which is passed back * through {@code WriteStream} * @return Builder */ public static Builder newBuilder( String streamOrTableName, TableSchema tableSchema, ToProtoConverter toProtoConverter) { Preconditions.checkNotNull(streamOrTableName, "StreamOrTableName is null."); Preconditions.checkNotNull(tableSchema, "TableSchema is null."); Preconditions.checkNotNull(toProtoConverter, "ToProtoConverter is null."); return new Builder<>(streamOrTableName, tableSchema, null, toProtoConverter); } /** * newBuilder that constructs a SchemaAwareStreamWriter builder. * *

The table schema passed in will be updated automatically when there is a schema update * event. When used for Writer creation, it should be the latest schema. So when you are trying to * reuse a stream, you should use Builder newBuilder( String streamOrTableName, * BigQueryWriteClient client) instead, so the created Writer will be based on a fresh schema. * * @param streamOrTableName name of the stream that must follow * "projects/[^/]+/datasets/[^/]+/tables/[^/]+/streams/[^/]+" * @param tableSchema The schema of the table when the stream was created, which is passed back * through {@code WriteStream} * @param client * @return Builder */ public static Builder newBuilder( String streamOrTableName, TableSchema tableSchema, BigQueryWriteClient client, ToProtoConverter toProtoConverter) { Preconditions.checkNotNull(streamOrTableName, "StreamOrTableName is null."); Preconditions.checkNotNull(tableSchema, "TableSchema is null."); Preconditions.checkNotNull(client, "BigQuery client is null."); Preconditions.checkNotNull(toProtoConverter, "ToProtoConverter is null."); return new Builder<>(streamOrTableName, tableSchema, client, toProtoConverter); } /** * newBuilder that constructs a SchemaAwareStreamWriter builder with TableSchema being initialized * by StreamWriter by default. * * @param streamOrTableName name of the stream that must follow * "projects/[^/]+/datasets/[^/]+/tables/[^/]+/streams/[^/]+" * @param client BigQueryWriteClient * @return Builder */ public static Builder newBuilder( String streamOrTableName, BigQueryWriteClient client, ToProtoConverter toProtoConverter) { Preconditions.checkNotNull(streamOrTableName, "StreamOrTableName is null."); Preconditions.checkNotNull(client, "BigQuery client is null."); Preconditions.checkNotNull(toProtoConverter, "ToProtoConverter is null."); return new Builder<>(streamOrTableName, null, client, toProtoConverter); } /** Closes the underlying StreamWriter. */ @Override public void close() { this.streamWriter.close(); } /** * @return if a writer can no longer be used for writing. It is due to either the * SchemaAwareStreamWriter is explicitly closed or the underlying connection is broken when * connection pool is not used. Client should recreate SchemaAwareStreamWriter in this case. */ public boolean isClosed() { return this.streamWriter.isClosed(); } /** @return if user explicitly closed the writer. */ public boolean isUserClosed() { return this.streamWriter.isUserClosed(); } public static final class Builder { private final String streamName; private final BigQueryWriteClient client; private final TableSchema tableSchema; private final boolean skipRefreshStreamWriter; private final ToProtoConverter toProtoConverter; private TransportChannelProvider channelProvider; private CredentialsProvider credentialsProvider; private ExecutorProvider executorProvider; private FlowControlSettings flowControlSettings; private String endpoint; private String traceIdBase; private String traceId; private boolean ignoreUnknownFields = false; // Indicates whether multiplexing mode is enabled. private boolean enableConnectionPool = false; private String location; private String compressorName; private RetrySettings retrySettings; private AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation = MissingValueInterpretation.MISSING_VALUE_INTERPRETATION_UNSPECIFIED; private static final String streamPatternString = "(projects/[^/]+/datasets/[^/]+/tables/[^/]+)/streams/[^/]+"; private static final String tablePatternString = "(projects/[^/]+/datasets/[^/]+/tables/[^/]+)"; private static final String defaultStreamPatternString = tablePatternString + "/_default"; private static final Pattern streamPattern = Pattern.compile(streamPatternString); private static final Pattern tablePattern = Pattern.compile(tablePatternString); private static final Pattern defaultStreamPattern = Pattern.compile(defaultStreamPatternString); /** * Constructor for SchemaAwareStreamWriter's Builder * * @param streamOrTableName name of the stream that must follow * "projects/[^/]+/datasets/[^/]+/tables/[^/]+/streams/[^/]+" or * "projects/[^/]+/datasets/[^/]+/tables/[^/]+" * @param tableSchema schema used to convert items to proto messages. * @param client * @param toProtoConverter converter used to convert items to proto messages */ private Builder( String streamOrTableName, TableSchema tableSchema, BigQueryWriteClient client, ToProtoConverter toProtoConverter) { Matcher streamMatcher = streamPattern.matcher(streamOrTableName); if (!streamMatcher.matches()) { Matcher tableMatcher = tablePattern.matcher(streamOrTableName); Matcher defaultStreamMatcher = defaultStreamPattern.matcher(streamOrTableName); if (!tableMatcher.matches() && !defaultStreamMatcher.matches()) { throw new IllegalArgumentException("Invalid name: " + streamOrTableName); } else { this.streamName = streamOrTableName + "/_default"; } } else { this.streamName = streamOrTableName; } this.client = client; if (tableSchema == null) { GetWriteStreamRequest writeStreamRequest = GetWriteStreamRequest.newBuilder() .setName(this.getStreamName()) .setView(WriteStreamView.FULL) .build(); WriteStream writeStream = this.client.getWriteStream(writeStreamRequest); this.tableSchema = writeStream.getTableSchema(); this.location = writeStream.getLocation(); this.skipRefreshStreamWriter = false; } else { this.tableSchema = tableSchema; this.skipRefreshStreamWriter = true; } this.toProtoConverter = toProtoConverter; } /** * Setter for the underlying StreamWriter's TransportChannelProvider. * * @param channelProvider * @return Builder */ public Builder setChannelProvider(TransportChannelProvider channelProvider) { this.channelProvider = Preconditions.checkNotNull(channelProvider, "ChannelProvider is null."); return this; } /** * Setter for the underlying StreamWriter's CredentialsProvider. * * @param credentialsProvider * @return Builder */ public Builder setCredentialsProvider(CredentialsProvider credentialsProvider) { this.credentialsProvider = Preconditions.checkNotNull(credentialsProvider, "CredentialsProvider is null."); return this; } /** * Setter for the underlying StreamWriter's ExecutorProvider. * * @param executorProvider * @return */ public Builder setExecutorProvider(ExecutorProvider executorProvider) { this.executorProvider = Preconditions.checkNotNull(executorProvider, "ExecutorProvider is null."); return this; } /** * Setter for the underlying StreamWriter's FlowControlSettings. * * @param flowControlSettings * @return Builder */ public Builder setFlowControlSettings(FlowControlSettings flowControlSettings) { this.flowControlSettings = Preconditions.checkNotNull(flowControlSettings, "FlowControlSettings is null."); return this; } /** * Stream name on the builder. * * @return Builder */ public String getStreamName() { return streamName; } /** * Setter for the underlying StreamWriter's Endpoint. * * @param endpoint * @return Builder */ public Builder setEndpoint(String endpoint) { this.endpoint = Preconditions.checkNotNull(endpoint, "Endpoint is null."); return this; } /** * Setter for a traceId to help identify traffic origin. * * @param traceId * @return Builder */ public Builder setTraceId(String traceId) { this.traceId = Preconditions.checkNotNull(traceId, "TraceId is null."); return this; } /** * Setter for a traceIdBase to help identify traffic origin. * * @param traceIdBase * @return Builder */ public Builder setTraceIdBase(String traceIdBase) { this.traceIdBase = Preconditions.checkNotNull(traceIdBase, "TraceIdBase is null."); return this; } /** * Setter for a ignoreUnknownFields, if true, unknown fields to BigQuery will be ignored instead * of error out. * * @param ignoreUnknownFields * @return Builder */ public Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) { this.ignoreUnknownFields = ignoreUnknownFields; return this; } /** * Enable multiplexing for this writer. In multiplexing mode tables will share the same * connection if possible until the connection is overwhelmed. This feature is still under * development, please contact write api team before using. * * @param enableConnectionPool * @return Builder */ public Builder setEnableConnectionPool(boolean enableConnectionPool) { this.enableConnectionPool = enableConnectionPool; return this; } /** * Location of the table this stream writer is targeting. Connection pools are shared by * location. * * @param location * @return Builder */ public Builder setLocation(String location) { if (this.location != null && !this.location.equals(location)) { throw new IllegalArgumentException( "Specified location " + location + " does not match the system value " + this.location); } this.location = location; return this; } /** * Sets the compression to use for the calls. The compressor must be of type gzip. * * @param compressorName * @return Builder */ public Builder setCompressorName(String compressorName) { this.compressorName = compressorName; return this; } /** * Sets the default missing value interpretation value if the column is not presented in the * missing_value_interpretations map. */ public Builder setDefaultMissingValueInterpretation( AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; return this; } /** * Sets the RetrySettings to use for in-stream error retry. * * @param retrySettings * @return Builder */ public Builder setRetrySettings(RetrySettings retrySettings) { this.retrySettings = retrySettings; return this; } /** * Builds SchemaAwareStreamWriter * * @return SchemaAwareStreamWriter */ public SchemaAwareStreamWriter build() throws DescriptorValidationException, IllegalArgumentException, IOException, InterruptedException { return new SchemaAwareStreamWriter<>(this); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy