io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils Maven / Gradle / Ivy
Show all versions of google-cloud Show documentation
/*
* Copyright © 2021 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.bigquery.sink;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.DatasetInfo;
import com.google.cloud.bigquery.EncryptionConfiguration;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.LegacySQLTypeName;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.validation.ValidationFailure;
import io.cdap.plugin.common.Asset;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryOutputConfiguration;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableFieldSchema;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableSchema;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryTypeSize.Numeric;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.GCPUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Supplier;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
/**
* Utility class for the BigQuery DelegatingMultiSink.
*
* The logic in this class has been extracted from the {@link AbstractBigQuerySink} in order to make this functionality
* available to other classes in this package.
*/
public final class BigQuerySinkUtils {
public static final String GS_PATH_FORMAT = "gs://%s/%s";
private static final String TEMPORARY_BUCKET_FORMAT = GS_PATH_FORMAT + "/input/%s-%s";
private static final String DATETIME = "DATETIME";
private static final String RECORD = "RECORD";
private static final String JSON = "JSON";
private static final Gson GSON = new Gson();
private static final Type LIST_OF_FIELD_TYPE = new TypeToken>() {
}.getType();
// Fields used to build update/upsert queries
private static final String CRITERIA_TEMPLATE = "T.`%s` = S.`%s`";
private static final String SOURCE_DATA_QUERY = "(SELECT * FROM (SELECT row_number() OVER (PARTITION BY %s%s) " +
"as rowid, * FROM %s) where rowid = 1)";
private static final String UPDATE_QUERY = "UPDATE %s T SET %s FROM %s S WHERE %s";
private static final String UPSERT_QUERY = "MERGE %s T USING %s S ON %s WHEN MATCHED THEN UPDATE SET %s " +
"WHEN NOT MATCHED THEN INSERT (%s) VALUES(%s)";
private static final String INSERT_ONLY_UPSERT_QUERY = "MERGE %s T USING %s S ON %s WHEN NOT MATCHED THEN " +
"INSERT (%s) VALUES(%s)";
private static final List COMPARISON_OPERATORS =
Arrays.asList("=", "<", ">", "<=", ">=", "!=", "<>",
"LIKE", "NOT LIKE", "BETWEEN", "NOT BETWEEN", "IN", "NOT IN", "IS NULL", "IS NOT NULL",
"IS TRUE", "IS NOT TRUE", "IS FALSE", "IS NOT FALSE");
public static final String BYTES_PROCESSED_METRIC = "bytes.processed";
/**
* Creates the given dataset and bucket if they do not already exist. If the dataset already exists but the
* bucket does not, the bucket will be created in the same location as the dataset. If the bucket already exists
* but the dataset does not, the dataset will attempt to be created in the same location. This may fail if the bucket
* is in a location that BigQuery does not yet support.
*
* @param bigQuery the bigquery client for the project
* @param storage the storage client for the project
* @param datasetId the Id of the dataset
* @param bucketName the name of the bucket
* @param location the location of the resources, this is only applied if both the bucket and dataset do not exist
* @param cmekKeyName the name of the cmek key
* @throws IOException if there was an error creating or fetching any GCP resource
*/
public static void createResources(BigQuery bigQuery, Storage storage,
DatasetId datasetId, String bucketName, @Nullable String location,
@Nullable CryptoKeyName cmekKeyName) throws IOException {
// Get the required Dataset and bucket instances using the supplied clients
Dataset dataset = bigQuery.getDataset(datasetId);
Bucket bucket = storage.get(bucketName);
createResources(bigQuery, dataset, datasetId, storage, bucket, bucketName, location, cmekKeyName);
}
/**
* Creates the given dataset and bucket if the supplied ones are null.
*
* If the dataset already exists but the
* bucket does not, the bucket will be created in the same location as the dataset. If the bucket already exists
* but the dataset does not, the dataset will attempt to be created in the same location. This may fail if the bucket
* is in a location that BigQuery does not yet support.
*
* @param bigQuery the bigquery client for the project
* @param dataset the bigquery dataset instance (may be null)
* @param datasetId the Id of the dataset
* @param storage the storage client for the project
* @param bucket the storage bucket instance (may be null)
* @param bucketName the name of the bucket
* @param location the location of the resources, this is only applied if both the bucket and dataset do not exist
* @param cmekKey the name of the cmek key
* @throws IOException
*/
public static void createResources(BigQuery bigQuery, @Nullable Dataset dataset, DatasetId datasetId,
Storage storage, @Nullable Bucket bucket, String bucketName,
@Nullable String location, @Nullable CryptoKeyName cmekKey) throws IOException {
if (dataset == null && bucket == null) {
createBucket(storage, bucketName, location, cmekKey,
() -> String.format("Unable to create Cloud Storage bucket '%s'", bucketName));
createDataset(bigQuery, datasetId, location, cmekKey,
() -> String.format("Unable to create BigQuery dataset '%s.%s'", datasetId.getProject(),
datasetId.getDataset()));
} else if (bucket == null) {
createBucket(
storage, bucketName, dataset.getLocation(), cmekKey,
() -> String.format(
"Unable to create Cloud Storage bucket '%s' in the same location ('%s') as BigQuery dataset '%s'. "
+ "Please use a bucket that is in the same location as the dataset.",
bucketName, dataset.getLocation(), datasetId.getProject() + "." + datasetId.getDataset()));
} else if (dataset == null) {
createDataset(
bigQuery, datasetId, bucket.getLocation(), cmekKey,
() -> String.format(
"Unable to create BigQuery dataset '%s' in the same location ('%s') as Cloud Storage bucket '%s'. "
+ "Please use a bucket that is in a supported location.",
datasetId, bucket.getLocation(), bucketName));
}
}
/**
* Creates a Dataset in the specified location using the supplied BigQuery client.
*
* @param bigQuery the bigQuery client.
* @param dataset the Id of the dataset to create.
* @param location Location for this dataset.
* @param cmekKeyName CMEK key to use for this dataset.
* @param errorMessage Supplier for the error message to output if the dataset could not be created.
* @throws IOException if the dataset could not be created.
*/
private static void createDataset(BigQuery bigQuery, DatasetId dataset, @Nullable String location,
@Nullable CryptoKeyName cmekKeyName,
Supplier errorMessage) throws IOException {
DatasetInfo.Builder builder = DatasetInfo.newBuilder(dataset);
if (location != null) {
builder.setLocation(location);
}
if (cmekKeyName != null) {
builder.setDefaultEncryptionConfiguration(
EncryptionConfiguration.newBuilder().setKmsKeyName(cmekKeyName.toString()).build());
}
try {
bigQuery.create(builder.build());
} catch (BigQueryException e) {
if (e.getCode() != 409) {
// A conflict means the dataset already exists (https://cloud.google.com/bigquery/troubleshooting-errors)
// This most likely means multiple stages in the same pipeline are trying to create the same dataset.
// Ignore this and move on, since all that matters is that the dataset exists.
throw new IOException(errorMessage.get(), e);
}
}
}
/**
* Creates a Dataset in the specified location using the supplied BigQuery client if it does not exist.
*
* @param bigQuery the bigQuery client.
* @param datasetId the Id of the dataset to create.
* @param location Location for this dataset.
* @param cmekKeyName CMEK key to use for this dataset.
* @param errorMessage Supplier for the error message to output if the dataset could not be created.
* @throws IOException if the dataset could not be created.
*/
public static void createDatasetIfNotExists(BigQuery bigQuery, DatasetId datasetId, @Nullable String location,
@Nullable CryptoKeyName cmekKeyName,
Supplier errorMessage) throws IOException {
// Check if dataset exists
Dataset ds = bigQuery.getDataset(datasetId);
// Create dataset if needed
if (ds == null) {
createDataset(bigQuery, datasetId, location, cmekKeyName, errorMessage);
}
}
/**
* Creates the specified GCS bucket using the supplied GCS client.
*
* @param storage GCS Client.
* @param bucket Bucket Name.
* @param location Location for this bucket.
* @param cmekKeyName CMEK key to use for this bucket.
* @param errorMessage Supplier for the error message to output if the bucket could not be created.
* @throws IOException if the bucket could not be created.
*/
private static void createBucket(Storage storage, String bucket, @Nullable String location,
@Nullable CryptoKeyName cmekKeyName,
Supplier errorMessage) throws IOException {
try {
GCPUtils.createBucket(storage, bucket, location, cmekKeyName);
} catch (StorageException e) {
if (e.getCode() != 409) {
// A conflict means the bucket already exists
// This most likely means multiple stages in the same pipeline are trying to create the same dataset.
// Ignore this and move on, since all that matters is that the dataset exists.
throw new IOException(errorMessage.get(), e);
}
}
}
/**
* Updates {@link Configuration} with bucket details.
* Uses provided bucket, otherwise uses provided runId as a bucket name.
*
* @return bucket name
*/
public static String configureBucket(Configuration baseConfiguration, @Nullable String bucket, String runId) {
boolean deleteBucket = false;
// If the bucket is null, assign the run ID as the bucket name and mark the bucket for deletion.
if (bucket == null) {
bucket = runId;
deleteBucket = true;
}
return configureBucket(baseConfiguration, bucket, runId, deleteBucket);
}
/**
* Updates {@link Configuration} with bucket details.
* Uses provided bucket, otherwise uses provided runId as a bucket name.
*
* @return bucket name
*/
public static String configureBucket(Configuration baseConfiguration,
String bucket,
String runId,
boolean deleteBucket) {
if (deleteBucket) {
// By default, this option is false, meaning the job can not delete the bucket.
// So enable it only when bucket name is not provided.
baseConfiguration.setBoolean("fs.gs.bucket.delete.enable", true);
}
baseConfiguration.set("fs.default.name", String.format(GS_PATH_FORMAT, bucket, runId));
baseConfiguration.setBoolean("fs.gs.impl.disable.cache", true);
baseConfiguration.setBoolean("fs.gs.metadata.cache.enable", false);
return bucket;
}
/**
* Sets string fields to JSON type if they are present in the provided list of fields.
* @param fields list of BigQuery table fields.
* @param jsonStringFields Comma separated list of fields that should be set to JSON type.
*
*/
public static void setJsonStringFields(List fields,
String jsonStringFields) {
Set jsonFields = new HashSet<>(Arrays.asList(jsonStringFields.split(",")));
setJsonStringFields(fields, jsonFields, new ArrayList<>());
}
private static void setJsonStringFields(List fields, Set jsonFields,
List path) {
for (BigQueryTableFieldSchema field : fields) {
String fieldName = field.getName();
String fieldType = field.getType();
String separator = path.isEmpty() ? "" : ".";
String fieldPath = String.join(".", path) + separator + fieldName;
if (jsonFields.contains(fieldPath) && fieldType.equals(LegacySQLTypeName.STRING.name())) {
field.setType(LegacySQLTypeName.valueOf(JSON).name());
} else if (field.getType().equals(LegacySQLTypeName.RECORD.name())) {
path.add(fieldName);
setJsonStringFields(field.getFields(), jsonFields, path);
path.remove(path.size() - 1);
}
}
}
/**
* Configures output for Sink
*
* @param configuration Hadoop configuration instance
* @param datasetId id of the dataset to use
* @param tableName name of the table to use
* @param gcsPath GCS path to use for output
* @param fields list of BigQuery table fields
* @throws IOException if the output cannot be configured
*/
public static void configureOutput(Configuration configuration,
DatasetId datasetId,
String tableName,
String gcsPath,
List fields) throws IOException {
// Set up table schema
BigQueryTableSchema outputTableSchema = new BigQueryTableSchema();
if (!fields.isEmpty()) {
String jsonStringFields = configuration.get(BigQueryConstants.CONFIG_JSON_STRING_FIELDS, null);
if (jsonStringFields != null) {
setJsonStringFields(fields, jsonStringFields);
}
outputTableSchema.setFields(fields);
}
BigQueryFileFormat fileFormat = getFileFormat(fields);
BigQueryOutputConfiguration.configure(
configuration,
String.format("%s:%s.%s", datasetId.getProject(), datasetId.getDataset(), tableName),
outputTableSchema,
gcsPath,
fileFormat,
getOutputFormat(fileFormat));
}
/**
* Configures output for MultiSink
*
* @param configuration Hadoop configuration instance
* @param datasetId name of the dataset to use
* @param tableName name of the table to use
* @param gcsPath GCS path to use for output
* @param fields list of BigQuery table fields
* @throws IOException if the output cannot be configured
*/
public static void configureMultiSinkOutput(Configuration configuration,
DatasetId datasetId,
String tableName,
String gcsPath,
List fields) throws IOException {
configureOutput(configuration,
datasetId,
tableName,
gcsPath,
fields);
// Set operation as Insertion. Currently the BQ MultiSink can only support the insertion operation.
configuration.set(BigQueryConstants.CONFIG_OPERATION, Operation.INSERT.name());
}
public static String getTemporaryGcsPath(String bucket, String pathPrefix, String tableName) {
return String.format(TEMPORARY_BUCKET_FORMAT, bucket, pathPrefix, tableName, pathPrefix);
}
public static List getBigQueryTableFieldsFromSchema(Schema tableSchema) {
List inputFields = Objects.requireNonNull(tableSchema.getFields(), "Schema must have fields");
return inputFields.stream()
.map(BigQuerySinkUtils::generateTableFieldSchema)
.collect(Collectors.toList());
}
/**
* Relaxes the Destination Table Schema based on the matching field names from the source table
*
* @param bigquery BigQuery client
* @param sourceTable source table, which contains the updated field definition
* @param destinationTable destination table, whose fields definitions may be relaxed depending on the source table.
*/
public static void relaxTableSchema(BigQuery bigquery,
Table sourceTable,
Table destinationTable) {
List sourceFields = sourceTable.getDefinition().getSchema().getFields();
List destinationFields = destinationTable.getDefinition().getSchema().getFields();
relaxTableSchema(bigquery, destinationTable, sourceFields, destinationFields);
}
/**
* Relaxes the Destination Table Schema based on the matching field names from the source table
*
* @param bigquery BigQuery client
* @param destinationTable destination table, whose fields definitions may be relaxed depending on the source fields.
* @param sourceFields fields in the source table that need to be used to relax the destination table
* @param destinationFields fields in the destination table that may be relaxed depending on the source fields
*/
public static void relaxTableSchema(BigQuery bigquery,
Table destinationTable,
List sourceFields,
List destinationFields) {
List resultFieldsList = getRelaxedTableFields(sourceFields, destinationFields);
// Update table definition, relaxing field definitions.
com.google.cloud.bigquery.Schema newSchema = com.google.cloud.bigquery.Schema.of(resultFieldsList);
bigquery.update(
destinationTable.toBuilder().setDefinition(
destinationTable.getDefinition().toBuilder().setSchema(newSchema).build()
).build()
);
}
public static List getRelaxedTableFields(List sourceFields,
List destinationFields) {
// Collect all fields form the destination table into a Map for quick lookups by name
Map destinationFieldMap = destinationFields.stream()
.collect(Collectors.toMap(Field::getName, x -> x));
// Collect all fields form the source table into a Map for quick lookups by name
Map sourceFieldMap = sourceFields.stream()
.collect(Collectors.toMap(Field::getName, x -> x));
// Collects all fields in the destination table that are not present in the source table, in order to retain them
// as-is in the destination schema
List resultingFields = destinationFields.stream()
.filter(field -> !sourceFieldMap.containsKey(field.getName()))
.collect(Collectors.toList());
// Add fields from the source table into the destination table
// We need to ensure that fields in the destination table that are Nullable remain as such, even when the source
// field defines this field as required, so we need to relax any fields in the source table that are already
// defined in the destination table.
sourceFieldMap.values().stream()
.map(sourceField -> {
String fieldName = sourceField.getName();
if (destinationFieldMap.containsKey(fieldName)) {
Field destinationField = destinationFieldMap.get(fieldName);
// If the field in the destination table is nullable, but not in the source table, we need to set update
// field's mode to NULLABLE in order to ensure the underlying schema doesn't change.
if (destinationField.getMode() == Field.Mode.NULLABLE && sourceField.getMode() == Field.Mode.REQUIRED) {
sourceField = sourceField.toBuilder().setMode(Field.Mode.NULLABLE).build();
}
}
return sourceField;
})
.forEach(resultingFields::add);
return resultingFields;
}
private static BigQueryTableFieldSchema generateTableFieldSchema(Schema.Field field) {
BigQueryTableFieldSchema fieldSchema = new BigQueryTableFieldSchema();
fieldSchema.setName(field.getName());
fieldSchema.setMode(getMode(field.getSchema()).name());
LegacySQLTypeName type = getTableDataType(field.getSchema());
fieldSchema.setType(type.name());
if (type == LegacySQLTypeName.RECORD) {
List schemaFields;
Schema fieldCdapSchema = BigQueryUtil.getNonNullableSchema(field.getSchema());
// If its an Array of records we need to get the component schema of the array
// which will be the Record. Which can itself be nullable, and then get the fields
// of that record.
if (Schema.Type.ARRAY == fieldCdapSchema.getType()) {
schemaFields = Objects.requireNonNull(
BigQueryUtil.getNonNullableSchema(fieldCdapSchema.getComponentSchema()).getFields());
} else {
schemaFields = fieldCdapSchema.getFields();
}
fieldSchema.setFields(Objects.requireNonNull(schemaFields).stream()
.map(BigQuerySinkUtils::generateTableFieldSchema)
.collect(Collectors.toList()));
}
return fieldSchema;
}
public static com.google.cloud.bigquery.Schema convertCdapSchemaToBigQuerySchema(Schema schema) {
List inputFields = Objects.requireNonNull(schema.getFields(), "Schema must have fields");
List fields = inputFields.stream()
.map(BigQuerySinkUtils::convertCdapFieldToBigQueryField)
.collect(Collectors.toList());
return com.google.cloud.bigquery.Schema.of(fields);
}
private static Field convertCdapFieldToBigQueryField(Schema.Field field) {
String name = field.getName();
LegacySQLTypeName type = getTableDataType(field.getSchema());
Field.Mode mode = getMode(field.getSchema());
Field.Builder fieldBuilder;
// For record fields, we need to get all subfields and re-create the builder.
if (type == LegacySQLTypeName.RECORD) {
List schemaFields;
Schema fieldCdapSchema = BigQueryUtil.getNonNullableSchema(field.getSchema());
// If its an Array of records we need to get the component schema of the array
// which will be the Record. Which can itself be nullable, and then get the fields
// of that record.
if (Schema.Type.ARRAY == fieldCdapSchema.getType()) {
schemaFields = Objects.requireNonNull(
BigQueryUtil.getNonNullableSchema(fieldCdapSchema.getComponentSchema()).getFields());
} else {
schemaFields = fieldCdapSchema.getFields();
}
FieldList subFields = FieldList.of(Objects.requireNonNull(schemaFields).stream()
.map(BigQuerySinkUtils::convertCdapFieldToBigQueryField)
.collect(Collectors.toList()));
fieldBuilder = Field.newBuilder(name, type, subFields);
} else {
fieldBuilder = Field.newBuilder(name, type);
}
fieldBuilder.setMode(mode);
// Set precision for numeric fields
if (type == LegacySQLTypeName.NUMERIC || type == LegacySQLTypeName.BIGNUMERIC) {
Schema decimalFieldSchema = BigQueryUtil.getNonNullableSchema(field.getSchema());
fieldBuilder.setPrecision((long) decimalFieldSchema.getPrecision());
fieldBuilder.setScale((long) decimalFieldSchema.getScale());
}
return fieldBuilder.build();
}
private static Field.Mode getMode(Schema schema) {
boolean isNullable = schema.isNullable();
Schema.Type nonNullableType = isNullable ? schema.getNonNullable().getType() : schema.getType();
if (isNullable && nonNullableType != Schema.Type.ARRAY) {
return Field.Mode.NULLABLE;
} else if (nonNullableType == Schema.Type.ARRAY) {
return Field.Mode.REPEATED;
}
return Field.Mode.REQUIRED;
}
/**
* This function returns the LegacySQLTypeName that maps to the given CDAP Schema.
* If the CDAP Schema is an Array it will return the LegacySQLTypename of the components.
*/
private static LegacySQLTypeName getTableDataType(Schema schema) {
schema = BigQueryUtil.getNonNullableSchema(schema);
Schema.LogicalType logicalType = schema.getLogicalType();
if (logicalType != null) {
switch (logicalType) {
case DATE:
return LegacySQLTypeName.DATE;
case TIME_MILLIS:
case TIME_MICROS:
return LegacySQLTypeName.TIME;
case TIMESTAMP_MILLIS:
case TIMESTAMP_MICROS:
return LegacySQLTypeName.TIMESTAMP;
case DECIMAL:
// Following the restrictions given by:
// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric_types
if ((schema.getScale() <= Numeric.SCALE) && (schema.getPrecision() <= Numeric.PRECISION) &&
((schema.getPrecision() - schema.getScale()) <= (Numeric.PRECISION - Numeric.SCALE))) {
return LegacySQLTypeName.NUMERIC;
}
return LegacySQLTypeName.BIGNUMERIC;
case DATETIME:
return LegacySQLTypeName.DATETIME;
default:
throw new IllegalStateException("Unsupported type " + logicalType.getToken());
}
}
Schema.Type type = schema.getType();
switch (type) {
case INT:
case LONG:
return LegacySQLTypeName.INTEGER;
case STRING:
return LegacySQLTypeName.STRING;
case FLOAT:
case DOUBLE:
return LegacySQLTypeName.FLOAT;
case BOOLEAN:
return LegacySQLTypeName.BOOLEAN;
case BYTES:
return LegacySQLTypeName.BYTES;
case ARRAY:
return getTableDataType(schema.getComponentSchema());
case RECORD:
return LegacySQLTypeName.RECORD;
default:
throw new IllegalStateException("Unsupported type " + type);
}
}
private static BigQueryFileFormat getFileFormat(List fields) {
for (BigQueryTableFieldSchema field : fields) {
if (DATETIME.equals(field.getType())) {
return BigQueryFileFormat.NEWLINE_DELIMITED_JSON;
}
// If the field name is not in english characters, then we will use json format
// We do this as the avro load job in BQ does not support non-english characters in field names for now
String fieldName = field.getName();
final String englishCharactersRegex = "[\\w]+";
if (!Pattern.matches(englishCharactersRegex, fieldName)) {
return BigQueryFileFormat.NEWLINE_DELIMITED_JSON;
}
// If the field is a record we have to check its subfields.
if (RECORD.equals(field.getType())) {
if (getFileFormat(field.getFields()) == BigQueryFileFormat.NEWLINE_DELIMITED_JSON) {
return BigQueryFileFormat.NEWLINE_DELIMITED_JSON;
}
}
}
return BigQueryFileFormat.AVRO;
}
private static Class extends FileOutputFormat> getOutputFormat(BigQueryFileFormat fileFormat) {
if (fileFormat == BigQueryFileFormat.NEWLINE_DELIMITED_JSON) {
return TextOutputFormat.class;
}
return AvroOutputFormat.class;
}
public static String generateUpdateUpsertQuery(Operation operation,
TableId sourceTableId,
TableId destinationTableId,
List tableFieldsList,
List tableKeyList,
List orderedByList,
String partitionFilter) {
String source = String.format("`%s.%s.%s`",
sourceTableId.getProject(),
sourceTableId.getDataset(),
sourceTableId.getTable());
String destination = String.format("`%s.%s.%s`",
destinationTableId.getProject(),
destinationTableId.getDataset(),
destinationTableId.getTable());
String criteria = tableKeyList.stream().map(s -> String.format(CRITERIA_TEMPLATE, s, s))
.collect(Collectors.joining(" AND "));
criteria = partitionFilter != null ? String.format("(%s) AND %s",
formatPartitionFilter(partitionFilter), criteria) : criteria;
String fieldsForUpdate = tableFieldsList.stream().filter(s -> !tableKeyList.contains(s))
.map(s -> String.format(CRITERIA_TEMPLATE, s, s)).collect(Collectors.joining(", "));
orderedByList = orderedByList.stream().map(s -> s.trim()).map(s -> {
StringBuilder sb = new StringBuilder("`").append(s.split(" ")[0]).append("` ");
if (s.split(" ").length > 1) {
sb.append(s.split(" ", 2)[1]);
}
return sb.toString();
}).collect(Collectors.toList());
String orderedBy = orderedByList.isEmpty() ? "" : " ORDER BY " + String.join(", ", orderedByList);
String sourceTable = String.format(SOURCE_DATA_QUERY, "`" + String.join("`, `", tableKeyList) + "`",
orderedBy, source);
switch (operation) {
case UPDATE:
return String.format(UPDATE_QUERY, destination, fieldsForUpdate, sourceTable, criteria);
case UPSERT:
String insertFields = "`" + String.join("`, `", tableFieldsList) + "`";
// if all fields are keys we update no field and only insert new rows
if (fieldsForUpdate.isEmpty()) {
return String.format(INSERT_ONLY_UPSERT_QUERY, destination, sourceTable, criteria,
insertFields, insertFields);
}
return String.format(UPSERT_QUERY, destination, sourceTable, criteria, fieldsForUpdate,
insertFields, insertFields);
default:
return "";
}
}
private static String formatPartitionFilter(String partitionFilter) {
String[] queryWords = partitionFilter.split(" ");
int index = 0;
for (String word : queryWords) {
if (COMPARISON_OPERATORS.contains(word.toUpperCase())) {
queryWords[index - 1] = queryWords[index - 1].replace(queryWords[index - 1],
"T." + queryWords[index - 1]);
}
index++;
}
return String.join(" ", queryWords);
}
/**
* Generates Big Query field instances based on given CDAP table schema after schema validation.
*
* @param bigQuery big query object
* @param tableName table name
* @param tableSchema table schema
* @param allowSchemaRelaxation if schema relaxation policy is allowed
* @param datasetProject project name of dataset
* @param dataset dataset name
* @param isTruncateTableSet to truncate the table before writing or not while inserting
* @param collector failure collector
* @return list of Big Query fields
*/
public static List getBigQueryTableFields(BigQuery bigQuery, String tableName,
@Nullable Schema tableSchema,
boolean allowSchemaRelaxation,
String datasetProject,
String dataset, boolean isTruncateTableSet,
FailureCollector collector) {
if (tableSchema == null) {
return Collections.emptyList();
}
TableId tableId = TableId.of(datasetProject, dataset, tableName);
try {
Table table = bigQuery.getTable(tableId);
// if table is null that mean it does not exist. So there is no need to perform validation
if (table != null) {
com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema();
validateSchema(tableName, bqSchema, tableSchema, allowSchemaRelaxation, isTruncateTableSet, dataset,
collector);
}
} catch (BigQueryException e) {
collector.addFailure("Unable to get details about the BigQuery table: " + e.getMessage(), null)
.withConfigProperty("table");
throw collector.getOrThrowException();
}
return BigQuerySinkUtils.getBigQueryTableFieldsFromSchema(tableSchema);
}
/**
* Validates output schema against Big Query table schema. It throws {@link IllegalArgumentException}
* if the output schema has more fields than Big Query table or output schema field types does not match
* Big Query column types unless schema relaxation policy is allowed.
*
* @param tableName big query table
* @param bqSchema BigQuery table schema
* @param tableSchema Configured table schema
* @param allowSchemaRelaxation allows schema relaxation policy
* @param isTruncateTableSet to truncate the table before writing or not while inserting
* @param dataset dataset name
* @param collector failure collector
*/
public static void validateSchema(
String tableName,
com.google.cloud.bigquery.Schema bqSchema,
@Nullable Schema tableSchema,
boolean allowSchemaRelaxation,
boolean isTruncateTableSet,
String dataset,
FailureCollector collector) {
if (bqSchema == null || bqSchema.getFields().isEmpty() || tableSchema == null) {
// Table is created without schema, so no further validation is required.
return;
}
FieldList bqFields = bqSchema.getFields();
List outputSchemaFields = Objects.requireNonNull(tableSchema.getFields());
List missingBQFields = BigQueryUtil.getSchemaMinusBqFields(outputSchemaFields, bqFields);
if (allowSchemaRelaxation && !isTruncateTableSet) {
// Required fields can be added only if truncate table option is set.
List nonNullableFields = missingBQFields.stream()
.map(tableSchema::getField)
.filter(Objects::nonNull)
.filter(field -> !field.getSchema().isNullable())
.map(Schema.Field::getName)
.collect(Collectors.toList());
for (String nonNullableField : nonNullableFields) {
collector.addFailure(
String.format("Required field '%s' does not exist in BigQuery table '%s.%s'.",
nonNullableField, dataset, tableName),
"Change the field to be nullable.")
.withInputSchemaField(nonNullableField).withOutputSchemaField(nonNullableField);
}
}
if (!allowSchemaRelaxation) {
// schema should not have fields that are not present in BigQuery table,
for (String missingField : missingBQFields) {
collector.addFailure(
String.format("Field '%s' does not exist in BigQuery table '%s.%s'.",
missingField, dataset, tableName),
String.format("Remove '%s' from the input, or add a column to the BigQuery table.", missingField))
.withInputSchemaField(missingField).withOutputSchemaField(missingField);
}
// validate the missing columns in output schema are nullable fields in BigQuery
List remainingBQFields = BigQueryUtil.getBqFieldsMinusSchema(bqFields, outputSchemaFields);
for (String field : remainingBQFields) {
Field.Mode mode = bqFields.get(field).getMode();
// Mode is optional. If the mode is unspecified, the column defaults to NULLABLE.
if (mode != null && mode != Field.Mode.NULLABLE) {
collector.addFailure(String.format("Required Column '%s' is not present in the schema.", field),
String.format("Add '%s' to the schema.", field));
}
}
}
// column type changes should be disallowed if either allowSchemaRelaxation or truncate table are not set.
if (!allowSchemaRelaxation || !isTruncateTableSet) {
// Match output schema field type with BigQuery column type
for (Schema.Field field : tableSchema.getFields()) {
String fieldName = field.getName();
// skip checking schema if field is missing in BigQuery
if (!missingBQFields.contains(fieldName)) {
ValidationFailure failure = BigQueryUtil.validateFieldSchemaMatches(
bqFields.get(field.getName()), field, dataset, tableName,
AbstractBigQuerySinkConfig.SUPPORTED_TYPES, collector);
if (failure != null) {
failure.withInputSchemaField(fieldName).withOutputSchemaField(fieldName);
}
BigQueryUtil.validateFieldModeMatches(bqFields.get(fieldName), field,
allowSchemaRelaxation,
collector);
}
}
}
collector.getOrThrowException();
}
/**
* Validates output schema against Big Query table schema for Insert operation. It throws
* {@link IllegalArgumentException}
* if the output schema has more fields than Big Query table or output schema field types does not match
* Big Query column types unless schema relaxation policy is allowed.
*
* @param table table name
* @param tableSchema configured table schema
* @param allowSchemaRelaxation allows schema relaxation policy
* @param isTruncateTableSet to truncate the table before writing or not while inserting
* @param dataset dataset name
* @param collector failure collector
*/
public static void validateInsertSchema(Table table, @Nullable Schema tableSchema,
boolean allowSchemaRelaxation,
boolean isTruncateTableSet,
String dataset, FailureCollector collector) {
com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema();
if (bqSchema == null || bqSchema.getFields().isEmpty()) {
// Table is created without schema, so no further validation is required.
return;
}
if (isTruncateTableSet || tableSchema == null) {
//no validation required for schema if truncate table is set.
// BQ will overwrite the schema for normal tables when write disposition is WRITE_TRUNCATE
//note - If write to single partition is supported in future, schema validation will be necessary
return;
}
FieldList bqFields = bqSchema.getFields();
List outputSchemaFields = Objects.requireNonNull(tableSchema.getFields());
List remainingBQFields = BigQueryUtil.getBqFieldsMinusSchema(bqFields, outputSchemaFields);
for (String field : remainingBQFields) {
if (bqFields.get(field).getMode() != Field.Mode.NULLABLE) {
collector.addFailure(String.format("Required Column '%s' is not present in the schema.", field),
String.format("Add '%s' to the schema.", field));
}
}
String tableName = table.getTableId().getTable();
List missingBQFields = BigQueryUtil.getSchemaMinusBqFields(outputSchemaFields, bqFields);
// Match output schema field type with BigQuery column type
for (Schema.Field field : tableSchema.getFields()) {
String fieldName = field.getName();
// skip checking schema if field is missing in BigQuery
if (!missingBQFields.contains(fieldName)) {
ValidationFailure failure = BigQueryUtil.validateFieldSchemaMatches(
bqFields.get(field.getName()), field, dataset, tableName,
AbstractBigQuerySinkConfig.SUPPORTED_TYPES, collector);
if (failure != null) {
failure.withInputSchemaField(fieldName).withOutputSchemaField(fieldName);
}
BigQueryUtil.validateFieldModeMatches(bqFields.get(fieldName), field,
allowSchemaRelaxation,
collector);
}
}
collector.getOrThrowException();
}
/**
* Returns list of duplicated fields (case insensitive)
*
* @param schemaFields List of schema fields
* @return return set of duplicate fields
*/
public static Set getDuplicatedFields(List schemaFields) {
final Set duplicatedFields = new HashSet<>();
final Set set = new HashSet<>();
for (String field : schemaFields) {
if (!set.add(field)) {
duplicatedFields.add(field);
}
}
return duplicatedFields;
}
/**
* returns whether any logical type is supported or not in BigQuery.
*
* @param logicalType logical type of schema field
* @return boolean value
*/
public static boolean isSupportedLogicalType(Schema.LogicalType logicalType) {
if (logicalType != null) {
return logicalType == Schema.LogicalType.DATE || logicalType == Schema.LogicalType.TIMESTAMP_MICROS ||
logicalType == Schema.LogicalType.TIMESTAMP_MILLIS || logicalType == Schema.LogicalType.DECIMAL;
}
return false;
}
/**
* returns a Map, if fields are coming multiple times in fieldList.
*
* @param values
* @return Map containing Field as key and occurrence as value
*/
public static Map calculateDuplicates(List values) {
return values.stream()
.map(v -> v.split(" ")[0])
.collect(Collectors.toMap(p -> p, p -> 1, (x, y) -> x + y));
}
/**
* Record the lineage for the plugin
*
* @param context Batch sink context
* @param asset asset object of table
* @param tableSchema schema of table
* @param fieldNames field list
* @param operationNameSuffix suffix for operation name, if applicable
*/
public static void recordLineage(BatchSinkContext context,
Asset asset,
Schema tableSchema,
List fieldNames, @Nullable String operationNameSuffix) {
LineageRecorder lineageRecorder = new LineageRecorder(context, asset);
lineageRecorder.createExternalDataset(tableSchema);
if (!fieldNames.isEmpty()) {
String operationName = operationNameSuffix == null ? "Write" : "Write_To_" + operationNameSuffix;
lineageRecorder.recordWrite(operationName, "Wrote to BigQuery table.", fieldNames);
}
}
/**
* Get the list of fields that are of type JSON from the BigQuery schema.
* @param bqSchema BigQuery schema.
* @return comma separated list of fields that are of type JSON.
*/
public static String getJsonStringFieldsFromBQSchema(com.google.cloud.bigquery.Schema bqSchema) {
ArrayList fields = new ArrayList<>();
getJsonStringFieldsFromBQSchema(bqSchema.getFields(), fields, new ArrayList<>());
return String.join(",", fields);
}
private static void getJsonStringFieldsFromBQSchema(FieldList fieldList,
ArrayList fields, ArrayList path) {
for (Field field : fieldList) {
path.add(field.getName());
if (field.getType() == LegacySQLTypeName.RECORD) {
getJsonStringFieldsFromBQSchema(field.getSubFields(), fields, path);
} else if (field.getType().equals(LegacySQLTypeName.valueOf(JSON))) {
fields.add(String.join(".", path));
}
path.remove(path.size() - 1);
}
}
}