All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils Maven / Gradle / Ivy

There is a newer version: 0.23.3
Show newest version
/*
 * Copyright © 2021 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.bigquery.sink;

import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.DatasetInfo;
import com.google.cloud.bigquery.EncryptionConfiguration;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.LegacySQLTypeName;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.validation.ValidationFailure;
import io.cdap.plugin.common.Asset;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryOutputConfiguration;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableFieldSchema;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableSchema;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryTypeSize.Numeric;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.GCPUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Supplier;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Utility class for the BigQuery DelegatingMultiSink.
 * 

* The logic in this class has been extracted from the {@link AbstractBigQuerySink} in order to make this functionality * available to other classes in this package. */ public final class BigQuerySinkUtils { public static final String GS_PATH_FORMAT = "gs://%s/%s"; private static final String TEMPORARY_BUCKET_FORMAT = GS_PATH_FORMAT + "/input/%s-%s"; private static final String DATETIME = "DATETIME"; private static final String RECORD = "RECORD"; private static final String JSON = "JSON"; private static final Gson GSON = new Gson(); private static final Type LIST_OF_FIELD_TYPE = new TypeToken>() { }.getType(); // Fields used to build update/upsert queries private static final String CRITERIA_TEMPLATE = "T.`%s` = S.`%s`"; private static final String SOURCE_DATA_QUERY = "(SELECT * FROM (SELECT row_number() OVER (PARTITION BY %s%s) " + "as rowid, * FROM %s) where rowid = 1)"; private static final String UPDATE_QUERY = "UPDATE %s T SET %s FROM %s S WHERE %s"; private static final String UPSERT_QUERY = "MERGE %s T USING %s S ON %s WHEN MATCHED THEN UPDATE SET %s " + "WHEN NOT MATCHED THEN INSERT (%s) VALUES(%s)"; private static final String INSERT_ONLY_UPSERT_QUERY = "MERGE %s T USING %s S ON %s WHEN NOT MATCHED THEN " + "INSERT (%s) VALUES(%s)"; private static final List COMPARISON_OPERATORS = Arrays.asList("=", "<", ">", "<=", ">=", "!=", "<>", "LIKE", "NOT LIKE", "BETWEEN", "NOT BETWEEN", "IN", "NOT IN", "IS NULL", "IS NOT NULL", "IS TRUE", "IS NOT TRUE", "IS FALSE", "IS NOT FALSE"); public static final String BYTES_PROCESSED_METRIC = "bytes.processed"; /** * Creates the given dataset and bucket if they do not already exist. If the dataset already exists but the * bucket does not, the bucket will be created in the same location as the dataset. If the bucket already exists * but the dataset does not, the dataset will attempt to be created in the same location. This may fail if the bucket * is in a location that BigQuery does not yet support. * * @param bigQuery the bigquery client for the project * @param storage the storage client for the project * @param datasetId the Id of the dataset * @param bucketName the name of the bucket * @param location the location of the resources, this is only applied if both the bucket and dataset do not exist * @param cmekKeyName the name of the cmek key * @throws IOException if there was an error creating or fetching any GCP resource */ public static void createResources(BigQuery bigQuery, Storage storage, DatasetId datasetId, String bucketName, @Nullable String location, @Nullable CryptoKeyName cmekKeyName) throws IOException { // Get the required Dataset and bucket instances using the supplied clients Dataset dataset = bigQuery.getDataset(datasetId); Bucket bucket = storage.get(bucketName); createResources(bigQuery, dataset, datasetId, storage, bucket, bucketName, location, cmekKeyName); } /** * Creates the given dataset and bucket if the supplied ones are null. *

* If the dataset already exists but the * bucket does not, the bucket will be created in the same location as the dataset. If the bucket already exists * but the dataset does not, the dataset will attempt to be created in the same location. This may fail if the bucket * is in a location that BigQuery does not yet support. * * @param bigQuery the bigquery client for the project * @param dataset the bigquery dataset instance (may be null) * @param datasetId the Id of the dataset * @param storage the storage client for the project * @param bucket the storage bucket instance (may be null) * @param bucketName the name of the bucket * @param location the location of the resources, this is only applied if both the bucket and dataset do not exist * @param cmekKey the name of the cmek key * @throws IOException */ public static void createResources(BigQuery bigQuery, @Nullable Dataset dataset, DatasetId datasetId, Storage storage, @Nullable Bucket bucket, String bucketName, @Nullable String location, @Nullable CryptoKeyName cmekKey) throws IOException { if (dataset == null && bucket == null) { createBucket(storage, bucketName, location, cmekKey, () -> String.format("Unable to create Cloud Storage bucket '%s'", bucketName)); createDataset(bigQuery, datasetId, location, cmekKey, () -> String.format("Unable to create BigQuery dataset '%s.%s'", datasetId.getProject(), datasetId.getDataset())); } else if (bucket == null) { createBucket( storage, bucketName, dataset.getLocation(), cmekKey, () -> String.format( "Unable to create Cloud Storage bucket '%s' in the same location ('%s') as BigQuery dataset '%s'. " + "Please use a bucket that is in the same location as the dataset.", bucketName, dataset.getLocation(), datasetId.getProject() + "." + datasetId.getDataset())); } else if (dataset == null) { createDataset( bigQuery, datasetId, bucket.getLocation(), cmekKey, () -> String.format( "Unable to create BigQuery dataset '%s' in the same location ('%s') as Cloud Storage bucket '%s'. " + "Please use a bucket that is in a supported location.", datasetId, bucket.getLocation(), bucketName)); } } /** * Creates a Dataset in the specified location using the supplied BigQuery client. * * @param bigQuery the bigQuery client. * @param dataset the Id of the dataset to create. * @param location Location for this dataset. * @param cmekKeyName CMEK key to use for this dataset. * @param errorMessage Supplier for the error message to output if the dataset could not be created. * @throws IOException if the dataset could not be created. */ private static void createDataset(BigQuery bigQuery, DatasetId dataset, @Nullable String location, @Nullable CryptoKeyName cmekKeyName, Supplier errorMessage) throws IOException { DatasetInfo.Builder builder = DatasetInfo.newBuilder(dataset); if (location != null) { builder.setLocation(location); } if (cmekKeyName != null) { builder.setDefaultEncryptionConfiguration( EncryptionConfiguration.newBuilder().setKmsKeyName(cmekKeyName.toString()).build()); } try { bigQuery.create(builder.build()); } catch (BigQueryException e) { if (e.getCode() != 409) { // A conflict means the dataset already exists (https://cloud.google.com/bigquery/troubleshooting-errors) // This most likely means multiple stages in the same pipeline are trying to create the same dataset. // Ignore this and move on, since all that matters is that the dataset exists. throw new IOException(errorMessage.get(), e); } } } /** * Creates a Dataset in the specified location using the supplied BigQuery client if it does not exist. * * @param bigQuery the bigQuery client. * @param datasetId the Id of the dataset to create. * @param location Location for this dataset. * @param cmekKeyName CMEK key to use for this dataset. * @param errorMessage Supplier for the error message to output if the dataset could not be created. * @throws IOException if the dataset could not be created. */ public static void createDatasetIfNotExists(BigQuery bigQuery, DatasetId datasetId, @Nullable String location, @Nullable CryptoKeyName cmekKeyName, Supplier errorMessage) throws IOException { // Check if dataset exists Dataset ds = bigQuery.getDataset(datasetId); // Create dataset if needed if (ds == null) { createDataset(bigQuery, datasetId, location, cmekKeyName, errorMessage); } } /** * Creates the specified GCS bucket using the supplied GCS client. * * @param storage GCS Client. * @param bucket Bucket Name. * @param location Location for this bucket. * @param cmekKeyName CMEK key to use for this bucket. * @param errorMessage Supplier for the error message to output if the bucket could not be created. * @throws IOException if the bucket could not be created. */ private static void createBucket(Storage storage, String bucket, @Nullable String location, @Nullable CryptoKeyName cmekKeyName, Supplier errorMessage) throws IOException { try { GCPUtils.createBucket(storage, bucket, location, cmekKeyName); } catch (StorageException e) { if (e.getCode() != 409) { // A conflict means the bucket already exists // This most likely means multiple stages in the same pipeline are trying to create the same dataset. // Ignore this and move on, since all that matters is that the dataset exists. throw new IOException(errorMessage.get(), e); } } } /** * Updates {@link Configuration} with bucket details. * Uses provided bucket, otherwise uses provided runId as a bucket name. * * @return bucket name */ public static String configureBucket(Configuration baseConfiguration, @Nullable String bucket, String runId) { boolean deleteBucket = false; // If the bucket is null, assign the run ID as the bucket name and mark the bucket for deletion. if (bucket == null) { bucket = runId; deleteBucket = true; } return configureBucket(baseConfiguration, bucket, runId, deleteBucket); } /** * Updates {@link Configuration} with bucket details. * Uses provided bucket, otherwise uses provided runId as a bucket name. * * @return bucket name */ public static String configureBucket(Configuration baseConfiguration, String bucket, String runId, boolean deleteBucket) { if (deleteBucket) { // By default, this option is false, meaning the job can not delete the bucket. // So enable it only when bucket name is not provided. baseConfiguration.setBoolean("fs.gs.bucket.delete.enable", true); } baseConfiguration.set("fs.default.name", String.format(GS_PATH_FORMAT, bucket, runId)); baseConfiguration.setBoolean("fs.gs.impl.disable.cache", true); baseConfiguration.setBoolean("fs.gs.metadata.cache.enable", false); return bucket; } /** * Sets string fields to JSON type if they are present in the provided list of fields. * @param fields list of BigQuery table fields. * @param jsonStringFields Comma separated list of fields that should be set to JSON type. * */ public static void setJsonStringFields(List fields, String jsonStringFields) { Set jsonFields = new HashSet<>(Arrays.asList(jsonStringFields.split(","))); setJsonStringFields(fields, jsonFields, new ArrayList<>()); } private static void setJsonStringFields(List fields, Set jsonFields, List path) { for (BigQueryTableFieldSchema field : fields) { String fieldName = field.getName(); String fieldType = field.getType(); String separator = path.isEmpty() ? "" : "."; String fieldPath = String.join(".", path) + separator + fieldName; if (jsonFields.contains(fieldPath) && fieldType.equals(LegacySQLTypeName.STRING.name())) { field.setType(LegacySQLTypeName.valueOf(JSON).name()); } else if (field.getType().equals(LegacySQLTypeName.RECORD.name())) { path.add(fieldName); setJsonStringFields(field.getFields(), jsonFields, path); path.remove(path.size() - 1); } } } /** * Configures output for Sink * * @param configuration Hadoop configuration instance * @param datasetId id of the dataset to use * @param tableName name of the table to use * @param gcsPath GCS path to use for output * @param fields list of BigQuery table fields * @throws IOException if the output cannot be configured */ public static void configureOutput(Configuration configuration, DatasetId datasetId, String tableName, String gcsPath, List fields) throws IOException { // Set up table schema BigQueryTableSchema outputTableSchema = new BigQueryTableSchema(); if (!fields.isEmpty()) { String jsonStringFields = configuration.get(BigQueryConstants.CONFIG_JSON_STRING_FIELDS, null); if (jsonStringFields != null) { setJsonStringFields(fields, jsonStringFields); } outputTableSchema.setFields(fields); } BigQueryFileFormat fileFormat = getFileFormat(fields); BigQueryOutputConfiguration.configure( configuration, String.format("%s:%s.%s", datasetId.getProject(), datasetId.getDataset(), tableName), outputTableSchema, gcsPath, fileFormat, getOutputFormat(fileFormat)); } /** * Configures output for MultiSink * * @param configuration Hadoop configuration instance * @param datasetId name of the dataset to use * @param tableName name of the table to use * @param gcsPath GCS path to use for output * @param fields list of BigQuery table fields * @throws IOException if the output cannot be configured */ public static void configureMultiSinkOutput(Configuration configuration, DatasetId datasetId, String tableName, String gcsPath, List fields) throws IOException { configureOutput(configuration, datasetId, tableName, gcsPath, fields); // Set operation as Insertion. Currently the BQ MultiSink can only support the insertion operation. configuration.set(BigQueryConstants.CONFIG_OPERATION, Operation.INSERT.name()); } public static String getTemporaryGcsPath(String bucket, String pathPrefix, String tableName) { return String.format(TEMPORARY_BUCKET_FORMAT, bucket, pathPrefix, tableName, pathPrefix); } public static List getBigQueryTableFieldsFromSchema(Schema tableSchema) { List inputFields = Objects.requireNonNull(tableSchema.getFields(), "Schema must have fields"); return inputFields.stream() .map(BigQuerySinkUtils::generateTableFieldSchema) .collect(Collectors.toList()); } /** * Relaxes the Destination Table Schema based on the matching field names from the source table * * @param bigquery BigQuery client * @param sourceTable source table, which contains the updated field definition * @param destinationTable destination table, whose fields definitions may be relaxed depending on the source table. */ public static void relaxTableSchema(BigQuery bigquery, Table sourceTable, Table destinationTable) { List sourceFields = sourceTable.getDefinition().getSchema().getFields(); List destinationFields = destinationTable.getDefinition().getSchema().getFields(); relaxTableSchema(bigquery, destinationTable, sourceFields, destinationFields); } /** * Relaxes the Destination Table Schema based on the matching field names from the source table * * @param bigquery BigQuery client * @param destinationTable destination table, whose fields definitions may be relaxed depending on the source fields. * @param sourceFields fields in the source table that need to be used to relax the destination table * @param destinationFields fields in the destination table that may be relaxed depending on the source fields */ public static void relaxTableSchema(BigQuery bigquery, Table destinationTable, List sourceFields, List destinationFields) { List resultFieldsList = getRelaxedTableFields(sourceFields, destinationFields); // Update table definition, relaxing field definitions. com.google.cloud.bigquery.Schema newSchema = com.google.cloud.bigquery.Schema.of(resultFieldsList); bigquery.update( destinationTable.toBuilder().setDefinition( destinationTable.getDefinition().toBuilder().setSchema(newSchema).build() ).build() ); } public static List getRelaxedTableFields(List sourceFields, List destinationFields) { // Collect all fields form the destination table into a Map for quick lookups by name Map destinationFieldMap = destinationFields.stream() .collect(Collectors.toMap(Field::getName, x -> x)); // Collect all fields form the source table into a Map for quick lookups by name Map sourceFieldMap = sourceFields.stream() .collect(Collectors.toMap(Field::getName, x -> x)); // Collects all fields in the destination table that are not present in the source table, in order to retain them // as-is in the destination schema List resultingFields = destinationFields.stream() .filter(field -> !sourceFieldMap.containsKey(field.getName())) .collect(Collectors.toList()); // Add fields from the source table into the destination table // We need to ensure that fields in the destination table that are Nullable remain as such, even when the source // field defines this field as required, so we need to relax any fields in the source table that are already // defined in the destination table. sourceFieldMap.values().stream() .map(sourceField -> { String fieldName = sourceField.getName(); if (destinationFieldMap.containsKey(fieldName)) { Field destinationField = destinationFieldMap.get(fieldName); // If the field in the destination table is nullable, but not in the source table, we need to set update // field's mode to NULLABLE in order to ensure the underlying schema doesn't change. if (destinationField.getMode() == Field.Mode.NULLABLE && sourceField.getMode() == Field.Mode.REQUIRED) { sourceField = sourceField.toBuilder().setMode(Field.Mode.NULLABLE).build(); } } return sourceField; }) .forEach(resultingFields::add); return resultingFields; } private static BigQueryTableFieldSchema generateTableFieldSchema(Schema.Field field) { BigQueryTableFieldSchema fieldSchema = new BigQueryTableFieldSchema(); fieldSchema.setName(field.getName()); fieldSchema.setMode(getMode(field.getSchema()).name()); LegacySQLTypeName type = getTableDataType(field.getSchema()); fieldSchema.setType(type.name()); if (type == LegacySQLTypeName.RECORD) { List schemaFields; Schema fieldCdapSchema = BigQueryUtil.getNonNullableSchema(field.getSchema()); // If its an Array of records we need to get the component schema of the array // which will be the Record. Which can itself be nullable, and then get the fields // of that record. if (Schema.Type.ARRAY == fieldCdapSchema.getType()) { schemaFields = Objects.requireNonNull( BigQueryUtil.getNonNullableSchema(fieldCdapSchema.getComponentSchema()).getFields()); } else { schemaFields = fieldCdapSchema.getFields(); } fieldSchema.setFields(Objects.requireNonNull(schemaFields).stream() .map(BigQuerySinkUtils::generateTableFieldSchema) .collect(Collectors.toList())); } return fieldSchema; } public static com.google.cloud.bigquery.Schema convertCdapSchemaToBigQuerySchema(Schema schema) { List inputFields = Objects.requireNonNull(schema.getFields(), "Schema must have fields"); List fields = inputFields.stream() .map(BigQuerySinkUtils::convertCdapFieldToBigQueryField) .collect(Collectors.toList()); return com.google.cloud.bigquery.Schema.of(fields); } private static Field convertCdapFieldToBigQueryField(Schema.Field field) { String name = field.getName(); LegacySQLTypeName type = getTableDataType(field.getSchema()); Field.Mode mode = getMode(field.getSchema()); Field.Builder fieldBuilder; // For record fields, we need to get all subfields and re-create the builder. if (type == LegacySQLTypeName.RECORD) { List schemaFields; Schema fieldCdapSchema = BigQueryUtil.getNonNullableSchema(field.getSchema()); // If its an Array of records we need to get the component schema of the array // which will be the Record. Which can itself be nullable, and then get the fields // of that record. if (Schema.Type.ARRAY == fieldCdapSchema.getType()) { schemaFields = Objects.requireNonNull( BigQueryUtil.getNonNullableSchema(fieldCdapSchema.getComponentSchema()).getFields()); } else { schemaFields = fieldCdapSchema.getFields(); } FieldList subFields = FieldList.of(Objects.requireNonNull(schemaFields).stream() .map(BigQuerySinkUtils::convertCdapFieldToBigQueryField) .collect(Collectors.toList())); fieldBuilder = Field.newBuilder(name, type, subFields); } else { fieldBuilder = Field.newBuilder(name, type); } fieldBuilder.setMode(mode); // Set precision for numeric fields if (type == LegacySQLTypeName.NUMERIC || type == LegacySQLTypeName.BIGNUMERIC) { Schema decimalFieldSchema = BigQueryUtil.getNonNullableSchema(field.getSchema()); fieldBuilder.setPrecision((long) decimalFieldSchema.getPrecision()); fieldBuilder.setScale((long) decimalFieldSchema.getScale()); } return fieldBuilder.build(); } private static Field.Mode getMode(Schema schema) { boolean isNullable = schema.isNullable(); Schema.Type nonNullableType = isNullable ? schema.getNonNullable().getType() : schema.getType(); if (isNullable && nonNullableType != Schema.Type.ARRAY) { return Field.Mode.NULLABLE; } else if (nonNullableType == Schema.Type.ARRAY) { return Field.Mode.REPEATED; } return Field.Mode.REQUIRED; } /** * This function returns the LegacySQLTypeName that maps to the given CDAP Schema. * If the CDAP Schema is an Array it will return the LegacySQLTypename of the components. */ private static LegacySQLTypeName getTableDataType(Schema schema) { schema = BigQueryUtil.getNonNullableSchema(schema); Schema.LogicalType logicalType = schema.getLogicalType(); if (logicalType != null) { switch (logicalType) { case DATE: return LegacySQLTypeName.DATE; case TIME_MILLIS: case TIME_MICROS: return LegacySQLTypeName.TIME; case TIMESTAMP_MILLIS: case TIMESTAMP_MICROS: return LegacySQLTypeName.TIMESTAMP; case DECIMAL: // Following the restrictions given by: // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric_types if ((schema.getScale() <= Numeric.SCALE) && (schema.getPrecision() <= Numeric.PRECISION) && ((schema.getPrecision() - schema.getScale()) <= (Numeric.PRECISION - Numeric.SCALE))) { return LegacySQLTypeName.NUMERIC; } return LegacySQLTypeName.BIGNUMERIC; case DATETIME: return LegacySQLTypeName.DATETIME; default: throw new IllegalStateException("Unsupported type " + logicalType.getToken()); } } Schema.Type type = schema.getType(); switch (type) { case INT: case LONG: return LegacySQLTypeName.INTEGER; case STRING: return LegacySQLTypeName.STRING; case FLOAT: case DOUBLE: return LegacySQLTypeName.FLOAT; case BOOLEAN: return LegacySQLTypeName.BOOLEAN; case BYTES: return LegacySQLTypeName.BYTES; case ARRAY: return getTableDataType(schema.getComponentSchema()); case RECORD: return LegacySQLTypeName.RECORD; default: throw new IllegalStateException("Unsupported type " + type); } } private static BigQueryFileFormat getFileFormat(List fields) { for (BigQueryTableFieldSchema field : fields) { if (DATETIME.equals(field.getType())) { return BigQueryFileFormat.NEWLINE_DELIMITED_JSON; } // If the field name is not in english characters, then we will use json format // We do this as the avro load job in BQ does not support non-english characters in field names for now String fieldName = field.getName(); final String englishCharactersRegex = "[\\w]+"; if (!Pattern.matches(englishCharactersRegex, fieldName)) { return BigQueryFileFormat.NEWLINE_DELIMITED_JSON; } // If the field is a record we have to check its subfields. if (RECORD.equals(field.getType())) { if (getFileFormat(field.getFields()) == BigQueryFileFormat.NEWLINE_DELIMITED_JSON) { return BigQueryFileFormat.NEWLINE_DELIMITED_JSON; } } } return BigQueryFileFormat.AVRO; } private static Class getOutputFormat(BigQueryFileFormat fileFormat) { if (fileFormat == BigQueryFileFormat.NEWLINE_DELIMITED_JSON) { return TextOutputFormat.class; } return AvroOutputFormat.class; } public static String generateUpdateUpsertQuery(Operation operation, TableId sourceTableId, TableId destinationTableId, List tableFieldsList, List tableKeyList, List orderedByList, String partitionFilter) { String source = String.format("`%s.%s.%s`", sourceTableId.getProject(), sourceTableId.getDataset(), sourceTableId.getTable()); String destination = String.format("`%s.%s.%s`", destinationTableId.getProject(), destinationTableId.getDataset(), destinationTableId.getTable()); String criteria = tableKeyList.stream().map(s -> String.format(CRITERIA_TEMPLATE, s, s)) .collect(Collectors.joining(" AND ")); criteria = partitionFilter != null ? String.format("(%s) AND %s", formatPartitionFilter(partitionFilter), criteria) : criteria; String fieldsForUpdate = tableFieldsList.stream().filter(s -> !tableKeyList.contains(s)) .map(s -> String.format(CRITERIA_TEMPLATE, s, s)).collect(Collectors.joining(", ")); orderedByList = orderedByList.stream().map(s -> s.trim()).map(s -> { StringBuilder sb = new StringBuilder("`").append(s.split(" ")[0]).append("` "); if (s.split(" ").length > 1) { sb.append(s.split(" ", 2)[1]); } return sb.toString(); }).collect(Collectors.toList()); String orderedBy = orderedByList.isEmpty() ? "" : " ORDER BY " + String.join(", ", orderedByList); String sourceTable = String.format(SOURCE_DATA_QUERY, "`" + String.join("`, `", tableKeyList) + "`", orderedBy, source); switch (operation) { case UPDATE: return String.format(UPDATE_QUERY, destination, fieldsForUpdate, sourceTable, criteria); case UPSERT: String insertFields = "`" + String.join("`, `", tableFieldsList) + "`"; // if all fields are keys we update no field and only insert new rows if (fieldsForUpdate.isEmpty()) { return String.format(INSERT_ONLY_UPSERT_QUERY, destination, sourceTable, criteria, insertFields, insertFields); } return String.format(UPSERT_QUERY, destination, sourceTable, criteria, fieldsForUpdate, insertFields, insertFields); default: return ""; } } private static String formatPartitionFilter(String partitionFilter) { String[] queryWords = partitionFilter.split(" "); int index = 0; for (String word : queryWords) { if (COMPARISON_OPERATORS.contains(word.toUpperCase())) { queryWords[index - 1] = queryWords[index - 1].replace(queryWords[index - 1], "T." + queryWords[index - 1]); } index++; } return String.join(" ", queryWords); } /** * Generates Big Query field instances based on given CDAP table schema after schema validation. * * @param bigQuery big query object * @param tableName table name * @param tableSchema table schema * @param allowSchemaRelaxation if schema relaxation policy is allowed * @param datasetProject project name of dataset * @param dataset dataset name * @param isTruncateTableSet to truncate the table before writing or not while inserting * @param collector failure collector * @return list of Big Query fields */ public static List getBigQueryTableFields(BigQuery bigQuery, String tableName, @Nullable Schema tableSchema, boolean allowSchemaRelaxation, String datasetProject, String dataset, boolean isTruncateTableSet, FailureCollector collector) { if (tableSchema == null) { return Collections.emptyList(); } TableId tableId = TableId.of(datasetProject, dataset, tableName); try { Table table = bigQuery.getTable(tableId); // if table is null that mean it does not exist. So there is no need to perform validation if (table != null) { com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema(); validateSchema(tableName, bqSchema, tableSchema, allowSchemaRelaxation, isTruncateTableSet, dataset, collector); } } catch (BigQueryException e) { collector.addFailure("Unable to get details about the BigQuery table: " + e.getMessage(), null) .withConfigProperty("table"); throw collector.getOrThrowException(); } return BigQuerySinkUtils.getBigQueryTableFieldsFromSchema(tableSchema); } /** * Validates output schema against Big Query table schema. It throws {@link IllegalArgumentException} * if the output schema has more fields than Big Query table or output schema field types does not match * Big Query column types unless schema relaxation policy is allowed. * * @param tableName big query table * @param bqSchema BigQuery table schema * @param tableSchema Configured table schema * @param allowSchemaRelaxation allows schema relaxation policy * @param isTruncateTableSet to truncate the table before writing or not while inserting * @param dataset dataset name * @param collector failure collector */ public static void validateSchema( String tableName, com.google.cloud.bigquery.Schema bqSchema, @Nullable Schema tableSchema, boolean allowSchemaRelaxation, boolean isTruncateTableSet, String dataset, FailureCollector collector) { if (bqSchema == null || bqSchema.getFields().isEmpty() || tableSchema == null) { // Table is created without schema, so no further validation is required. return; } FieldList bqFields = bqSchema.getFields(); List outputSchemaFields = Objects.requireNonNull(tableSchema.getFields()); List missingBQFields = BigQueryUtil.getSchemaMinusBqFields(outputSchemaFields, bqFields); if (allowSchemaRelaxation && !isTruncateTableSet) { // Required fields can be added only if truncate table option is set. List nonNullableFields = missingBQFields.stream() .map(tableSchema::getField) .filter(Objects::nonNull) .filter(field -> !field.getSchema().isNullable()) .map(Schema.Field::getName) .collect(Collectors.toList()); for (String nonNullableField : nonNullableFields) { collector.addFailure( String.format("Required field '%s' does not exist in BigQuery table '%s.%s'.", nonNullableField, dataset, tableName), "Change the field to be nullable.") .withInputSchemaField(nonNullableField).withOutputSchemaField(nonNullableField); } } if (!allowSchemaRelaxation) { // schema should not have fields that are not present in BigQuery table, for (String missingField : missingBQFields) { collector.addFailure( String.format("Field '%s' does not exist in BigQuery table '%s.%s'.", missingField, dataset, tableName), String.format("Remove '%s' from the input, or add a column to the BigQuery table.", missingField)) .withInputSchemaField(missingField).withOutputSchemaField(missingField); } // validate the missing columns in output schema are nullable fields in BigQuery List remainingBQFields = BigQueryUtil.getBqFieldsMinusSchema(bqFields, outputSchemaFields); for (String field : remainingBQFields) { Field.Mode mode = bqFields.get(field).getMode(); // Mode is optional. If the mode is unspecified, the column defaults to NULLABLE. if (mode != null && mode != Field.Mode.NULLABLE) { collector.addFailure(String.format("Required Column '%s' is not present in the schema.", field), String.format("Add '%s' to the schema.", field)); } } } // column type changes should be disallowed if either allowSchemaRelaxation or truncate table are not set. if (!allowSchemaRelaxation || !isTruncateTableSet) { // Match output schema field type with BigQuery column type for (Schema.Field field : tableSchema.getFields()) { String fieldName = field.getName(); // skip checking schema if field is missing in BigQuery if (!missingBQFields.contains(fieldName)) { ValidationFailure failure = BigQueryUtil.validateFieldSchemaMatches( bqFields.get(field.getName()), field, dataset, tableName, AbstractBigQuerySinkConfig.SUPPORTED_TYPES, collector); if (failure != null) { failure.withInputSchemaField(fieldName).withOutputSchemaField(fieldName); } BigQueryUtil.validateFieldModeMatches(bqFields.get(fieldName), field, allowSchemaRelaxation, collector); } } } collector.getOrThrowException(); } /** * Validates output schema against Big Query table schema for Insert operation. It throws * {@link IllegalArgumentException} * if the output schema has more fields than Big Query table or output schema field types does not match * Big Query column types unless schema relaxation policy is allowed. * * @param table table name * @param tableSchema configured table schema * @param allowSchemaRelaxation allows schema relaxation policy * @param isTruncateTableSet to truncate the table before writing or not while inserting * @param dataset dataset name * @param collector failure collector */ public static void validateInsertSchema(Table table, @Nullable Schema tableSchema, boolean allowSchemaRelaxation, boolean isTruncateTableSet, String dataset, FailureCollector collector) { com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema(); if (bqSchema == null || bqSchema.getFields().isEmpty()) { // Table is created without schema, so no further validation is required. return; } if (isTruncateTableSet || tableSchema == null) { //no validation required for schema if truncate table is set. // BQ will overwrite the schema for normal tables when write disposition is WRITE_TRUNCATE //note - If write to single partition is supported in future, schema validation will be necessary return; } FieldList bqFields = bqSchema.getFields(); List outputSchemaFields = Objects.requireNonNull(tableSchema.getFields()); List remainingBQFields = BigQueryUtil.getBqFieldsMinusSchema(bqFields, outputSchemaFields); for (String field : remainingBQFields) { if (bqFields.get(field).getMode() != Field.Mode.NULLABLE) { collector.addFailure(String.format("Required Column '%s' is not present in the schema.", field), String.format("Add '%s' to the schema.", field)); } } String tableName = table.getTableId().getTable(); List missingBQFields = BigQueryUtil.getSchemaMinusBqFields(outputSchemaFields, bqFields); // Match output schema field type with BigQuery column type for (Schema.Field field : tableSchema.getFields()) { String fieldName = field.getName(); // skip checking schema if field is missing in BigQuery if (!missingBQFields.contains(fieldName)) { ValidationFailure failure = BigQueryUtil.validateFieldSchemaMatches( bqFields.get(field.getName()), field, dataset, tableName, AbstractBigQuerySinkConfig.SUPPORTED_TYPES, collector); if (failure != null) { failure.withInputSchemaField(fieldName).withOutputSchemaField(fieldName); } BigQueryUtil.validateFieldModeMatches(bqFields.get(fieldName), field, allowSchemaRelaxation, collector); } } collector.getOrThrowException(); } /** * Returns list of duplicated fields (case insensitive) * * @param schemaFields List of schema fields * @return return set of duplicate fields */ public static Set getDuplicatedFields(List schemaFields) { final Set duplicatedFields = new HashSet<>(); final Set set = new HashSet<>(); for (String field : schemaFields) { if (!set.add(field)) { duplicatedFields.add(field); } } return duplicatedFields; } /** * returns whether any logical type is supported or not in BigQuery. * * @param logicalType logical type of schema field * @return boolean value */ public static boolean isSupportedLogicalType(Schema.LogicalType logicalType) { if (logicalType != null) { return logicalType == Schema.LogicalType.DATE || logicalType == Schema.LogicalType.TIMESTAMP_MICROS || logicalType == Schema.LogicalType.TIMESTAMP_MILLIS || logicalType == Schema.LogicalType.DECIMAL; } return false; } /** * returns a Map, if fields are coming multiple times in fieldList. * * @param values * @return Map containing Field as key and occurrence as value */ public static Map calculateDuplicates(List values) { return values.stream() .map(v -> v.split(" ")[0]) .collect(Collectors.toMap(p -> p, p -> 1, (x, y) -> x + y)); } /** * Record the lineage for the plugin * * @param context Batch sink context * @param asset asset object of table * @param tableSchema schema of table * @param fieldNames field list * @param operationNameSuffix suffix for operation name, if applicable */ public static void recordLineage(BatchSinkContext context, Asset asset, Schema tableSchema, List fieldNames, @Nullable String operationNameSuffix) { LineageRecorder lineageRecorder = new LineageRecorder(context, asset); lineageRecorder.createExternalDataset(tableSchema); if (!fieldNames.isEmpty()) { String operationName = operationNameSuffix == null ? "Write" : "Write_To_" + operationNameSuffix; lineageRecorder.recordWrite(operationName, "Wrote to BigQuery table.", fieldNames); } } /** * Get the list of fields that are of type JSON from the BigQuery schema. * @param bqSchema BigQuery schema. * @return comma separated list of fields that are of type JSON. */ public static String getJsonStringFieldsFromBQSchema(com.google.cloud.bigquery.Schema bqSchema) { ArrayList fields = new ArrayList<>(); getJsonStringFieldsFromBQSchema(bqSchema.getFields(), fields, new ArrayList<>()); return String.join(",", fields); } private static void getJsonStringFieldsFromBQSchema(FieldList fieldList, ArrayList fields, ArrayList path) { for (Field field : fieldList) { path.add(field.getName()); if (field.getType() == LegacySQLTypeName.RECORD) { getJsonStringFieldsFromBQSchema(field.getSubFields(), fields, path); } else if (field.getType().equals(LegacySQLTypeName.valueOf(JSON))) { fields.add(String.join(".", path)); } path.remove(path.size() - 1); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy