All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.dataplex.sink.config.DataplexBatchSinkConfig Maven / Gradle / Ivy

/*
 * Copyright © 2022 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.dataplex.sink.config;

import com.google.api.gax.rpc.ApiException;
import com.google.cloud.bigquery.JobInfo;
import com.google.cloud.bigquery.RangePartitioning;
import com.google.cloud.bigquery.StandardTableDefinition;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TimePartitioning;
import com.google.cloud.dataplex.v1.Asset;
import com.google.cloud.dataplex.v1.AssetName;
import com.google.cloud.dataplex.v1.DataplexServiceClient;
import com.google.cloud.dataplex.v1.LakeName;
import com.google.cloud.dataplex.v1.Zone;
import com.google.cloud.dataplex.v1.ZoneName;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.plugin.InvalidPluginConfigException;
import io.cdap.cdap.api.plugin.InvalidPluginProperty;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.validation.FormatContext;
import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat;
import io.cdap.plugin.common.IdUtils;
import io.cdap.plugin.format.FileFormat;
import io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils;
import io.cdap.plugin.gcp.bigquery.sink.Operation;
import io.cdap.plugin.gcp.bigquery.sink.PartitionType;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.GCPConnectorConfig;
import io.cdap.plugin.gcp.dataplex.common.config.DataplexBaseConfig;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

/**
 * Dataplex plugin UI configuration parameters and validation wrapper
 */
public class DataplexBatchSinkConfig extends DataplexBaseConfig {
  private static final Set SUPPORTED_CLUSTERING_TYPES =
    ImmutableSet.of(Schema.Type.INT, Schema.Type.LONG, Schema.Type.STRING, Schema.Type.BOOLEAN, Schema.Type.BYTES);
  private static final Set SUPPORTED_FORMATS_FOR_CURATED_ZONE =
    ImmutableSet.of(FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET);
  private static final int MAX_NUMBER_OF_COLUMNS = 4;
  private static final String NAME_SUFFIX = "suffix";
  private static final String NAME_TABLE = "table";
  private static final String NAME_ASSET = "asset";
  private static final String NAME_ASSET_TYPE = "assetType";
  private static final Logger LOG = LoggerFactory.getLogger(DataplexBatchSinkConfig.class);
  private static final String WHERE = "WHERE";
  protected static final String NAME_FORMAT = "format";
  private static final String NAME_TABLE_KEY = "tableKey";
  private static final String NAME_DEDUPE_BY = "dedupeBy";
  private static final String NAME_OPERATION = "operation";
  private static final String NAME_PARTITION_FILTER = "partitionFilter";
  private static final String NAME_PARTITIONING_TYPE = "partitioningType";
  private static final String NAME_TRUNCATE_TABLE = "truncateTable";
  private static final String NAME_UPDATE_DATAPLEX_METADATA = "updateDataplexMetadata";
  private static final String NAME_UPDATE_SCHEMA = "allowSchemaRelaxation";
  private static final String NAME_PARTITION_BY_FIELD = "partitionField";
  private static final String NAME_REQUIRE_PARTITION_FIELD = "requirePartitionField";
  private static final String NAME_CLUSTERING_ORDER = "clusteringOrder";
  private static final String NAME_RANGE_START = "rangeStart";
  private static final String NAME_RANGE_END = "rangeEnd";
  private static final String NAME_RANGE_INTERVAL = "rangeInterval";
  private static final String NAME_SCHEMA = "schema";
  private static final String CONTENT_TYPE_APPLICATION_JSON = "application/json";
  private static final String CONTENT_TYPE_APPLICATION_AVRO = "application/x-avro";
  private static final String CONTENT_TYPE_APPLICATION_PARQUET = "application/x-parquet";
  private static final String CONTENT_TYPE_APPLICATION_ORC = "application/x-orc";
  private static final String CONTENT_TYPE_TEXT_CSV = "text/csv";
  private static final String FORMAT_AVRO = "avro";
  private static final String FORMAT_CSV = "csv";
  private static final String FORMAT_JSON = "json";
  private static final String FORMAT_ORC = "orc";
  private static final String FORMAT_PARQUET = "parquet";
  private static final Pattern FIELD_PATTERN = Pattern.compile("[a-zA-Z0-9_]+");

  private static final Map contentTypeMap = ImmutableMap.of(
    FORMAT_AVRO, CONTENT_TYPE_APPLICATION_AVRO,
    FORMAT_CSV, CONTENT_TYPE_TEXT_CSV,
    FORMAT_JSON, CONTENT_TYPE_APPLICATION_JSON,
    FORMAT_PARQUET, CONTENT_TYPE_APPLICATION_PARQUET,
    FORMAT_ORC, CONTENT_TYPE_APPLICATION_ORC
  );


  @Name(NAME_ASSET)
  @Macro
  @Description("ID of the Dataplex asset. It represents a cloud resource that is being managed within a lake as a " +
    "member of a zone.")
  protected String asset;

  @Name(NAME_ASSET_TYPE)
  @Nullable
  @Description("Type of asset selected to ingest the data in Dataplex.")
  protected String assetType;

  @Name(NAME_FORMAT)
  @Nullable
  @Macro
  @Description("The format to write the records in. The format for a raw zone must be one of ‘json’, ‘avro’," +
    " ‘csv’,‘orc’, or ‘parquet’.  The format for a curated zone must be one of ‘avro’, ‘orc’, or ‘parquet’.")
  protected String format;

  @Name(NAME_TABLE)
  @Nullable
  @Macro
  @Description("The table to write to. It can be BigQuery table if Asset is of Type 'BigQuery Dataset' or" +
    " a directory if Asset is of type 'Storage Bucket'")
  protected String table;

  @Name(NAME_TABLE_KEY)
  @Nullable
  @Macro
  @Description("List of fields that determine relation between tables during Update and Upsert operations.")
  protected String tableKey;

  @Name(NAME_DEDUPE_BY)
  @Nullable
  @Macro
  @Description("Column names and sort order used to choose which input record to update/upsert when there are " +
    "multiple input records with the same key. For example, if this is set to ‘updated_time desc’, then if there " +
    "are multiple input records with the same key, the one with the largest value for ‘updated_time’ will be " +
    "applied.")
  protected String dedupeBy;

  @Name(NAME_OPERATION)
  @Nullable
  @Macro
  @Description("Type of write operation to perform. This can be set to Insert, Update, or Upsert.")
  protected String operation;

  @Name(NAME_PARTITION_FILTER)
  @Nullable
  @Macro
  @Description("Partition filter that can be used for partition elimination during Update or Upsert operations." +
    " Only Use with Update or Upsert operations for tables where Require Partition Filter is enabled. For example," +
    " if the table is partitioned and the Partition Filter  is ‘_PARTITIONTIME > “2020-01-01” " +
    "and _PARTITIONTIME < “2020-03-01”‘, the update operation will be performed only in the" +
    " partitions meeting the criteria.")
  protected String partitionFilter;

  @Name(NAME_PARTITIONING_TYPE)
  @Nullable
  @Macro
  @Description("Specifies the partitioning type. Can either be Integer, Time, or None. Defaults to Time. " +
    "This value is ignored if the table already exists.")
  protected String partitioningType;

  @Name(NAME_RANGE_START)
  @Nullable
  @Macro
  @Description("Start value for range partitioning. The start value is inclusive. Ignored when table already exists")
  protected Long rangeStart;

  @Name(NAME_RANGE_END)
  @Nullable
  @Macro
  @Description("End value for range partitioning. The end value is exclusive. Ignored when table already exists")
  protected Long rangeEnd;

  @Name(NAME_RANGE_INTERVAL)
  @Nullable
  @Macro
  @Description("Interval value for range partitioning. The interval value must be a positive integer. Ignored when " +
    "table already exists")
  protected Long rangeInterval;

  @Name(NAME_TRUNCATE_TABLE)
  @Nullable
  @Macro
  @Description("Whether or not to truncate the table before writing to it. Only use with the Insert " +
    "operation.")
  protected Boolean truncateTable;

  @Name(NAME_UPDATE_DATAPLEX_METADATA)
  @Nullable
  @Macro
  @Description("Whether to update Dataplex metadata for the newly created entities." + 
    "If enabled, the pipeline will automatically copy the output schema to the destination " + 
    "Dataplex entities, and the automated Dataplex Discovery won't run for them.")
  protected Boolean updateDataplexMetadata;

  @Name(NAME_UPDATE_SCHEMA)
  @Nullable
  @Macro
  @Description("Whether the BigQuery table schema should be modified when it does not match the schema expected " +
    "by the pipeline.")
  protected Boolean allowSchemaRelaxation;

  @Name(NAME_PARTITION_BY_FIELD)
  @Nullable
  @Macro
  @Description("Partitioning column for the BigQuery table. Leave blank if the BigQuery table " +
    "is an ingestion-time partitioned table.")
  protected String partitionByField;

  @Name(NAME_REQUIRE_PARTITION_FIELD)
  @Nullable
  @Macro
  @Description("Whether to create a table that requires a partition filter. This value is ignored if the table " +
    "already exists.")
  protected Boolean requirePartitionField;

  @Name(NAME_CLUSTERING_ORDER)
  @Nullable
  @Macro
  @Description("List of fields that determines the sort order of the data. Fields must be of type INT, LONG, " +
    "STRING, DATE, TIMESTAMP, BOOLEAN, or DECIMAL. Tables cannot be clustered on more than 4 fields. This " +
    "value is only used when the BigQuery table is automatically created and ignored if the table already exists.")
  protected String clusteringOrder;

  @Nullable
  @Macro
  @Description("The time format for the output directory that will be appended to the path. " +
    "For example, the format 'yyyy-MM-dd-HH-mm' will result in a directory of the form '2015-01-01-20-42'.")
  private String suffix;

  @Name(NAME_SCHEMA)
  @Nullable
  @Macro
  @Description("The schema of the data to write. If provided, must be compatible with the table schema.")
  private String schema;

  public String getAsset() {
    return asset;
  }

  public String getAssetType() {
    return assetType;
  }

  @Nullable
  public FileFormat getFormat() {
    return FileFormat.from(format, FileFormat::canWrite);
  }

  @Nullable
  public String getFormatStr() {
    return format;
  }

  @Nullable
  public String getTable() {
    return table;
  }

  @Nullable
  public String getTableKey() {
    return Strings.isNullOrEmpty(tableKey) ? null : tableKey;
  }

  @Nullable
  public String getDedupeBy() {
    return Strings.isNullOrEmpty(dedupeBy) ? null : dedupeBy;
  }

  @Nullable
  public Operation getOperation() {
    return Strings.isNullOrEmpty(operation) ? Operation.INSERT : Operation.valueOf(operation.toUpperCase());
  }

  @Nullable
  public String getPartitionFilter() {
    if (Strings.isNullOrEmpty(partitionFilter)) {
      return null;
    }
    partitionFilter = partitionFilter.trim();
    // remove the WHERE keyword from the filter if the user adds it at the beginning of the expression
    if (partitionFilter.toUpperCase().startsWith(WHERE)) {
      partitionFilter = partitionFilter.substring(WHERE.length());
    }
    return partitionFilter;
  }

  @Nullable
  public PartitionType getPartitioningType() {
    return Strings.isNullOrEmpty(partitioningType) ? PartitionType.TIME
      : PartitionType.valueOf(partitioningType.toUpperCase());
  }

  @Nullable
  public Long getRangeStart() {
    return rangeStart;
  }

  @Nullable
  public Long getRangeEnd() {
    return rangeEnd;
  }

  @Nullable
  public Long getRangeInterval() {
    return rangeInterval;
  }

  @Nullable
  public Boolean isTruncateTable() {
    return truncateTable != null && truncateTable;
  }

  public JobInfo.WriteDisposition getWriteDisposition() {
    return isTruncateTable() ? JobInfo.WriteDisposition.WRITE_TRUNCATE
      : JobInfo.WriteDisposition.WRITE_APPEND;
  }

  @Nullable
  public Boolean isUpdateDataplexMetadata() {
    return updateDataplexMetadata != null && updateDataplexMetadata;
  }

  @Nullable
  public Boolean isUpdateTableSchema() {
    return allowSchemaRelaxation != null && allowSchemaRelaxation;
  }

  @Nullable
  public String getPartitionByField() {
    return Strings.isNullOrEmpty(partitionByField) ? null : partitionByField;
  }

  @Nullable
  public Boolean isRequirePartitionField() {
    return requirePartitionField != null && requirePartitionField;
  }

  @Nullable
  public String getClusteringOrder() {
    return Strings.isNullOrEmpty(clusteringOrder) ? null : clusteringOrder;
  }

  @Nullable
  public String getSuffix() {
    return suffix;
  }

  /**
   * Checks whether table name is as per standards and truncating can be performed only while insert operation.
   *
   * @param collector
   */
  public void validateBigQueryDataset(FailureCollector collector) {
    if (!containsMacro(NAME_TABLE)) {
      if (table == null) {
        collector.addFailure(String.format("Required property '%s' has no value.", NAME_TABLE), null)
          .withConfigProperty(NAME_TABLE);

        collector.getOrThrowException();
      }
      BigQueryUtil.validateTable(table, NAME_TABLE, collector);
    }

    if (getWriteDisposition().equals(JobInfo.WriteDisposition.WRITE_TRUNCATE)
      && !getOperation().equals(Operation.INSERT)) {

      collector.addFailure("Truncate must only be used with operation 'Insert'.",
          "Set Truncate to false, or change the Operation to 'Insert'.")
        .withConfigProperty(NAME_TRUNCATE_TABLE).withConfigProperty(NAME_OPERATION);
    }
  }

  @Nullable
  public Schema getSchema(FailureCollector collector) {
    if (Strings.isNullOrEmpty(schema)) {
      return null;
    }
    try {
      return Schema.parseJson(schema);
    } catch (IOException e) {
      collector.addFailure("Invalid schema: " + e.getMessage(), null).
        withConfigProperty(NAME_SCHEMA);
    }
    // if there was an error that was added, it will throw an exception, otherwise,
    // this statement will not be executed
    throw collector.getOrThrowException();
  }

  /**
   * This method will validate the location, lake, zone and asset configurations
   *
   * @param collector
   * @param dataplexServiceClient
   */
  public void validateAssetConfiguration(FailureCollector collector, DataplexServiceClient dataplexServiceClient) {
    if (!Strings.isNullOrEmpty(referenceName)) {
      IdUtils.validateReferenceName(referenceName, collector);
    }
    String projectID = tryGetProject();
    if (!Strings.isNullOrEmpty(location) && !containsMacro(NAME_LOCATION)) {
      if (!Strings.isNullOrEmpty(lake) && !containsMacro(NAME_LAKE)) {
        try {
          dataplexServiceClient.getLake(LakeName.newBuilder().setProject(projectID).setLocation(location)
            .setLake(lake).build());
        } catch (ApiException e) {
          if (e.getMessage().contains("Location")) {
            configureDataplexException(location, NAME_LOCATION, e, collector);
          } else {
            configureDataplexException(lake, NAME_LAKE, e, collector);
          }
          return;
        }

        if (!Strings.isNullOrEmpty(zone) && !containsMacro(NAME_ZONE)) {
          Zone zoneBean;
          try {
            zoneBean =
              dataplexServiceClient.getZone(ZoneName.newBuilder().setProject(projectID).setLocation(location)
                .setLake(lake).setZone(zone).build());
          } catch (ApiException e) {
            configureDataplexException(zone, NAME_ZONE, e, collector);
            return;
          }
          if (!Strings.isNullOrEmpty(asset) && !containsMacro(NAME_ASSET)) {
            try {
              Asset assetBean = dataplexServiceClient.getAsset(AssetName.newBuilder().setProject(projectID)
                .setLocation(location)
                .setLake(lake).setZone(zone).setAsset(asset).build());
              if (!assetType.equalsIgnoreCase(assetBean.getResourceSpec().getType().toString())) {
                collector.addFailure("Asset type doesn't match with actual asset. ", null).
                  withConfigProperty(NAME_ASSET_TYPE);
              }
              if (zoneBean != null && assetBean != null && assetBean.getResourceSpec().getType().
                equals(Asset.ResourceSpec.Type.STORAGE_BUCKET) && zoneBean.getType()
                .equals(Zone.Type.CURATED) && !containsMacro(NAME_FORMAT) &&
                !Strings.isNullOrEmpty(format)) {
                FileFormat fileFormat = getFormat();
                // For curated zone only avro, orc and parquet are supported.
                if (!SUPPORTED_FORMATS_FOR_CURATED_ZONE.contains(fileFormat)) {
                  collector.addFailure(String.format("Format '%s' is not supported for curated zone",
                        fileFormat.toString().toLowerCase()),
                      null).
                    withConfigProperty(NAME_FORMAT);
                }
              }
            } catch (ApiException e) {
              configureDataplexException(asset, NAME_ASSET, e, collector);
              return;
            }
          }
        }
      }
    }
    collector.getOrThrowException();
  }

  /**
   * validates BigQuery Dataset, Table  and columns with selected properties .
   *
   * @param inputSchema           InputSchema
   * @param outputSchema          OutputSchema
   * @param collector             FailureCollector
   * @param dataplexServiceClient DataplexServiceClient
   */
  public void validateBigQueryDataset(@Nullable Schema inputSchema, @Nullable Schema outputSchema,
                                      FailureCollector collector,
                                      DataplexServiceClient dataplexServiceClient) {
    if (containsMacro(NAME_LOCATION) || containsMacro(NAME_LAKE) || containsMacro(NAME_ZONE) ||
      containsMacro(NAME_ASSET)) {
      return;
    }
    validateBigQueryDataset(collector);
    if (!containsMacro(NAME_SCHEMA)) {
      Schema schema = outputSchema == null ? inputSchema : outputSchema;
      try {
        Asset assetBean =
          dataplexServiceClient.getAsset(AssetName.newBuilder().setProject(tryGetProject()).setLocation(location)
            .setLake(lake).setZone(zone).setAsset(asset).build());
        String[] assetValues = assetBean.getResourceSpec().getName().split("/");
        String dataset = assetValues[assetValues.length - 1];
        String datasetProject = assetValues[assetValues.length - 3];
        validatePartitionProperties(schema, collector, dataset, datasetProject);
        validateClusteringOrder(schema, collector);
        validateOperationProperties(schema, collector);
        validateConfiguredSchema(schema, collector, dataset);
      } catch (Exception e) {
        LOG.debug(String.format("%s: %s", e.getLocalizedMessage(), e.getMessage()));
      }

      if (outputSchema == null) {
        return;
      }

      List schemaFields = Objects.requireNonNull(schema.getFields()).stream().
        map(Schema.Field::getName).map(String::toLowerCase).collect(Collectors.toList());

      final Set duplicatedFields = BigQuerySinkUtils.getDuplicatedFields(schemaFields);

      for (Schema.Field field : outputSchema.getFields()) {
        String name = field.getName();
        // BigQuery column names only allow alphanumeric characters and _
        // https://cloud.google.com/bigquery/docs/schemas#column_names
        if (!FIELD_PATTERN.matcher(name).matches()) {
          collector
            .addFailure(String.format("Output field '%s' must only contain alphanumeric characters and '_'.",
              name), null).withOutputSchemaField(name);
        }

        // check if the required fields are present in the input schema.
        if (!field.getSchema().isNullable() && inputSchema != null &&
          inputSchema.getField(field.getName()) == null) {
          collector.addFailure(
              String.format("Required output field '%s' must be present in input schema.", field.getName()),
              "Change the field to be nullable.")
            .withOutputSchemaField(name);
        }

        // check if field is duplicated -> case insensitive
        if (duplicatedFields.contains(name.toLowerCase())) {
          collector.addFailure(
              String.format("Output field '%s' is duplicated.", name),
              "BigQuery is case insensitive and does not allow two fields with the same name.")
            .withOutputSchemaField(name);
        }
      }
    }
  }


  private void validateConfiguredSchema(Schema schema, FailureCollector collector, String dataset) {
    if (!this.shouldConnect()) {
      return;
    }
    String tableName = this.getTable();
    Table table = BigQueryUtil.getBigQueryTable(this.tryGetProject(), dataset, tableName,
      connection.getServiceAccount(), connection.isServiceAccountFilePath(),
      collector);

    if (table != null && !this.containsMacro(NAME_UPDATE_SCHEMA)) {
      // if table already exists, validate schema against underlying bigquery table
      com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema();
      if (this.getOperation().equals(Operation.INSERT)) {
        BigQuerySinkUtils.validateInsertSchema(table, schema, isUpdateTableSchema(), isTruncateTable(), dataset,
          collector);
      } else if (this.getOperation().equals(Operation.UPSERT)) {
        BigQuerySinkUtils.validateSchema(tableName, bqSchema, schema, isUpdateTableSchema(), isTruncateTable(),
          dataset, collector);
      }
    }
  }


  private void validatePartitionProperties(@Nullable Schema schema, FailureCollector collector, String dataset,
                                           String datasetProject) {
    if (tryGetProject() == null) {
      return;
    }
    String project = datasetProject;
    String tableName = getTable();
    String serviceAccount = getServiceAccount();

    if (project == null || dataset == null || tableName == null || serviceAccount == null) {
      return;
    }

    Table table = BigQueryUtil.getBigQueryTable(project, dataset, tableName, serviceAccount,
      isServiceAccountFilePath(), collector);
    if (table != null) {
      StandardTableDefinition tableDefinition = table.getDefinition();
      TimePartitioning timePartitioning = tableDefinition.getTimePartitioning();
      if (timePartitioning == null) {
        LOG.warn(
          String.format("The plugin is configured to auto-create a partitioned table, but table '%s' already " +
              "exists without partitioning. Please verify the partitioning configuration.",
            table.getTableId().getTable()));
      }
      RangePartitioning rangePartitioning = tableDefinition.getRangePartitioning();
      if (timePartitioning == null && rangePartitioning == null) {
        LOG.warn(String.format(
          "The plugin is configured to auto-create a partitioned table, but table '%s' already " +
            "exists without partitioning. Please verify the partitioning configuration.",
          table.getTableId().getTable()));
      } else if (timePartitioning != null) {
        validateTimePartitionTableWithInputConfiguration(table, timePartitioning, collector);
      } else {
        validateRangePartitionTableWithInputConfiguration(table, rangePartitioning, collector);
      }
      validateColumnForPartition(partitionByField, schema, collector);
    }

  }

  private void validateTimePartitionTableWithInputConfiguration(Table table, TimePartitioning timePartitioning,
                                                                FailureCollector collector) {
    PartitionType partitioningType = getPartitioningType();
    if (partitioningType == PartitionType.TIME && timePartitioning.getField() != null &&
      !timePartitioning.getField()
        .equals(partitionByField)) {
      collector.addFailure(String.format("Destination table '%s' is partitioned by column '%s'.",
            table.getTableId().getTable(),
            timePartitioning.getField()),
          String.format("Set the partition field to '%s'.", timePartitioning.getField()))
        .withConfigProperty(NAME_PARTITION_BY_FIELD);
    } else if (partitioningType != PartitionType.TIME) {
      LOG.warn(String.format("The plugin is configured to %s, but table '%s' already " +
          "exists with Time partitioning. Please verify the partitioning configuration.",
        partitioningType == PartitionType.INTEGER ? "auto-create a Integer partitioned table"
          : "auto-create table without partition",
        table.getTableId().getTable()));
    }
  }

  private void validateRangePartitionTableWithInputConfiguration(Table table, RangePartitioning rangePartitioning,
                                                                 FailureCollector collector) {
    PartitionType partitioningType = getPartitioningType();
    if (partitioningType != PartitionType.INTEGER) {
      LOG.warn(String.format("The plugin is configured to %s, but table '%s' already " +
          "exists with Integer partitioning. Please verify the partitioning configuration.",
        partitioningType == PartitionType.TIME ? "auto-create a Time partitioned table"
          : "auto-create table without partition",
        table.getTableId().getTable()));
    } else if (rangePartitioning.getField() != null && !rangePartitioning.getField().equals(partitionByField)) {
      collector.addFailure(String.format("Destination table '%s' is partitioned by column '%s'.",
            table.getTableId().getTable(),
            rangePartitioning.getField()),
          String.format("Set the partition field to '%s'.", rangePartitioning.getField()))
        .withConfigProperty(NAME_PARTITION_BY_FIELD);
    }
  }

  private void validateColumnForPartition(@Nullable String columnName, @Nullable Schema schema,
                                          FailureCollector collector) {
    if (containsMacro(NAME_PARTITION_BY_FIELD) || containsMacro(NAME_PARTITIONING_TYPE) || schema == null) {
      return;
    }
    PartitionType partitioningType = getPartitioningType();
    if (Strings.isNullOrEmpty(columnName)) {
      if (partitioningType == PartitionType.INTEGER) {
        collector.addFailure("Partition column not provided.",
            "Set the column for integer partitioning.")
          .withConfigProperty(NAME_PARTITION_BY_FIELD);
      }
      return;
    }
    Schema.Field field = schema.getField(columnName);
    if (field == null) {
      collector.addFailure(String.format("Partition column '%s' must be present in the schema.", columnName),
          "Change the Partition column to be one of the schema fields.")
        .withConfigProperty(NAME_PARTITION_BY_FIELD);
      return;
    }
    Schema fieldSchema = field.getSchema();
    fieldSchema = fieldSchema.isNullable() ? fieldSchema.getNonNullable() : fieldSchema;
    if (partitioningType == PartitionType.TIME) {
      validateTimePartitioningColumn(columnName, collector, fieldSchema);
    } else if (partitioningType == PartitionType.INTEGER) {
      validateIntegerPartitioningColumn(columnName, collector, fieldSchema);
      validateIntegerPartitioningRange(getRangeStart(), getRangeEnd(), getRangeInterval(), collector);
    }
  }

  private void validateTimePartitioningColumn(String columnName, FailureCollector collector, Schema fieldSchema) {
    Schema.LogicalType logicalType = fieldSchema.getLogicalType();
    if (logicalType != Schema.LogicalType.DATE && logicalType != Schema.LogicalType.TIMESTAMP_MICROS
      && logicalType != Schema.LogicalType.TIMESTAMP_MILLIS) {
      collector.addFailure(
          String.format("Partition column '%s' is of invalid type '%s'.", columnName, fieldSchema.getDisplayName()),
          "Partition column must be a date or timestamp.")
        .withConfigProperty(NAME_PARTITION_BY_FIELD)
        .withOutputSchemaField(columnName).withInputSchemaField(columnName);
    }
  }

  private void validateIntegerPartitioningRange(Long rangeStart, Long rangeEnd, Long rangeInterval,
                                                FailureCollector collector) {
    if (!containsMacro(NAME_RANGE_START) && rangeStart == null) {
      collector.addFailure("Range Start is not defined.",
          "For Integer Partitioning, Range Start must be defined.")
        .withConfigProperty(NAME_RANGE_START);
    }
    if (!containsMacro(NAME_RANGE_END) && rangeEnd == null) {
      collector.addFailure("Range End is not defined.",
          "For Integer Partitioning, Range End must be defined.")
        .withConfigProperty(NAME_RANGE_END);
    }

    if (!containsMacro(NAME_RANGE_INTERVAL)) {
      if (rangeInterval == null) {
        collector.addFailure(
            "Range Interval is not defined.",
            "For Integer Partitioning, Range Interval must be defined.")
          .withConfigProperty(NAME_RANGE_INTERVAL);
      } else if (rangeInterval <= 0) {
        collector.addFailure(
            "Range Interval is not a positive number.",
            "Range interval must be a valid positive integer.")
          .withConfigProperty(NAME_RANGE_INTERVAL);
      }
    }
  }

  private void validateIntegerPartitioningColumn(String columnName, FailureCollector collector, Schema fieldSchema) {
    if (fieldSchema.getType() != Schema.Type.INT && fieldSchema.getType() != Schema.Type.LONG) {
      collector.addFailure(
          String.format("Partition column '%s' is of invalid type '%s'.", columnName, fieldSchema.getDisplayName()),
          "Partition column must be a int  or long.").withConfigProperty(NAME_PARTITION_BY_FIELD)
        .withOutputSchemaField(columnName).withInputSchemaField(columnName);
    }
  }

  private void validateClusteringOrder(@Nullable Schema schema, FailureCollector collector) {
    if (Strings.isNullOrEmpty(clusteringOrder) || schema == null) {
      return;
    }

    if (!containsMacro(NAME_PARTITION_BY_FIELD) && !containsMacro(NAME_CLUSTERING_ORDER) &&
      !Strings.isNullOrEmpty(clusteringOrder) && (Strings.isNullOrEmpty(partitionByField))) {
      collector.addFailure("Clustering order cannot be validated.",
        "Partition field must have a value.");
      return;
    }

    List columnsNames = Arrays.stream(clusteringOrder.split(",")).map(String::trim)
      .collect(Collectors.toList());
    if (columnsNames.size() > MAX_NUMBER_OF_COLUMNS) {
      collector.addFailure(String.format("Found '%d' number of clustering fields.", columnsNames.size()),
          String.format("Expected at most '%d' clustering fields.", MAX_NUMBER_OF_COLUMNS))
        .withConfigProperty(NAME_CLUSTERING_ORDER);
      return;
    }

    for (String column : columnsNames) {
      Schema.Field field = schema.getField(column);
      if (field == null) {
        collector.addFailure(String.format("Clustering field '%s' does not exist in the schema.", column),
            "Ensure all clustering fields exist in the schema.")
          .withConfigElement(NAME_CLUSTERING_ORDER, column);
        continue;
      }
      Schema nonNullSchema = BigQueryUtil.getNonNullableSchema(field.getSchema());

      Schema.Type type = nonNullSchema.getType();
      Schema.LogicalType logicalType = nonNullSchema.getLogicalType();

      if (!SUPPORTED_CLUSTERING_TYPES.contains(type) && !BigQuerySinkUtils.isSupportedLogicalType(logicalType)) {
        collector.addFailure(
            String.format("Field '%s' is of unsupported type '%s'.", column, nonNullSchema.getDisplayName()),
            "Supported types are : string, bytes, int, long, boolean, date, timestamp and decimal.")
          .withConfigElement(NAME_CLUSTERING_ORDER, column).withInputSchemaField(column)
          .withOutputSchemaField(column);
      }
    }
  }

  private void validateOperationProperties(@Nullable Schema schema, FailureCollector collector) {
    if (containsMacro(NAME_OPERATION) || containsMacro(NAME_TABLE_KEY) || containsMacro(NAME_DEDUPE_BY)) {
      return;
    }
    Operation assetOperation = getOperation();
    if (Arrays.stream(Operation.values()).noneMatch(assetOperation::equals)) {
      collector.addFailure(
          String.format("Operation has incorrect value '%s'.", assetOperation),
          "Set the operation to 'Insert', 'Update', or 'Upsert'.")
        .withConfigElement(NAME_OPERATION, assetOperation.name().toLowerCase());
      return;
    }
    if (Operation.INSERT.equals(assetOperation)) {
      return;
    }
    boolean updateOrUpsertOperation =
      Operation.UPDATE.equals(assetOperation) || Operation.UPSERT.equals(assetOperation);

    if ((updateOrUpsertOperation) && getTableKey() == null) {
      collector.addFailure(
          "Table key must be set if the operation is 'Update' or 'Upsert'.", null)
        .withConfigProperty(NAME_TABLE_KEY).withConfigProperty(NAME_OPERATION);
      return;
    }

    if (schema == null) {
      return;
    }
    List fields = Objects.requireNonNull(schema.getFields()).stream().map(Schema.Field::getName)
      .collect(Collectors.toList());
    List keyFields = Arrays.stream(Objects.requireNonNull(getTableKey()).split(","))
      .map(String::trim).collect(Collectors.toList());

    for (String keyField : keyFields) {
      if (!fields.contains(keyField)) {
        collector.addFailure(
            String.format("Table key field '%s' does not exist in the schema.", keyField),
            "Change the Table key field to be one of the schema fields.")
          .withConfigElement(NAME_TABLE_KEY, keyField);
      }
    }

    Map keyMap = BigQuerySinkUtils.calculateDuplicates(keyFields);
    keyMap.keySet().stream()
      .filter(key -> keyMap.get(key) != 1)
      .forEach(key -> collector.addFailure(
          String.format("Table key field '%s' is duplicated.", key),
          String.format("Remove duplicates of Table key field '%s'.", key))
        .withConfigElement(NAME_TABLE_KEY, key)
      );

    if ((updateOrUpsertOperation) && getDedupeBy() != null) {
      List dedupeByList = Arrays.stream(Objects.requireNonNull(getDedupeBy()).split(","))
        .collect(Collectors.toList());

      //Validating the list of dedup key fields against fields received from bigquery input table
      dedupeByList.stream()
        .filter(v -> !fields.contains(v.split(" ")[0]))
        .forEach(v -> collector.addFailure(
            String.format("Dedupe by field '%s' does not exist in the schema.", v.split(" ")[0]),
            "Change the Dedupe by field to be one of the schema fields.")
          .withConfigElement(NAME_DEDUPE_BY, v));

      Map orderedByFieldMap = BigQuerySinkUtils.calculateDuplicates(dedupeByList);
      Map orderedByFieldValueMap = dedupeByList.stream()
        .collect(Collectors.toMap(p -> p.split(" ")[0], p -> p, (x, y) -> y));

      orderedByFieldMap.keySet().stream()
        .filter(key -> orderedByFieldMap.get(key) != 1)
        .forEach(key -> collector.addFailure(
            String.format("Dedupe by field '%s' is duplicated.", key),
            String.format("Remove duplicates of Dedupe by field '%s'.", key))
          .withConfigElement(NAME_DEDUPE_BY, orderedByFieldValueMap.get(key))
        );
    }
  }


  /**
   * Returns true if dataplex can be connected to or schema is not a macro.
   */
  public boolean shouldConnect() {
    return !containsMacro(NAME_ASSET) && !containsMacro(NAME_TABLE) &&
      !containsMacro(GCPConnectorConfig.NAME_SERVICE_ACCOUNT_TYPE) &&
      !(containsMacro(GCPConnectorConfig.NAME_SERVICE_ACCOUNT_FILE_PATH) ||
        containsMacro(GCPConnectorConfig.NAME_SERVICE_ACCOUNT_JSON)) &&
      !containsMacro(GCPConnectorConfig.NAME_PROJECT) && !containsMacro(NAME_SCHEMA);
  }

  protected ValidatingOutputFormat getValidatingOutputFormat(PipelineConfigurer pipelineConfigurer) {
    return pipelineConfigurer.usePlugin("validatingOutputFormat",
      format.toLowerCase(), format.toLowerCase(), this.getRawProperties());
  }

  /**
   * Validates if format is a valid one that can be ingested in Dataplex.
   *
   * @param pipelineConfigurer
   * @param collector
   */
  public void validateFormatForStorageBucket(PipelineConfigurer pipelineConfigurer, FailureCollector collector) {
    if (!this.containsMacro(NAME_FORMAT) && Strings.isNullOrEmpty(format)) {
      collector.addFailure(String.format("Required field '%s' has no value.", NAME_FORMAT), null)
        .withConfigProperty(NAME_FORMAT);
      collector.getOrThrowException();
    }

    if (!this.containsMacro(NAME_FORMAT)) {
      // Validates output format for the selected format type
      String fileFormat = null;
      try {
        fileFormat = getFormat().toString().toLowerCase();
      } catch (IllegalArgumentException e) {
        collector.addFailure(e.getMessage(), null).withConfigProperty(NAME_FORMAT)
          .withStacktrace(e.getStackTrace());
      }
      ValidatingOutputFormat validatingOutputFormat = this.getValidatingOutputFormat(pipelineConfigurer);
      FormatContext context = new FormatContext(collector, pipelineConfigurer.getStageConfigurer().getInputSchema());
      this.validateOutputFormatProvider(context, fileFormat, validatingOutputFormat);
    } else {
      // Verifying all the file formats have its corresponding validating output format classes or not
      FileFormat[] fileFormats = FileFormat.values();
      int fileFormatLength = fileFormats.length;

      for (int i = 0; i < fileFormatLength; ++i) {
        FileFormat f = fileFormats[i];
        try {
          pipelineConfigurer.usePlugin("validatingOutputFormat", f.name().toLowerCase(),
            f.name().toLowerCase(), this.getRawProperties());
        } catch (InvalidPluginConfigException var8) {
          LOG.warn(
            "Failed to register format '{}', which means it cannot be used when the pipeline is run. " +
              "Missing properties: {}, invalid properties: {}",
            f.name(), var8.getMissingProperties(), var8.getInvalidProperties().stream().map(
              InvalidPluginProperty::getName).collect(Collectors.toList()));
        }
      }
    }
  }

  /**
   * Validates the schema based on validating output format.
   *
   * @param context                FormatContext
   * @param format
   * @param validatingOutputFormat ValidatingOutputFormat
   */
  public void validateOutputFormatProvider(FormatContext context, String format,
                                           @Nullable ValidatingOutputFormat validatingOutputFormat) {
    FailureCollector collector = context.getFailureCollector();
    if (validatingOutputFormat == null) {
      collector.addFailure(String.format("Could not find the '%s' output format plugin.", format), null)
        .withPluginNotFound(format, format, "validatingOutputFormat");
    } else {
      validatingOutputFormat.validate(context);
    }
  }

  /**
   * Validates the Cloud Storage asset properties before ingestion to dataplex.
   *
   * @param collector FailureCollector
   */
  public void validateStorageBucket(FailureCollector collector) {
    if (containsMacro(NAME_LOCATION) || containsMacro(NAME_LAKE) || containsMacro(NAME_ZONE) ||
      containsMacro(NAME_ASSET)) {
      return;
    }
    if (!containsMacro(NAME_TABLE)) {
      if (table == null) {
        collector.addFailure(String.format("Required property '%s' has no value.", NAME_TABLE), null)
          .withConfigProperty(NAME_TABLE);
        collector.getOrThrowException();
      }
    }
    if (!Strings.isNullOrEmpty(suffix) && !containsMacro(NAME_SUFFIX)) {
      try {
        new SimpleDateFormat(suffix);
      } catch (IllegalArgumentException e) {
        collector.addFailure("Invalid suffix.", "Ensure provided suffix is valid.")
          .withConfigProperty(NAME_SUFFIX).withStacktrace(e.getStackTrace());
      }
    }
    try {
      getSchema(collector);
    } catch (IllegalArgumentException e) {
      collector.addFailure(e.getMessage(), null).withConfigProperty(NAME_SCHEMA)
        .withStacktrace(e.getStackTrace());
    }
  }

  /*  This method gets the value of content type. Valid content types for each format are:
   *
   *  avro -> application/avro, application/octet-stream
   *  json -> application/json, text/plain, application/octet-stream
   *  csv -> application/csv, text/csv, text/plain, application/octet-stream
   *  orc -> application/octet-stream
   *  parquet -> application/octet-stream
   */
  @Nullable
  public String getContentType(String format) {
    return contentTypeMap.get(format.toLowerCase());
  }


  private DataplexBatchSinkConfig(@Nullable String referenceName, String asset, @Nullable String assetType,
                                  @Nullable String location, @Nullable String lake, @Nullable String zone,
                                  @Nullable String format, @Nullable GCPConnectorConfig connection,
                                  @Nullable String table, @Nullable String tableKey, @Nullable String dedupeBy,
                                  @Nullable String operation, @Nullable String partitionFilter,
                                  @Nullable String partitioningType, @Nullable Long rangeStart,
                                  @Nullable Long rangeEnd, @Nullable Long rangeInterval,
                                  @Nullable Boolean truncateTable, @Nullable Boolean updateDataplexMetadata, 
                                  @Nullable Boolean allowSchemaRelaxation, @Nullable String partitionByField,
                                  @Nullable Boolean requirePartitionField, @Nullable String clusteringOrder, 
                                  @Nullable String suffix, @Nullable String schema) {
    this.referenceName = referenceName;
    this.connection = connection;
    this.location = location;
    this.lake = lake;
    this.zone = zone;
    this.asset = asset;
    this.assetType = assetType;
    this.format = format;
    this.table = table;
    this.tableKey = tableKey;
    this.dedupeBy = dedupeBy;
    this.operation = operation;
    this.partitionFilter = partitionFilter;
    this.partitioningType = partitioningType;
    this.rangeStart = rangeStart;
    this.rangeEnd = rangeEnd;
    this.rangeInterval = rangeInterval;
    this.truncateTable = truncateTable;
    this.updateDataplexMetadata = updateDataplexMetadata;
    this.allowSchemaRelaxation = allowSchemaRelaxation;
    this.partitionByField = partitionByField;
    this.requirePartitionField = requirePartitionField;
    this.clusteringOrder = clusteringOrder;
    this.suffix = suffix;
    this.schema = schema;
  }

  public static DataplexBatchSinkConfig.Builder builder() {
    return new DataplexBatchSinkConfig.Builder();
  }

  /**
   * Dataplex Batch Sink configuration builder.
   */

  public static class Builder {
    private String asset;
    private String assetType;
    private String format;
    private String table;
    private String tableKey;
    private String dedupeBy;
    private String operation;
    private String partitionFilter;
    private String partitioningType;
    private Long rangeStart;
    private Long rangeEnd;
    private Long rangeInterval;
    private Boolean truncateTable;
    private Boolean updateDataplexMetadata;
    private Boolean allowSchemaRelaxation;
    private String partitionByField;
    private Boolean requirePartitionField;
    private String clusteringOrder;
    private String suffix;
    private String schema;
    private String location;
    private String lake;
    private String zone;
    private GCPConnectorConfig connection;
    private String referenceName;

    public Builder setAsset(String asset) {
      this.asset = asset;
      return this;
    }

    public Builder setAssetType(String assetType) {
      this.assetType = assetType;
      return this;
    }

    public Builder setFormat(String format) {
      this.format = format;
      return this;
    }

    public Builder setTable(String table) {
      this.table = table;
      return this;
    }

    public Builder setTableKey(String tableKey) {
      this.tableKey = tableKey;
      return this;
    }

    public Builder setDedupeBy(String dedupeBy) {
      this.dedupeBy = dedupeBy;
      return this;
    }

    public Builder setOperation(String operation) {
      this.operation = operation;
      return this;
    }

    public Builder setPartitionFilter(String partitionFilter) {
      this.partitionFilter = partitionFilter;
      return this;
    }

    public Builder setPartitioningType(String partitioningType) {
      this.partitioningType = partitioningType;
      return this;
    }

    public Builder setRangeStart(Long rangeStart) {
      this.rangeStart = rangeStart;
      return this;
    }

    public Builder setRangeEnd(Long rangeEnd) {
      this.rangeEnd = rangeEnd;
      return this;
    }

    public Builder setRangeInterval(Long rangeInterval) {
      this.rangeInterval = rangeInterval;
      return this;
    }

    public Builder setTruncateTable(Boolean truncateTable) {
      this.truncateTable = truncateTable;
      return this;
    }

    public Builder setUpdateDataplexMetadata(Boolean updateDataplexMetadata) {
      this.updateDataplexMetadata = updateDataplexMetadata;
      return this;
    }

    public Builder setAllowSchemaRelaxation(Boolean allowSchemaRelaxation) {
      this.allowSchemaRelaxation = allowSchemaRelaxation;
      return this;
    }

    public Builder setPartitionByField(String partitionByField) {
      this.partitionByField = partitionByField;
      return this;
    }

    public Builder setRequirePartitionField(Boolean requirePartitionField) {
      this.requirePartitionField = requirePartitionField;
      return this;
    }

    public Builder setClusteringOrder(String clusteringOrder) {
      this.clusteringOrder = clusteringOrder;
      return this;
    }

    public Builder setSuffix(String suffix) {
      this.suffix = suffix;
      return this;
    }

    public Builder setSchema(String schema) {
      this.schema = schema;
      return this;
    }

    public Builder setLocation(String location) {
      this.location = location;
      return this;
    }

    public Builder setLake(String lake) {
      this.lake = lake;
      return this;
    }

    public Builder setZone(String zone) {
      this.zone = zone;
      return this;
    }

    public Builder setConnection(GCPConnectorConfig connection) {
      this.connection = connection;
      return this;
    }

    public Builder setReferenceName(String referenceName) {
      this.referenceName = referenceName;
      return this;
    }

    public DataplexBatchSinkConfig build() {
      return new DataplexBatchSinkConfig(referenceName, asset, assetType, location, lake, zone, format, connection,
        table, tableKey, dedupeBy, operation, partitionFilter,
        partitioningType, rangeStart, rangeEnd, rangeInterval, truncateTable,
        updateDataplexMetadata, allowSchemaRelaxation, partitionByField,
        requirePartitionField, clusteringOrder, suffix, schema);
    }

  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy