All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.bigquery.util.BigQueryUtil Maven / Gradle / Ivy

/*
 * Copyright © 2018 Cask Data, Inc.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not
 *  use this file except in compliance with the License. You may obtain a copy of
 *  the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  License for the specific language governing permissions and limitations under
 *  the License.
 */

package io.cdap.plugin.gcp.bigquery.util;

import com.google.api.client.googleapis.media.MediaHttpUploader;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.LegacySQLTypeName;
import com.google.cloud.bigquery.StandardSQLTypeName;
import com.google.cloud.bigquery.StandardTableDefinition;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TimePartitioning;
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
import io.cdap.cdap.etl.api.validation.InvalidStageException;
import io.cdap.cdap.etl.api.validation.ValidationFailure;
import io.cdap.plugin.gcp.bigquery.sink.AbstractBigQuerySinkConfig;
import io.cdap.plugin.gcp.bigquery.sink.BigQuerySink;
import io.cdap.plugin.gcp.bigquery.source.BigQuerySource;
import io.cdap.plugin.gcp.bigquery.source.BigQuerySourceConfig;
import io.cdap.plugin.gcp.bigquery.util.BigQueryTypeSize.BigNumeric;
import io.cdap.plugin.gcp.bigquery.util.BigQueryTypeSize.Numeric;
import io.cdap.plugin.gcp.common.GCPConfig;
import io.cdap.plugin.gcp.common.GCPUtils;
import io.cdap.plugin.gcp.gcs.GCSPath;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import javax.annotation.Nullable;

/**
 * Common Util class for big query plugins such as {@link BigQuerySource} and {@link BigQuerySink}
 */
public final class BigQueryUtil {

  private static final Logger LOG = LoggerFactory.getLogger(BigQueryUtil.class);

  private static final String DEFAULT_PARTITION_COLUMN_NAME = "_PARTITIONTIME";
  private static final String BIGQUERY_BUCKET_PREFIX_PROPERTY_NAME = "gcp.bigquery.bucket.prefix";

  public static final String BUCKET_PATTERN = "[a-z0-9._-]+";
  public static final String DATASET_PATTERN = "[A-Za-z0-9_]+";
  // Read more about table naming in BigQuery here: https://cloud.google.com/bigquery/docs/tables#table_naming
  public static final String TABLE_PATTERN = "[\\p{L}\\p{M}\\p{N}\\p{Pc}\\p{Pd}\\p{Zs}]+";

  // Tags for BQ Jobs
  public static final String BQ_JOB_TYPE_SOURCE_TAG = "bq_source_plugin";
  public static final String BQ_JOB_TYPE_EXECUTE_TAG = "bq_execute_plugin";
  public static final String BQ_JOB_TYPE_SINK_TAG = "bq_sink_plugin";
  public static final String BQ_JOB_TYPE_PUSHDOWN_TAG = "bq_pushdown";

  // array of arrays and map of arrays are not supported by big query
  public static final Set UNSUPPORTED_ARRAY_TYPES = ImmutableSet.of(Schema.Type.ARRAY, Schema.Type.MAP);

  // bigquery types to cdap schema types mapping
  public static final Map BQ_TYPE_MAP = ImmutableMap.builder()
    .put(LegacySQLTypeName.INTEGER, "long")
    .put(LegacySQLTypeName.FLOAT, "double")
    .put(LegacySQLTypeName.BOOLEAN, "boolean")
    .put(LegacySQLTypeName.BYTES, "bytes")
    .put(LegacySQLTypeName.RECORD, "record")
    .put(LegacySQLTypeName.STRING, "string or datetime")
    .put(LegacySQLTypeName.DATETIME, "datetime or string")
    .put(LegacySQLTypeName.DATE, "date")
    .put(LegacySQLTypeName.TIME, "time")
    .put(LegacySQLTypeName.TIMESTAMP, "timestamp")
    .put(LegacySQLTypeName.NUMERIC, "decimal")
    .build();

  public static final String BQ_JOB_SOURCE_TAG = "job_source";
  public static final String BQ_JOB_TYPE_TAG = "type";
  public static final Set BQ_JOB_LABEL_SYSTEM_KEYS = ImmutableSet.of(BQ_JOB_SOURCE_TAG, BQ_JOB_TYPE_TAG);

  private static final Map> TYPE_MAP = ImmutableMap.>builder()
    .put(Schema.Type.INT, ImmutableSet.of(LegacySQLTypeName.INTEGER))
    .put(Schema.Type.LONG, ImmutableSet.of(LegacySQLTypeName.INTEGER))
    .put(Schema.Type.STRING, ImmutableSet.of(LegacySQLTypeName.STRING, LegacySQLTypeName.DATETIME,
            LegacySQLTypeName.valueOf("JSON")))
    .put(Schema.Type.FLOAT, ImmutableSet.of(LegacySQLTypeName.FLOAT))
    .put(Schema.Type.DOUBLE, ImmutableSet.of(LegacySQLTypeName.FLOAT))
    .put(Schema.Type.BOOLEAN, ImmutableSet.of(LegacySQLTypeName.BOOLEAN))
    .put(Schema.Type.BYTES, ImmutableSet.of(LegacySQLTypeName.BYTES))
    .put(Schema.Type.RECORD, ImmutableSet.of(LegacySQLTypeName.RECORD))
    .build();

  private static final Map> LOGICAL_TYPE_MAP =
    ImmutableMap.>builder()
      .put(Schema.LogicalType.DATE, ImmutableSet.of(LegacySQLTypeName.DATE))
      .put(Schema.LogicalType.DATETIME, ImmutableSet.of(LegacySQLTypeName.DATETIME, LegacySQLTypeName.STRING))
      .put(Schema.LogicalType.TIME_MILLIS, ImmutableSet.of(LegacySQLTypeName.TIME))
      .put(Schema.LogicalType.TIME_MICROS, ImmutableSet.of(LegacySQLTypeName.TIME))
      .put(Schema.LogicalType.TIMESTAMP_MILLIS, ImmutableSet.of(LegacySQLTypeName.TIMESTAMP))
      .put(Schema.LogicalType.TIMESTAMP_MICROS, ImmutableSet.of(LegacySQLTypeName.TIMESTAMP))
      .put(Schema.LogicalType.DECIMAL, ImmutableSet.of(LegacySQLTypeName.NUMERIC, LegacySQLTypeName.BIGNUMERIC))
      .build();

  /**
   * Gets non nullable type from provided schema.
   *
   * @param schema schema to be used
   * @return non-nullable {@link Schema}
   */
  public static Schema getNonNullableSchema(Schema schema) {
    return schema.isNullable() ? schema.getNonNullable() : schema;
  }

  /**
   * Get Bigquery {@link Configuration}.
   *
   * @param serviceAccountInfo service account file path or JSON content
   * @param projectId          BigQuery project ID
   * @param cmekKeyName            the name of the cmek key
   * @param serviceAccountType type of the service account
   * @return indicator for whether service account is file or json
   * @throws IOException if not able to get credentials
   */
  public static Configuration getBigQueryConfig(@Nullable String serviceAccountInfo, String projectId,
                                                @Nullable CryptoKeyName cmekKeyName, String serviceAccountType)
    throws IOException {
    Job job = Job.getInstance();

    // some input formats require the credentials to be present in the job. We don't know for
    // sure which ones (HCatalog is one of them), so we simply always add them. This has no other
    // effect, because this method is only used at configure time and will be ignored later on.
    if (UserGroupInformation.isSecurityEnabled()) {
      Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
      job.getCredentials().addAll(credentials);
    }

    Configuration configuration = job.getConfiguration();
    configuration.clear();
    Map authProperties =
      GCPUtils.generateBigQueryAuthProperties(serviceAccountInfo, serviceAccountType);
    authProperties.forEach(configuration::set);
    configuration.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem");
    configuration.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS");
    configuration.set("fs.gs.project.id", projectId);
    configuration.set("fs.gs.working.dir", GCSPath.ROOT_DIR);
    configuration.set(BigQueryConfiguration.PROJECT_ID.getKey(), projectId);
    if (cmekKeyName != null) {
      configuration.set(BigQueryConfiguration.OUTPUT_TABLE_KMS_KEY_NAME.getKey(),
          cmekKeyName.toString());
    }
    return configuration;
  }

  /**
   * Converts BigQuery Table Schema into a CDAP Schema object.
   *
   * @param bqSchema  BigQuery Schema to be converted.
   * @param collector Failure collector to collect failure messages for the client.
   * @return CDAP schema object
   */
  public static Schema getTableSchema(com.google.cloud.bigquery.Schema bqSchema, @Nullable FailureCollector collector) {
    FieldList fields = bqSchema.getFields();
    List schemafields = new ArrayList<>();

    for (Field field : fields) {
      Schema.Field schemaField = getSchemaField(field, collector);
      // if schema field is null, that means that there was a validation error. We will still continue in order to
      // collect more errors
      if (schemaField == null) {
        continue;
      }
      schemafields.add(schemaField);
    }
    if (schemafields.isEmpty() && collector != null && !collector.getValidationFailures().isEmpty()) {
      // throw if there was validation failure(s) added to the collector
      collector.getOrThrowException();
    }
    if (schemafields.isEmpty()) {
      return null;
    }
    return Schema.recordOf("output", schemafields);
  }

  /**
   * Converts BigQuery schema field into a corresponding CDAP Schema.Field.
   *
   * @param field     BigQuery field to be converted.
   * @param collector Failure collector to collect failure messages for the client.
   * @return A CDAP schema field
   */
  @Nullable
  public static Schema.Field getSchemaField(Field field, @Nullable FailureCollector collector) {
    return getSchemaField(field, collector, null);
  }

  /**
   * Converts BigQuery schema field into a corresponding CDAP Schema.Field.
   *
   * @param field        BigQuery field to be converted.
   * @param collector    Failure collector to collect failure messages for the client.
   * @param recordPrefix String to prepend to recordNames to make them unique
   * @return A CDAP schema field
   */
  @Nullable
  private static Schema.Field getSchemaField(Field field, @Nullable FailureCollector collector,
                                            @Nullable String recordPrefix) {
    Schema schema = convertFieldType(field, collector, recordPrefix);
    if (schema == null) {
      return null;
    }

    Field.Mode mode = field.getMode() == null ? Field.Mode.NULLABLE : field.getMode();
    switch (mode) {
      case NULLABLE:
        return Schema.Field.of(field.getName(), Schema.nullableOf(schema));
      case REQUIRED:
        return Schema.Field.of(field.getName(), schema);
      case REPEATED:
        return Schema.Field.of(field.getName(), Schema.arrayOf(schema));
      default:
        // this should not happen, unless newer bigquery versions introduces new mode that is not supported by this
        // plugin.
        String error = String.format("Field '%s' has unsupported mode '%s'.", field.getName(), mode);
        if (collector != null) {
          collector.addFailure(error, null);
        } else {
          throw new RuntimeException(error);
        }
    }
    return null;
  }

  /**
   * Converts BiqQuery field type into a CDAP field type.
   *
   * @param field     Bigquery field to be converted.
   * @param collector Failure collector to collect failure messages for the client.
   * @return A CDAP field schema
   */
  @Nullable
  public static Schema convertFieldType(Field field, @Nullable FailureCollector collector) {
    return convertFieldType(field, collector, null);
  }

  /**
   * Converts BiqQuery field type into a CDAP field type.
   *
   * @param field        Bigquery field to be converted.
   * @param collector    Failure collector to collect failure messages for the client.
   * @param recordPrefix String to add before a record name to ensure unique names.
   * @return A CDAP field schema
   */
  @Nullable
  public static Schema convertFieldType(Field field, @Nullable FailureCollector collector,
                                        @Nullable String recordPrefix) {
    LegacySQLTypeName type = field.getType();
    // Treat JSON as string
    StandardSQLTypeName standardType = LegacySQLTypeName.valueOf("JSON").equals(type) ?
    StandardSQLTypeName.STRING : type.getStandardType();
    switch (standardType) {
      case FLOAT64:
        // float is a float64, so corresponding type becomes double
        return Schema.of(Schema.Type.DOUBLE);
      case BOOL:
        return Schema.of(Schema.Type.BOOLEAN);
      case INT64:
        // int is a int64, so corresponding type becomes long
        return Schema.of(Schema.Type.LONG);
      case STRING:
        return Schema.of(Schema.Type.STRING);
      case DATETIME:
        return Schema.of(Schema.LogicalType.DATETIME);
      case BYTES:
        return Schema.of(Schema.Type.BYTES);
      case TIME:
        return Schema.of(Schema.LogicalType.TIME_MICROS);
      case DATE:
        return Schema.of(Schema.LogicalType.DATE);
      case TIMESTAMP:
        return Schema.of(Schema.LogicalType.TIMESTAMP_MICROS);
      case NUMERIC:
        // bigquery has Numeric.PRECISION digits of precision and Numeric.SCALE digits of scale for NUMERIC.
        // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
        return Schema.decimalOf(Numeric.PRECISION, Numeric.SCALE);
      case BIGNUMERIC:
        // bigquery has BigNumeric.PRECISION digits of precision and BigNumeric.SCALE digits of scale for BIGNUMERIC.
        // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
        return Schema.decimalOf(BigNumeric.PRECISION, BigNumeric.SCALE);
      case STRUCT:
        FieldList fields = field.getSubFields();
        List schemaFields = new ArrayList<>();

        // Record names have to be unique as Avro doesn't allow to redefine them.
        // We can make them unique by prepending the previous records names to their name.
        String recordTypeName = "";
        if (recordPrefix != null) {
          recordTypeName = recordPrefix + '.';
        }
        recordTypeName = recordTypeName + field.getName();

        for (Field f : fields) {
          Schema.Field schemaField = getSchemaField(f, collector, recordTypeName);
          // if schema field is null, that means that there was a validation error. We will still continue in order to
          // collect more errors
          if (schemaField == null) {
            continue;
          }
          schemaFields.add(schemaField);
        }
        // do not return schema for the struct field if none of the nested fields are of supported types
        if (!schemaFields.isEmpty()) {
          Schema namingSchema = Schema.recordOf(schemaFields);
          recordTypeName = recordTypeName + namingSchema.getRecordName();
          return Schema.recordOf(recordTypeName, schemaFields);
        } else {
          return null;
        }
      default:
        String error =
          String.format("BigQuery column '%s' is of unsupported type '%s'.", field.getName(), standardType.name());
        String action = String.format("Supported column types are: %s.",
                                      BigQueryUtil.BQ_TYPE_MAP.keySet().stream().map(t -> t.getStandardType().name())
                                        .collect(Collectors.joining(", ")));
        if (collector != null) {
          collector.addFailure(error, action);
        } else {
          throw new RuntimeException(error + action);
        }
        return null;
    }
  }

  /**
   * Validates if provided field schema matches with BigQuery table column type.
   *
   * @param bqField        bigquery table field
   * @param field          schema field
   * @param dataset        dataset name
   * @param table          table name
   * @param supportedTypes types supported
   * @param collector      failure collector
   * @return returns validation failure
   */
  @Nullable
  public static ValidationFailure validateFieldSchemaMatches(Field bqField, Schema.Field field, String dataset,
                                                             String table, Set supportedTypes,
                                                             FailureCollector collector) {
    // validate type of fields against BigQuery column type
    String name = field.getName();
    Schema fieldSchema = getNonNullableSchema(field.getSchema());
    Schema.Type type = fieldSchema.getType();
    Schema.LogicalType logicalType = fieldSchema.getLogicalType();

    // validate logical types
    if (logicalType != null) {
      if (LOGICAL_TYPE_MAP.get(logicalType) == null) {
        return collector.addFailure(
          String.format("Field '%s' is of unsupported type '%s'.", field.getName(), fieldSchema.getDisplayName()),
          String.format("Supported types are: %s, date, time, timestamp and decimal.",
                        supportedTypes.stream().map(t -> t.name().toLowerCase()).collect(Collectors.joining(", "))));
      }

      if (!LOGICAL_TYPE_MAP.get(logicalType).contains(bqField.getType())) {
        return collector.addFailure(
          String.format("Field '%s' of type '%s' has incompatible type with column '%s' in BigQuery table '%s.%s'.",
                        name, fieldSchema.getDisplayName(), bqField.getName(), dataset, table),
          String.format("Modify the input so that it is of type '%s'.", BQ_TYPE_MAP.get(bqField.getType())));
      }

      // BigQuery schema precision must be at most BigNumeric.PRECISION and scale at most BigNumeric.SCALE
      if (logicalType == Schema.LogicalType.DECIMAL) {
        if (fieldSchema.getPrecision() > BigNumeric.PRECISION || fieldSchema.getScale() > BigNumeric.SCALE) {
          return collector.addFailure(
              String.format("Decimal Field '%s' has invalid precision '%s' and scale '%s'. ",
                  name, fieldSchema.getPrecision(), fieldSchema.getScale()),
              String.format("Precision must be at most '%s' and scale must be at most '%s'.",
                  BigNumeric.PRECISION, BigNumeric.SCALE)
          );
        }
      }

      // Return once logical types are validated. This is because logical types are represented as primitive types
      // internally.
      return null;
    }

    // Complex types like maps and unions are not supported in BigQuery plugins.
    if (!supportedTypes.contains(type)) {
      return collector.addFailure(
        String.format("Field '%s' is of unsupported type '%s'.", name, type.name().toLowerCase()),
        String.format("Supported types are: %s, date, time, timestamp and decimal.",
                      supportedTypes.stream().map(t -> t.name().toLowerCase()).collect(Collectors.joining(", "))));
    }

    if (type == Schema.Type.ARRAY) {
      ValidationFailure failure = validateArraySchema(field.getSchema(), field.getName(), collector);
      if (failure != null) {
        return failure;
      }
      if (bqField.getMode() == Field.Mode.REPEATED) {
        fieldSchema = fieldSchema.getComponentSchema();
        type = fieldSchema.getType();
        logicalType = fieldSchema.getLogicalType();
      }
    }

    String[] incompatibleFieldErrorMessage = {
      String.format("Field '%s' of type '%s' is incompatible with column '%s' of type '%s' in BigQuery table '%s.%s'.",
          field.getName(), fieldSchema.getDisplayName(), bqField.getName(),
          BQ_TYPE_MAP.get(bqField.getType()), dataset, table) ,
      String.format("It must be of type '%s'.", BQ_TYPE_MAP.get(bqField.getType()))
    };
    if (logicalType != null) {
      if (LOGICAL_TYPE_MAP.get(logicalType) != null && !LOGICAL_TYPE_MAP.get(logicalType).contains(bqField.getType())) {
        return collector.addFailure(incompatibleFieldErrorMessage[0], incompatibleFieldErrorMessage[1]);
      }

      // Return once logical types are validated. This is because logical types are represented as primitive types
      // internally.
      return null;
    }

    if (TYPE_MAP.get(type) != null && !TYPE_MAP.get(type).contains(bqField.getType())) {
      return collector.addFailure(incompatibleFieldErrorMessage[0], incompatibleFieldErrorMessage[1]);
    }
    return null;
  }

  /**
   * Check the mode of the output schema fields against big query table fields.
   *
   * @param bigQueryField schema fields
   * @param field         bigquery table fields
   * @param collector     failure collector
   */
  public static void validateFieldModeMatches(Field bigQueryField, Schema.Field field, boolean allowSchemaRelaxation,
                                              FailureCollector collector) {
    Field.Mode mode = bigQueryField.getMode();
    boolean isBqFieldNullable = mode == null || mode.equals(Field.Mode.NULLABLE);

    Schema fieldSchema = field.getSchema();
    if (!allowSchemaRelaxation && fieldSchema.isNullable() && !isBqFieldNullable) {
      // Nullable output schema field is incompatible with required BQ table field

      // In case of arrays, BigQuery handles null arrays and convert them to empty arrays at insert
      if (!getNonNullableSchema(fieldSchema).getType().equals(Schema.Type.ARRAY)) {
        collector.addFailure(String.format("Field '%s' cannot be nullable.", bigQueryField.getName()),
                "Change the field to be required.")
                .withOutputSchemaField(field.getName());
      }
    }
  }

  /**
   * Get difference of schema fields and big query table fields. The operation is equivalent to
   * (Names of schema fields - Names of bigQuery table fields).
   *
   * @param schemaFields schema fields
   * @param bqFields     bigquery table fields
   * @return list of remaining field names
   */
  public static List getSchemaMinusBqFields(List schemaFields, FieldList bqFields) {
    List diff = new ArrayList<>();

    for (Schema.Field field : schemaFields) {
      diff.add(field.getName());
    }

    for (Field field : bqFields) {
      diff.remove(field.getName());
    }
    return diff;
  }

  /**
   * Get difference of big query table fields and schema fields. The operation is equivalent to
   * (Names of bigQuery table fields - Names of schema fields).
   *
   * @param bqFields     bigquery table fields
   * @param schemaFields schema fields
   * @return list of remaining field names
   */
  public static List getBqFieldsMinusSchema(FieldList bqFields, List schemaFields) {
    List diff = new ArrayList<>();

    for (Field field : bqFields) {
      diff.add(field.getName());
    }

    for (Schema.Field field : schemaFields) {
      diff.remove(field.getName());
    }
    return diff;
  }

  /**
   * Converts Hadoop configuration to map of properties.
   *
   * @param config Hadoop configuration
   * @return properties map
   */
  public static Map configToMap(Configuration config) {
    return StreamSupport.stream(config.spliterator(), false)
      .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
  }

  /**
   * Validates schema of type array. BigQuery does not allow nullable type within array.
   *
   * @param arraySchema schema of array field
   * @param name        name of the array field
   * @param collector   failure collector
   * @return returns validation failure if invalid array schema, otherwise returns null
   */
  @Nullable
  public static ValidationFailure validateArraySchema(Schema arraySchema, String name, FailureCollector collector) {
    Schema nonNullableSchema = getNonNullableSchema(arraySchema);
    Schema componentSchema = nonNullableSchema.getComponentSchema();

    if (componentSchema.isNullable()) {
      return collector.addFailure(String.format("Field '%s' contains null values in its array.", name),
                                  "Change the array component type to be non-nullable.");
    }

    if (UNSUPPORTED_ARRAY_TYPES.contains(componentSchema.getType())) {
      return collector.addFailure(String.format("Field '%s' is an array of unsupported type '%s'.",
                                                name, componentSchema.getDisplayName()),
                                  "Change the array component type to be a valid type.");
    }

    return null;
  }

  /**
   * Get BigQuery table.
   *
   * @param datasetProject           project where dataset is in
   * @param datasetId                BigQuery dataset ID
   * @param tableName                BigQuery table name
   * @param serviceAccount           service account file path or JSON content
   * @param isServiceAccountFilePath indicator for whether service account is file or json
   * @return BigQuery table
   */
  @Nullable
  public static Table getBigQueryTable(String datasetProject, String datasetId, String tableName,
                                       @Nullable String serviceAccount, boolean isServiceAccountFilePath) {
    TableId tableId = TableId.of(datasetProject, datasetId, tableName);

    com.google.auth.Credentials credentials = null;
    if (serviceAccount != null) {
      try {
        credentials = GCPUtils.loadServiceAccountCredentials(serviceAccount, isServiceAccountFilePath);
      } catch (IOException e) {
        throw new InvalidConfigPropertyException(
          String.format("Unable to load credentials from %s", isServiceAccountFilePath ? serviceAccount : " JSON."),
          "serviceFilePath");
      }
    }
    BigQuery bigQuery = GCPUtils.getBigQuery(datasetProject, credentials);

    Table table;
    try {
      table = bigQuery.getTable(tableId);
    } catch (BigQueryException e) {
      throw new InvalidStageException("Unable to get details about the BigQuery table: " + e.getMessage(), e);
    }

    return table;
  }

  /**
   * Get BigQuery table.
   *
   * @param projectId          BigQuery project ID
   * @param datasetId          BigQuery dataset ID
   * @param tableName          BigQuery table name
   * @param serviceAccountPath service account file path
   * @param collector          failure collector
   * @return BigQuery table
   */
  @Nullable
  public static Table getBigQueryTable(String projectId, String datasetId, String tableName,
                                       @Nullable String serviceAccountPath, FailureCollector collector) {
    return getBigQueryTable(projectId, datasetId, tableName, serviceAccountPath, true, collector);
  }

  /**
   * Get BigQuery table.
   *
   * @param projectId                BigQuery project ID
   * @param dataset                  BigQuery dataset name
   * @param tableName                BigQuery table name
   * @param serviceAccount           service account file path or JSON content
   * @param isServiceAccountFilePath indicator for whether service account is file or json
   * @param collector                failure collector
   * @return BigQuery table
   */
  public static Table getBigQueryTable(String projectId, String dataset, String tableName,
                                       @Nullable String serviceAccount, @Nullable Boolean isServiceAccountFilePath,
                                       FailureCollector collector) {
    TableId tableId = TableId.of(projectId, dataset, tableName);
    com.google.auth.Credentials credentials = null;
    if (serviceAccount != null) {
      try {
        credentials = GCPUtils.loadServiceAccountCredentials(serviceAccount, isServiceAccountFilePath);
      } catch (IOException e) {
        collector.addFailure(String.format("Unable to load credentials from %s.",
                                           isServiceAccountFilePath ? serviceAccount : "provided JSON key"),
                             "Ensure the service account file is available on the local filesystem.")
          .withConfigProperty(GCPConfig.NAME_SERVICE_ACCOUNT_FILE_PATH);
        throw collector.getOrThrowException();
      }
    }
    BigQuery bigQuery = GCPUtils.getBigQuery(projectId, credentials);

    Table table = null;
    try {
      table = bigQuery.getTable(tableId);
    } catch (BigQueryException e) {
      collector.addFailure("Unable to get details about the BigQuery table: " + e.getMessage(), null)
        .withConfigProperty(BigQuerySourceConfig.NAME_TABLE);
      throw collector.getOrThrowException();
    }

    return table;
  }

  /**
   * Validates allowed characters for bucket name.
   *
   * @param bucket             bucket name
   * @param bucketPropertyName bucket name property
   * @param collector          failure collector
   */
  public static void validateBucket(String bucket, String bucketPropertyName, FailureCollector collector) {
    // Allowed character validation for bucket name as per https://cloud.google.com/storage/docs/naming
    String errorMessage = "Bucket name can only contain lowercase letters, numbers, '.', '_', and '-'.";
    match(bucket, bucketPropertyName, BUCKET_PATTERN, collector, errorMessage);
  }

  /**
   * Validates allowed characters for dataset name.
   *
   * @param dataset             dataset name
   * @param datasetPropertyName dataset name property
   * @param collector           failure collector
   */
  public static void validateDataset(String dataset, String datasetPropertyName, FailureCollector collector) {
    // Allowed character validation for dataset name as per https://cloud.google.com/bigquery/docs/datasets
    String errorMessage = "Dataset name can only contain letters (lower or uppercase), numbers and '_'.";
    match(dataset, datasetPropertyName, DATASET_PATTERN, collector, errorMessage);
  }

  /**
   * Validates allowed characters for table name.
   *
   * @param table             table name
   * @param tablePropertyName table name property
   * @param collector         failure collector
   */
  public static void validateTable(String table, String tablePropertyName, FailureCollector collector) {
    // Allowed character validation for table name as per https://cloud.google.com/bigquery/docs/tables
    String errorMessage = "Table name can only contain letters (lower or uppercase), numbers, '_' and '-'.";
    match(table, tablePropertyName, TABLE_PATTERN, collector, errorMessage);
  }

  /**
   * Validates allowed GCS Upload Request Chunk Size.
   *
   * @param chunkSize             provided chunk size
   * @param chunkSizePropertyName GCS chunk size name property
   * @param collector             failure collector
   */
  public static void validateGCSChunkSize(String chunkSize, String chunkSizePropertyName, FailureCollector collector) {
    if (!Strings.isNullOrEmpty(chunkSize)) {
      try {
        if (Integer.parseInt(chunkSize) % MediaHttpUploader.MINIMUM_CHUNK_SIZE != 0) {
          collector.addFailure(
            String.format("Value must be a multiple of %s.", MediaHttpUploader.MINIMUM_CHUNK_SIZE), null)
            .withConfigProperty(chunkSizePropertyName);
        }
      } catch (NumberFormatException e) {
        collector.addFailure(e.getMessage(), "Input value must be a valid number.")
          .withConfigProperty(chunkSizePropertyName);
      }
    }
  }

  /**
   * Matches text with provided pattern. If the text does not match the pattern, the method adds a new failure to
   * failure collector.
   *
   * @param text         text to be matched
   * @param propertyName property name
   * @param pattern      pattern
   * @param collector    failure collector
   * @param errorMessage error message
   */
  private static void match(String text, String propertyName, String pattern,
                            FailureCollector collector, String errorMessage) {
    if (!Strings.isNullOrEmpty(text)) {
      Pattern p = Pattern.compile(pattern);
      if (!p.matcher(text).matches()) {
        collector.addFailure(errorMessage, null).withConfigProperty(propertyName);
      }
    }
  }

  /**
   * Deletes temporary directory.
   *
   * @param configuration Hadoop Configuration.
   * @param dir           directory to delete
   */
  public static void deleteTemporaryDirectory(Configuration configuration, String dir) throws IOException {
    Path path = new Path(dir);
    FileSystem fs = path.getFileSystem(configuration);
    if (fs.exists(path)) {
      fs.delete(path, true);
      LOG.debug("Deleted temporary directory '{}'", path);
    }
  }

  public static String generateTimePartitionCondition(StandardTableDefinition tableDefinition,
                                                      String partitionFromDate,
                                                      String partitionToDate) {

    TimePartitioning timePartitioning = tableDefinition.getTimePartitioning();

    if (timePartitioning == null) {
      return StringUtils.EMPTY;
    }

    StringBuilder timePartitionCondition = new StringBuilder();
    String columnName = timePartitioning.getField() != null ?
      timePartitioning.getField() : DEFAULT_PARTITION_COLUMN_NAME;

    LegacySQLTypeName columnType = null;
    if (!DEFAULT_PARTITION_COLUMN_NAME.equals(columnName)) {
      columnType = tableDefinition.getSchema().getFields().get(columnName).getType();
    }

    String columnNameTS = columnName;
    if (!LegacySQLTypeName.TIMESTAMP.equals(columnType)) {
      columnNameTS = "TIMESTAMP(`" + columnNameTS + "`)";
    }
    if (partitionFromDate != null) {
      timePartitionCondition.append(columnNameTS).append(" >= ").append("TIMESTAMP(\"")
        .append(partitionFromDate).append("\")");
    }
    if (partitionFromDate != null && partitionToDate != null) {
      timePartitionCondition.append(" and ");
    }
    if (partitionToDate != null) {
      timePartitionCondition.append(columnNameTS).append(" < ").append("TIMESTAMP(\"")
        .append(partitionToDate).append("\")");
    }
    return timePartitionCondition.toString();
  }

  /**
   * Get fully-qualified name (FQN) for a BQ table (FQN format:
   * bigquery:{projectId}.{datasetId}.{tableId}).
   *
   * @param datasetProject Name of the BQ project
   * @param datasetName Name of the BQ dataset
   * @param tableName Name of the BQ table
   * @return String fqn
   */
  public static String getFQN(String datasetProject, String datasetName, String tableName) {

    String formattedProject = GCPUtils.formatAsFQNComponent(datasetProject);
    String formattedDataset = GCPUtils.formatAsFQNComponent(datasetName);
    String formattedTable = GCPUtils.formatAsFQNComponent(tableName);

    String fqn = String.format("%s:%s.%s.%s", BigQueryConstants.BQ_FQN_PREFIX, formattedProject,
        formattedDataset, formattedTable);
    LOG.trace("Formatted Fully-Qualified Name (FQN): {}", fqn);
    return fqn;
  }

  /**
   * Get the bucket prefix from the runtime arguments. If not set, it will be created and set.
   *
   * @param arguments settable arguments instance to verify
   * @return the bucket prefix to use for this pipeline
   */
  @Nullable
  public static String getBucketPrefix(Map arguments) {
    // If the bucket prefix property is set, use it.
    if (arguments.containsKey(BIGQUERY_BUCKET_PREFIX_PROPERTY_NAME)) {
      String bucketPrefix = arguments.get(BIGQUERY_BUCKET_PREFIX_PROPERTY_NAME);
      validateBucketPrefix(bucketPrefix);
      LOG.debug("Using bucket prefix for temporary buckets: {}", bucketPrefix);
      return bucketPrefix;
    }
    return null;
  }

  /**
   * Ensures configured bucket prefix is valid per the GCS naming convention.
   *
   * @param bucketPrefix
   */
  private static void validateBucketPrefix(String bucketPrefix) {
    if (!bucketPrefix.matches("^[a-z0-9-_.]+$")) {
      throw new IllegalArgumentException("The configured bucket prefix '" + bucketPrefix + "' is not a valid bucket " +
                                           "name. Bucket names can only contain lowercase letters, numeric " +
                                           "characters, dashes (-), underscores (_), and dots (.).");
    }

    if (!bucketPrefix.contains(".") && bucketPrefix.length() > 50) {
      throw new IllegalArgumentException("The configured bucket prefix '" + bucketPrefix + "' should be 50 " +
                                           "characters or shorter.");
    }
  }

  /**
   * Method to generate the CRC32 checksum for a location.
   * We use this to ensure location name length is constant (only 8 characters).
   *
   * @param location location to checksum
   * @return checksum value as an 8 character string (hex).
   */
  @VisibleForTesting
  public static String crc32location(String location) {
    byte[] bytes = location.toLowerCase().getBytes();
    Checksum checksum = new CRC32();
    checksum.update(bytes, 0, bytes.length);
    return Long.toHexString(checksum.getValue());
  }

  /**
   * Build bucket name concatenating the bucket prefix with the location crc32 hash using a hyphen (-)
   *
   * @param bucketPrefix Bucket prefix
   * @param location     location to use.
   * @return String containing the bucket location.
   */
  public static String getBucketNameForLocation(String bucketPrefix, String location) {
    return String.format("%s-%s", bucketPrefix, crc32location(location));
  }

  /**
   * Get labels for a BigQuery Job.
   * @param jobType the job type to set in the labels.
   * @return map containing labels for a BigQuery job launched by this plugin.
   */
  public static Map getJobLabels(String jobType) {
    Map labels = new HashMap<>();
    labels.put(BQ_JOB_SOURCE_TAG, "cdap");
    labels.put(BQ_JOB_TYPE_TAG, jobType);
    return labels;
  }

  /**
   * Get labels for a BigQuery Job.
   * @param jobType the job type to set in the labels.
   * @param userDefinedTags user defined tags to be added to the labels.
   * @return map containing labels for a BigQuery job launched by this plugin.
   */
  public static Map getJobLabels(String jobType, String userDefinedTags) {
    Map labels = getJobLabels(jobType);
    if (Strings.isNullOrEmpty(userDefinedTags)) {
        return labels;
    }
    for (String tag : userDefinedTags.trim().split(",")) {
      String[] keyValue = tag.split(":");
      if (keyValue.length == 1) {
        labels.put(keyValue[0], "");
      }
      if (keyValue.length == 2) {
        labels.put(keyValue[0], keyValue[1]);
      }
    }
    return labels;
  }

  /**
   * Identify a stating bucket name from the pipeline context and plugin configuration
   * @param arguments runtime arguments
   * @param configLocation location from plugin configuration
   * @param dataset BigQuery dataset
   * @param bucket bucket from plugin configuration
   * @return Bucket name to use for this sink.
   */
  @Nullable
  public static String getStagingBucketName(Map arguments, @Nullable String configLocation,
                                            @Nullable Dataset dataset, @Nullable String bucket) {
    // Get Bucket Prefix from configuration
    String bucketPrefix = BigQueryUtil.getBucketPrefix(arguments);

    // If temp bucket name is not defined in configuration, and a common bucket name prefix has been specified,
    // we must set this prefix along with the destination location in order to create/reuse the bucket.
    // Otherwise, if temp bucket name is defined, or a prefix is not set, the configureBucket method will prepare
    // for a new bucket creation.
    if (Strings.isNullOrEmpty(bucket) && bucketPrefix != null) {
      // If the destination dataset exists, use the dataset location. Otherwise, use location from configuration.
      String datasetLocation = dataset != null ? dataset.getLocation() : configLocation;
      // Get the bucket name for the specified location.
      bucket = BigQueryUtil.getBucketNameForLocation(bucketPrefix, datasetLocation);
    }
    return bucket;
  }

  /**
   * Validates job label key value pairs, as per the following rules:
   * Keys and values can contain only lowercase letters, numeric characters, underscores, and dashes.
   * Defined in the following link:
   * Docs
   * @param failureCollector failure collector
   */
  public static void validateJobLabelKeyValue(String labelKeyValue, FailureCollector failureCollector,
                                              String stageConfigProperty) {
    Set reservedKeys = BQ_JOB_LABEL_SYSTEM_KEYS;
    int maxLabels = 64 - reservedKeys.size();
    int maxKeyLength = 63;
    int maxValueLength = 63;

    String validLabelKeyRegex = "^[\\p{L}][a-z0-9-_\\p{L}]+$";
    String validLabelValueRegex = "^[a-z0-9-_\\p{L}]+$";
    String capitalLetterRegex = ".*[A-Z].*";

    if (com.google.api.client.util.Strings.isNullOrEmpty(labelKeyValue)) {
      return;
    }

    String[] keyValuePairs = labelKeyValue.split(",");
    Set uniqueKeys = new HashSet<>();

    for (String keyValuePair : keyValuePairs) {

      // Adding a label without a value is valid behavior
      // Read more here: https://cloud.google.com/bigquery/docs/adding-labels#adding_a_label_without_a_value
      String[] keyValue = keyValuePair.trim().split(":");
      boolean isKeyPresent = keyValue.length == 1 || keyValue.length == 2;
      boolean isValuePresent = keyValue.length == 2;


      if (!isKeyPresent) {
        failureCollector.addFailure(String.format("Invalid job label key value pair '%s'.", keyValuePair),
                        "Job label key value pair should be in the format 'key:value'.")
                .withConfigProperty(stageConfigProperty);
        continue;
      }

      // Check if key is reserved
      if (reservedKeys.contains(keyValue[0])) {
        failureCollector.addFailure(String.format("Invalid job label key '%s'.", keyValue[0]),
                "A system label already exists with same name.").withConfigProperty(stageConfigProperty);
        continue;
      }

      String key = keyValue[0];
      String value = isValuePresent ? keyValue[1] : "";
      boolean isKeyValid = true;
      boolean isValueValid = true;

      // Key cannot be empty
      if (com.google.api.client.util.Strings.isNullOrEmpty(key)) {
        failureCollector.addFailure(String.format("Invalid job label key '%s'.", key),
                "Job label key cannot be empty.").withConfigProperty(stageConfigProperty);
        isKeyValid = false;
      }

      // Key cannot be longer than 63 characters
      if (key.length() > maxKeyLength) {
        failureCollector.addFailure(String.format("Invalid job label key '%s'.", key),
                "Job label key cannot be longer than 63 characters.").withConfigProperty(stageConfigProperty);
        isKeyValid = false;
      }

      // Value cannot be longer than 63 characters
      if (value.length() > maxValueLength) {
        failureCollector.addFailure(String.format("Invalid job label value '%s'.", value),
                "Job label value cannot be longer than 63 characters.").withConfigProperty(stageConfigProperty);
        isValueValid = false;
      }

      if (isKeyValid && (!key.matches(validLabelKeyRegex) || key.matches(capitalLetterRegex))) {
        failureCollector.addFailure(String.format("Invalid job label key '%s'.", key),
                        "Job label key can only contain lowercase letters, numeric characters, " +
                                "underscores, and dashes. Check docs for more details.")
                .withConfigProperty(stageConfigProperty);
        isKeyValid = false;
      }

      if (isValuePresent && isValueValid &&
              (!value.matches(validLabelValueRegex) || value.matches(capitalLetterRegex))) {
        failureCollector.addFailure(String.format("Invalid job label value '%s'.", value),
                "Job label value can only contain lowercase letters, numeric characters, " +
                        "underscores, and dashes.").withConfigProperty(stageConfigProperty);
      }

      if (isKeyValid && !uniqueKeys.add(key)) {
        failureCollector.addFailure(String.format("Duplicate job label key '%s'.", key),
                "Job label key should be unique.").withConfigProperty(stageConfigProperty);
      }
    }
    // Check if number of labels is greater than 64 - reserved keys
    if (uniqueKeys.size() > maxLabels) {
      failureCollector.addFailure("Number of job labels exceeds the limit.",
                      String.format("Number of job labels cannot be greater than %d.", maxLabels))
              .withConfigProperty(stageConfigProperty);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy