All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.hive.HiveSyncConfig Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hive;

import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.sync.common.HoodieSyncConfig;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;

import javax.annotation.concurrent.Immutable;

import java.util.Properties;

/**
 * Configs needed to sync data into the Hive Metastore.
 */
@Immutable
@ConfigClassProperty(name = "Hive Sync Configs",
    groupName = ConfigGroups.Names.META_SYNC,
    description = "Configurations used by the Hudi to sync metadata to Hive Metastore.")
public class HiveSyncConfig extends HoodieSyncConfig {

  /*
   * Config constants below are retained here for BWC purpose.
   */
  public static final ConfigProperty HIVE_SYNC_ENABLED = HiveSyncConfigHolder.HIVE_SYNC_ENABLED;
  public static final ConfigProperty HIVE_USER = HiveSyncConfigHolder.HIVE_USER;
  public static final ConfigProperty HIVE_PASS = HiveSyncConfigHolder.HIVE_PASS;
  public static final ConfigProperty HIVE_URL = HiveSyncConfigHolder.HIVE_URL;
  public static final ConfigProperty HIVE_USE_PRE_APACHE_INPUT_FORMAT = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT;
  /**
   * @deprecated Use {@link #HIVE_SYNC_MODE} instead of this config from 0.9.0
   */
  @Deprecated
  public static final ConfigProperty HIVE_USE_JDBC = HiveSyncConfigHolder.HIVE_USE_JDBC;
  public static final ConfigProperty METASTORE_URIS = HiveSyncConfigHolder.METASTORE_URIS;
  public static final ConfigProperty HIVE_AUTO_CREATE_DATABASE = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
  public static final ConfigProperty HIVE_IGNORE_EXCEPTIONS = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
  public static final ConfigProperty HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
  public static final ConfigProperty HIVE_SUPPORT_TIMESTAMP_TYPE = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
  public static final ConfigProperty HIVE_TABLE_PROPERTIES = HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES;
  public static final ConfigProperty HIVE_TABLE_SERDE_PROPERTIES = HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES;
  public static final ConfigProperty HIVE_SYNC_AS_DATA_SOURCE_TABLE = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
  public static final ConfigProperty HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
  public static final ConfigProperty HIVE_CREATE_MANAGED_TABLE = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
  public static final ConfigProperty HIVE_SYNC_OMIT_METADATA_FIELDS = HiveSyncConfigHolder.HIVE_SYNC_OMIT_METADATA_FIELDS;
  public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
  public static final ConfigProperty HIVE_SYNC_MODE = HiveSyncConfigHolder.HIVE_SYNC_MODE;
  public static final ConfigProperty HIVE_SYNC_BUCKET_SYNC = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
  public static final ConfigProperty HIVE_SYNC_BUCKET_SYNC_SPEC = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
  public static final ConfigProperty HIVE_SYNC_COMMENT = HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
  public static final ConfigProperty HIVE_SYNC_TABLE_STRATEGY = HiveSyncConfigHolder.HIVE_SYNC_TABLE_STRATEGY;

  public static final ConfigProperty HIVE_SYNC_FILTER_PUSHDOWN_ENABLED = ConfigProperty
      .key("hoodie.datasource.hive_sync.filter_pushdown_enabled")
      .defaultValue(false)
      .markAdvanced()
      .withDocumentation("Whether to enable push down partitions by filter");

  public static final ConfigProperty HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE = ConfigProperty
      .key("hoodie.datasource.hive_sync.filter_pushdown_max_size")
      .defaultValue(1000)
      .markAdvanced()
      .withDocumentation("Max size limit to push down partition filters, if the estimate push down "
          + "filters exceed this size, will directly try to fetch all partitions between the min/max."
          + "In case of glue metastore, this value should be reduced because it has a filter length limit.");

  public static final ConfigProperty RECREATE_HIVE_TABLE_ON_ERROR = ConfigProperty
      .key("hoodie.datasource.hive_sync.recreate_table_on_error")
      .defaultValue(false)
      .sinceVersion("0.14.0")
      .markAdvanced()
      .withDocumentation("Hive sync may fail if the Hive table exists with partitions differing from the Hoodie table or if schema evolution if not supported by Hive."
          + "Enabling this configuration will drop and create the table to match the Hoodie config");

  public static String getBucketSpec(String bucketCols, int bucketNum) {
    return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
  }

  public HiveSyncConfig(Properties props) {
    super(props);
    validateParameters();
  }

  public HiveSyncConfig(Properties props, Configuration hadoopConf) {
    super(props, hadoopConf);
    HiveConf hiveConf = new HiveConf();
    // HiveConf needs to load Hadoop conf to allow instantiation via AWSGlueClientFactory
    hiveConf.addResource(hadoopConf);
    setHadoopConf(hiveConf);
    validateParameters();
  }

  public HiveConf getHiveConf() {
    return (HiveConf) getHadoopConf();
  }

  public boolean useBucketSync() {
    return getBooleanOrDefault(HIVE_SYNC_BUCKET_SYNC);
  }

  public static class HiveSyncConfigParams {

    @ParametersDelegate()
    public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams();

    @Parameter(names = {"--user"}, description = "Hive username")
    public String hiveUser;
    @Parameter(names = {"--pass"}, description = "Hive password")
    public String hivePass;
    @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url")
    public String jdbcUrl;
    @Parameter(names = {"--use-pre-apache-input-format"},
        description = "Use InputFormat under com.uber.hoodie package "
            + "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
            + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
            + "org.apache.hudi input format.")
    public Boolean usePreApacheInputFormat;
    @Deprecated
    @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
    public Boolean useJdbc;
    @Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
    public String metastoreUris;
    @Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms,glue,jdbc and hiveql")
    public String syncMode;
    @Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
    public Boolean autoCreateDatabase;
    @Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions")
    public Boolean ignoreExceptions;
    @Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
    public Boolean skipROSuffix;
    @Parameter(names = {"--table-properties"}, description = "Table properties to hive table")
    public String tableProperties;
    @Parameter(names = {"--serde-properties"}, description = "Serde properties to hive table")
    public String serdeProperties;
    @Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
        + "Disabled by default for backward compatibility.")
    public Boolean supportTimestamp;
    @Parameter(names = {"--managed-table"}, description = "Create a managed table")
    public Boolean createManagedTable;
    @Parameter(names = {"--omit-metafields"}, description = "Omit metafields in schema")
    public Boolean omitMetaFields;
    @Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
    public Integer batchSyncNum;
    @Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
    public Boolean syncAsSparkDataSourceTable;
    @Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
    public Integer sparkSchemaLengthThreshold;
    @Parameter(names = {"--bucket-sync"}, description = "use bucket sync")
    public Boolean bucketSync;
    @Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore")
    public String bucketSpec;
    @Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
    public Boolean syncComment;

    @Parameter(names = {"--sync-strategy"}, description = "Hive table synchronization strategy. Available option: RO, RT, ALL")
    public String syncStrategy;

    public boolean isHelp() {
      return hoodieSyncConfigParams.isHelp();
    }

    public TypedProperties toProps() {
      final TypedProperties props = hoodieSyncConfigParams.toProps();
      props.setPropertyIfNonNull(HIVE_USER.key(), hiveUser);
      props.setPropertyIfNonNull(HIVE_PASS.key(), hivePass);
      props.setPropertyIfNonNull(HIVE_URL.key(), jdbcUrl);
      props.setPropertyIfNonNull(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), usePreApacheInputFormat);
      props.setPropertyIfNonNull(HIVE_USE_JDBC.key(), useJdbc);
      props.setPropertyIfNonNull(HIVE_SYNC_MODE.key(), syncMode);
      props.setPropertyIfNonNull(METASTORE_URIS.key(), metastoreUris);
      props.setPropertyIfNonNull(HIVE_AUTO_CREATE_DATABASE.key(), autoCreateDatabase);
      props.setPropertyIfNonNull(HIVE_IGNORE_EXCEPTIONS.key(), ignoreExceptions);
      props.setPropertyIfNonNull(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key(), skipROSuffix);
      props.setPropertyIfNonNull(HIVE_SUPPORT_TIMESTAMP_TYPE.key(), supportTimestamp);
      props.setPropertyIfNonNull(HIVE_TABLE_PROPERTIES.key(), tableProperties);
      props.setPropertyIfNonNull(HIVE_TABLE_SERDE_PROPERTIES.key(), serdeProperties);
      props.setPropertyIfNonNull(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), syncAsSparkDataSourceTable);
      props.setPropertyIfNonNull(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), sparkSchemaLengthThreshold);
      props.setPropertyIfNonNull(HIVE_CREATE_MANAGED_TABLE.key(), createManagedTable);
      props.setPropertyIfNonNull(HIVE_SYNC_OMIT_METADATA_FIELDS.key(), omitMetaFields);
      props.setPropertyIfNonNull(HIVE_BATCH_SYNC_PARTITION_NUM.key(), batchSyncNum);
      props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC.key(), bucketSync);
      props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), bucketSpec);
      props.setPropertyIfNonNull(HIVE_SYNC_COMMENT.key(), syncComment);
      props.setPropertyIfNonNull(HIVE_SYNC_TABLE_STRATEGY.key(), syncStrategy);
      return props;
    }
  }

  public void validateParameters() {
    ValidationUtils.checkArgument(getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM) > 0, "batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy