All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.cli.BootstrapExecutorUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.cli;

import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieTimelineTimeZone;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.NonpartitionedKeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator;
import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
import org.apache.hudi.keygen.constant.KeyGeneratorType;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.util.SparkKeyGenUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
import static org.apache.hudi.common.table.HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT;
import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS;
import static org.apache.hudi.common.table.HoodieTableConfig.TIMELINE_TIMEZONE;
import static org.apache.hudi.common.util.ConfigUtils.filterProperties;
import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD;
import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS;
import static org.apache.hudi.config.HoodieWriteConfig.PRECOMBINE_FIELD_NAME;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC;
import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE;
import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.RECORDKEY_FIELD_NAME;
import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.URL_ENCODE_PARTITIONING;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME;

/**
 * Performs bootstrap from a non-hudi source.
 * import static org.apache.hudi.common.util.ConfigUtils.filterProperties;
 */
public class BootstrapExecutorUtils implements Serializable {

  private static final Logger LOG = LoggerFactory.getLogger(BootstrapExecutorUtils.class);

  /**
   * Config.
   */
  private final Config cfg;

  /**
   * Spark context.
   */
  private final transient JavaSparkContext jssc;

  /**
   * Bag of properties with source, hoodie client, key generator etc.
   */
  private final TypedProperties props;

  /**
   * Hadoop Configuration.
   */
  private final Configuration configuration;

  /**
   * Bootstrap Configuration.
   */
  private final HoodieWriteConfig bootstrapConfig;

  /**
   * FileSystem instance.
   */
  private final transient FileSystem fs;

  private final String bootstrapBasePath;

  public static final String CHECKPOINT_KEY = HoodieWriteConfig.STREAMER_CHECKPOINT_KEY;

  /**
   * Bootstrap Executor.
   *
   * @param cfg        Hudi Streamer Config
   * @param jssc       Java Spark Context
   * @param fs         File System
   * @param properties Bootstrap Writer Properties
   * @throws IOException
   */
  public BootstrapExecutorUtils(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf,
                                TypedProperties properties) throws IOException {
    this.cfg = cfg;
    this.jssc = jssc;
    this.fs = fs;
    this.configuration = conf;
    this.props = properties;

    ValidationUtils.checkArgument(properties.containsKey(HoodieTableConfig.BOOTSTRAP_BASE_PATH
            .key()),
        HoodieTableConfig.BOOTSTRAP_BASE_PATH.key() + " must be specified.");
    this.bootstrapBasePath = properties.getString(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key());

    // Add more defaults if full bootstrap requested
    this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(),
        DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue());
    /*
     * Schema provider that supplies the command for reading the input and writing out the target table.
     */
    SchemaProvider schemaProvider = createSchemaProvider(cfg.schemaProviderClass, props, jssc);
    String keyGenClass = genKeyGenClassAndPartitionColumns().getLeft();
    HoodieWriteConfig.Builder builder =
        HoodieWriteConfig.newBuilder().withPath(cfg.basePath)
            .withCompactionConfig(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build())
            .forTable(cfg.tableName)
            .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
            .withAutoCommit(true)
            .withKeyGenerator(keyGenClass)
            .withProps(props);

    if (null != schemaProvider && null != schemaProvider.getTargetSchema()) {
      builder = builder.withSchema(schemaProvider.getTargetSchema().toString());
    }
    this.bootstrapConfig = builder.build();
    LOG.info("Created bootstrap executor with configs : " + bootstrapConfig.getProps());
  }

  public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg,
                                                    JavaSparkContext jssc) throws IOException {
    try {
      return StringUtils.isNullOrEmpty(schemaProviderClass) ? null
          : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
    } catch (Throwable e) {
      throw new IOException("Could not load schema provider class " + schemaProviderClass, e);
    }
  }

  /**
   * Executes Bootstrap.
   */
  public void execute() throws IOException {
    initializeTable();

    try (SparkRDDWriteClient bootstrapClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jssc), bootstrapConfig)) {
      HashMap checkpointCommitMetadata = new HashMap<>();
      checkpointCommitMetadata.put(CHECKPOINT_KEY, Config.checkpoint);
      bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata));
    } catch (Exception e) {
      Path basePath = new Path(cfg.basePath);
      if (fs.exists(basePath)) {
        LOG.warn("deleted target base path " + cfg.basePath);
        fs.delete(basePath, true);
      }
      throw new HoodieException("Failed to bootstrap table", e);
    }
    syncHive();
  }

  /**
   * Sync to Hive.
   */
  private void syncHive() {
    if (cfg.enableHiveSync) {
      TypedProperties metaProps = new TypedProperties();
      metaProps.putAll(props);
      metaProps.put(META_SYNC_DATABASE_NAME.key(), cfg.database);
      metaProps.put(META_SYNC_TABLE_NAME.key(), cfg.tableName);
      metaProps.put(META_SYNC_BASE_PATH.key(), cfg.basePath);
      metaProps.put(META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat);
      if (props.getBoolean(HIVE_SYNC_BUCKET_SYNC.key(), HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
        metaProps.put(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(BUCKET_INDEX_HASH_FIELD.key()),
            props.getInteger(BUCKET_INDEX_NUM_BUCKETS.key())));
      }

      try (HiveSyncTool hiveSyncTool = new HiveSyncTool(metaProps, configuration)) {
        hiveSyncTool.syncHoodieTable();
      }
    }
  }

  private void initializeTable() throws IOException {
    Path basePath = new Path(cfg.basePath);
    if (fs.exists(basePath)) {
      if (cfg.bootstrapOverwrite) {
        LOG.warn("Target base path already exists, overwrite it");
        fs.delete(basePath, true);
      } else {
        throw new HoodieException("target base path already exists at " + cfg.basePath
            + ". Cannot bootstrap data on top of an existing table");
      }
    }
    Pair keyGenClassAndParCols = genKeyGenClassAndPartitionColumns();
    Map timestampKeyGeneratorConfigs =
        extractConfigsRelatedToTimestampBasedKeyGenerator(keyGenClassAndParCols.getLeft(), props);

    HoodieTableMetaClient.PropertyBuilder builder = HoodieTableMetaClient.withPropertyBuilder()
        .fromProperties(props)
        .setTableType(cfg.tableType)
        .setDatabaseName(cfg.database)
        .setTableName(cfg.tableName)
        .setRecordKeyFields(props.getString(RECORDKEY_FIELD_NAME.key()))
        .setPreCombineField(props.getString(PRECOMBINE_FIELD_NAME.key(), null))
        .setPopulateMetaFields(props.getBoolean(
            POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue()))
        .setArchiveLogFolder(props.getString(
            ARCHIVELOG_FOLDER.key(), ARCHIVELOG_FOLDER.defaultValue()))
        .setPayloadClassName(cfg.payloadClass)
        .setBaseFileFormat(cfg.baseFileFormat)
        .setBootstrapIndexClass(cfg.bootstrapIndexClass)
        .setBootstrapBasePath(bootstrapBasePath)
        .setCDCEnabled(props.getBoolean(HoodieTableConfig.CDC_ENABLED.key(),
            HoodieTableConfig.CDC_ENABLED.defaultValue()))
        .setCDCSupplementalLoggingMode(props.getString(HoodieTableConfig.CDC_SUPPLEMENTAL_LOGGING_MODE.key(),
            HoodieTableConfig.CDC_SUPPLEMENTAL_LOGGING_MODE.defaultValue()))
        .setHiveStylePartitioningEnable(props.getBoolean(
            HIVE_STYLE_PARTITIONING_ENABLE.key(),
            Boolean.parseBoolean(HIVE_STYLE_PARTITIONING_ENABLE.defaultValue())
        ))
        .setUrlEncodePartitioning(props.getBoolean(
            URL_ENCODE_PARTITIONING.key(),
            Boolean.parseBoolean(URL_ENCODE_PARTITIONING.defaultValue())))
        .setCommitTimezone(HoodieTimelineTimeZone.valueOf(props.getString(
            TIMELINE_TIMEZONE.key(),
            String.valueOf(TIMELINE_TIMEZONE.defaultValue()))))
        .setPartitionMetafileUseBaseFormat(props.getBoolean(
            PARTITION_METAFILE_USE_BASE_FORMAT.key(),
            PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue()))
        .set(timestampKeyGeneratorConfigs)
        .setKeyGeneratorClassProp(keyGenClassAndParCols.getLeft())
        .setPartitionFields(keyGenClassAndParCols.getRight());

    builder.initTable(HadoopFSUtils.getStorageConfWithCopy(jssc.hadoopConfiguration()), cfg.basePath);
  }

  private Pair genKeyGenClassAndPartitionColumns() {
    String keyGenClass;
    if (StringUtils.nonEmpty(props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), null))) {
      keyGenClass = props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key());
    } else if (StringUtils.nonEmpty(props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), null))) {
      keyGenClass = HoodieSparkKeyGeneratorFactory.getKeyGeneratorClassName(props);
    } else {
      keyGenClass = KeyGeneratorType.getKeyGeneratorClassName(new HoodieConfig(props));
    }
    props.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), keyGenClass);
    String partitionColumns = SparkKeyGenUtils.getPartitionColumns(props);

    if (StringUtils.isNullOrEmpty(partitionColumns)) {
      partitionColumns = null;
      if (keyGenClass.equals(SimpleKeyGenerator.class.getName())) {
        keyGenClass = NonpartitionedKeyGenerator.class.getName();
      }
    }
    return Pair.of(keyGenClass, partitionColumns);
  }

  private Map extractConfigsRelatedToTimestampBasedKeyGenerator(String keyGenerator, TypedProperties params) {
    if (TimestampBasedKeyGenerator.class.getCanonicalName().equals(keyGenerator)
        || TimestampBasedAvroKeyGenerator.class.getCanonicalName().equals(keyGenerator)) {
      return filterProperties(params, HoodieTableConfig.PERSISTED_CONFIG_LIST);
    }
    return Collections.emptyMap();
  }

  public static class Config {
    private String database;

    private String tableName;
    private String tableType;

    private String basePath;

    private String baseFileFormat;
    private String bootstrapIndexClass;
    private String schemaProviderClass;
    private String payloadClass;
    private Boolean enableHiveSync;

    private Boolean bootstrapOverwrite;

    public static String checkpoint = null;

    public void setDatabase(String database) {
      this.database = database;
    }

    public void setTableName(String tableName) {
      this.tableName = tableName;
    }

    public void setTableType(String tableType) {
      this.tableType = tableType;
    }

    public void setBasePath(String basePath) {
      this.basePath = basePath;
    }

    public void setBaseFileFormat(String baseFileFormat) {
      this.baseFileFormat = baseFileFormat;
    }

    public void setBootstrapIndexClass(String bootstrapIndexClass) {
      this.bootstrapIndexClass = bootstrapIndexClass;
    }

    public void setSchemaProviderClass(String schemaProviderClass) {
      this.schemaProviderClass = schemaProviderClass;
    }

    public void setPayloadClass(String payloadClass) {
      this.payloadClass = payloadClass;
    }

    public void setEnableHiveSync(Boolean enableHiveSync) {
      this.enableHiveSync = enableHiveSync;
    }

    public void setBootstrapOverwrite(Boolean bootstrapOverwrite) {
      this.bootstrapOverwrite = bootstrapOverwrite;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy