
org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.deltastreamer;
import com.beust.jcommander.Parameter;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.utilities.IdentitySplitter;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.schema.SchemaRegistryProvider;
import com.beust.jcommander.JCommander;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.utilities.sources.JsonDFSSource;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Wrapper over HoodieDeltaStreamer.java class.
* Helps with ingesting incremental data into hoodie datasets for multiple tables.
* Currently supports only COPY_ON_WRITE storage type.
*/
public class HoodieMultiTableDeltaStreamer {
private static Logger logger = LogManager.getLogger(HoodieMultiTableDeltaStreamer.class);
private List tableExecutionContexts;
private transient JavaSparkContext jssc;
private Set successTables;
private Set failedTables;
public HoodieMultiTableDeltaStreamer(Config config, JavaSparkContext jssc) throws IOException {
this.tableExecutionContexts = new ArrayList<>();
this.successTables = new HashSet<>();
this.failedTables = new HashSet<>();
this.jssc = jssc;
String commonPropsFile = config.propsFilePath;
String configFolder = config.configFolder;
ValidationUtils.checkArgument(!config.filterDupes || config.operation != HoodieDeltaStreamer.Operation.UPSERT,
"'--filter-dupes' needs to be disabled when '--op' is 'UPSERT' to ensure updates are not missed.");
FileSystem fs = FSUtils.getFs(commonPropsFile, jssc.hadoopConfiguration());
configFolder = configFolder.charAt(configFolder.length() - 1) == '/' ? configFolder.substring(0, configFolder.length() - 1) : configFolder;
checkIfPropsFileAndConfigFolderExist(commonPropsFile, configFolder, fs);
TypedProperties properties = UtilHelpers.readConfig(fs, new Path(commonPropsFile), new ArrayList<>()).getConfig();
//get the tables to be ingested and their corresponding config files from this properties instance
populateTableExecutionContextList(properties, configFolder, fs, config);
}
private void checkIfPropsFileAndConfigFolderExist(String commonPropsFile, String configFolder, FileSystem fs) throws IOException {
if (!fs.exists(new Path(commonPropsFile))) {
throw new IllegalArgumentException("Please provide valid common config file path!");
}
if (!fs.exists(new Path(configFolder))) {
fs.mkdirs(new Path(configFolder));
}
}
private void checkIfTableConfigFileExists(String configFolder, FileSystem fs, String configFilePath) throws IOException {
if (!fs.exists(new Path(configFilePath)) || !fs.isFile(new Path(configFilePath))) {
throw new IllegalArgumentException("Please provide valid table config file path!");
}
Path path = new Path(configFilePath);
Path filePathInConfigFolder = new Path(configFolder, path.getName());
if (!fs.exists(filePathInConfigFolder)) {
FileUtil.copy(fs, path, fs, filePathInConfigFolder, false, fs.getConf());
}
}
//commonProps are passed as parameter which contain table to config file mapping
private void populateTableExecutionContextList(TypedProperties properties, String configFolder, FileSystem fs, Config config) throws IOException {
List tablesToBeIngested = getTablesToBeIngested(properties);
logger.info("tables to be ingested via MultiTableDeltaStreamer : " + tablesToBeIngested);
TableExecutionContext executionContext;
for (String table : tablesToBeIngested) {
String[] tableWithDatabase = table.split("\\.");
String database = tableWithDatabase.length > 1 ? tableWithDatabase[0] : "default";
String currentTable = tableWithDatabase.length > 1 ? tableWithDatabase[1] : table;
String configProp = Constants.INGESTION_PREFIX + database + Constants.DELIMITER + currentTable + Constants.INGESTION_CONFIG_SUFFIX;
String configFilePath = properties.getString(configProp, Helpers.getDefaultConfigFilePath(configFolder, database, currentTable));
checkIfTableConfigFileExists(configFolder, fs, configFilePath);
TypedProperties tableProperties = UtilHelpers.readConfig(fs, new Path(configFilePath), new ArrayList<>()).getConfig();
properties.forEach((k, v) -> {
tableProperties.setProperty(k.toString(), v.toString());
});
final HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
//copy all the values from config to cfg
String targetBasePath = resetTarget(config, database, currentTable);
Helpers.deepCopyConfigs(config, cfg);
String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, "");
cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath;
if (cfg.enableHiveSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), ""))) {
throw new HoodieException("Hive sync table field not provided!");
}
populateSchemaProviderProps(cfg, tableProperties);
executionContext = new TableExecutionContext();
executionContext.setProperties(tableProperties);
executionContext.setConfig(cfg);
executionContext.setDatabase(database);
executionContext.setTableName(currentTable);
this.tableExecutionContexts.add(executionContext);
}
}
private List getTablesToBeIngested(TypedProperties properties) {
String combinedTablesString = properties.getString(Constants.TABLES_TO_BE_INGESTED_PROP);
if (combinedTablesString == null) {
return new ArrayList<>();
}
String[] tablesArray = combinedTablesString.split(Constants.COMMA_SEPARATOR);
return Arrays.asList(tablesArray);
}
private void populateSchemaProviderProps(HoodieDeltaStreamer.Config cfg, TypedProperties typedProperties) {
if (cfg.schemaProviderClassName.equals(SchemaRegistryProvider.class.getName())) {
String schemaRegistryBaseUrl = typedProperties.getString(Constants.SCHEMA_REGISTRY_BASE_URL_PROP);
String schemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_URL_SUFFIX_PROP);
typedProperties.setProperty(Constants.SOURCE_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + schemaRegistrySuffix);
typedProperties.setProperty(Constants.TARGET_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + schemaRegistrySuffix);
}
}
public static class Helpers {
static String getDefaultConfigFilePath(String configFolder, String database, String currentTable) {
return configFolder + Constants.FILE_DELIMITER + database + Constants.UNDERSCORE + currentTable + Constants.DEFAULT_CONFIG_FILE_NAME_SUFFIX;
}
static String getTableWithDatabase(TableExecutionContext context) {
return context.getDatabase() + Constants.DELIMITER + context.getTableName();
}
static void deepCopyConfigs(Config globalConfig, HoodieDeltaStreamer.Config tableConfig) {
tableConfig.enableHiveSync = globalConfig.enableHiveSync;
tableConfig.schemaProviderClassName = globalConfig.schemaProviderClassName;
tableConfig.sourceOrderingField = globalConfig.sourceOrderingField;
tableConfig.sourceClassName = globalConfig.sourceClassName;
tableConfig.tableType = globalConfig.tableType;
tableConfig.targetTableName = globalConfig.targetTableName;
tableConfig.operation = globalConfig.operation;
tableConfig.sourceLimit = globalConfig.sourceLimit;
tableConfig.checkpoint = globalConfig.checkpoint;
tableConfig.continuousMode = globalConfig.continuousMode;
tableConfig.filterDupes = globalConfig.filterDupes;
tableConfig.payloadClassName = globalConfig.payloadClassName;
tableConfig.forceDisableCompaction = globalConfig.forceDisableCompaction;
tableConfig.maxPendingCompactions = globalConfig.maxPendingCompactions;
tableConfig.minSyncIntervalSeconds = globalConfig.minSyncIntervalSeconds;
tableConfig.transformerClassNames = globalConfig.transformerClassNames;
tableConfig.commitOnErrors = globalConfig.commitOnErrors;
tableConfig.compactSchedulingMinShare = globalConfig.compactSchedulingMinShare;
tableConfig.compactSchedulingWeight = globalConfig.compactSchedulingWeight;
tableConfig.deltaSyncSchedulingMinShare = globalConfig.deltaSyncSchedulingMinShare;
tableConfig.deltaSyncSchedulingWeight = globalConfig.deltaSyncSchedulingWeight;
tableConfig.sparkMaster = globalConfig.sparkMaster;
}
}
public static void main(String[] args) throws IOException {
final Config config = new Config();
JCommander cmd = new JCommander(config, null, args);
if (config.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
JavaSparkContext jssc = UtilHelpers.buildSparkContext("multi-table-delta-streamer", Constants.LOCAL_SPARK_MASTER);
try {
new HoodieMultiTableDeltaStreamer(config, jssc).sync();
} finally {
jssc.stop();
}
}
public static class Config implements Serializable {
@Parameter(names = {"--base-path-prefix"},
description = "base path prefix for multi table support via HoodieMultiTableDeltaStreamer class")
public String basePathPrefix;
@Parameter(names = {"--target-table"}, description = "name of the target table", required = true)
public String targetTableName;
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ", required = true)
public String tableType;
@Parameter(names = {"--config-folder"}, description = "Path to folder which contains all the properties file", required = true)
public String configFolder;
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
+ "to individual classes, for supported properties.")
public String propsFilePath =
"file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+ "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
splitter = IdentitySplitter.class)
public List configs = new ArrayList<>();
@Parameter(names = {"--source-class"},
description = "Subclass of org.apache.hudi.utilities.sources to read data. "
+ "Built-in options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), AvroDFSSource, "
+ "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}")
public String sourceClassName = JsonDFSSource.class.getName();
@Parameter(names = {"--source-ordering-field"}, description = "Field within source record to decide how"
+ " to break ties between records with same key in input data. Default: 'ts' holding unix timestamp of record")
public String sourceOrderingField = "ts";
@Parameter(names = {"--payload-class"}, description = "subclass of HoodieRecordPayload, that works off "
+ "a GenericRecord. Implement your own, if you want to do something other than overwriting existing value")
public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName();
@Parameter(names = {"--schemaprovider-class"}, description = "subclass of org.apache.hudi.utilities.schema"
+ ".SchemaProvider to attach schemas to input & target table data, built in options: "
+ "org.apache.hudi.utilities.schema.FilebasedSchemaProvider."
+ "Source (See org.apache.hudi.utilities.sources.Source) implementation can implement their own SchemaProvider."
+ " For Sources that return Dataset, the schema is obtained implicitly. "
+ "However, this CLI option allows overriding the schemaprovider returned by Source.")
public String schemaProviderClassName = null;
@Parameter(names = {"--transformer-class"},
description = "A subclass or a list of subclasses of org.apache.hudi.utilities.transform.Transformer"
+ ". Allows transforming raw source Dataset to a target Dataset (conforming to target schema) before "
+ "writing. Default : Not set. E:g - org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which "
+ "allows a SQL query templated to be passed as a transformation function). "
+ "Pass a comma-separated list of subclass names to chain the transformations.")
public List transformerClassNames = null;
@Parameter(names = {"--source-limit"}, description = "Maximum amount of data to read from source. "
+ "Default: No limit, e.g: DFS-Source => max bytes to read, Kafka-Source => max events to read")
public long sourceLimit = Long.MAX_VALUE;
@Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input "
+ "is purely new data/inserts to gain speed)", converter = HoodieDeltaStreamer.OperationConverter.class)
public HoodieDeltaStreamer.Operation operation = HoodieDeltaStreamer.Operation.UPSERT;
@Parameter(names = {"--filter-dupes"},
description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert")
public Boolean filterDupes = false;
@Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
public Boolean enableHiveSync = false;
@Parameter(names = {"--max-pending-compactions"},
description = "Maximum number of outstanding inflight/requested compactions. Delta Sync will not happen unless"
+ "outstanding compactions is less than this number")
public Integer maxPendingCompactions = 5;
@Parameter(names = {"--continuous"}, description = "Delta Streamer runs in continuous mode running"
+ " source-fetch -> Transform -> Hudi Write in loop")
public Boolean continuousMode = false;
@Parameter(names = {"--min-sync-interval-seconds"},
description = "the min sync interval of each sync in continuous mode")
public Integer minSyncIntervalSeconds = 0;
@Parameter(names = {"--spark-master"}, description = "spark master to use.")
public String sparkMaster = "local[2]";
@Parameter(names = {"--commit-on-errors"}, description = "Commit even when some records failed to be written")
public Boolean commitOnErrors = false;
@Parameter(names = {"--delta-sync-scheduling-weight"},
description = "Scheduling weight for delta sync as defined in "
+ "https://spark.apache.org/docs/latest/job-scheduling.html")
public Integer deltaSyncSchedulingWeight = 1;
@Parameter(names = {"--compact-scheduling-weight"}, description = "Scheduling weight for compaction as defined in "
+ "https://spark.apache.org/docs/latest/job-scheduling.html")
public Integer compactSchedulingWeight = 1;
@Parameter(names = {"--delta-sync-scheduling-minshare"}, description = "Minshare for delta sync as defined in "
+ "https://spark.apache.org/docs/latest/job-scheduling.html")
public Integer deltaSyncSchedulingMinShare = 0;
@Parameter(names = {"--compact-scheduling-minshare"}, description = "Minshare for compaction as defined in "
+ "https://spark.apache.org/docs/latest/job-scheduling.html")
public Integer compactSchedulingMinShare = 0;
/**
* Compaction is enabled for MoR table by default. This flag disables it
*/
@Parameter(names = {"--disable-compaction"},
description = "Compaction is enabled for MoR table by default. This flag disables it ")
public Boolean forceDisableCompaction = false;
/**
* Resume Delta Streamer from this checkpoint.
*/
@Parameter(names = {"--checkpoint"}, description = "Resume Delta Streamer from this checkpoint.")
public String checkpoint = null;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
}
/**
* Resets target table name and target path using base-path-prefix.
*
* @param configuration
* @param database
* @param tableName
* @return
*/
private static String resetTarget(Config configuration, String database, String tableName) {
String basePathPrefix = configuration.basePathPrefix;
basePathPrefix = basePathPrefix.charAt(basePathPrefix.length() - 1) == '/' ? basePathPrefix.substring(0, basePathPrefix.length() - 1) : basePathPrefix;
String targetBasePath = basePathPrefix + Constants.FILE_DELIMITER + database + Constants.FILE_DELIMITER + tableName;
configuration.targetTableName = database + Constants.DELIMITER + tableName;
return targetBasePath;
}
/**
* Creates actual HoodieDeltaStreamer objects for every table/topic and does incremental sync.
*/
public void sync() {
for (TableExecutionContext context : tableExecutionContexts) {
try {
new HoodieDeltaStreamer(context.getConfig(), jssc, context.getProperties()).sync();
successTables.add(Helpers.getTableWithDatabase(context));
} catch (Exception e) {
logger.error("error while running MultiTableDeltaStreamer for table: " + context.getTableName(), e);
failedTables.add(Helpers.getTableWithDatabase(context));
}
}
logger.info("Ingestion was successful for topics: " + successTables);
if (!failedTables.isEmpty()) {
logger.info("Ingestion failed for topics: " + failedTables);
}
}
public static class Constants {
public static final String KAFKA_TOPIC_PROP = "hoodie.deltastreamer.source.kafka.topic";
private static final String SOURCE_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.targetUrl";
public static final String HIVE_SYNC_TABLE_PROP = "hoodie.datasource.hive_sync.table";
private static final String SCHEMA_REGISTRY_BASE_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.baseUrl";
private static final String SCHEMA_REGISTRY_URL_SUFFIX_PROP = "hoodie.deltastreamer.schemaprovider.registry.urlSuffix";
private static final String TABLES_TO_BE_INGESTED_PROP = "hoodie.deltastreamer.ingestion.tablesToBeIngested";
private static final String INGESTION_PREFIX = "hoodie.deltastreamer.ingestion.";
private static final String INGESTION_CONFIG_SUFFIX = ".configFile";
private static final String DEFAULT_CONFIG_FILE_NAME_SUFFIX = "_config.properties";
private static final String TARGET_BASE_PATH_PROP = "hoodie.deltastreamer.ingestion.targetBasePath";
private static final String LOCAL_SPARK_MASTER = "local[2]";
private static final String FILE_DELIMITER = "/";
private static final String DELIMITER = ".";
private static final String UNDERSCORE = "_";
private static final String COMMA_SEPARATOR = ",";
}
public Set getSuccessTables() {
return successTables;
}
public Set getFailedTables() {
return failedTables;
}
public List getTableExecutionContexts() {
return this.tableExecutionContexts;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy