org.apache.hudi.hive.HiveSyncTool Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.hudi.hive;
import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.hive.util.ConfigUtils;
import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
* partitions incrementally (all the partitions modified since the last commit)
public class HiveSyncTool extends AbstractSyncTool {
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class);
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
protected final HiveSyncConfig cfg;
protected HoodieHiveClient hoodieHiveClient = null;
protected String snapshotTableName = null;
protected Option roTableName = null;
public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
super(configuration.getAllProperties(), fs);
try {
this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
} catch (RuntimeException e) {
if (cfg.ignoreExceptions) {
LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e);
} else {
throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e);
this.cfg = cfg;
// Set partitionFields to empty, when the NonPartitionedExtractor is used
if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) {
LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used");
cfg.partitionFields = new ArrayList<>();
if (hoodieHiveClient != null) {
switch (hoodieHiveClient.getTableType()) {
this.snapshotTableName = cfg.tableName;
this.roTableName = Option.empty();
this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE;
this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) :
Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
throw new InvalidTableException(hoodieHiveClient.getBasePath());
public void syncHoodieTable() {
try {
if (hoodieHiveClient != null) {
} catch (RuntimeException re) {
throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re);
} finally {
if (hoodieHiveClient != null) {
protected void doSync() {
switch (hoodieHiveClient.getTableType()) {
syncHoodieTable(snapshotTableName, false, false);
// sync a RO table for MOR
syncHoodieTable(roTableName.get(), false, true);
// sync a RT table for MOR
syncHoodieTable(snapshotTableName, true, false);
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
throw new InvalidTableException(hoodieHiveClient.getBasePath());
protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
boolean readAsOptimized) {"Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath()
+ " of type " + hoodieHiveClient.getTableType());
// check if the database exists else create it
if (cfg.autoCreateDatabase) {
try {
if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) {
} catch (Exception e) {
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
LOG.warn("Unable to create database", e);
} else {
if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) {
throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName);
// Check if the necessary table exists
boolean tableExists = hoodieHiveClient.doesTableExist(tableName);
// check if isDropPartition
boolean isDropPartition = hoodieHiveClient.isDropPartition();
// Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieHiveClient.getDataSchema();
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
// so we disable the syncAsSparkDataSourceTable here to avoid read such kind table
// by the data source way (which will use the HoodieBootstrapRelation).
// TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical.
if (hoodieHiveClient.isBootstrap()
&& hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ
&& !readAsOptimized) {
cfg.syncAsSparkDataSourceTable = false;
// Sync schema if needed
boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);"Schema sync complete. Syncing partitions for " + tableName);
// Get the last time we successfully synced partitions
Option lastCommitTimeSynced = Option.empty();
if (tableExists) {
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName);
}"Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
List writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced);"Storage partitions scan complete. Found " + writtenPartitionsSince.size());
// Sync the partitions if needed
boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition);
boolean meetSyncConditions = schemaChanged || partitionsChanged;
if (!cfg.isConditionalSync || meetSyncConditions) {
}"Sync complete for " + tableName);
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
* table schema.
* @param tableExists - does table exist
* @param schema - extracted schema
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
boolean readAsOptimized, MessageType schema) {
// Append spark table properties & serde properties
Map tableProperties = ConfigUtils.toMap(cfg.tableProperties);
Map serdeProperties = ConfigUtils.toMap(cfg.serdeProperties);
if (cfg.syncAsSparkDataSourceTable) {
Map sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema);
Map sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
boolean schemaChanged = false;
// Check and sync schema
if (!tableExists) {"Hive table " + tableName + " is not found. Creating it");
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
// Parquet input format had an InputFormat class visible under the old naming scheme.
inputFormatClassName = useRealTimeInputFormat
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
: com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// /ql/exec/
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName,
outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
schemaChanged = true;
} else {
// Check if the table schema has evolved
Map tableSchema = hoodieHiveClient.getTableSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
if (!schemaDiff.isEmpty()) {"Schema difference found for " + tableName);
hoodieHiveClient.updateTableDefinition(tableName, schema);
// Sync the table properties if the schema has changed
if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) {
hoodieHiveClient.updateTableProperties(tableName, tableProperties);"Sync table properties for " + tableName + ", table properties is: " + tableProperties);
schemaChanged = true;
} else {"No Schema difference for " + tableName);
return schemaChanged;
* Get Spark Sql related table properties. This is used for spark datasource table.
* @param schema The schema to write to the table.
* @return A new parameters added the spark's table properties.
private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) {
// Convert the schema and partition info used by spark sql to hive table properties.
// The following code refers to the spark code in
GroupType originGroupType = schema.asGroupType();
List partitionNames = cfg.partitionFields;
List partitionCols = new ArrayList<>();
List dataCols = new ArrayList<>();
Map column2Field = new HashMap<>();
for (Type field : originGroupType.getFields()) {
column2Field.put(field.getName(), field);
// Get partition columns and data columns.
for (String partitionName : partitionNames) {
// Default the unknown partition fields to be String.
// Keep the same logical with HiveSchemaUtil#getPartitionKeyType.
new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8)));
for (Type field : originGroupType.getFields()) {
if (!partitionNames.contains(field.getName())) {
List reOrderedFields = new ArrayList<>();
GroupType reOrderedType = new GroupType(originGroupType.getRepetition(), originGroupType.getName(), reOrderedFields);
Map sparkProperties = new HashMap<>();
sparkProperties.put("spark.sql.sources.provider", "hudi");
// Split the schema string to multi-parts according the schemaLengthThreshold size.
String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType);
int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold;
sparkProperties.put("spark.sql.sources.schema.numParts", String.valueOf(numSchemaPart));
// Add each part of schema string to sparkProperties
for (int i = 0; i < numSchemaPart; i++) {
int start = i * schemaLengthThreshold;
int end = Math.min(start + schemaLengthThreshold, schemaString.length());
sparkProperties.put("spark.sql.sources.schema.part." + i, schemaString.substring(start, end));
// Add partition columns
if (!partitionNames.isEmpty()) {
sparkProperties.put("spark.sql.sources.schema.numPartCols", String.valueOf(partitionNames.size()));
for (int i = 0; i < partitionNames.size(); i++) {
sparkProperties.put("spark.sql.sources.schema.partCol." + i, partitionNames.get(i));
return sparkProperties;
private Map getSparkSerdeProperties(boolean readAsOptimized) {
Map sparkSerdeProperties = new HashMap<>();
sparkSerdeProperties.put("path", cfg.basePath);
sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));
return sparkSerdeProperties;
* Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the
* partition path does not match, it updates the partition path).
private boolean syncPartitions(String tableName, List writtenPartitionsSince, boolean isDropPartition) {
boolean partitionsChanged;
try {
List hivePartitions = hoodieHiveClient.scanTablePartitions(tableName);
List partitionEvents =
hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition);
List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
if (!newPartitions.isEmpty()) {"New Partitions " + newPartitions);
hoodieHiveClient.addPartitionsToTable(tableName, newPartitions);
List updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
if (!updatePartitions.isEmpty()) {"Changed Partitions " + updatePartitions);
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions);
List dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP);
if (!dropPartitions.isEmpty()) {"Drop Partitions " + dropPartitions);
hoodieHiveClient.dropPartitionsToTable(tableName, dropPartitions);
partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty();
} catch (Exception e) {
throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e);
return partitionsChanged;
private List filterPartitions(List events, PartitionEventType eventType) {
return -> s.eventType == eventType).map(s -> s.storagePartition)
public static void main(String[] args) {
// parse the params
final HiveSyncConfig cfg = new HiveSyncConfig();
JCommander cmd = new JCommander(cfg, null, args);
if ( || args.length == 0) {
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
HiveConf hiveConf = new HiveConf();
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();