All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.hive.HiveSyncTool Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hive;

import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.hive.util.ConfigUtils;
import org.apache.hudi.hive.util.HiveSchemaUtil;
import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;

/**
 * Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
 * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
 * 

* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the * partitions incrementally (all the partitions modified since the last commit) */ @SuppressWarnings("WeakerAccess") public class HiveSyncTool extends AbstractSyncTool { private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; protected final HiveSyncConfig cfg; protected HoodieHiveClient hoodieHiveClient = null; protected String snapshotTableName = null; protected Option roTableName = null; public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { super(configuration.getAllProperties(), fs); try { this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs); } catch (RuntimeException e) { if (cfg.ignoreExceptions) { LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); } else { throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); } } this.cfg = cfg; // Set partitionFields to empty, when the NonPartitionedExtractor is used if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) { LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used"); cfg.partitionFields = new ArrayList<>(); } if (hoodieHiveClient != null) { switch (hoodieHiveClient.getTableType()) { case COPY_ON_WRITE: this.snapshotTableName = cfg.tableName; this.roTableName = Option.empty(); break; case MERGE_ON_READ: this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE; this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) : Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE); break; default: LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); throw new InvalidTableException(hoodieHiveClient.getBasePath()); } } } @Override public void syncHoodieTable() { try { if (hoodieHiveClient != null) { doSync(); } } catch (RuntimeException re) { throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re); } finally { if (hoodieHiveClient != null) { hoodieHiveClient.close(); } } } protected void doSync() { switch (hoodieHiveClient.getTableType()) { case COPY_ON_WRITE: syncHoodieTable(snapshotTableName, false, false); break; case MERGE_ON_READ: // sync a RO table for MOR syncHoodieTable(roTableName.get(), false, true); // sync a RT table for MOR syncHoodieTable(snapshotTableName, true, false); break; default: LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); throw new InvalidTableException(hoodieHiveClient.getBasePath()); } } protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) { LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType()); // check if the database exists else create it if (cfg.autoCreateDatabase) { try { if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { hoodieHiveClient.createDatabase(cfg.databaseName); } } catch (Exception e) { // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing LOG.warn("Unable to create database", e); } } else { if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName); } } // Check if the necessary table exists boolean tableExists = hoodieHiveClient.doesTableExist(tableName); // check if isDropPartition boolean isDropPartition = hoodieHiveClient.isDropPartition(); // Get the parquet schema for this table looking at the latest commit MessageType schema = hoodieHiveClient.getDataSchema(); // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, // so we disable the syncAsSparkDataSourceTable here to avoid read such kind table // by the data source way (which will use the HoodieBootstrapRelation). // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical. if (hoodieHiveClient.isBootstrap() && hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ && !readAsOptimized) { cfg.syncAsSparkDataSourceTable = false; } // Sync schema if needed boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema); LOG.info("Schema sync complete. Syncing partitions for " + tableName); // Get the last time we successfully synced partitions Option lastCommitTimeSynced = Option.empty(); if (tableExists) { lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName); } LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null")); List writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced); LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); // Sync the partitions if needed boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition); boolean meetSyncConditions = schemaChanged || partitionsChanged; if (!cfg.isConditionalSync || meetSyncConditions) { hoodieHiveClient.updateLastCommitTimeSynced(tableName); } LOG.info("Sync complete for " + tableName); } /** * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the * table schema. * * @param tableExists - does table exist * @param schema - extracted schema */ private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, boolean readAsOptimized, MessageType schema) { // Append spark table properties & serde properties Map tableProperties = ConfigUtils.toMap(cfg.tableProperties); Map serdeProperties = ConfigUtils.toMap(cfg.serdeProperties); if (cfg.syncAsSparkDataSourceTable) { Map sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema); Map sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized); tableProperties.putAll(sparkTableProperties); serdeProperties.putAll(sparkSerdeProperties); } boolean schemaChanged = false; // Check and sync schema if (!tableExists) { LOG.info("Hive table " + tableName + " is not found. Creating it"); HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase()); String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) { // Parquet input format had an InputFormat class visible under the old naming scheme. inputFormatClassName = useRealTimeInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName(); } String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat); String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat); // Custom serde will not work with ALTER TABLE REPLACE COLUMNS // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive // /ql/exec/DDLTask.java#L3488 hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties); schemaChanged = true; } else { // Check if the table schema has evolved Map tableSchema = hoodieHiveClient.getTableSchema(tableName); SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp); if (!schemaDiff.isEmpty()) { LOG.info("Schema difference found for " + tableName); hoodieHiveClient.updateTableDefinition(tableName, schema); // Sync the table properties if the schema has changed if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) { hoodieHiveClient.updateTableProperties(tableName, tableProperties); LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties); } schemaChanged = true; } else { LOG.info("No Schema difference for " + tableName); } } return schemaChanged; } /** * Get Spark Sql related table properties. This is used for spark datasource table. * @param schema The schema to write to the table. * @return A new parameters added the spark's table properties. */ private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) { // Convert the schema and partition info used by spark sql to hive table properties. // The following code refers to the spark code in // https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala GroupType originGroupType = schema.asGroupType(); List partitionNames = cfg.partitionFields; List partitionCols = new ArrayList<>(); List dataCols = new ArrayList<>(); Map column2Field = new HashMap<>(); for (Type field : originGroupType.getFields()) { column2Field.put(field.getName(), field); } // Get partition columns and data columns. for (String partitionName : partitionNames) { // Default the unknown partition fields to be String. // Keep the same logical with HiveSchemaUtil#getPartitionKeyType. partitionCols.add(column2Field.getOrDefault(partitionName, new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8))); } for (Type field : originGroupType.getFields()) { if (!partitionNames.contains(field.getName())) { dataCols.add(field); } } List reOrderedFields = new ArrayList<>(); reOrderedFields.addAll(dataCols); reOrderedFields.addAll(partitionCols); GroupType reOrderedType = new GroupType(originGroupType.getRepetition(), originGroupType.getName(), reOrderedFields); Map sparkProperties = new HashMap<>(); sparkProperties.put("spark.sql.sources.provider", "hudi"); // Split the schema string to multi-parts according the schemaLengthThreshold size. String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType); int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold; sparkProperties.put("spark.sql.sources.schema.numParts", String.valueOf(numSchemaPart)); // Add each part of schema string to sparkProperties for (int i = 0; i < numSchemaPart; i++) { int start = i * schemaLengthThreshold; int end = Math.min(start + schemaLengthThreshold, schemaString.length()); sparkProperties.put("spark.sql.sources.schema.part." + i, schemaString.substring(start, end)); } // Add partition columns if (!partitionNames.isEmpty()) { sparkProperties.put("spark.sql.sources.schema.numPartCols", String.valueOf(partitionNames.size())); for (int i = 0; i < partitionNames.size(); i++) { sparkProperties.put("spark.sql.sources.schema.partCol." + i, partitionNames.get(i)); } } return sparkProperties; } private Map getSparkSerdeProperties(boolean readAsOptimized) { Map sparkSerdeProperties = new HashMap<>(); sparkSerdeProperties.put("path", cfg.basePath); sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized)); return sparkSerdeProperties; } /** * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the * partition path does not match, it updates the partition path). */ private boolean syncPartitions(String tableName, List writtenPartitionsSince, boolean isDropPartition) { boolean partitionsChanged; try { List hivePartitions = hoodieHiveClient.scanTablePartitions(tableName); List partitionEvents = hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition); List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); if (!newPartitions.isEmpty()) { LOG.info("New Partitions " + newPartitions); hoodieHiveClient.addPartitionsToTable(tableName, newPartitions); } List updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); if (!updatePartitions.isEmpty()) { LOG.info("Changed Partitions " + updatePartitions); hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions); } List dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP); if (!dropPartitions.isEmpty()) { LOG.info("Drop Partitions " + dropPartitions); hoodieHiveClient.dropPartitionsToTable(tableName, dropPartitions); } partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty(); } catch (Exception e) { throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e); } return partitionsChanged; } private List filterPartitions(List events, PartitionEventType eventType) { return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition) .collect(Collectors.toList()); } public static void main(String[] args) { // parse the params final HiveSyncConfig cfg = new HiveSyncConfig(); JCommander cmd = new JCommander(cfg, null, args); if (cfg.help || args.length == 0) { cmd.usage(); System.exit(1); } FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); HiveConf hiveConf = new HiveConf(); hiveConf.addResource(fs.getConf()); new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy