org.apache.iceberg.mr.hive.HiveIcebergMetaHook Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-mr Show documentation
Show all versions of iceberg-mr Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.mr.hive;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.iceberg.BaseMetastoreTableOperations;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableMetadataParser;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.hive.HiveSchemaUtil;
import org.apache.iceberg.hive.HiveTableOperations;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HiveIcebergMetaHook implements HiveMetaHook {
private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergMetaHook.class);
private static final Set PARAMETERS_TO_REMOVE =
ImmutableSet.of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME);
private static final Set PROPERTIES_TO_REMOVE =
ImmutableSet
// We don't want to push down the metadata location props to Iceberg from HMS,
// since the snapshot pointer in HMS would always be one step ahead
.of(
BaseMetastoreTableOperations.METADATA_LOCATION_PROP,
BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP,
// Initially we'd like to cache the partition spec in HMS, but not push it down later to
// Iceberg during alter
// table commands since by then the HMS info can be stale + Iceberg does not store its
// partition spec in the props
InputFormatConfig.PARTITION_SPEC);
private final Configuration conf;
private Table icebergTable = null;
private Properties catalogProperties;
private boolean deleteIcebergTable;
private FileIO deleteIo;
private TableMetadata deleteMetadata;
public HiveIcebergMetaHook(Configuration conf) {
this.conf = conf;
}
@Override
public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
this.catalogProperties = getCatalogProperties(hmsTable);
// Set the table type even for non HiveCatalog based tables
hmsTable
.getParameters()
.put(
BaseMetastoreTableOperations.TABLE_TYPE_PROP,
BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ROOT));
if (!Catalogs.hiveCatalog(conf, catalogProperties)) {
// For non-HiveCatalog tables too, we should set the input and output format
// so that the table can be read by other engines like Impala
hmsTable.getSd().setInputFormat(HiveIcebergInputFormat.class.getCanonicalName());
hmsTable.getSd().setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName());
// If not using HiveCatalog check for existing table
try {
this.icebergTable = Catalogs.loadTable(conf, catalogProperties);
Preconditions.checkArgument(
catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null,
"Iceberg table already created - can not use provided schema");
Preconditions.checkArgument(
catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null,
"Iceberg table already created - can not use provided partition specification");
LOG.info("Iceberg table already exists {}", icebergTable);
return;
} catch (NoSuchTableException nte) {
// If the table does not exist we will create it below
}
}
// If the table does not exist collect data for table creation
// - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the
// user can override the
// Iceberg schema and specification generated by the code
Schema schema = schema(catalogProperties, hmsTable);
PartitionSpec spec = spec(schema, hmsTable);
// If there are partition keys specified remove them from the HMS table and add them to the
// column list
if (hmsTable.isSetPartitionKeys()) {
hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys());
hmsTable.setPartitionKeysIsSet(false);
}
catalogProperties.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(schema));
catalogProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
// Allow purging table data if the table is created now and not set otherwise
hmsTable.getParameters().putIfAbsent(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE");
// If the table is not managed by Hive catalog then the location should be set
if (!Catalogs.hiveCatalog(conf, catalogProperties)) {
Preconditions.checkArgument(
hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null,
"Table location not set");
}
// Remove creation related properties
PARAMETERS_TO_REMOVE.forEach(hmsTable.getParameters()::remove);
}
@Override
public void rollbackCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
// do nothing
}
@Override
public void commitCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
if (icebergTable == null) {
if (Catalogs.hiveCatalog(conf, catalogProperties)) {
catalogProperties.put(TableProperties.ENGINE_HIVE_ENABLED, true);
}
Catalogs.createTable(conf, catalogProperties);
}
}
@Override
public void preDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
this.catalogProperties = getCatalogProperties(hmsTable);
this.deleteIcebergTable =
hmsTable.getParameters() != null
&& "TRUE"
.equalsIgnoreCase(
hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE));
if (deleteIcebergTable && Catalogs.hiveCatalog(conf, catalogProperties)) {
// Store the metadata and the io for deleting the actual table data
try {
String metadataLocation =
hmsTable.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP);
this.deleteIo = Catalogs.loadTable(conf, catalogProperties).io();
this.deleteMetadata = TableMetadataParser.read(deleteIo, metadataLocation);
} catch (Exception e) {
LOG.error(
"preDropTable: Error during loading Iceberg table or parsing its metadata for HMS table: {}.{}. "
+ "In some cases, this might lead to undeleted metadata files under the table directory: {}. "
+ "Please double check and, if needed, manually delete any dangling files/folders, if any. "
+ "In spite of this error, the HMS table drop operation should proceed as normal.",
hmsTable.getDbName(),
hmsTable.getTableName(),
hmsTable.getSd().getLocation(),
e);
}
}
}
@Override
public void rollbackDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
// do nothing
}
@Override
public void commitDropTable(
org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean deleteData) {
if (deleteData && deleteIcebergTable) {
try {
if (!Catalogs.hiveCatalog(conf, catalogProperties)) {
LOG.info(
"Dropping with purge all the data for table {}.{}",
hmsTable.getDbName(),
hmsTable.getTableName());
Catalogs.dropTable(conf, catalogProperties);
} else {
// do nothing if metadata folder has been deleted already (Hive 4 behaviour for
// purge=TRUE)
if (deleteMetadata != null && deleteIo.newInputFile(deleteMetadata.location()).exists()) {
CatalogUtil.dropTableData(deleteIo, deleteMetadata);
}
}
} catch (Exception e) {
// we want to successfully complete the Hive DROP TABLE command despite catalog-related
// exceptions here
// e.g. we wish to successfully delete a Hive table even if the underlying Hadoop table has
// already been deleted
LOG.warn(
"Exception during commitDropTable operation for table {}.{}.",
hmsTable.getDbName(),
hmsTable.getTableName(),
e);
}
}
}
/**
* Calculates the properties we would like to send to the catalog.
*
*
* - The base of the properties is the properties stored at the Hive Metastore for the given
* table
*
- We add the {@link Catalogs#LOCATION} as the table location
*
- We add the {@link Catalogs#NAME} as TableIdentifier defined by the database name and
* table name
*
- We remove some parameters that we don't want to push down to the Iceberg table props
*
*
* @param hmsTable Table for which we are calculating the properties
* @return The properties we can provide for Iceberg functions, like {@link Catalogs}
*/
private static Properties getCatalogProperties(
org.apache.hadoop.hive.metastore.api.Table hmsTable) {
Properties properties = new Properties();
hmsTable
.getParameters()
.forEach(
(key, value) -> {
// translate key names between HMS and Iceberg where needed
String icebergKey = HiveTableOperations.translateToIcebergProp(key);
properties.put(icebergKey, value);
});
if (properties.get(Catalogs.LOCATION) == null
&& hmsTable.getSd() != null
&& hmsTable.getSd().getLocation() != null) {
properties.put(Catalogs.LOCATION, hmsTable.getSd().getLocation());
}
if (properties.get(Catalogs.NAME) == null) {
properties.put(
Catalogs.NAME,
TableIdentifier.of(hmsTable.getDbName(), hmsTable.getTableName()).toString());
}
// Remove HMS table parameters we don't want to propagate to Iceberg
PROPERTIES_TO_REMOVE.forEach(properties::remove);
return properties;
}
private Schema schema(
Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) {
boolean autoConversion = conf.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
if (properties.getProperty(InputFormatConfig.TABLE_SCHEMA) != null) {
return SchemaParser.fromJson(properties.getProperty(InputFormatConfig.TABLE_SCHEMA));
} else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) {
// Add partitioning columns to the original column list before creating the Iceberg Schema
List cols = Lists.newArrayList(hmsTable.getSd().getCols());
cols.addAll(hmsTable.getPartitionKeys());
return HiveSchemaUtil.convert(cols, autoConversion);
} else {
return HiveSchemaUtil.convert(hmsTable.getSd().getCols(), autoConversion);
}
}
private static PartitionSpec spec(
Schema schema, org.apache.hadoop.hive.metastore.api.Table hmsTable) {
if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) {
Preconditions.checkArgument(
!hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(),
"Provide only one of the following: Hive partition specification, or the "
+ InputFormatConfig.PARTITION_SPEC
+ " property");
return PartitionSpecParser.fromJson(
schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC));
} else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) {
// If the table is partitioned then generate the identity partition definitions for the
// Iceberg table
return HiveSchemaUtil.spec(schema, hmsTable.getPartitionKeys());
} else {
return PartitionSpec.unpartitioned();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy