org.apache.iceberg.mr.hive.HiveIcebergMetaHook Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-mr Show documentation
Show all versions of iceberg-mr Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.mr.hive;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.iceberg.BaseMetastoreTableOperations;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableMetadataParser;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.hive.HiveSchemaUtil;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HiveIcebergMetaHook implements HiveMetaHook {
private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergMetaHook.class);
private static final Set PARAMETERS_TO_REMOVE = ImmutableSet
.of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME);
private static final Set PROPERTIES_TO_REMOVE = ImmutableSet
// We don't want to push down the metadata location props to Iceberg from HMS,
// since the snapshot pointer in HMS would always be one step ahead
.of(BaseMetastoreTableOperations.METADATA_LOCATION_PROP,
BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP,
// Initially we'd like to cache the partition spec in HMS, but not push it down later to Iceberg during alter
// table commands since by then the HMS info can be stale + Iceberg does not store its partition spec in the props
InputFormatConfig.PARTITION_SPEC);
private final Configuration conf;
private Table icebergTable = null;
private Properties catalogProperties;
private boolean deleteIcebergTable;
private FileIO deleteIo;
private TableMetadata deleteMetadata;
public HiveIcebergMetaHook(Configuration conf) {
this.conf = conf;
}
@Override
public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
this.catalogProperties = getCatalogProperties(hmsTable);
// Set the table type even for non HiveCatalog based tables
hmsTable.getParameters().put(BaseMetastoreTableOperations.TABLE_TYPE_PROP,
BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase());
if (!Catalogs.hiveCatalog(conf)) {
// For non-HiveCatalog tables too, we should set the input and output format
// so that the table can be read by other engines like Impala
hmsTable.getSd().setInputFormat(HiveIcebergInputFormat.class.getCanonicalName());
hmsTable.getSd().setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName());
// If not using HiveCatalog check for existing table
try {
this.icebergTable = Catalogs.loadTable(conf, catalogProperties);
Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null,
"Iceberg table already created - can not use provided schema");
Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null,
"Iceberg table already created - can not use provided partition specification");
LOG.info("Iceberg table already exists {}", icebergTable);
return;
} catch (NoSuchTableException nte) {
// If the table does not exist we will create it below
}
}
// If the table does not exist collect data for table creation
// - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the user can override the
// Iceberg schema and specification generated by the code
Schema schema = schema(catalogProperties, hmsTable);
PartitionSpec spec = spec(schema, catalogProperties, hmsTable);
// If there are partition keys specified remove them from the HMS table and add them to the column list
if (hmsTable.isSetPartitionKeys()) {
hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys());
hmsTable.setPartitionKeysIsSet(false);
}
catalogProperties.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(schema));
catalogProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec));
// Allow purging table data if the table is created now and not set otherwise
if (hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE) == null) {
hmsTable.getParameters().put(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE");
}
// If the table is not managed by Hive catalog then the location should be set
if (!Catalogs.hiveCatalog(conf)) {
Preconditions.checkArgument(hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null,
"Table location not set");
}
// Remove creation related properties
PARAMETERS_TO_REMOVE.forEach(hmsTable.getParameters()::remove);
}
@Override
public void rollbackCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
// do nothing
}
@Override
public void commitCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
if (icebergTable == null) {
if (Catalogs.hiveCatalog(conf)) {
catalogProperties.put(TableProperties.ENGINE_HIVE_ENABLED, true);
}
Catalogs.createTable(conf, catalogProperties);
}
}
@Override
public void preDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
this.catalogProperties = getCatalogProperties(hmsTable);
this.deleteIcebergTable = hmsTable.getParameters() != null &&
"TRUE".equalsIgnoreCase(hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE));
if (deleteIcebergTable && Catalogs.hiveCatalog(conf)) {
// Store the metadata and the id for deleting the actual table data
String metadataLocation = hmsTable.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP);
this.deleteIo = Catalogs.loadTable(conf, catalogProperties).io();
this.deleteMetadata = TableMetadataParser.read(deleteIo, metadataLocation);
}
}
@Override
public void rollbackDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
// do nothing
}
@Override
public void commitDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean deleteData) {
if (deleteData && deleteIcebergTable) {
if (!Catalogs.hiveCatalog(conf)) {
LOG.info("Dropping with purge all the data for table {}.{}", hmsTable.getDbName(), hmsTable.getTableName());
Catalogs.dropTable(conf, catalogProperties);
} else {
CatalogUtil.dropTableData(deleteIo, deleteMetadata);
}
}
}
/**
* Calculates the properties we would like to send to the catalog.
*
* - The base of the properties is the properties stored at the Hive Metastore for the given table
*
- We add the {@link Catalogs#LOCATION} as the table location
*
- We add the {@link Catalogs#NAME} as TableIdentifier defined by the database name and table name
*
- We remove some parameters that we don't want to push down to the Iceberg table props
*
* @param hmsTable Table for which we are calculating the properties
* @return The properties we can provide for Iceberg functions, like {@link Catalogs}
*/
private static Properties getCatalogProperties(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
Properties properties = new Properties();
properties.putAll(hmsTable.getParameters());
if (properties.get(Catalogs.LOCATION) == null &&
hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null) {
properties.put(Catalogs.LOCATION, hmsTable.getSd().getLocation());
}
if (properties.get(Catalogs.NAME) == null) {
properties.put(Catalogs.NAME, TableIdentifier.of(hmsTable.getDbName(), hmsTable.getTableName()).toString());
}
// Remove HMS table parameters we don't want to propagate to Iceberg
PROPERTIES_TO_REMOVE.forEach(properties::remove);
return properties;
}
private Schema schema(Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) {
boolean autoConversion = conf.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
if (properties.getProperty(InputFormatConfig.TABLE_SCHEMA) != null) {
return SchemaParser.fromJson(properties.getProperty(InputFormatConfig.TABLE_SCHEMA));
} else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) {
// Add partitioning columns to the original column list before creating the Iceberg Schema
List cols = Lists.newArrayList(hmsTable.getSd().getCols());
cols.addAll(hmsTable.getPartitionKeys());
return HiveSchemaUtil.convert(cols, autoConversion);
} else {
return HiveSchemaUtil.convert(hmsTable.getSd().getCols(), autoConversion);
}
}
private static PartitionSpec spec(Schema schema, Properties properties,
org.apache.hadoop.hive.metastore.api.Table hmsTable) {
if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) {
Preconditions.checkArgument(!hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(),
"Provide only one of the following: Hive partition specification, or the " +
InputFormatConfig.PARTITION_SPEC + " property");
return PartitionSpecParser.fromJson(schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC));
} else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) {
// If the table is partitioned then generate the identity partition definitions for the Iceberg table
return HiveSchemaUtil.spec(schema, hmsTable.getPartitionKeys());
} else {
return PartitionSpec.unpartitioned();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy