org.apache.iceberg.mr.hive.HiveIcebergStorageHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-mr Show documentation
Show all versions of iceberg-mr Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.mr.hive;
import java.io.Serializable;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.encryption.EncryptionManager;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.LocationProvider;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SerializationUtil;
import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT;
public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, HiveStorageHandler {
private static final String WRITE_KEY = "HiveIcebergStorageHandler_write";
private Configuration conf;
@Override
public Class extends InputFormat> getInputFormatClass() {
return HiveIcebergInputFormat.class;
}
@Override
public Class extends OutputFormat> getOutputFormatClass() {
return HiveIcebergOutputFormat.class;
}
@Override
public Class extends AbstractSerDe> getSerDeClass() {
return HiveIcebergSerDe.class;
}
@Override
public HiveMetaHook getMetaHook() {
return new HiveIcebergMetaHook(conf);
}
@Override
public HiveAuthorizationProvider getAuthorizationProvider() {
return null;
}
@Override
public void configureInputJobProperties(TableDesc tableDesc, Map map) {
overlayTableProperties(conf, tableDesc, map);
}
@Override
public void configureOutputJobProperties(TableDesc tableDesc, Map map) {
overlayTableProperties(conf, tableDesc, map);
map.put(WRITE_KEY, "true");
}
@Override
public void configureTableJobProperties(TableDesc tableDesc, Map map) {
}
// Override annotation commented out, since this interface method has been introduced only in Hive 3
// @Override
public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) {
}
@Override
public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
if (tableDesc != null && tableDesc.getJobProperties() != null &&
tableDesc.getJobProperties().get(WRITE_KEY) != null) {
jobConf.set("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName());
HiveIcebergConfigUtil.copySchemaToConf(
() -> Catalogs.loadTable(conf, tableDesc.getProperties()).schema(),
jobConf,
tableDesc.getProperties()
);
}
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public String toString() {
return this.getClass().getName();
}
/**
* @param jobConf Job configuration for InputFormat to access
* @param deserializer Deserializer
* @param exprNodeDesc Filter expression extracted by Hive
* @return Entire filter to take advantage of Hive's pruning as well as Iceberg's pruning.
*/
@Override
public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) {
DecomposedPredicate predicate = new DecomposedPredicate();
predicate.residualPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
predicate.pushedPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
return predicate;
}
/**
* Returns the Table FileIO serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table FileIO object
*/
public static FileIO io(Configuration config) {
return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.FILE_IO));
}
/**
* Returns the Table LocationProvider serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table LocationProvider object
*/
public static LocationProvider location(Configuration config) {
return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.LOCATION_PROVIDER));
}
/**
* Returns the Table EncryptionManager serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table EncryptionManager object
*/
public static EncryptionManager encryption(Configuration config) {
return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.ENCRYPTION_MANAGER));
}
/**
* Returns the Table Schema serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table Schema object
*/
public static Schema schema(Configuration config) {
return SchemaParser.fromJson(config.get(InputFormatConfig.TABLE_SCHEMA));
}
/**
* Returns the Table PartitionSpec serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table PartitionSpec object
*/
public static PartitionSpec spec(Configuration config) {
return PartitionSpecParser.fromJson(schema(config), config.get(InputFormatConfig.PARTITION_SPEC));
}
/**
* Stores the serializable table data in the configuration.
* Currently the following is handled:
*
* - - Table - in case the table is serializable
* - - Location
* - - Schema
* - - Partition specification
* - - FileIO for handling table files
* - - Location provider used for file generation
* - - Encryption manager for encryption handling
*
* @param configuration The configuration storing the catalog information
* @param tableDesc The table which we want to store to the configuration
* @param map The map of the configuration properties which we append with the serialized data
*/
@VisibleForTesting
static void overlayTableProperties(Configuration configuration, TableDesc tableDesc, Map map) {
Properties props = tableDesc.getProperties();
Table table = Catalogs.loadTable(configuration, props);
String schemaJson = SchemaParser.toJson(table.schema());
Maps.fromProperties(props).entrySet().stream()
.filter(entry -> !map.containsKey(entry.getKey())) // map overrides tableDesc properties
.forEach(entry -> map.put(entry.getKey(), entry.getValue()));
map.put(InputFormatConfig.TABLE_IDENTIFIER, props.getProperty(Catalogs.NAME));
map.put(InputFormatConfig.TABLE_LOCATION, table.location());
map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
map.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(table.spec()));
String formatString = PropertyUtil.propertyAsString(table.properties(), DEFAULT_FILE_FORMAT,
DEFAULT_FILE_FORMAT_DEFAULT);
map.put(InputFormatConfig.WRITE_FILE_FORMAT, formatString.toUpperCase(Locale.ENGLISH));
map.put(InputFormatConfig.WRITE_TARGET_FILE_SIZE,
table.properties().getOrDefault(WRITE_TARGET_FILE_SIZE_BYTES,
String.valueOf(WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT)));
if (table instanceof Serializable) {
map.put(InputFormatConfig.SERIALIZED_TABLE, SerializationUtil.serializeToBase64(table));
}
map.put(InputFormatConfig.FILE_IO, SerializationUtil.serializeToBase64(table.io()));
map.put(InputFormatConfig.LOCATION_PROVIDER, SerializationUtil.serializeToBase64(table.locationProvider()));
map.put(InputFormatConfig.ENCRYPTION_MANAGER, SerializationUtil.serializeToBase64(table.encryption()));
// We need to remove this otherwise the job.xml will be invalid as column comments are separated with '\0' and
// the serialization utils fail to serialize this character
map.remove("columns.comments");
// save schema into table props as well to avoid repeatedly hitting the HMS during serde initializations
// this is an exception to the interface documentation, but it's a safe operation to add this property
props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
map.put(HiveIcebergInputFormat.SPLIT_LOCATION, props.getProperty(Catalogs.LOCATION));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy