org.apache.iceberg.mr.hive.HiveIcebergStorageHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-mr Show documentation
Show all versions of iceberg-mr Show documentation
Apache Iceberg modified by Tencent blueking
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.mr.hive;
import java.io.Serializable;
import java.util.Collection;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.SerializationUtil;
public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, HiveStorageHandler {
private static final Splitter TABLE_NAME_SPLITTER = Splitter.on("..");
private static final String TABLE_NAME_SEPARATOR = "..";
static final String WRITE_KEY = "HiveIcebergStorageHandler_write";
private Configuration conf;
@Override
public Class extends InputFormat> getInputFormatClass() {
return HiveIcebergInputFormat.class;
}
@Override
public Class extends OutputFormat> getOutputFormatClass() {
return HiveIcebergOutputFormat.class;
}
@Override
public Class extends AbstractSerDe> getSerDeClass() {
return HiveIcebergSerDe.class;
}
@Override
public HiveMetaHook getMetaHook() {
return new HiveIcebergMetaHook(conf);
}
@Override
public HiveAuthorizationProvider getAuthorizationProvider() {
return null;
}
@Override
public void configureInputJobProperties(TableDesc tableDesc, Map map) {
overlayTableProperties(conf, tableDesc, map);
}
@Override
public void configureOutputJobProperties(TableDesc tableDesc, Map map) {
overlayTableProperties(conf, tableDesc, map);
// For Tez, setting the committer here is enough to make sure it'll be part of the jobConf
map.put("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName());
// For MR, the jobConf is set only in configureJobConf, so we're setting the write key here to detect it over there
map.put(WRITE_KEY, "true");
// Putting the key into the table props as well, so that projection pushdown can be determined on a
// table-level and skipped only for output tables in HiveIcebergSerde. Properties from the map will be present in
// the serde config for all tables in the query, not just the output tables, so we can't rely on that in the serde.
tableDesc.getProperties().put(WRITE_KEY, "true");
}
@Override
public void configureTableJobProperties(TableDesc tableDesc, Map map) {
}
// Override annotation commented out, since this interface method has been introduced only in Hive 3
// @Override
public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) {
}
@Override
public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
if (tableDesc != null && tableDesc.getProperties() != null &&
tableDesc.getProperties().get(WRITE_KEY) != null) {
String tableName = tableDesc.getTableName();
Preconditions.checkArgument(!tableName.contains(TABLE_NAME_SEPARATOR),
"Can not handle table " + tableName + ". Its name contains '" + TABLE_NAME_SEPARATOR + "'");
String tables = jobConf.get(InputFormatConfig.OUTPUT_TABLES);
tables = tables == null ? tableName : tables + TABLE_NAME_SEPARATOR + tableName;
jobConf.set("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName());
jobConf.set(InputFormatConfig.OUTPUT_TABLES, tables);
String catalogName = tableDesc.getProperties().getProperty(InputFormatConfig.CATALOG_NAME);
if (catalogName != null) {
jobConf.set(InputFormatConfig.TABLE_CATALOG_PREFIX + tableName, catalogName);
}
}
if (HiveConf.getBoolVar(jobConf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) {
jobConf.setEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.HIVE);
conf.setBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, true);
}
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public String toString() {
return this.getClass().getName();
}
/**
* @param jobConf Job configuration for InputFormat to access
* @param deserializer Deserializer
* @param exprNodeDesc Filter expression extracted by Hive
* @return Entire filter to take advantage of Hive's pruning as well as Iceberg's pruning.
*/
@Override
public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) {
DecomposedPredicate predicate = new DecomposedPredicate();
predicate.residualPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
predicate.pushedPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
return predicate;
}
/**
* Returns the Table serialized to the configuration based on the table name.
* @param config The configuration used to get the data from
* @param name The name of the table we need as returned by TableDesc.getTableName()
* @return The Table
*/
public static Table table(Configuration config, String name) {
return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.SERIALIZED_TABLE_PREFIX + name));
}
/**
* Returns the names of the output tables stored in the configuration.
* @param config The configuration used to get the data from
* @return The collection of the table names as returned by TableDesc.getTableName()
*/
public static Collection outputTables(Configuration config) {
return TABLE_NAME_SPLITTER.splitToList(config.get(InputFormatConfig.OUTPUT_TABLES));
}
/**
* Returns the catalog name serialized to the configuration.
* @param config The configuration used to get the data from
* @param name The name of the table we neeed as returned by TableDesc.getTableName()
* @return catalog name
*/
public static String catalogName(Configuration config, String name) {
return config.get(InputFormatConfig.TABLE_CATALOG_PREFIX + name);
}
/**
* Returns the Table Schema serialized to the configuration.
* @param config The configuration used to get the data from
* @return The Table Schema object
*/
public static Schema schema(Configuration config) {
return SchemaParser.fromJson(config.get(InputFormatConfig.TABLE_SCHEMA));
}
/**
* Stores the serializable table data in the configuration.
* Currently the following is handled:
*
* - - Table - in case the table is serializable
* - - Location
* - - Schema
* - - Partition specification
* - - FileIO for handling table files
* - - Location provider used for file generation
* - - Encryption manager for encryption handling
*
* @param configuration The configuration storing the catalog information
* @param tableDesc The table which we want to store to the configuration
* @param map The map of the configuration properties which we append with the serialized data
*/
@VisibleForTesting
static void overlayTableProperties(Configuration configuration, TableDesc tableDesc, Map map) {
Properties props = tableDesc.getProperties();
Table table = Catalogs.loadTable(configuration, props);
String schemaJson = SchemaParser.toJson(table.schema());
Maps.fromProperties(props).entrySet().stream()
.filter(entry -> !map.containsKey(entry.getKey())) // map overrides tableDesc properties
.forEach(entry -> map.put(entry.getKey(), entry.getValue()));
map.put(InputFormatConfig.TABLE_IDENTIFIER, props.getProperty(Catalogs.NAME));
map.put(InputFormatConfig.TABLE_LOCATION, table.location());
map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
if (table instanceof Serializable) {
map.put(InputFormatConfig.SERIALIZED_TABLE_PREFIX + tableDesc.getTableName(),
SerializationUtil.serializeToBase64(table));
}
// We need to remove this otherwise the job.xml will be invalid as column comments are separated with '\0' and
// the serialization utils fail to serialize this character
map.remove("columns.comments");
// save schema into table props as well to avoid repeatedly hitting the HMS during serde initializations
// this is an exception to the interface documentation, but it's a safe operation to add this property
props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy