All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.mr.hive.HiveIcebergStorageHandler Maven / Gradle / Ivy

There is a newer version: 0.11.1.33
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.mr.hive;

import java.io.Serializable;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.encryption.EncryptionManager;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.LocationProvider;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SerializationUtil;

import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT;

public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, HiveStorageHandler {

  private static final String WRITE_KEY = "HiveIcebergStorageHandler_write";

  private Configuration conf;

  @Override
  public Class getInputFormatClass() {
    return HiveIcebergInputFormat.class;
  }

  @Override
  public Class getOutputFormatClass() {
    return HiveIcebergOutputFormat.class;
  }

  @Override
  public Class getSerDeClass() {
    return HiveIcebergSerDe.class;
  }

  @Override
  public HiveMetaHook getMetaHook() {
    return new HiveIcebergMetaHook(conf);
  }

  @Override
  public HiveAuthorizationProvider getAuthorizationProvider() {
    return null;
  }

  @Override
  public void configureInputJobProperties(TableDesc tableDesc, Map map) {
    overlayTableProperties(conf, tableDesc, map);
  }

  @Override
  public void configureOutputJobProperties(TableDesc tableDesc, Map map) {
    overlayTableProperties(conf, tableDesc, map);
    map.put(WRITE_KEY, "true");
  }

  @Override
  public void configureTableJobProperties(TableDesc tableDesc, Map map) {

  }

  // Override annotation commented out, since this interface method has been introduced only in Hive 3
  // @Override
  public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) {

  }

  @Override
  public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
    if (tableDesc != null && tableDesc.getJobProperties() != null &&
        tableDesc.getJobProperties().get(WRITE_KEY) != null) {
      jobConf.set("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName());
      HiveIcebergConfigUtil.copySchemaToConf(
          () -> Catalogs.loadTable(conf, tableDesc.getProperties()).schema(),
          jobConf,
          tableDesc.getProperties()
      );
    }
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  @Override
  public String toString() {
    return this.getClass().getName();
  }

  /**
   * @param jobConf Job configuration for InputFormat to access
   * @param deserializer Deserializer
   * @param exprNodeDesc Filter expression extracted by Hive
   * @return Entire filter to take advantage of Hive's pruning as well as Iceberg's pruning.
   */
  @Override
  public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) {
    DecomposedPredicate predicate = new DecomposedPredicate();
    predicate.residualPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
    predicate.pushedPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc;
    return predicate;
  }

  /**
   * Returns the Table FileIO serialized to the configuration.
   * @param config The configuration used to get the data from
   * @return The Table FileIO object
   */
  public static FileIO io(Configuration config) {
    return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.FILE_IO));
  }

  /**
   * Returns the Table LocationProvider serialized to the configuration.
   * @param config The configuration used to get the data from
   * @return The Table LocationProvider object
   */
  public static LocationProvider location(Configuration config) {
    return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.LOCATION_PROVIDER));
  }

  /**
   * Returns the Table EncryptionManager serialized to the configuration.
   * @param config The configuration used to get the data from
   * @return The Table EncryptionManager object
   */
  public static EncryptionManager encryption(Configuration config) {
    return SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.ENCRYPTION_MANAGER));
  }

  /**
   * Returns the Table Schema serialized to the configuration.
   * @param config The configuration used to get the data from
   * @return The Table Schema object
   */
  public static Schema schema(Configuration config) {
    return SchemaParser.fromJson(config.get(InputFormatConfig.TABLE_SCHEMA));
  }

  /**
   * Returns the Table PartitionSpec serialized to the configuration.
   * @param config The configuration used to get the data from
   * @return The Table PartitionSpec object
   */
  public static PartitionSpec spec(Configuration config) {
    return PartitionSpecParser.fromJson(schema(config), config.get(InputFormatConfig.PARTITION_SPEC));
  }

  /**
   * Stores the serializable table data in the configuration.
   * Currently the following is handled:
   * 
    *
  • - Table - in case the table is serializable
  • *
  • - Location
  • *
  • - Schema
  • *
  • - Partition specification
  • *
  • - FileIO for handling table files
  • *
  • - Location provider used for file generation
  • *
  • - Encryption manager for encryption handling
  • *
* @param configuration The configuration storing the catalog information * @param tableDesc The table which we want to store to the configuration * @param map The map of the configuration properties which we append with the serialized data */ @VisibleForTesting static void overlayTableProperties(Configuration configuration, TableDesc tableDesc, Map map) { Properties props = tableDesc.getProperties(); Table table = Catalogs.loadTable(configuration, props); String schemaJson = SchemaParser.toJson(table.schema()); Maps.fromProperties(props).entrySet().stream() .filter(entry -> !map.containsKey(entry.getKey())) // map overrides tableDesc properties .forEach(entry -> map.put(entry.getKey(), entry.getValue())); map.put(InputFormatConfig.TABLE_IDENTIFIER, props.getProperty(Catalogs.NAME)); map.put(InputFormatConfig.TABLE_LOCATION, table.location()); map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); map.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(table.spec())); String formatString = PropertyUtil.propertyAsString(table.properties(), DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); map.put(InputFormatConfig.WRITE_FILE_FORMAT, formatString.toUpperCase(Locale.ENGLISH)); map.put(InputFormatConfig.WRITE_TARGET_FILE_SIZE, table.properties().getOrDefault(WRITE_TARGET_FILE_SIZE_BYTES, String.valueOf(WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT))); if (table instanceof Serializable) { map.put(InputFormatConfig.SERIALIZED_TABLE, SerializationUtil.serializeToBase64(table)); } map.put(InputFormatConfig.FILE_IO, SerializationUtil.serializeToBase64(table.io())); map.put(InputFormatConfig.LOCATION_PROVIDER, SerializationUtil.serializeToBase64(table.locationProvider())); map.put(InputFormatConfig.ENCRYPTION_MANAGER, SerializationUtil.serializeToBase64(table.encryption())); // We need to remove this otherwise the job.xml will be invalid as column comments are separated with '\0' and // the serialization utils fail to serialize this character map.remove("columns.comments"); // save schema into table props as well to avoid repeatedly hitting the HMS during serde initializations // this is an exception to the interface documentation, but it's a safe operation to add this property props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); map.put(HiveIcebergInputFormat.SPLIT_LOCATION, props.getProperty(Catalogs.LOCATION)); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy