org.apache.iceberg.mr.Catalogs Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-mr Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.mr;

import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Streams;

/**
 * Class for catalog resolution and accessing the common functions for {@link Catalog} API.
 *
 * If the catalog name is provided, get the catalog type from iceberg.catalog.catalogName
 * .type config.
 *
 * 
In case the catalog name is {@link #ICEBERG_HADOOP_TABLE_NAME location_based_table}, type is
 * ignored and tables will be loaded using {@link HadoopTables}.
 *
 * 
In case the value of catalog type is null, iceberg.catalog.catalogName
 * .catalog-impl config is used to determine the catalog implementation class.
 *
 * 
If catalog name is null, get the catalog type from {@link CatalogUtil#ICEBERG_CATALOG_TYPE
 * catalog type} config:
 *
 * 

 *   hive: HiveCatalog
 *   
location: HadoopTables
 *   
hadoop: HadoopCatalog
 * 
 */
public final class Catalogs {

  public static final String ICEBERG_DEFAULT_CATALOG_NAME = "default_iceberg";
  public static final String ICEBERG_HADOOP_TABLE_NAME = "location_based_table";
  public static final String NAME = "name";
  public static final String LOCATION = "location";

  private static final String NO_CATALOG_TYPE = "no catalog";
  private static final Set PROPERTIES_TO_REMOVE =
      ImmutableSet.of(
          InputFormatConfig.TABLE_SCHEMA,
          InputFormatConfig.PARTITION_SPEC,
          LOCATION,
          NAME,
          InputFormatConfig.CATALOG_NAME);

  private Catalogs() {}

  /**
   * Load an Iceberg table using the catalog and table identifier (or table path) specified by the
   * configuration.
   *
   * @param conf a Hadoop conf
   * @return an Iceberg table
   */
  public static Table loadTable(Configuration conf) {
    return loadTable(
        conf,
        conf.get(InputFormatConfig.TABLE_IDENTIFIER),
        conf.get(InputFormatConfig.TABLE_LOCATION),
        conf.get(InputFormatConfig.CATALOG_NAME));
  }

  /**
   * Load an Iceberg table using the catalog specified by the configuration.
   *
   * The table identifier ({@link Catalogs#NAME}) and the catalog name ({@link
   * InputFormatConfig#CATALOG_NAME}), or table path ({@link Catalogs#LOCATION}) should be specified
   * by the controlling properties.
   *
   * 
Used by HiveIcebergSerDe and HiveIcebergStorageHandler
   *
   * @param conf a Hadoop
   * @param props the controlling properties
   * @return an Iceberg table
   */
  public static Table loadTable(Configuration conf, Properties props) {
    return loadTable(
        conf,
        props.getProperty(NAME),
        props.getProperty(LOCATION),
        props.getProperty(InputFormatConfig.CATALOG_NAME));
  }

  private static Table loadTable(
      Configuration conf, String tableIdentifier, String tableLocation, String catalogName) {
    Optional catalog = loadCatalog(conf, catalogName);

    if (catalog.isPresent()) {
      Preconditions.checkArgument(tableIdentifier != null, "Table identifier not set");
      return catalog.get().loadTable(TableIdentifier.parse(tableIdentifier));
    }

    Preconditions.checkArgument(tableLocation != null, "Table location not set");
    return new HadoopTables(conf).load(tableLocation);
  }

  /**
   * Creates an Iceberg table using the catalog specified by the configuration.
   *
   * 
The properties should contain the following values:
   *
   * 

   *   Table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) is
   *       required
   *   
Table schema ({@link InputFormatConfig#TABLE_SCHEMA}) is required
   *   
Partition specification ({@link InputFormatConfig#PARTITION_SPEC}) is optional. Table
   *       will be unpartitioned if not provided
   * 
   *
   * Other properties will be handled over to the Table creation. The controlling properties
   * above will not be propagated.
   *
   * @param conf a Hadoop conf
   * @param props the controlling properties
   * @return the created Iceberg table
   */
  public static Table createTable(Configuration conf, Properties props) {
    String schemaString = props.getProperty(InputFormatConfig.TABLE_SCHEMA);
    Preconditions.checkNotNull(schemaString, "Table schema not set");
    Schema schema = SchemaParser.fromJson(schemaString);

    String specString = props.getProperty(InputFormatConfig.PARTITION_SPEC);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    if (specString != null) {
      spec = PartitionSpecParser.fromJson(schema, specString);
    }

    String location = props.getProperty(LOCATION);
    String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);

    // Create a table property map without the controlling properties
    Map map = Maps.newHashMapWithExpectedSize(props.size());
    for (Object key : props.keySet()) {
      if (!PROPERTIES_TO_REMOVE.contains(key)) {
        map.put(key.toString(), props.get(key).toString());
      }
    }

    Optional catalog = loadCatalog(conf, catalogName);

    if (catalog.isPresent()) {
      String name = props.getProperty(NAME);
      Preconditions.checkNotNull(name, "Table identifier not set");
      return catalog.get().createTable(TableIdentifier.parse(name), schema, spec, location, map);
    }

    Preconditions.checkNotNull(location, "Table location not set");
    return new HadoopTables(conf).create(schema, spec, map, location);
  }

  /**
   * Drops an Iceberg table using the catalog specified by the configuration.
   *
   * 
The table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION})
   * should be specified by the controlling properties.
   *
   * @param conf a Hadoop conf
   * @param props the controlling properties
   * @return the created Iceberg table
   */
  public static boolean dropTable(Configuration conf, Properties props) {
    String location = props.getProperty(LOCATION);
    String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);

    Optional catalog = loadCatalog(conf, catalogName);

    if (catalog.isPresent()) {
      String name = props.getProperty(NAME);
      Preconditions.checkNotNull(name, "Table identifier not set");
      return catalog.get().dropTable(TableIdentifier.parse(name));
    }

    Preconditions.checkNotNull(location, "Table location not set");
    return new HadoopTables(conf).dropTable(location);
  }

  /**
   * Returns true if HiveCatalog is used
   *
   * @param conf a Hadoop conf
   * @param props the controlling properties
   * @return true if the Catalog is HiveCatalog
   */
  public static boolean hiveCatalog(Configuration conf, Properties props) {
    String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);
    String catalogType = getCatalogType(conf, catalogName);
    if (catalogType != null) {
      return CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE.equalsIgnoreCase(catalogType);
    }
    catalogType = getCatalogType(conf, ICEBERG_DEFAULT_CATALOG_NAME);
    if (catalogType != null) {
      return CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE.equalsIgnoreCase(catalogType);
    }
    return getCatalogProperties(conf, catalogName).get(CatalogProperties.CATALOG_IMPL) == null;
  }

  @VisibleForTesting
  static Optional loadCatalog(Configuration conf, String catalogName) {
    String catalogType = getCatalogType(conf, catalogName);
    if (NO_CATALOG_TYPE.equalsIgnoreCase(catalogType)) {
      return Optional.empty();
    } else {
      String name = catalogName == null ? ICEBERG_DEFAULT_CATALOG_NAME : catalogName;
      return Optional.of(
          CatalogUtil.buildIcebergCatalog(name, getCatalogProperties(conf, name), conf));
    }
  }

  /**
   * Collect all the catalog specific configuration from the global hive configuration.
   *
   * @param conf a Hadoop configuration
   * @param catalogName name of the catalog
   * @return complete map of catalog properties
   */
  private static Map getCatalogProperties(Configuration conf, String catalogName) {
    String keyPrefix = InputFormatConfig.CATALOG_CONFIG_PREFIX + catalogName;

    return Streams.stream(conf.iterator())
        .filter(e -> e.getKey().startsWith(keyPrefix))
        .collect(
            Collectors.toMap(
                e -> e.getKey().substring(keyPrefix.length() + 1), Map.Entry::getValue));
  }

  /**
   * Return the catalog type based on the catalog name.
   *
   * See {@link Catalogs} documentation for catalog type resolution strategy.
   *
   * @param conf global hive configuration
   * @param catalogName name of the catalog
   * @return type of the catalog, can be null
   */
  private static String getCatalogType(Configuration conf, String catalogName) {
    if (catalogName != null) {
      String catalogType =
          conf.get(
              InputFormatConfig.catalogPropertyConfigKey(
                  catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE));
      if (catalogName.equals(ICEBERG_HADOOP_TABLE_NAME)) {
        return NO_CATALOG_TYPE;
      } else {
        return catalogType;
      }
    } else {
      String catalogType = conf.get(CatalogUtil.ICEBERG_CATALOG_TYPE);
      if (catalogType != null && catalogType.equals(LOCATION)) {
        return NO_CATALOG_TYPE;
      } else {
        return catalogType;
      }
    }
  }
}