org.apache.iceberg.mr.Catalogs Maven / Gradle / Ivy
Show all versions of iceberg-mr Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.mr;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PartitionSpecParser;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Streams;
/**
* Class for catalog resolution and accessing the common functions for {@link Catalog} API.
*
* If the catalog name is provided, get the catalog type from iceberg.catalog.catalogName
*
.type config.
*
*
In case the catalog name is {@link #ICEBERG_HADOOP_TABLE_NAME location_based_table}, type is
* ignored and tables will be loaded using {@link HadoopTables}.
*
*
In case the value of catalog type is null, iceberg.catalog.catalogName
* .catalog-impl config is used to determine the catalog implementation class.
*
*
If catalog name is null, get the catalog type from {@link CatalogUtil#ICEBERG_CATALOG_TYPE
* catalog type} config:
*
*
* - hive: HiveCatalog
*
- location: HadoopTables
*
- hadoop: HadoopCatalog
*
*/
public final class Catalogs {
public static final String ICEBERG_DEFAULT_CATALOG_NAME = "default_iceberg";
public static final String ICEBERG_HADOOP_TABLE_NAME = "location_based_table";
public static final String NAME = "name";
public static final String LOCATION = "location";
private static final String NO_CATALOG_TYPE = "no catalog";
private static final Set PROPERTIES_TO_REMOVE =
ImmutableSet.of(
InputFormatConfig.TABLE_SCHEMA,
InputFormatConfig.PARTITION_SPEC,
LOCATION,
NAME,
InputFormatConfig.CATALOG_NAME);
private Catalogs() {}
/**
* Load an Iceberg table using the catalog and table identifier (or table path) specified by the
* configuration.
*
* @param conf a Hadoop conf
* @return an Iceberg table
*/
public static Table loadTable(Configuration conf) {
return loadTable(
conf,
conf.get(InputFormatConfig.TABLE_IDENTIFIER),
conf.get(InputFormatConfig.TABLE_LOCATION),
conf.get(InputFormatConfig.CATALOG_NAME));
}
/**
* Load an Iceberg table using the catalog specified by the configuration.
*
* The table identifier ({@link Catalogs#NAME}) and the catalog name ({@link
* InputFormatConfig#CATALOG_NAME}), or table path ({@link Catalogs#LOCATION}) should be specified
* by the controlling properties.
*
*
Used by HiveIcebergSerDe and HiveIcebergStorageHandler
*
* @param conf a Hadoop
* @param props the controlling properties
* @return an Iceberg table
*/
public static Table loadTable(Configuration conf, Properties props) {
return loadTable(
conf,
props.getProperty(NAME),
props.getProperty(LOCATION),
props.getProperty(InputFormatConfig.CATALOG_NAME));
}
private static Table loadTable(
Configuration conf, String tableIdentifier, String tableLocation, String catalogName) {
Optional catalog = loadCatalog(conf, catalogName);
if (catalog.isPresent()) {
Preconditions.checkArgument(tableIdentifier != null, "Table identifier not set");
return catalog.get().loadTable(TableIdentifier.parse(tableIdentifier));
}
Preconditions.checkArgument(tableLocation != null, "Table location not set");
return new HadoopTables(conf).load(tableLocation);
}
/**
* Creates an Iceberg table using the catalog specified by the configuration.
*
* The properties should contain the following values:
*
*
* - Table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) is
* required
*
- Table schema ({@link InputFormatConfig#TABLE_SCHEMA}) is required
*
- Partition specification ({@link InputFormatConfig#PARTITION_SPEC}) is optional. Table
* will be unpartitioned if not provided
*
*
* Other properties will be handled over to the Table creation. The controlling properties
* above will not be propagated.
*
* @param conf a Hadoop conf
* @param props the controlling properties
* @return the created Iceberg table
*/
public static Table createTable(Configuration conf, Properties props) {
String schemaString = props.getProperty(InputFormatConfig.TABLE_SCHEMA);
Preconditions.checkNotNull(schemaString, "Table schema not set");
Schema schema = SchemaParser.fromJson(schemaString);
String specString = props.getProperty(InputFormatConfig.PARTITION_SPEC);
PartitionSpec spec = PartitionSpec.unpartitioned();
if (specString != null) {
spec = PartitionSpecParser.fromJson(schema, specString);
}
String location = props.getProperty(LOCATION);
String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);
// Create a table property map without the controlling properties
Map map = Maps.newHashMapWithExpectedSize(props.size());
for (Object key : props.keySet()) {
if (!PROPERTIES_TO_REMOVE.contains(key)) {
map.put(key.toString(), props.get(key).toString());
}
}
Optional catalog = loadCatalog(conf, catalogName);
if (catalog.isPresent()) {
String name = props.getProperty(NAME);
Preconditions.checkNotNull(name, "Table identifier not set");
return catalog.get().createTable(TableIdentifier.parse(name), schema, spec, location, map);
}
Preconditions.checkNotNull(location, "Table location not set");
return new HadoopTables(conf).create(schema, spec, map, location);
}
/**
* Drops an Iceberg table using the catalog specified by the configuration.
*
* The table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION})
* should be specified by the controlling properties.
*
* @param conf a Hadoop conf
* @param props the controlling properties
* @return the created Iceberg table
*/
public static boolean dropTable(Configuration conf, Properties props) {
String location = props.getProperty(LOCATION);
String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);
Optional catalog = loadCatalog(conf, catalogName);
if (catalog.isPresent()) {
String name = props.getProperty(NAME);
Preconditions.checkNotNull(name, "Table identifier not set");
return catalog.get().dropTable(TableIdentifier.parse(name));
}
Preconditions.checkNotNull(location, "Table location not set");
return new HadoopTables(conf).dropTable(location);
}
/**
* Returns true if HiveCatalog is used
*
* @param conf a Hadoop conf
* @param props the controlling properties
* @return true if the Catalog is HiveCatalog
*/
public static boolean hiveCatalog(Configuration conf, Properties props) {
String catalogName = props.getProperty(InputFormatConfig.CATALOG_NAME);
String catalogType = getCatalogType(conf, catalogName);
if (catalogType != null) {
return CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE.equalsIgnoreCase(catalogType);
}
catalogType = getCatalogType(conf, ICEBERG_DEFAULT_CATALOG_NAME);
if (catalogType != null) {
return CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE.equalsIgnoreCase(catalogType);
}
return getCatalogProperties(conf, catalogName).get(CatalogProperties.CATALOG_IMPL) == null;
}
@VisibleForTesting
static Optional loadCatalog(Configuration conf, String catalogName) {
String catalogType = getCatalogType(conf, catalogName);
if (NO_CATALOG_TYPE.equalsIgnoreCase(catalogType)) {
return Optional.empty();
} else {
String name = catalogName == null ? ICEBERG_DEFAULT_CATALOG_NAME : catalogName;
return Optional.of(
CatalogUtil.buildIcebergCatalog(name, getCatalogProperties(conf, name), conf));
}
}
/**
* Collect all the catalog specific configuration from the global hive configuration.
*
* @param conf a Hadoop configuration
* @param catalogName name of the catalog
* @return complete map of catalog properties
*/
private static Map getCatalogProperties(Configuration conf, String catalogName) {
String keyPrefix = InputFormatConfig.CATALOG_CONFIG_PREFIX + catalogName;
return Streams.stream(conf.iterator())
.filter(e -> e.getKey().startsWith(keyPrefix))
.collect(
Collectors.toMap(
e -> e.getKey().substring(keyPrefix.length() + 1), Map.Entry::getValue));
}
/**
* Return the catalog type based on the catalog name.
*
* See {@link Catalogs} documentation for catalog type resolution strategy.
*
* @param conf global hive configuration
* @param catalogName name of the catalog
* @return type of the catalog, can be null
*/
private static String getCatalogType(Configuration conf, String catalogName) {
if (catalogName != null) {
String catalogType =
conf.get(
InputFormatConfig.catalogPropertyConfigKey(
catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE));
if (catalogName.equals(ICEBERG_HADOOP_TABLE_NAME)) {
return NO_CATALOG_TYPE;
} else {
return catalogType;
}
} else {
String catalogType = conf.get(CatalogUtil.ICEBERG_CATALOG_TYPE);
if (catalogType != null && catalogType.equals(LOCATION)) {
return NO_CATALOG_TYPE;
} else {
return catalogType;
}
}
}
}