All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.spark.SparkCatalog Maven / Gradle / Ivy

There is a newer version: 0.13.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.spark;

import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CachingCatalog;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.Transaction;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.SupportsNamespaces;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.exceptions.AlreadyExistsException;
import org.apache.iceberg.hadoop.HadoopCatalog;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.source.SparkTable;
import org.apache.iceberg.spark.source.StagedSparkTable;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NamespaceAlreadyExistsException;
import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.NamespaceChange;
import org.apache.spark.sql.connector.catalog.StagedTable;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.catalog.TableChange;
import org.apache.spark.sql.connector.catalog.TableChange.ColumnChange;
import org.apache.spark.sql.connector.catalog.TableChange.RemoveProperty;
import org.apache.spark.sql.connector.catalog.TableChange.SetProperty;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

/**
 * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}.
 * 

* This supports the following catalog configuration options: *

    *
  • type - catalog type, "hive" or "hadoop"
  • *
  • uri - the Hive Metastore URI (Hive catalog only)
  • *
  • warehouse - the warehouse path (Hadoop catalog only)
  • *
  • default-namespace - a namespace to use as the default
  • *
*

* To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override * {@link #buildIcebergCatalog(String, CaseInsensitiveStringMap)}. */ public class SparkCatalog extends BaseCatalog { private static final Set DEFAULT_NS_KEYS = ImmutableSet.of(TableCatalog.PROP_OWNER); private String catalogName = null; private Catalog icebergCatalog = null; private boolean cacheEnabled = true; private SupportsNamespaces asNamespaceCatalog = null; private String[] defaultNamespace = null; private HadoopTables tables; /** * Build an Iceberg {@link Catalog} to be used by this Spark catalog adapter. * * @param name Spark's catalog name * @param options Spark's catalog options * @return an Iceberg catalog */ protected Catalog buildIcebergCatalog(String name, CaseInsensitiveStringMap options) { Configuration conf = SparkSession.active().sessionState().newHadoopConf(); Map optionsMap = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); optionsMap.putAll(options); return CatalogUtil.buildIcebergCatalog(name, optionsMap, conf); } /** * Build an Iceberg {@link TableIdentifier} for the given Spark identifier. * * @param identifier Spark's identifier * @return an Iceberg identifier */ protected TableIdentifier buildIdentifier(Identifier identifier) { return Spark3Util.identifierToTableIdentifier(identifier); } @Override public SparkTable loadTable(Identifier ident) throws NoSuchTableException { try { Table icebergTable = load(ident); return new SparkTable(icebergTable, !cacheEnabled); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); } } @Override public SparkTable createTable(Identifier ident, StructType schema, Transform[] transforms, Map properties) throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); Table icebergTable = builder .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) .withLocation(properties.get("location")) .withProperties(Spark3Util.rebuildCreateProperties(properties)) .create(); return new SparkTable(icebergTable, !cacheEnabled); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); } } @Override public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, Map properties) throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) .withLocation(properties.get("location")) .withProperties(Spark3Util.rebuildCreateProperties(properties)) .createTransaction(); return new StagedSparkTable(transaction); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); } } @Override public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, Map properties) throws NoSuchTableException { Schema icebergSchema = SparkSchemaUtil.convert(schema); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) .withLocation(properties.get("location")) .withProperties(Spark3Util.rebuildCreateProperties(properties)) .replaceTransaction(); return new StagedSparkTable(transaction); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); } } @Override public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, Map properties) { Schema icebergSchema = SparkSchemaUtil.convert(schema); Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) .withLocation(properties.get("location")) .withProperties(Spark3Util.rebuildCreateProperties(properties)) .createOrReplaceTransaction(); return new StagedSparkTable(transaction); } @Override public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException { SetProperty setLocation = null; SetProperty setSnapshotId = null; SetProperty pickSnapshotId = null; List propertyChanges = Lists.newArrayList(); List schemaChanges = Lists.newArrayList(); for (TableChange change : changes) { if (change instanceof SetProperty) { SetProperty set = (SetProperty) change; if (TableCatalog.PROP_LOCATION.equalsIgnoreCase(set.property())) { setLocation = set; } else if ("current-snapshot-id".equalsIgnoreCase(set.property())) { setSnapshotId = set; } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) { pickSnapshotId = set; } else { propertyChanges.add(set); } } else if (change instanceof RemoveProperty) { propertyChanges.add(change); } else if (change instanceof ColumnChange) { schemaChanges.add(change); } else { throw new UnsupportedOperationException("Cannot apply unknown table change: " + change); } } try { Table table = load(ident); commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); } return null; } @Override public boolean dropTable(Identifier ident) { try { return isPathIdentifier(ident) ? tables.dropTable(((PathIdentifier) ident).location()) : icebergCatalog.dropTable(buildIdentifier(ident)); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { return false; } } @Override public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { try { checkNotPathIdentifier(from, "renameTable"); checkNotPathIdentifier(to, "renameTable"); icebergCatalog.renameTable(buildIdentifier(from), buildIdentifier(to)); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(from); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(to); } } @Override public void invalidateTable(Identifier ident) { try { load(ident).refresh(); } catch (org.apache.iceberg.exceptions.NoSuchTableException ignored) { // ignore if the table doesn't exist, it is not cached } } @Override public Identifier[] listTables(String[] namespace) { return icebergCatalog.listTables(Namespace.of(namespace)).stream() .map(ident -> Identifier.of(ident.namespace().levels(), ident.name())) .toArray(Identifier[]::new); } @Override public String[] defaultNamespace() { if (defaultNamespace != null) { return defaultNamespace; } return new String[0]; } @Override public String[][] listNamespaces() { if (asNamespaceCatalog != null) { return asNamespaceCatalog.listNamespaces().stream() .map(Namespace::levels) .toArray(String[][]::new); } return new String[0][]; } @Override public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.listNamespaces(Namespace.of(namespace)).stream() .map(Namespace::levels) .toArray(String[][]::new); } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) { throw new NoSuchNamespaceException(namespace); } } throw new NoSuchNamespaceException(namespace); } @Override public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace)); } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) { throw new NoSuchNamespaceException(namespace); } } throw new NoSuchNamespaceException(namespace); } @Override public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { if (asNamespaceCatalog != null) { try { if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) { // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner". // If only the automatic properties are present, replace metadata with an empty map. asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of()); } else { asNamespaceCatalog.createNamespace(Namespace.of(namespace), metadata); } } catch (AlreadyExistsException e) { throw new NamespaceAlreadyExistsException(namespace); } } else { throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName); } } @Override public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { Map updates = Maps.newHashMap(); Set removals = Sets.newHashSet(); for (NamespaceChange change : changes) { if (change instanceof NamespaceChange.SetProperty) { NamespaceChange.SetProperty set = (NamespaceChange.SetProperty) change; updates.put(set.property(), set.value()); } else if (change instanceof NamespaceChange.RemoveProperty) { removals.add(((NamespaceChange.RemoveProperty) change).property()); } else { throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change); } } try { if (!updates.isEmpty()) { asNamespaceCatalog.setProperties(Namespace.of(namespace), updates); } if (!removals.isEmpty()) { asNamespaceCatalog.removeProperties(Namespace.of(namespace), removals); } } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) { throw new NoSuchNamespaceException(namespace); } } else { throw new NoSuchNamespaceException(namespace); } } @Override public boolean dropNamespace(String[] namespace) throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.dropNamespace(Namespace.of(namespace)); } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) { throw new NoSuchNamespaceException(namespace); } } return false; } @Override public final void initialize(String name, CaseInsensitiveStringMap options) { this.cacheEnabled = Boolean.parseBoolean(options.getOrDefault("cache-enabled", "true")); Catalog catalog = buildIcebergCatalog(name, options); this.catalogName = name; this.tables = new HadoopTables(SparkSession.active().sessionState().newHadoopConf()); this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog) : catalog; if (catalog instanceof SupportsNamespaces) { this.asNamespaceCatalog = (SupportsNamespaces) catalog; if (options.containsKey("default-namespace")) { this.defaultNamespace = Splitter.on('.') .splitToList(options.get("default-namespace")) .toArray(new String[0]); } } } @Override public String name() { return catalogName; } private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, SetProperty pickSnapshotId, List propertyChanges, List schemaChanges) { // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing // one order leads to different results Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { long newSnapshotId = Long.parseLong(setSnapshotId.value()); table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); } // if updating the table snapshot, perform that update first in case it fails if (pickSnapshotId != null) { long newSnapshotId = Long.parseLong(pickSnapshotId.value()); table.manageSnapshots().cherrypick(newSnapshotId).commit(); } Transaction transaction = table.newTransaction(); if (setLocation != null) { transaction.updateLocation() .setLocation(setLocation.value()) .commit(); } if (!propertyChanges.isEmpty()) { Spark3Util.applyPropertyChanges(transaction.updateProperties(), propertyChanges).commit(); } if (!schemaChanges.isEmpty()) { Spark3Util.applySchemaChanges(transaction.updateSchema(), schemaChanges).commit(); } transaction.commitTransaction(); } private static boolean isPathIdentifier(Identifier ident) { return ident instanceof PathIdentifier; } private static void checkNotPathIdentifier(Identifier identifier, String method) { if (identifier instanceof PathIdentifier) { throw new IllegalArgumentException(String.format("Cannot pass path based identifier to %s method. %s is a path.", method, identifier)); } } private Table load(Identifier ident) { return isPathIdentifier(ident) ? tables.load(((PathIdentifier) ident).location()) : icebergCatalog.loadTable(buildIdentifier(ident)); } private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) { return isPathIdentifier(ident) ? tables.buildTable(((PathIdentifier) ident).location(), schema) : icebergCatalog.buildTable(buildIdentifier(ident), schema); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy