org.apache.iceberg.spark.source.IcebergSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
There is a newer version: 0.13.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.spark.source;

import java.util.Arrays;
import java.util.Map;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.spark.PathIdentifier;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkReadOptions;
import org.apache.iceberg.spark.SparkSessionCatalog;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.connector.catalog.CatalogManager;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.SupportsCatalogOptions;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.sources.DataSourceRegister;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

/**
 * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables.
 *
 * How paths/tables are loaded when using spark.read().format("iceberg").path(table)
 *
 *  table = "file:/path/to/table" -> loads a HadoopTable at given path
 *  table = "tablename" -> loads currentCatalog.currentNamespace.tablename
 *  table = "catalog.tablename" -> load "tablename" from the specified catalog.
 *  table = "namespace.tablename" -> load "namespace.tablename" from current catalog
 *  table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog.
 *  table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog
 *
 * The above list is in order of priority. For example: a matching catalog will take priority over any namespace
 * resolution.
 */
public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions {
  private static final String DEFAULT_CATALOG_NAME = "default_iceberg";
  private static final String DEFAULT_CATALOG = "spark.sql.catalog." + DEFAULT_CATALOG_NAME;
  private static final String AT_TIMESTAMP = "at_timestamp_";
  private static final String SNAPSHOT_ID = "snapshot_id_";

  @Override
  public String shortName() {
    return "iceberg";
  }

  @Override
  public StructType inferSchema(CaseInsensitiveStringMap options) {
    return null;
  }

  @Override
  public Transform[] inferPartitioning(CaseInsensitiveStringMap options) {
    return getTable(null, null, options).partitioning();
  }

  @Override
  public boolean supportsExternalMetadata() {
    return true;
  }

  @Override
  public Table getTable(StructType schema, Transform[] partitioning, Map options) {
    Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options));
    CatalogPlugin catalog = catalogIdentifier.catalog();
    Identifier ident = catalogIdentifier.identifier();

    try {
      if (catalog instanceof TableCatalog) {
        return ((TableCatalog) catalog).loadTable(ident);
      }
    } catch (NoSuchTableException e) {
      // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
      throw new org.apache.iceberg.exceptions.NoSuchTableException(e, "Cannot find table for %s.", ident);
    }

    // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
    throw new org.apache.iceberg.exceptions.NoSuchTableException("Cannot find table for %s.", ident);
  }

  private Spark3Util.CatalogAndIdentifier catalogAndIdentifier(CaseInsensitiveStringMap options) {
    Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set");
    SparkSession spark = SparkSession.active();
    setupDefaultSparkCatalog(spark);
    String path = options.get("path");

    Long snapshotId = propertyAsLong(options, SparkReadOptions.SNAPSHOT_ID);
    Long asOfTimestamp = propertyAsLong(options, SparkReadOptions.AS_OF_TIMESTAMP);
    Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null,
        "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, asOfTimestamp);

    String selector = null;

    if (snapshotId != null) {
      selector = SNAPSHOT_ID + snapshotId;
    }

    if (asOfTimestamp != null) {
      selector = AT_TIMESTAMP + asOfTimestamp;
    }

    CatalogManager catalogManager = spark.sessionState().catalogManager();

    if (path.contains("/")) {
      // contains a path. Return iceberg default catalog and a PathIdentifier
      String newPath = (selector == null) ? path : path + "#" + selector;
      return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME),
          new PathIdentifier(newPath));
    }

    final Spark3Util.CatalogAndIdentifier catalogAndIdentifier = Spark3Util.catalogAndIdentifier(
        "path or identifier", spark, path);

    Identifier ident = identifierWithSelector(catalogAndIdentifier.identifier(), selector);
    if (catalogAndIdentifier.catalog().name().equals("spark_catalog") &&
        !(catalogAndIdentifier.catalog() instanceof SparkSessionCatalog)) {
      // catalog is a session catalog but does not support Iceberg. Use Iceberg instead.
      return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME), ident);
    } else {
      return new Spark3Util.CatalogAndIdentifier(catalogAndIdentifier.catalog(), ident);
    }
  }

  private Identifier identifierWithSelector(Identifier ident, String selector) {
    if (selector == null) {
      return ident;
    } else {
      String[] namespace = ident.namespace();
      String[] ns = Arrays.copyOf(namespace, namespace.length + 1);
      ns[namespace.length] = ident.name();
      return Identifier.of(ns, selector);
    }
  }

  @Override
  public Identifier extractIdentifier(CaseInsensitiveStringMap options) {
    return catalogAndIdentifier(options).identifier();
  }

  @Override
  public String extractCatalog(CaseInsensitiveStringMap options) {
    return catalogAndIdentifier(options).catalog().name();
  }

  private static Long propertyAsLong(CaseInsensitiveStringMap options, String property) {
    String value = options.get(property);
    if (value != null) {
      return Long.parseLong(value);
    }

    return null;
  }

  private static void setupDefaultSparkCatalog(SparkSession spark) {
    if (spark.conf().contains(DEFAULT_CATALOG)) {
      return;
    }
    ImmutableMap config = ImmutableMap.of(
        "type", "hive",
        "default-namespace", "default",
        "cache-enabled", "false" // the source should not use a cache
    );
    String catalogName = "org.apache.iceberg.spark.SparkCatalog";
    spark.conf().set(DEFAULT_CATALOG, catalogName);
    config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value));
  }
}