org.apache.iceberg.spark.procedures.BaseProcedure Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.procedures;

import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.function.Function;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.Spark3Util.CatalogAndIdentifier;
import org.apache.iceberg.spark.actions.SparkActions;
import org.apache.iceberg.spark.procedures.SparkProcedures.ProcedureBuilder;
import org.apache.iceberg.spark.source.SparkTable;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.iceberg.catalog.Procedure;
import org.apache.spark.sql.execution.CacheManager;
import org.apache.spark.sql.execution.datasources.SparkExpressionConverter;
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import scala.Option;

abstract class BaseProcedure implements Procedure {
  protected static final DataType STRING_MAP =
      DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType);
  protected static final DataType STRING_ARRAY = DataTypes.createArrayType(DataTypes.StringType);

  private final SparkSession spark;
  private final TableCatalog tableCatalog;

  private SparkActions actions;
  private ExecutorService executorService = null;

  protected BaseProcedure(TableCatalog tableCatalog) {
    this.spark = SparkSession.active();
    this.tableCatalog = tableCatalog;
  }

  protected SparkSession spark() {
    return this.spark;
  }

  protected SparkActions actions() {
    if (actions == null) {
      this.actions = SparkActions.get(spark);
    }
    return actions;
  }

  protected TableCatalog tableCatalog() {
    return this.tableCatalog;
  }

  protected  T modifyIcebergTable(Identifier ident, Function func) {
    try {
      return execute(ident, true, func);
    } finally {
      closeService();
    }
  }

  protected  T withIcebergTable(Identifier ident, Function func) {
    try {
      return execute(ident, false, func);
    } finally {
      closeService();
    }
  }

  private  T execute(
      Identifier ident, boolean refreshSparkCache, Function func) {
    SparkTable sparkTable = loadSparkTable(ident);
    org.apache.iceberg.Table icebergTable = sparkTable.table();

    T result = func.apply(icebergTable);

    if (refreshSparkCache) {
      refreshSparkCache(ident, sparkTable);
    }

    return result;
  }

  protected Identifier toIdentifier(String identifierAsString, String argName) {
    CatalogAndIdentifier catalogAndIdentifier =
        toCatalogAndIdentifier(identifierAsString, argName, tableCatalog);

    Preconditions.checkArgument(
        catalogAndIdentifier.catalog().equals(tableCatalog),
        "Cannot run procedure in catalog '%s': '%s' is a table in catalog '%s'",
        tableCatalog.name(),
        identifierAsString,
        catalogAndIdentifier.catalog().name());

    return catalogAndIdentifier.identifier();
  }

  protected CatalogAndIdentifier toCatalogAndIdentifier(
      String identifierAsString, String argName, CatalogPlugin catalog) {
    Preconditions.checkArgument(
        identifierAsString != null && !identifierAsString.isEmpty(),
        "Cannot handle an empty identifier for argument %s",
        argName);

    return Spark3Util.catalogAndIdentifier(
        "identifier for arg " + argName, spark, identifierAsString, catalog);
  }

  protected SparkTable loadSparkTable(Identifier ident) {
    try {
      Table table = tableCatalog.loadTable(ident);
      ValidationException.check(
          table instanceof SparkTable, "%s is not %s", ident, SparkTable.class.getName());
      return (SparkTable) table;
    } catch (NoSuchTableException e) {
      String errMsg =
          String.format("Couldn't load table '%s' in catalog '%s'", ident, tableCatalog.name());
      throw new RuntimeException(errMsg, e);
    }
  }

  protected Dataset loadRows(Identifier tableIdent, Map options) {
    String tableName = Spark3Util.quotedFullIdentifier(tableCatalog().name(), tableIdent);
    return spark().read().options(options).table(tableName);
  }

  protected void refreshSparkCache(Identifier ident, Table table) {
    CacheManager cacheManager = spark.sharedState().cacheManager();
    DataSourceV2Relation relation =
        DataSourceV2Relation.create(table, Option.apply(tableCatalog), Option.apply(ident));
    cacheManager.recacheByPlan(spark, relation);
  }

  protected Expression filterExpression(Identifier ident, String where) {
    try {
      String name = Spark3Util.quotedFullIdentifier(tableCatalog.name(), ident);
      org.apache.spark.sql.catalyst.expressions.Expression expression =
          SparkExpressionConverter.collectResolvedSparkExpression(spark, name, where);
      return SparkExpressionConverter.convertToIcebergExpression(expression);
    } catch (AnalysisException e) {
      throw new IllegalArgumentException("Cannot parse predicates in where option: " + where, e);
    }
  }

  protected InternalRow newInternalRow(Object... values) {
    return new GenericInternalRow(values);
  }

  protected abstract static class Builder implements ProcedureBuilder {
    private TableCatalog tableCatalog;

    @Override
    public Builder withTableCatalog(TableCatalog newTableCatalog) {
      this.tableCatalog = newTableCatalog;
      return this;
    }

    @Override
    public T build() {
      return doBuild();
    }

    protected abstract T doBuild();

    TableCatalog tableCatalog() {
      return tableCatalog;
    }
  }

  /**
   * Closes this procedure's executor service if a new one was created with {@link
   * BaseProcedure#executorService(int, String)}. Does not block for any remaining tasks.
   */
  protected void closeService() {
    if (executorService != null) {
      executorService.shutdown();
    }
  }

  /**
   * Starts a new executor service which can be used by this procedure in its work. The pool will be
   * automatically shut down if {@link #withIcebergTable(Identifier, Function)} or {@link
   * #modifyIcebergTable(Identifier, Function)} are called. If these methods are not used then the
   * service can be shut down with {@link #closeService()} or left to be closed when this class is
   * finalized.
   *
   * @param threadPoolSize number of threads in the service
   * @param nameFormat name prefix for threads created in this service
   * @return the new executor service owned by this procedure
   */
  protected ExecutorService executorService(int threadPoolSize, String nameFormat) {
    Preconditions.checkArgument(
        executorService == null, "Cannot create a new executor service, one already exists.");
    Preconditions.checkArgument(
        nameFormat != null, "Cannot create a service with null nameFormat arg");
    this.executorService =
        MoreExecutors.getExitingExecutorService(
            (ThreadPoolExecutor)
                Executors.newFixedThreadPool(
                    threadPoolSize,
                    new ThreadFactoryBuilder()
                        .setDaemon(true)
                        .setNameFormat(nameFormat + "-%d")
                        .build()));

    return executorService;
  }
}