All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.internal.table.DeltaCatalog Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
package io.delta.flink.internal.table;

import java.util.List;
import java.util.Map;
import java.util.Objects;
import javax.annotation.ParametersAreNonnullByDefault;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import io.delta.flink.internal.table.DeltaCatalogTableHelper.DeltaMetastoreTable;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.table.api.Schema;
import org.apache.flink.table.catalog.Catalog;
import org.apache.flink.table.catalog.CatalogBaseTable;
import org.apache.flink.table.catalog.CatalogTable;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.ResolvedCatalogTable;
import org.apache.flink.table.catalog.exceptions.CatalogException;
import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException;
import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.table.types.DataType;
import org.apache.flink.util.StringUtils;
import org.apache.hadoop.conf.Configuration;
import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;

import io.delta.standalone.DeltaLog;
import io.delta.standalone.Operation;
import io.delta.standalone.Snapshot;
import io.delta.standalone.actions.Metadata;
import io.delta.standalone.types.StructType;

/**
 * Delta Catalog implementation. This class executes calls to _delta_log for catalog operations such
 * as createTable, getTable etc. This class also prepares, persists and uses data store in metastore
 * using decorated catalog implementation.
 * 

* Catalog operations that are not in scope of Delta Table or do not require _delta_log operations * will be handled by {@link CatalogProxy} and {@link BaseCatalog} classes. */ public class DeltaCatalog { private static final String DEFAULT_TABLE_CACHE_SIZE = "100"; private final String catalogName; /** * A Flink's {@link Catalog} implementation to which all Metastore related actions will be * redirected. The {@link DeltaCatalog} will not call {@link Catalog#open()} on this instance. * If it is required to call this method it should be done before passing this reference to * {@link DeltaCatalog}. */ private final Catalog decoratedCatalog; private final LoadingCache deltaLogCache; /** * Creates instance of {@link DeltaCatalog} for given decorated catalog and catalog name. * * @param catalogName catalog name. * @param decoratedCatalog A Flink's {@link Catalog} implementation to which all Metastore * related actions will be redirected. The {@link DeltaCatalog} will * not call {@link Catalog#open()} on this instance. If it is * required to call this method it should be done before passing this * reference to {@link DeltaCatalog}. * @param hadoopConf The {@link Configuration} object that will be used for {@link * DeltaLog} initialization. */ DeltaCatalog(String catalogName, Catalog decoratedCatalog, Configuration hadoopConf) { this.catalogName = catalogName; this.decoratedCatalog = decoratedCatalog; checkArgument( !StringUtils.isNullOrWhitespaceOnly(catalogName), "Catalog name cannot be null or empty." ); checkArgument(decoratedCatalog != null, "The decoratedCatalog cannot be null." ); checkArgument(hadoopConf != null, "The Hadoop Configuration object - 'hadoopConfiguration' cannot be null." ); // Get max cache size from cluster Hadoop configuration. long cacheSize = Long.parseLong(hadoopConf.get("deltaCatalogTableCacheSize", DEFAULT_TABLE_CACHE_SIZE)); this.deltaLogCache = CacheBuilder.newBuilder() // Note that each DeltaLog, while in memory, contains a reference to a current // Snapshot, though that current Snapshot may not be the latest Snapshot available // for that delta table. Recomputing these Snapshots from scratch is expensive, hence // this cache. It is preferred, instead, to keep the most-recently-computed Snapshot // per Delta Log instance in this cache, so that generating the latest Snapshot means we // (internally) only have to apply the incremental changes. // A retained size for DeltaLog instance containing a Snapshot for a delta table with // 1100 records and 10 versions is only 700 KB. // When cache reaches its maximum size, the lest recently used entry will be replaced // (LRU eviction policy). .maximumSize(cacheSize) .build(new CacheLoader() { @Override @ParametersAreNonnullByDefault public DeltaLog load(DeltaLogCacheKey key) { return DeltaLog.forTable(hadoopConf, key.deltaTablePath); } }); } /** * Creates a new table in metastore and _delta_log if not already exists under given table Path. * The information stored in metastore will contain only catalog path (database.tableName) and * connector type. DDL options and table schema will be stored in _delta_log. *

* If _delta_log already exists under DDL's table-path option this method will throw an * exception if DDL scheme does not match _delta_log schema or DDL options override existing * _delta_log table properties or Partition specification defined in `PARTITION BY` does not * match _delta_log partition specification. *

*

* The framework will make sure to call this method with fully validated ResolvedCatalogTable or * ResolvedCatalogView. * * @param catalogTable the {@link DeltaCatalogBaseTable} with describing new table that should * be added to the catalog. * @param ignoreIfExists specifies behavior when a table or view already exists at the given * path: if set to false, it throws a TableAlreadyExistException, if set * to true, do nothing. * @throws TableAlreadyExistException if table already exists and ignoreIfExists is false * @throws DatabaseNotExistException if the database in tablePath doesn't exist * @throws CatalogException in case of any runtime exception */ public void createTable(DeltaCatalogBaseTable catalogTable, boolean ignoreIfExists) throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { checkNotNull(catalogTable); ObjectPath tableCatalogPath = catalogTable.getTableCatalogPath(); // First we need to check if table exists in Metastore and if so, throw exception. if (this.decoratedCatalog.tableExists(tableCatalogPath) && !ignoreIfExists) { throw new TableAlreadyExistException(this.catalogName, tableCatalogPath); } if (!decoratedCatalog.databaseExists(catalogTable.getDatabaseName())) { throw new DatabaseNotExistException( this.catalogName, catalogTable.getDatabaseName() ); } // These are taken from the DDL OPTIONS. Map ddlOptions = catalogTable.getOptions(); String deltaTablePath = ddlOptions.get(DeltaTableConnectorOptions.TABLE_PATH.key()); if (StringUtils.isNullOrWhitespaceOnly(deltaTablePath)) { throw new CatalogException("Path to Delta table cannot be null or empty."); } // DDL options validation DeltaCatalogTableHelper.validateDdlOptions(ddlOptions); // At this point what we should have in ddlOptions are only delta table // properties, connector type, table path and arbitrary user-defined table properties. // We don't want to store connector type or table path in _delta_log, so we will filter // those. Map filteredDdlOptions = DeltaCatalogTableHelper.filterMetastoreDdlOptions(ddlOptions); CatalogBaseTable table = catalogTable.getCatalogTable(); // Get Partition columns from DDL; List ddlPartitionColumns = ((CatalogTable) table).getPartitionKeys(); // Get Delta schema from Flink DDL. StructType ddlDeltaSchema = DeltaCatalogTableHelper.resolveDeltaSchemaFromDdl((ResolvedCatalogTable) table); DeltaLog deltaLog = getDeltaLogFromCache(catalogTable, deltaTablePath); if (deltaLog.tableExists()) { // Table was not present in metastore however it is present on Filesystem, we have to // verify if schema, partition spec and properties stored in _delta_log match with DDL. Metadata deltaMetadata = deltaLog.update().getMetadata(); // Validate ddl schema and partition spec matches _delta_log's. DeltaCatalogTableHelper.validateDdlSchemaAndPartitionSpecMatchesDelta( deltaTablePath, tableCatalogPath, ddlPartitionColumns, ddlDeltaSchema, deltaMetadata ); // Add new properties to Delta's metadata. // Throw if DDL Delta table properties override previously defined properties from // _delta_log. Map deltaLogProperties = DeltaCatalogTableHelper.prepareDeltaTableProperties( filteredDdlOptions, tableCatalogPath, deltaMetadata, false // allowOverride = false ); // deltaLogProperties will have same properties than original metadata + new one, // defined in DDL. In that case we want to update _delta_log metadata. if (deltaLogProperties.size() != deltaMetadata.getConfiguration().size()) { Metadata updatedMetadata = deltaMetadata.copyBuilder() .configuration(deltaLogProperties) .build(); // add properties to _delta_log DeltaCatalogTableHelper .commitToDeltaLog( deltaLog, updatedMetadata, Operation.Name.SET_TABLE_PROPERTIES ); } // Add table to metastore DeltaMetastoreTable metastoreTable = DeltaCatalogTableHelper.prepareMetastoreTable(table, deltaTablePath); this.decoratedCatalog.createTable(tableCatalogPath, metastoreTable, ignoreIfExists); } else { // Table does not exist on filesystem, we have to create a new _delta_log Metadata metadata = Metadata.builder() .schema(ddlDeltaSchema) .partitionColumns(ddlPartitionColumns) .configuration(filteredDdlOptions) .name(tableCatalogPath.getObjectName()) .build(); // create _delta_log DeltaCatalogTableHelper.commitToDeltaLog( deltaLog, metadata, Operation.Name.CREATE_TABLE ); DeltaMetastoreTable metastoreTable = DeltaCatalogTableHelper.prepareMetastoreTable(table, deltaTablePath); // add table to metastore this.decoratedCatalog.createTable(tableCatalogPath, metastoreTable, ignoreIfExists); } } /** * Deletes metastore entry and clears DeltaCatalog cache for given Delta table. *

* By design, we will remove only metastore information during drop table. No filesystem * information (for example _delta_log folder) will be removed. However, we have to clear * DeltaCatalog's cache for this table. */ public void dropTable(DeltaCatalogBaseTable catalogTable, boolean ignoreIfExists) throws TableNotExistException { CatalogBaseTable metastoreTable = catalogTable.getCatalogTable(); String tablePath = metastoreTable.getOptions().get(DeltaTableConnectorOptions.TABLE_PATH.key()); ObjectPath tableCatalogPath = catalogTable.getTableCatalogPath(); this.deltaLogCache.invalidate(new DeltaLogCacheKey(tableCatalogPath, tablePath)); this.decoratedCatalog.dropTable(tableCatalogPath, ignoreIfExists); } /** * Returns a {@link CatalogBaseTable} identified by the given * {@link DeltaCatalogBaseTable#getCatalogTable()}. * This method assumes that provided {@link DeltaCatalogBaseTable#getCatalogTable()} table * already exists in metastore hence no extra metastore checks will be executed. * * @throws TableNotExistException if the target does not exist */ public CatalogBaseTable getTable(DeltaCatalogBaseTable catalogTable) throws TableNotExistException { CatalogBaseTable metastoreTable = catalogTable.getCatalogTable(); String tablePath = metastoreTable.getOptions().get(DeltaTableConnectorOptions.TABLE_PATH.key()); DeltaLog deltaLog = getDeltaLogFromCache(catalogTable, tablePath); Snapshot snapshot = deltaLog.update(); if (!deltaLog.tableExists()) { // TableNotExistException does not accept custom message, but we would like to meet // API contracts from Flink's Catalog::getTable interface and throw // TableNotExistException but with information that what was missing was _delta_log. throw new TableNotExistException( this.catalogName, catalogTable.getTableCatalogPath(), new CatalogException( String.format( "Table %s exists in metastore but _delta_log was not found under path %s", catalogTable.getTableCatalogPath().getFullName(), tablePath ) ) ); } Metadata deltaMetadata = snapshot.getMetadata(); StructType deltaSchema = deltaMetadata.getSchema(); if (deltaSchema == null) { // This should not happen, but if it did for some reason it mens there is something // wong with _delta_log. throw new CatalogException(String.format("" + "Delta schema is null for table %s and table path %s. Please contact your " + "administrator.", catalogTable.getCatalogTable(), tablePath )); } Pair flinkTypesFromDelta = DeltaCatalogTableHelper.resolveFlinkTypesFromDelta(deltaSchema); return CatalogTable.of( Schema.newBuilder() .fromFields(flinkTypesFromDelta.getKey(), flinkTypesFromDelta.getValue()) .build(), // Table Schema is not stored in metastore, we take it from _delta_log. metastoreTable.getComment(), deltaMetadata.getPartitionColumns(), metastoreTable.getOptions() ); } /** * Checks if _delta_log folder exists for table described by {@link * DeltaCatalogBaseTable#getCatalogTable()} metastore entry. This method assumes that table * exists in metastore thus not execute any checks there. * * @return true if _delta_log exists for given {@link DeltaCatalogBaseTable}, false if not. */ public boolean tableExists(DeltaCatalogBaseTable catalogTable) { CatalogBaseTable metastoreTable = catalogTable.getCatalogTable(); String deltaTablePath = metastoreTable.getOptions().get(DeltaTableConnectorOptions.TABLE_PATH.key()); return getDeltaLogFromCache(catalogTable, deltaTablePath).tableExists(); } /** * Executes ALTER operation on Delta table. Currently, only changing table name and * changing/setting table properties is supported using ALTER statement. *

* Changing table name: {@code ALTER TABLE sourceTable RENAME TO newSourceTable} *

* Setting table property: {@code ALTER TABLE sourceTable SET ('userCustomProp'='myVal')} * * @param newCatalogTable catalog table with new name and properties defined by ALTER * statement. */ public void alterTable(DeltaCatalogBaseTable newCatalogTable) { // Flink's Default SQL dialect support ALTER statements ONLY for changing table name // (Catalog::renameTable(...) and for changing/setting table properties. Schema/partition // change for Flink default SQL dialect is not supported. Map alterTableDdlOptions = newCatalogTable.getOptions(); String deltaTablePath = alterTableDdlOptions.get(DeltaTableConnectorOptions.TABLE_PATH.key()); // DDL options validation DeltaCatalogTableHelper.validateDdlOptions(alterTableDdlOptions); // At this point what we should have in ddlOptions are only delta table // properties, connector type, table path and user defined options. We don't want to // store connector type or table path in _delta_log, so we will filter those. Map filteredDdlOptions = DeltaCatalogTableHelper.filterMetastoreDdlOptions(alterTableDdlOptions); DeltaLog deltaLog = getDeltaLogFromCache(newCatalogTable, deltaTablePath); Metadata originalMetaData = deltaLog.update().getMetadata(); // Add new properties to metadata. // Throw if DDL Delta table properties override previously defined properties from // _delta_log. Map deltaLogProperties = DeltaCatalogTableHelper.prepareDeltaTableProperties( filteredDdlOptions, newCatalogTable.getTableCatalogPath(), originalMetaData, true // allowOverride = true ); Metadata updatedMetadata = originalMetaData.copyBuilder() .configuration(deltaLogProperties) .build(); // add properties to _delta_log DeltaCatalogTableHelper .commitToDeltaLog(deltaLog, updatedMetadata, Operation.Name.SET_TABLE_PROPERTIES); } private DeltaLog getDeltaLogFromCache(DeltaCatalogBaseTable catalogTable, String tablePath) { return deltaLogCache.getUnchecked( new DeltaLogCacheKey( catalogTable.getTableCatalogPath(), tablePath )); } @VisibleForTesting LoadingCache getDeltaLogCache() { return deltaLogCache; } /** * This class represents a key for DeltaLog instances cache. */ static class DeltaLogCacheKey { private final ObjectPath objectPath; private final String deltaTablePath; DeltaLogCacheKey(ObjectPath objectPath, String deltaTablePath) { this.objectPath = objectPath; this.deltaTablePath = deltaTablePath; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } DeltaLogCacheKey that = (DeltaLogCacheKey) o; return objectPath.equals(that.objectPath) && deltaTablePath.equals(that.deltaTablePath); } @Override public int hashCode() { return Objects.hash(objectPath, deltaTablePath); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy