io.trino.plugin.deltalake.transactionlog.TransactionLogAccess Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-delta-lake Show documentation
Show all versions of trino-delta-lake Show documentation
Trino - Delta Lake connector
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.deltalake.transactionlog;
import com.google.common.cache.Cache;
import com.google.common.cache.Weigher;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Ints;
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.inject.Inject;
import io.airlift.jmx.CacheStatsMBean;
import io.trino.cache.EvictableCacheBuilder;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoFileSystemFactory;
import io.trino.filesystem.TrinoInputFile;
import io.trino.parquet.ParquetReaderOptions;
import io.trino.plugin.deltalake.DeltaLakeColumnHandle;
import io.trino.plugin.deltalake.DeltaLakeColumnMetadata;
import io.trino.plugin.deltalake.DeltaLakeConfig;
import io.trino.plugin.deltalake.transactionlog.TableSnapshot.MetadataAndProtocolEntry;
import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator;
import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager;
import io.trino.plugin.deltalake.transactionlog.checkpoint.LastCheckpoint;
import io.trino.plugin.deltalake.transactionlog.checkpoint.TransactionLogTail;
import io.trino.plugin.hive.FileFormatDataSourceStats;
import io.trino.plugin.hive.parquet.ParquetReaderConfig;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.SchemaTableName;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.BooleanType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeManager;
import io.trino.spi.type.VarbinaryType;
import org.weakref.jmx.Managed;
import org.weakref.jmx.Nested;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.time.Instant;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Stream;
import static com.google.common.base.Predicates.alwaysTrue;
import static com.google.common.base.Throwables.throwIfUnchecked;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static io.airlift.slice.SizeOf.estimatedSizeOf;
import static io.airlift.slice.SizeOf.instanceSize;
import static io.trino.cache.CacheUtils.invalidateAllIf;
import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.isCheckpointFilteringEnabled;
import static io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate;
import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.readLastCheckpoint;
import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogJsonEntryPath;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.ADD;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.COMMIT;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.METADATA;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.PROTOCOL;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.REMOVE;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.TransactionLogTail.getEntriesFromJson;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
public class TransactionLogAccess
{
private final TypeManager typeManager;
private final CheckpointSchemaManager checkpointSchemaManager;
private final FileFormatDataSourceStats fileFormatDataSourceStats;
private final TrinoFileSystemFactory fileSystemFactory;
private final ParquetReaderOptions parquetReaderOptions;
private final boolean checkpointRowStatisticsWritingEnabled;
private final int domainCompactionThreshold;
private final Cache tableSnapshots;
private final Cache activeDataFileCache;
@Inject
public TransactionLogAccess(
TypeManager typeManager,
CheckpointSchemaManager checkpointSchemaManager,
DeltaLakeConfig deltaLakeConfig,
FileFormatDataSourceStats fileFormatDataSourceStats,
TrinoFileSystemFactory fileSystemFactory,
ParquetReaderConfig parquetReaderConfig)
{
this.typeManager = requireNonNull(typeManager, "typeManager is null");
this.checkpointSchemaManager = requireNonNull(checkpointSchemaManager, "checkpointSchemaManager is null");
this.fileFormatDataSourceStats = requireNonNull(fileFormatDataSourceStats, "fileFormatDataSourceStats is null");
this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null");
this.parquetReaderOptions = parquetReaderConfig.toParquetReaderOptions().withBloomFilter(false);
this.checkpointRowStatisticsWritingEnabled = deltaLakeConfig.isCheckpointRowStatisticsWritingEnabled();
this.domainCompactionThreshold = deltaLakeConfig.getDomainCompactionThreshold();
tableSnapshots = EvictableCacheBuilder.newBuilder()
.expireAfterWrite(deltaLakeConfig.getMetadataCacheTtl().toMillis(), TimeUnit.MILLISECONDS)
.maximumSize(deltaLakeConfig.getMetadataCacheMaxSize())
.shareNothingWhenDisabled()
.recordStats()
.build();
activeDataFileCache = EvictableCacheBuilder.newBuilder()
.weigher((Weigher) (key, value) -> Ints.saturatedCast(key.getRetainedSizeInBytes() + value.getRetainedSizeInBytes()))
.maximumWeight(deltaLakeConfig.getDataFileCacheSize().toBytes())
.expireAfterWrite(deltaLakeConfig.getDataFileCacheTtl().toMillis(), TimeUnit.MILLISECONDS)
.shareNothingWhenDisabled()
.recordStats()
.build();
}
@Managed
@Nested
public CacheStatsMBean getDataFileMetadataCacheStats()
{
return new CacheStatsMBean(activeDataFileCache);
}
@Managed
@Nested
public CacheStatsMBean getMetadataCacheStats()
{
return new CacheStatsMBean(tableSnapshots);
}
public TableSnapshot loadSnapshot(ConnectorSession session, SchemaTableName table, String tableLocation)
throws IOException
{
TableLocation cacheKey = new TableLocation(table, tableLocation);
TableSnapshot cachedSnapshot = tableSnapshots.getIfPresent(cacheKey);
TableSnapshot snapshot;
TrinoFileSystem fileSystem = fileSystemFactory.create(session);
if (cachedSnapshot == null) {
try {
Optional lastCheckpoint = readLastCheckpoint(fileSystem, tableLocation);
snapshot = tableSnapshots.get(cacheKey, () ->
TableSnapshot.load(
table,
lastCheckpoint,
fileSystem,
tableLocation,
parquetReaderOptions,
checkpointRowStatisticsWritingEnabled,
domainCompactionThreshold));
}
catch (UncheckedExecutionException | ExecutionException e) {
throwIfUnchecked(e.getCause());
throw new RuntimeException(e);
}
}
else {
Optional updatedSnapshot = cachedSnapshot.getUpdatedSnapshot(fileSystem, Optional.empty());
if (updatedSnapshot.isPresent()) {
snapshot = updatedSnapshot.get();
tableSnapshots.asMap().replace(cacheKey, cachedSnapshot, snapshot);
}
else {
snapshot = cachedSnapshot;
}
}
return snapshot;
}
public void flushCache()
{
tableSnapshots.invalidateAll();
activeDataFileCache.invalidateAll();
}
public void invalidateCache(SchemaTableName schemaTableName, Optional tableLocation)
{
requireNonNull(schemaTableName, "schemaTableName is null");
// Invalidate by location in case one table (location) unregistered and re-register under different name
tableLocation.ifPresent(location -> {
invalidateAllIf(tableSnapshots, cacheKey -> cacheKey.location().equals(location));
invalidateAllIf(activeDataFileCache, cacheKey -> cacheKey.tableLocation().location().equals(location));
});
invalidateAllIf(tableSnapshots, cacheKey -> cacheKey.tableName().equals(schemaTableName));
invalidateAllIf(activeDataFileCache, cacheKey -> cacheKey.tableLocation().tableName().equals(schemaTableName));
}
public MetadataEntry getMetadataEntry(ConnectorSession session, TableSnapshot tableSnapshot)
{
if (tableSnapshot.getCachedMetadata().isEmpty()) {
try (Stream metadataEntries = getEntries(
session,
tableSnapshot,
METADATA,
entryStream -> entryStream.map(DeltaLakeTransactionLogEntry::getMetaData).filter(Objects::nonNull),
fileSystemFactory.create(session),
fileFormatDataSourceStats)) {
// Get last entry in the stream
tableSnapshot.setCachedMetadata(metadataEntries.reduce((first, second) -> second));
}
}
return tableSnapshot.getCachedMetadata()
.orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableSnapshot.getTable()));
}
public Stream getActiveFiles(
ConnectorSession session,
TableSnapshot tableSnapshot,
MetadataEntry metadataEntry,
ProtocolEntry protocolEntry,
TupleDomain partitionConstraint,
Set projectedColumns)
{
Set baseColumnNames = projectedColumns.stream()
.filter(DeltaLakeColumnHandle::isBaseColumn) // Only base column stats are supported
.map(DeltaLakeColumnHandle::getColumnName)
.collect(toImmutableSet());
return getActiveFiles(session, tableSnapshot, metadataEntry, protocolEntry, partitionConstraint, baseColumnNames::contains);
}
public Stream getActiveFiles(
ConnectorSession session,
TableSnapshot tableSnapshot,
MetadataEntry metadataEntry,
ProtocolEntry protocolEntry,
TupleDomain partitionConstraint,
Predicate addStatsMinMaxColumnFilter)
{
try {
if (isCheckpointFilteringEnabled(session)) {
return loadActiveFiles(session, tableSnapshot, metadataEntry, protocolEntry, partitionConstraint, addStatsMinMaxColumnFilter);
}
TableVersion tableVersion = new TableVersion(new TableLocation(tableSnapshot.getTable(), tableSnapshot.getTableLocation()), tableSnapshot.getVersion());
DeltaLakeDataFileCacheEntry cacheEntry = activeDataFileCache.get(tableVersion, () -> {
DeltaLakeDataFileCacheEntry oldCached = activeDataFileCache.asMap().keySet().stream()
.filter(key -> key.tableLocation().equals(tableVersion.tableLocation()) &&
key.version() < tableVersion.version())
.flatMap(key -> Optional.ofNullable(activeDataFileCache.getIfPresent(key))
.map(value -> Map.entry(key, value))
.stream())
.max(Comparator.comparing(entry -> entry.getKey().version()))
.map(Map.Entry::getValue)
.orElse(null);
if (oldCached != null) {
try {
List newEntries = getJsonEntries(
oldCached.getVersion(),
tableSnapshot.getVersion(),
tableSnapshot,
fileSystemFactory.create(session));
return oldCached.withUpdatesApplied(newEntries, tableSnapshot.getVersion());
}
catch (MissingTransactionLogException e) {
// The cached state cannot be used to calculate current state, as some
// intermediate transaction files are expired.
}
}
List activeFiles;
try (Stream addFileEntryStream = loadActiveFiles(session, tableSnapshot, metadataEntry, protocolEntry, TupleDomain.all(), alwaysTrue())) {
activeFiles = addFileEntryStream.collect(toImmutableList());
}
return new DeltaLakeDataFileCacheEntry(tableSnapshot.getVersion(), activeFiles);
});
return cacheEntry.getActiveFiles().stream();
}
catch (ExecutionException | UncheckedExecutionException e) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Failed accessing transaction log for table: " + tableSnapshot.getTable(), e);
}
}
private Stream loadActiveFiles(
ConnectorSession session,
TableSnapshot tableSnapshot,
MetadataEntry metadataEntry,
ProtocolEntry protocolEntry,
TupleDomain partitionConstraint,
Predicate addStatsMinMaxColumnFilter)
{
List transactions = tableSnapshot.getTransactions();
try (Stream checkpointEntries = tableSnapshot.getCheckpointTransactionLogEntries(
session,
ImmutableSet.of(ADD),
checkpointSchemaManager,
typeManager,
fileSystemFactory.create(session),
fileFormatDataSourceStats,
Optional.of(new MetadataAndProtocolEntry(metadataEntry, protocolEntry)),
partitionConstraint,
Optional.of(addStatsMinMaxColumnFilter))) {
return activeAddEntries(checkpointEntries, transactions)
.filter(partitionConstraint.isAll()
? addAction -> true
: addAction -> partitionMatchesPredicate(addAction.getCanonicalPartitionValues(), partitionConstraint.getDomains().orElseThrow()));
}
catch (IOException e) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Error reading transaction log for " + tableSnapshot.getTable(), e);
}
}
public static List columnsWithStats(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager)
{
return columnsWithStats(DeltaLakeSchemaSupport.extractSchema(metadataEntry, protocolEntry, typeManager), metadataEntry.getOriginalPartitionColumns());
}
public static ImmutableList columnsWithStats(List schema, List partitionColumns)
{
return schema.stream()
.filter(column -> !partitionColumns.contains(column.getName()))
.filter(column -> {
Type type = column.getType();
return !(type instanceof MapType || type instanceof ArrayType || type.equals(BooleanType.BOOLEAN) || type.equals(VarbinaryType.VARBINARY));
})
.collect(toImmutableList());
}
private Stream activeAddEntries(Stream checkpointEntries, List transactions)
{
Map activeJsonEntries = new LinkedHashMap<>();
HashSet removedFiles = new HashSet<>();
// The json entries containing the last few entries in the log need to be applied on top of the parquet snapshot:
// - Any files which have been removed need to be excluded
// - Any files with newer add actions need to be updated with the most recent metadata
transactions.forEach(transaction -> {
Map addFilesInTransaction = new LinkedHashMap<>();
Set removedFilesInTransaction = new HashSet<>();
transaction.transactionEntries().forEach(deltaLakeTransactionLogEntry -> {
if (deltaLakeTransactionLogEntry.getAdd() != null) {
addFilesInTransaction.put(deltaLakeTransactionLogEntry.getAdd().getPath(), deltaLakeTransactionLogEntry.getAdd());
}
else if (deltaLakeTransactionLogEntry.getRemove() != null) {
removedFilesInTransaction.add(deltaLakeTransactionLogEntry.getRemove().path());
}
});
// Process 'remove' entries first because deletion vectors register both 'add' and 'remove' entries and the 'add' entry should be kept
removedFiles.addAll(removedFilesInTransaction);
removedFilesInTransaction.forEach(activeJsonEntries::remove);
activeJsonEntries.putAll(addFilesInTransaction);
});
Stream filteredCheckpointEntries = checkpointEntries
.map(DeltaLakeTransactionLogEntry::getAdd)
.filter(Objects::nonNull)
.filter(addEntry -> !removedFiles.contains(addEntry.getPath()) && !activeJsonEntries.containsKey(addEntry.getPath()));
return Stream.concat(filteredCheckpointEntries, activeJsonEntries.values().stream());
}
public Stream getRemoveEntries(ConnectorSession session, TableSnapshot tableSnapshot)
{
return getEntries(
session,
tableSnapshot,
REMOVE,
entryStream -> entryStream.map(DeltaLakeTransactionLogEntry::getRemove).filter(Objects::nonNull),
fileSystemFactory.create(session),
fileFormatDataSourceStats);
}
public Map, Object> getTransactionLogEntries(
ConnectorSession session,
TableSnapshot tableSnapshot,
Set entryTypes,
Function, Stream