io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.deltalake.transactionlog.checkpoint;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.math.LongMath;
import io.airlift.log.Logger;
import io.trino.filesystem.TrinoInputFile;
import io.trino.parquet.Column;
import io.trino.parquet.Field;
import io.trino.parquet.ParquetReaderOptions;
import io.trino.plugin.deltalake.DeltaHiveTypeTranslator;
import io.trino.plugin.deltalake.DeltaLakeColumnHandle;
import io.trino.plugin.deltalake.DeltaLakeColumnMetadata;
import io.trino.plugin.deltalake.transactionlog.AddFileEntry;
import io.trino.plugin.deltalake.transactionlog.CommitInfoEntry;
import io.trino.plugin.deltalake.transactionlog.DeletionVectorEntry;
import io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry;
import io.trino.plugin.deltalake.transactionlog.MetadataEntry;
import io.trino.plugin.deltalake.transactionlog.ProtocolEntry;
import io.trino.plugin.deltalake.transactionlog.RemoveFileEntry;
import io.trino.plugin.deltalake.transactionlog.TransactionEntry;
import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeParquetFileStatistics;
import io.trino.plugin.hive.FileFormatDataSourceStats;
import io.trino.plugin.hive.HiveColumnHandle;
import io.trino.plugin.hive.HiveColumnHandle.ColumnType;
import io.trino.plugin.hive.HiveColumnProjectionInfo;
import io.trino.plugin.hive.HiveType;
import io.trino.plugin.hive.ReaderPageSource;
import io.trino.plugin.hive.parquet.ParquetPageSource;
import io.trino.plugin.hive.parquet.ParquetPageSourceFactory;
import io.trino.spi.Page;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.SqlRow;
import io.trino.spi.block.ValueBlock;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampWithTimeZoneType;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeManager;
import io.trino.spi.type.TypeSignature;
import jakarta.annotation.Nullable;
import org.joda.time.DateTimeZone;
import java.util.ArrayDeque;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.Queue;
import java.util.Set;
import java.util.function.Predicate;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.MoreCollectors.onlyElement;
import static com.google.common.collect.MoreCollectors.toOptional;
import static io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR;
import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA;
import static io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.isDeletionVectorEnabled;
import static io.trino.plugin.deltalake.transactionlog.TransactionLogAccess.columnsWithStats;
import static io.trino.plugin.deltalake.transactionlog.TransactionLogParser.START_OF_MODERN_ERA_EPOCH_DAY;
import static io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.canonicalizePartitionValues;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.ADD;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.COMMIT;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.METADATA;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.PROTOCOL;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.REMOVE;
import static io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator.EntryType.TRANSACTION;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone;
import static io.trino.spi.type.TimeZoneKey.UTC_KEY;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS;
import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND;
import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_DAY;
import static io.trino.spi.type.TypeUtils.readNativeValue;
import static io.trino.spi.type.VarcharType.VARCHAR;
import static java.lang.Math.floorDiv;
import static java.lang.String.format;
import static java.math.RoundingMode.UNNECESSARY;
import static java.util.Objects.requireNonNull;
public class CheckpointEntryIterator
extends AbstractIterator
{
public enum EntryType
{
TRANSACTION("txn"),
ADD("add"),
REMOVE("remove"),
METADATA("metadata"),
PROTOCOL("protocol"),
COMMIT("commitinfo");
private final String columnName;
EntryType(String columnName)
{
this.columnName = columnName;
}
public String getColumnName()
{
return columnName;
}
}
private static final Logger log = Logger.get(CheckpointEntryIterator.class);
private final String checkpointPath;
private final ConnectorSession session;
private final ParquetPageSource pageSource;
private final MapType stringMap;
private final ArrayType stringList;
private final Queue nextEntries;
private final List extractors;
private final boolean checkpointRowStatisticsWritingEnabled;
private final TupleDomain partitionConstraint;
private final Optional txnType;
private final Optional addType;
private final Optional addPartitionValuesType;
private final Optional addDeletionVectorType;
private final Optional addParsedStatsFieldType;
private final Optional removeType;
private final Optional metadataType;
private final Optional protocolType;
private final Optional commitType;
private MetadataEntry metadataEntry;
private ProtocolEntry protocolEntry;
private boolean deletionVectorsEnabled;
private List schema;
private List columnsWithMinMaxStats;
private Page page;
private int pagePosition;
public CheckpointEntryIterator(
TrinoInputFile checkpoint,
ConnectorSession session,
long fileSize,
CheckpointSchemaManager checkpointSchemaManager,
TypeManager typeManager,
Set fields,
Optional metadataEntry,
Optional protocolEntry,
FileFormatDataSourceStats stats,
ParquetReaderOptions parquetReaderOptions,
boolean checkpointRowStatisticsWritingEnabled,
int domainCompactionThreshold,
TupleDomain partitionConstraint,
Optional> addStatsMinMaxColumnFilter)
{
this.checkpointPath = checkpoint.location().toString();
this.session = requireNonNull(session, "session is null");
this.stringList = (ArrayType) typeManager.getType(TypeSignature.arrayType(VARCHAR.getTypeSignature()));
this.stringMap = (MapType) typeManager.getType(TypeSignature.mapType(VARCHAR.getTypeSignature(), VARCHAR.getTypeSignature()));
this.checkpointRowStatisticsWritingEnabled = checkpointRowStatisticsWritingEnabled;
this.partitionConstraint = requireNonNull(partitionConstraint, "partitionConstraint is null");
requireNonNull(addStatsMinMaxColumnFilter, "addStatsMinMaxColumnFilter is null");
checkArgument(!fields.isEmpty(), "fields is empty");
// ADD requires knowing the metadata in order to figure out the Parquet schema
if (fields.contains(ADD)) {
checkArgument(metadataEntry.isPresent(), "Metadata entry must be provided when reading ADD entries from Checkpoint files");
this.metadataEntry = metadataEntry.get();
checkArgument(protocolEntry.isPresent(), "Protocol entry must be provided when reading ADD entries from Checkpoint files");
this.protocolEntry = protocolEntry.get();
deletionVectorsEnabled = isDeletionVectorEnabled(this.metadataEntry, this.protocolEntry);
checkArgument(addStatsMinMaxColumnFilter.isPresent(), "addStatsMinMaxColumnFilter must be provided when reading ADD entries from Checkpoint files");
this.schema = extractSchema(this.metadataEntry, this.protocolEntry, typeManager);
this.columnsWithMinMaxStats = columnsWithStats(schema, this.metadataEntry.getOriginalPartitionColumns());
Predicate columnStatsFilterFunction = addStatsMinMaxColumnFilter.orElseThrow();
this.columnsWithMinMaxStats = columnsWithMinMaxStats.stream()
.filter(column -> columnStatsFilterFunction.test(column.getName()))
.collect(toImmutableList());
}
ImmutableList.Builder columnsBuilder = ImmutableList.builderWithExpectedSize(fields.size());
ImmutableList.Builder> disjunctDomainsBuilder = ImmutableList.builderWithExpectedSize(fields.size());
for (EntryType field : fields) {
HiveColumnHandle column = buildColumnHandle(field, checkpointSchemaManager, this.metadataEntry, this.protocolEntry, addStatsMinMaxColumnFilter).toHiveColumnHandle();
columnsBuilder.add(column);
disjunctDomainsBuilder.add(buildTupleDomainColumnHandle(field, column));
if (field == ADD) {
Type addEntryPartitionValuesType = checkpointSchemaManager.getAddEntryPartitionValuesType();
columnsBuilder.add(new DeltaLakeColumnHandle("add", addEntryPartitionValuesType, OptionalInt.empty(), "add", addEntryPartitionValuesType, REGULAR, Optional.empty()).toHiveColumnHandle());
}
}
ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(
checkpoint,
0,
fileSize,
columnsBuilder.build(),
disjunctDomainsBuilder.build(), // OR-ed condition
true,
DateTimeZone.UTC,
stats,
parquetReaderOptions,
Optional.empty(),
domainCompactionThreshold,
OptionalLong.empty());
verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
this.pageSource = (ParquetPageSource) pageSource.get();
this.nextEntries = new ArrayDeque<>();
this.extractors = fields.stream()
.map(this::createCheckpointFieldExtractor)
.collect(toImmutableList());
txnType = getParquetType(fields, TRANSACTION);
addType = getAddParquetTypeContainingField(fields, "path");
addPartitionValuesType = getAddParquetTypeContainingField(fields, "partitionValues");
addDeletionVectorType = addType.flatMap(type -> getOptionalFieldType(type, "deletionVector"));
addParsedStatsFieldType = addType.flatMap(type -> getOptionalFieldType(type, "stats_parsed"));
removeType = getParquetType(fields, REMOVE);
metadataType = getParquetType(fields, METADATA);
protocolType = getParquetType(fields, PROTOCOL);
commitType = getParquetType(fields, COMMIT);
}
private static Optional getOptionalFieldType(RowType type, String fieldName)
{
return type.getFields().stream()
.filter(field -> field.getName().orElseThrow().equals(fieldName))
.collect(toOptional())
.map(RowType.Field::getType)
.map(RowType.class::cast);
}
private Optional getAddParquetTypeContainingField(Set fields, String fieldName)
{
return fields.contains(ADD) ?
this.pageSource.getColumnFields().stream()
.filter(column -> column.name().equals(ADD.getColumnName()) &&
column.field().getType() instanceof RowType rowType &&
rowType.getFields().stream().map(RowType.Field::getName).filter(Optional::isPresent).flatMap(Optional::stream).anyMatch(fieldName::equals))
// The field even if it was requested might not exist in Parquet file
.collect(toOptional())
.map(Column::field)
.map(Field::getType)
.map(RowType.class::cast)
: Optional.empty();
}
private Optional getParquetType(Set fields, EntryType field)
{
return fields.contains(field) ? getParquetType(field.getColumnName()).map(RowType.class::cast) : Optional.empty();
}
private Optional getParquetType(String columnName)
{
return pageSource.getColumnFields().stream()
.filter(column -> column.name().equals(columnName))
// The field even if it was requested may not exist in Parquet file
.collect(toOptional())
.map(Column::field)
.map(Field::getType);
}
private CheckpointFieldExtractor createCheckpointFieldExtractor(EntryType entryType)
{
return switch (entryType) {
case TRANSACTION -> (session, pagePosition, blocks) -> buildTxnEntry(session, pagePosition, blocks[0]);
case ADD -> new AddFileEntryExtractor();
case REMOVE -> (session, pagePosition, blocks) -> buildRemoveEntry(session, pagePosition, blocks[0]);
case METADATA -> (session, pagePosition, blocks) -> buildMetadataEntry(session, pagePosition, blocks[0]);
case PROTOCOL -> (session, pagePosition, blocks) -> buildProtocolEntry(session, pagePosition, blocks[0]);
case COMMIT -> (session, pagePosition, blocks) -> buildCommitInfoEntry(session, pagePosition, blocks[0]);
};
}
private DeltaLakeColumnHandle buildColumnHandle(
EntryType entryType,
CheckpointSchemaManager schemaManager,
MetadataEntry metadataEntry,
ProtocolEntry protocolEntry,
Optional> addStatsMinMaxColumnFilter)
{
Type type = switch (entryType) {
case TRANSACTION -> schemaManager.getTxnEntryType();
case ADD -> schemaManager.getAddEntryType(metadataEntry, protocolEntry, addStatsMinMaxColumnFilter.orElseThrow(), true, true, false);
case REMOVE -> schemaManager.getRemoveEntryType();
case METADATA -> schemaManager.getMetadataEntryType();
case PROTOCOL -> schemaManager.getProtocolEntryType(true, true);
case COMMIT -> schemaManager.getCommitInfoEntryType();
};
return new DeltaLakeColumnHandle(entryType.getColumnName(), type, OptionalInt.empty(), entryType.getColumnName(), type, REGULAR, Optional.empty());
}
/**
* Constructs a TupleDomain which filters on a specific required primitive sub-column of the EntryType being
* not null for effectively pushing down the predicate to the Parquet reader.
*
* The particular field we select for each action is a required fields per the Delta Log specification, please see
* https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Actions This is also enforced when we read entries.
*/
private TupleDomain buildTupleDomainColumnHandle(EntryType entryType, HiveColumnHandle column)
{
String field;
Type type;
switch (entryType) {
case COMMIT, TRANSACTION -> {
field = "version";
type = BIGINT;
}
case ADD, REMOVE -> {
field = "path";
type = VARCHAR;
}
case METADATA -> {
field = "id";
type = VARCHAR;
}
case PROTOCOL -> {
field = "minReaderVersion";
type = BIGINT;
}
default -> throw new IllegalArgumentException("Unsupported Delta Lake checkpoint entry type: " + entryType);
}
HiveColumnHandle handle = new HiveColumnHandle(
column.getBaseColumnName(),
column.getBaseHiveColumnIndex(),
column.getBaseHiveType(),
column.getBaseType(),
Optional.of(new HiveColumnProjectionInfo(
ImmutableList.of(0), // hiveColumnIndex; we provide fake value because we always find columns by name
ImmutableList.of(field),
HiveType.toHiveType(type),
type)),
ColumnType.REGULAR,
column.getComment());
ImmutableMap.Builder domains = ImmutableMap.builder()
.put(handle, Domain.notNull(handle.getType()));
if (entryType == ADD) {
partitionConstraint.getDomains().orElseThrow().forEach((key, value) -> domains.put(toPartitionValuesParsedField(column, key), value));
}
return TupleDomain.withColumnDomains(domains.buildOrThrow());
}
private static HiveColumnHandle toPartitionValuesParsedField(HiveColumnHandle addColumn, DeltaLakeColumnHandle partitionColumn)
{
return new HiveColumnHandle(
addColumn.getBaseColumnName(),
addColumn.getBaseHiveColumnIndex(),
addColumn.getBaseHiveType(),
addColumn.getBaseType(),
Optional.of(new HiveColumnProjectionInfo(
ImmutableList.of(0, 0), // hiveColumnIndex; we provide fake value because we always find columns by name
ImmutableList.of("partitionvalues_parsed", partitionColumn.getColumnName()),
DeltaHiveTypeTranslator.toHiveType(partitionColumn.getType()),
partitionColumn.getType())),
HiveColumnHandle.ColumnType.REGULAR,
addColumn.getComment());
}
private DeltaLakeTransactionLogEntry buildCommitInfoEntry(ConnectorSession session, int pagePosition, Block block)
{
log.debug("Building commitInfo entry from %s pagePosition %d", block, pagePosition);
if (block.isNull(pagePosition)) {
return null;
}
RowType type = commitType.orElseThrow();
int commitInfoFields = 12;
int jobFields = 5;
int notebookFields = 1;
SqlRow commitInfoRow = block.getObject(pagePosition, SqlRow.class);
CheckpointFieldReader commitInfo = new CheckpointFieldReader(session, commitInfoRow, type);
log.debug("Block %s has %s fields", block, commitInfoRow.getFieldCount());
if (commitInfoRow.getFieldCount() != commitInfoFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", block, commitInfoFields, commitInfoRow.getFieldCount()));
}
SqlRow jobRow = commitInfo.getRow("job");
if (jobRow.getFieldCount() != jobFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", jobRow, jobFields, jobRow.getFieldCount()));
}
RowType.Field jobField = type.getFields().stream().filter(field -> field.getName().orElseThrow().equals("job")).collect(onlyElement());
CheckpointFieldReader job = new CheckpointFieldReader(session, jobRow, (RowType) jobField.getType());
SqlRow notebookRow = commitInfo.getRow("notebook");
if (notebookRow.getFieldCount() != notebookFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", notebookRow, notebookFields, notebookRow.getFieldCount()));
}
RowType.Field notebookField = type.getFields().stream().filter(field -> field.getName().orElseThrow().equals("notebook")).collect(onlyElement());
CheckpointFieldReader notebook = new CheckpointFieldReader(session, notebookRow, (RowType) notebookField.getType());
CommitInfoEntry result = new CommitInfoEntry(
commitInfo.getLong("version"),
commitInfo.getLong("timestamp"),
commitInfo.getString("userId"),
commitInfo.getString("userName"),
commitInfo.getString("operation"),
commitInfo.getMap(stringMap, "operationParameters"),
new CommitInfoEntry.Job(
job.getString("jobId"),
job.getString("jobName"),
job.getString("runId"),
job.getString("jobOwnerId"),
job.getString("triggerType")),
new CommitInfoEntry.Notebook(
notebook.getString("notebookId")),
commitInfo.getString("clusterId"),
commitInfo.getInt("readVersion"),
commitInfo.getString("isolationLevel"),
Optional.of(commitInfo.getBoolean("isBlindAppend")));
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.commitInfoEntry(result);
}
private DeltaLakeTransactionLogEntry buildProtocolEntry(ConnectorSession session, int pagePosition, Block block)
{
log.debug("Building protocol entry from %s pagePosition %d", block, pagePosition);
if (block.isNull(pagePosition)) {
return null;
}
RowType type = protocolType.orElseThrow();
int minProtocolFields = 2;
int maxProtocolFields = 4;
SqlRow protocolEntryRow = block.getObject(pagePosition, SqlRow.class);
int fieldCount = protocolEntryRow.getFieldCount();
log.debug("Block %s has %s fields", block, fieldCount);
if (fieldCount < minProtocolFields || fieldCount > maxProtocolFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have between %d and %d children, but found %s", block, minProtocolFields, maxProtocolFields, fieldCount));
}
CheckpointFieldReader protocol = new CheckpointFieldReader(session, protocolEntryRow, type);
ProtocolEntry result = new ProtocolEntry(
protocol.getInt("minReaderVersion"),
protocol.getInt("minWriterVersion"),
protocol.getOptionalSet(stringList, "readerFeatures"),
protocol.getOptionalSet(stringList, "writerFeatures"));
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.protocolEntry(result);
}
private DeltaLakeTransactionLogEntry buildMetadataEntry(ConnectorSession session, int pagePosition, Block block)
{
log.debug("Building metadata entry from %s pagePosition %d", block, pagePosition);
if (block.isNull(pagePosition)) {
return null;
}
RowType type = metadataType.orElseThrow();
int metadataFields = 8;
int formatFields = 2;
SqlRow metadataEntryRow = block.getObject(pagePosition, SqlRow.class);
CheckpointFieldReader metadata = new CheckpointFieldReader(session, metadataEntryRow, type);
log.debug("Block %s has %s fields", block, metadataEntryRow.getFieldCount());
if (metadataEntryRow.getFieldCount() != metadataFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", block, metadataFields, metadataEntryRow.getFieldCount()));
}
SqlRow formatRow = metadata.getRow("format");
if (formatRow.getFieldCount() != formatFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", formatRow, formatFields, formatRow.getFieldCount()));
}
RowType.Field formatField = type.getFields().stream().filter(field -> field.getName().orElseThrow().equals("format")).collect(onlyElement());
CheckpointFieldReader format = new CheckpointFieldReader(session, formatRow, (RowType) formatField.getType());
MetadataEntry result = new MetadataEntry(
metadata.getString("id"),
metadata.getString("name"),
metadata.getString("description"),
new MetadataEntry.Format(
format.getString("provider"),
format.getMap(stringMap, "options")),
metadata.getString("schemaString"),
metadata.getList(stringList, "partitionColumns"),
metadata.getMap(stringMap, "configuration"),
metadata.getLong("createdTime"));
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.metadataEntry(result);
}
private DeltaLakeTransactionLogEntry buildRemoveEntry(ConnectorSession session, int pagePosition, Block block)
{
log.debug("Building remove entry from %s pagePosition %d", block, pagePosition);
if (block.isNull(pagePosition)) {
return null;
}
RowType type = removeType.orElseThrow();
int removeFields = 3;
SqlRow removeEntryRow = block.getObject(pagePosition, SqlRow.class);
log.debug("Block %s has %s fields", block, removeEntryRow.getFieldCount());
if (removeEntryRow.getFieldCount() != removeFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA,
format("Expected block %s to have %d children, but found %s", block, removeFields, removeEntryRow.getFieldCount()));
}
CheckpointFieldReader remove = new CheckpointFieldReader(session, removeEntryRow, type);
RemoveFileEntry result = new RemoveFileEntry(
remove.getString("path"),
remove.getLong("deletionTimestamp"),
remove.getBoolean("dataChange"));
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.removeFileEntry(result);
}
private class AddFileEntryExtractor
implements CheckpointFieldExtractor
{
@Nullable
@Override
public DeltaLakeTransactionLogEntry getEntry(ConnectorSession session, int pagePosition, Block... blocks)
{
checkState(blocks.length == getRequiredChannels(), "Unexpected amount of blocks: %s", blocks.length);
Block addBlock = blocks[0];
Block addPartitionValuesBlock = blocks[1];
log.debug("Building add entry from %s pagePosition %d", addBlock, pagePosition);
if (addBlock.isNull(pagePosition)) {
return null;
}
checkState(!addPartitionValuesBlock.isNull(pagePosition), "Inconsistent blocks provided while building the add file entry");
SqlRow addPartitionValuesRow = addPartitionValuesBlock.getObject(pagePosition, SqlRow.class);
CheckpointFieldReader addPartitionValuesReader = new CheckpointFieldReader(session, addPartitionValuesRow, addPartitionValuesType.orElseThrow());
Map partitionValues = addPartitionValuesReader.getMap(stringMap, "partitionValues");
Map> canonicalPartitionValues = canonicalizePartitionValues(partitionValues);
if (!partitionConstraint.isAll() && !partitionMatchesPredicate(canonicalPartitionValues, partitionConstraint.getDomains().orElseThrow())) {
return null;
}
// Materialize from Parquet the information needed to build the AddEntry instance
addBlock = addBlock.getLoadedBlock();
SqlRow addEntryRow = addBlock.getObject(pagePosition, SqlRow.class);
log.debug("Block %s has %s fields", addBlock, addEntryRow.getFieldCount());
CheckpointFieldReader addReader = new CheckpointFieldReader(session, addEntryRow, addType.orElseThrow());
String path = addReader.getString("path");
long size = addReader.getLong("size");
long modificationTime = addReader.getLong("modificationTime");
boolean dataChange = addReader.getBoolean("dataChange");
Optional deletionVector = Optional.empty();
if (deletionVectorsEnabled) {
deletionVector = Optional.ofNullable(addReader.getRow("deletionVector"))
.map(row -> parseDeletionVectorFromParquet(session, row, addDeletionVectorType.orElseThrow()));
}
Optional parsedStats = Optional.ofNullable(addReader.getRow("stats_parsed"))
.map(row -> parseStatisticsFromParquet(session, row, addParsedStatsFieldType.orElseThrow()));
Optional stats = Optional.empty();
if (parsedStats.isEmpty()) {
stats = Optional.ofNullable(addReader.getString("stats"));
}
Map tags = addReader.getMap(stringMap, "tags");
AddFileEntry result = new AddFileEntry(
path,
partitionValues,
canonicalPartitionValues,
size,
modificationTime,
dataChange,
stats,
parsedStats,
tags,
deletionVector);
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.addFileEntry(result);
}
@Override
public int getRequiredChannels()
{
return 2;
}
}
private DeletionVectorEntry parseDeletionVectorFromParquet(ConnectorSession session, SqlRow row, RowType type)
{
checkArgument(row.getFieldCount() == 5, "Deletion vector entry must have 5 fields");
CheckpointFieldReader deletionVector = new CheckpointFieldReader(session, row, type);
String storageType = deletionVector.getString("storageType");
String pathOrInlineDv = deletionVector.getString("pathOrInlineDv");
OptionalInt offset = deletionVector.getOptionalInt("offset");
int sizeInBytes = deletionVector.getInt("sizeInBytes");
long cardinality = deletionVector.getLong("cardinality");
return new DeletionVectorEntry(storageType, pathOrInlineDv, offset, sizeInBytes, cardinality);
}
private DeltaLakeParquetFileStatistics parseStatisticsFromParquet(ConnectorSession session, SqlRow statsRow, RowType type)
{
CheckpointFieldReader stats = new CheckpointFieldReader(session, statsRow, type);
long numRecords = stats.getLong("numRecords");
Optional