com.facebook.presto.iceberg.optimizer.IcebergEqualityDeleteAsJoin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of presto-iceberg Show documentation
Show all versions of presto-iceberg Show documentation
Presto - Iceberg Connector
The newest version!
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.iceberg.optimizer;
import com.facebook.presto.common.function.OperatorType;
import com.facebook.presto.common.predicate.TupleDomain;
import com.facebook.presto.common.type.BigintType;
import com.facebook.presto.common.type.BooleanType;
import com.facebook.presto.common.type.TypeManager;
import com.facebook.presto.iceberg.ColumnIdentity;
import com.facebook.presto.iceberg.IcebergAbstractMetadata;
import com.facebook.presto.iceberg.IcebergColumnHandle;
import com.facebook.presto.iceberg.IcebergMetadataColumn;
import com.facebook.presto.iceberg.IcebergTableHandle;
import com.facebook.presto.iceberg.IcebergTableLayoutHandle;
import com.facebook.presto.iceberg.IcebergTableName;
import com.facebook.presto.iceberg.IcebergTableType;
import com.facebook.presto.iceberg.IcebergTransactionManager;
import com.facebook.presto.iceberg.IcebergUtil;
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.ConnectorPlanOptimizer;
import com.facebook.presto.spi.ConnectorPlanRewriter;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.TableHandle;
import com.facebook.presto.spi.VariableAllocator;
import com.facebook.presto.spi.function.FunctionHandle;
import com.facebook.presto.spi.function.StandardFunctionResolution;
import com.facebook.presto.spi.plan.Assignments;
import com.facebook.presto.spi.plan.ConnectorJoinNode;
import com.facebook.presto.spi.plan.EquiJoinClause;
import com.facebook.presto.spi.plan.FilterNode;
import com.facebook.presto.spi.plan.JoinType;
import com.facebook.presto.spi.plan.PlanNode;
import com.facebook.presto.spi.plan.PlanNodeIdAllocator;
import com.facebook.presto.spi.plan.ProjectNode;
import com.facebook.presto.spi.plan.TableScanNode;
import com.facebook.presto.spi.relation.CallExpression;
import com.facebook.presto.spi.relation.RowExpression;
import com.facebook.presto.spi.relation.SpecialFormExpression;
import com.facebook.presto.spi.relation.VariableReferenceExpression;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSet.Builder;
import com.google.common.collect.Sets;
import org.apache.iceberg.DeleteFile;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.Table;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.PARTITION_KEY;
import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.REGULAR;
import static com.facebook.presto.iceberg.FileContent.EQUALITY_DELETES;
import static com.facebook.presto.iceberg.FileContent.fromIcebergFileContent;
import static com.facebook.presto.iceberg.IcebergColumnHandle.DATA_SEQUENCE_NUMBER_COLUMN_HANDLE;
import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR;
import static com.facebook.presto.iceberg.IcebergMetadataColumn.DATA_SEQUENCE_NUMBER;
import static com.facebook.presto.iceberg.IcebergSessionProperties.isDeleteToJoinPushdownEnabled;
import static com.facebook.presto.iceberg.IcebergUtil.getDeleteFiles;
import static com.facebook.presto.iceberg.IcebergUtil.getIcebergTable;
import static com.facebook.presto.iceberg.TypeConverter.toPrestoType;
import static com.facebook.presto.spi.ConnectorPlanRewriter.rewriteWith;
import static java.util.Objects.requireNonNull;
/**
* This optimizer implements equality deletes as a join, rather than having each split read the delete files and apply them.
* This approach significantly enhances performance for equality deletes, as most delete files will apply to most splits,
* and opening the delete file in each split incurs considerable overhead. Usually, the delete files are relatively small
* and can be broadcast easily. Each delete file may have a different schema, though typically there will be only a few delete
* schemas, often just one (the primary key).
*
* For example, consider the following query:
* SELECT * FROM table;
* With 2 delete schemas: (pk), (orderid), the query will be transformed into:
*
* SELECT "$data_sequence_number", * FROM table
* LEFT JOIN "table$equality_deletes1" d1 ON left.pk = d1.pk AND left."$data_sequence_number" < d1."$data_sequence_number" -- Find deletes by schema 1
* LEFT JOIN "table$equality_deletes2" d2 ON left.orderid = d1.orderid AND left."$data_sequence_number" < d2."$data_sequence_number" -- Find deletes by schema 2
* WHERE COALESCE(d1."$data_sequence_number", d2."data_sequence_number") IS NULL -- None of the delete files had a delete for this row
*
* Note that table$equality_deletes1 and table$equality_deletes2 are different tables, each containing only the delete files with the schema for this join.
*/
public class IcebergEqualityDeleteAsJoin
implements ConnectorPlanOptimizer
{
private final StandardFunctionResolution functionResolution;
private final IcebergTransactionManager transactionManager;
private final TypeManager typeManager;
IcebergEqualityDeleteAsJoin(StandardFunctionResolution functionResolution,
IcebergTransactionManager transactionManager,
TypeManager typeManager)
{
this.functionResolution = requireNonNull(functionResolution, "functionResolution is null");
this.transactionManager = requireNonNull(transactionManager, "transactionManager is null");
this.typeManager = requireNonNull(typeManager, "typeManager is null");
}
@Override
public PlanNode optimize(PlanNode maxSubplan, ConnectorSession session, VariableAllocator variableAllocator, PlanNodeIdAllocator idAllocator)
{
if (!isDeleteToJoinPushdownEnabled(session)) {
return maxSubplan;
}
return rewriteWith(new DeleteAsJoinRewriter(functionResolution,
transactionManager, idAllocator, session, typeManager, variableAllocator), maxSubplan);
}
private static class DeleteAsJoinRewriter
extends ConnectorPlanRewriter
{
private final ConnectorSession session;
private final StandardFunctionResolution functionResolution;
private final PlanNodeIdAllocator idAllocator;
private final IcebergTransactionManager transactionManager;
private final TypeManager typeManager;
private final VariableAllocator variableAllocator;
public DeleteAsJoinRewriter(
StandardFunctionResolution functionResolution,
IcebergTransactionManager transactionManager,
PlanNodeIdAllocator idAllocator,
ConnectorSession session,
TypeManager typeManager,
VariableAllocator variableAllocator)
{
this.functionResolution = requireNonNull(functionResolution, "functionResolution is null");
this.transactionManager = requireNonNull(transactionManager, "transactionManager is null");
this.idAllocator = requireNonNull(idAllocator, "idAllocator is null");
this.session = requireNonNull(session, "session is null");
this.typeManager = requireNonNull(typeManager, "typeManager is null");
this.variableAllocator = requireNonNull(variableAllocator, "variableAllocator is null");
}
@Override
public PlanNode visitTableScan(TableScanNode node, RewriteContext context)
{
TableHandle table = node.getTable();
IcebergTableHandle icebergTableHandle = (IcebergTableHandle) table.getConnectorHandle();
Optional icebergTableLayoutHandle = table.getLayout().map(IcebergTableLayoutHandle.class::cast);
IcebergTableName tableName = icebergTableHandle.getIcebergTableName();
if (!tableName.getSnapshotId().isPresent() || tableName.getTableType() != IcebergTableType.DATA) {
// Node is already optimized or not ready for planning
return node;
}
IcebergAbstractMetadata metadata = (IcebergAbstractMetadata) transactionManager.get(table.getTransaction());
Table icebergTable = getIcebergTable(metadata, session, icebergTableHandle.getSchemaTableName());
TupleDomain predicate = icebergTableLayoutHandle
.map(IcebergTableLayoutHandle::getValidPredicate)
.map(IcebergUtil::getNonMetadataColumnConstraints)
.orElse(TupleDomain.all());
// Collect info about each unique delete schema to join by
ImmutableMap, DeleteSetInfo> deleteSchemas = collectDeleteInformation(icebergTable, predicate, tableName.getSnapshotId().get());
if (deleteSchemas.isEmpty()) {
// no equality deletes
return node;
}
// Add all the fields required by the join that were not added by the user's query
ImmutableMap unselectedAssignments = createAssignmentsForUnselectedFields(node, deleteSchemas, icebergTable);
TableScanNode updatedTableScan = createNewRoot(node, icebergTableHandle, tableName, unselectedAssignments, table);
Map reverseAssignmentsMap = updatedTableScan
.getAssignments()
.entrySet()
.stream()
.collect(Collectors.toMap(assignment -> ((IcebergColumnHandle) (assignment.getValue())).getId(), Map.Entry::getKey));
List deleteVersionColumns = new ArrayList<>();
PlanNode parentNode = updatedTableScan;
// For each unique delete schema add a join that applies those equality deletes
for (Map.Entry, DeleteSetInfo> entry : deleteSchemas.entrySet()) {
DeleteSetInfo deleteGroupInfo = entry.getValue();
List deleteFields = deleteGroupInfo
.equalityFieldIds
.stream()
.map(fieldId -> icebergTable.schema().findField(fieldId))
.filter(Objects::nonNull)
.collect(Collectors.toList());
VariableReferenceExpression joinSequenceNumber = toVariableReference(DATA_SEQUENCE_NUMBER_COLUMN_HANDLE);
deleteVersionColumns.add(joinSequenceNumber);
ImmutableMap deleteColumnAssignments = ImmutableMap.builder()
.putAll(deleteGroupInfo.allFields(icebergTable.schema()).stream().collect(Collectors.toMap(this::toVariableReference, this::toIcebergColumnHandle)))
.put(joinSequenceNumber, DATA_SEQUENCE_NUMBER_COLUMN_HANDLE)
.build();
// ON source.delete_column = deletes.delete_column, ...
Set clauses = deleteColumnAssignments
.entrySet()
.stream()
.filter(assignment -> !IcebergMetadataColumn.isMetadataColumnId(((IcebergColumnHandle) (assignment.getValue())).getId()))
.map(assignment -> {
VariableReferenceExpression left = reverseAssignmentsMap.get(((IcebergColumnHandle) (assignment.getValue())).getId());
VariableReferenceExpression right = assignment.getKey();
return new EquiJoinClause(left, right);
}).collect(Collectors.toSet());
FunctionHandle lessThan = functionResolution.comparisonFunction(OperatorType.LESS_THAN, BigintType.BIGINT, BigintType.BIGINT);
// AND source.$data_sequence_number < deletes.$data_sequence_number
RowExpression versionFilter = new CallExpression(lessThan.getName(),
lessThan,
BooleanType.BOOLEAN,
Collections.unmodifiableList(Arrays.asList(reverseAssignmentsMap.get(DATA_SEQUENCE_NUMBER.getId()), joinSequenceNumber)));
TableScanNode deleteTableScan = createDeletesTableScan(deleteColumnAssignments,
icebergTableHandle,
tableName,
deleteFields,
table,
deleteGroupInfo);
parentNode = new ConnectorJoinNode(idAllocator.getNextId(),
Arrays.asList(parentNode, deleteTableScan),
Optional.empty(),
JoinType.LEFT,
clauses,
Sets.newHashSet(versionFilter),
Optional.empty(), // Allow stats to determine join distribution
Stream.concat(parentNode.getOutputVariables().stream(), deleteTableScan.getOutputVariables().stream()).collect(Collectors.toList()));
}
FilterNode filter = new FilterNode(Optional.empty(), idAllocator.getNextId(), Optional.empty(), parentNode,
new SpecialFormExpression(SpecialFormExpression.Form.IS_NULL, BooleanType.BOOLEAN,
new SpecialFormExpression(SpecialFormExpression.Form.COALESCE, BigintType.BIGINT, deleteVersionColumns)));
Assignments.Builder assignmentsBuilder = Assignments.builder();
filter.getOutputVariables().stream()
.filter(variableReferenceExpression -> !variableReferenceExpression.getName().startsWith(DATA_SEQUENCE_NUMBER_COLUMN_HANDLE.getName()))
.forEach(variableReferenceExpression -> assignmentsBuilder.put(variableReferenceExpression, variableReferenceExpression));
return new ProjectNode(Optional.empty(), idAllocator.getNextId(), filter, assignmentsBuilder.build(), ProjectNode.Locality.LOCAL);
}
private static ImmutableMap, DeleteSetInfo> collectDeleteInformation(Table icebergTable,
TupleDomain predicate,
long snapshotId)
{
// Delete schemas can repeat, so using a normal hashmap to dedup, will be converted to immutable at the end of the function.
HashMap, DeleteSetInfo> deleteInformations = new HashMap<>();
try (CloseableIterator files =
getDeleteFiles(icebergTable, snapshotId, predicate, Optional.empty(), Optional.empty()).iterator()) {
files.forEachRemaining(delete -> {
if (fromIcebergFileContent(delete.content()) == EQUALITY_DELETES) {
ImmutableMap.Builder partitionFieldsBuilder = new ImmutableMap.Builder<>();
ImmutableSet.Builder identityPartitionFieldSourceIdsBuilder = new Builder<>();
PartitionSpec partitionSpec = icebergTable.specs().get(delete.specId());
Types.StructType partitionType = partitionSpec.partitionType();
// PartitionField ids are unique across the entire table in v2. We can assume we are in v2 since v1 doesn't have equality deletes
partitionSpec.fields().forEach(field -> {
if (field.transform().isIdentity()) {
identityPartitionFieldSourceIdsBuilder.add(field.sourceId());
}
partitionFieldsBuilder.put(field.fieldId(), new PartitionFieldInfo(partitionType.field(field.fieldId()), field));
});
ImmutableMap partitionFields = partitionFieldsBuilder.build();
ImmutableSet identityPartitionFieldSourceIds = identityPartitionFieldSourceIdsBuilder.build();
HashSet result = new HashSet<>();
result.addAll(partitionFields.keySet());
// Filter out identity partition columns from delete file's `equalityFieldIds` to support `delete-schema-merging` within the same partition spec.
List equalityFieldIdsExcludeIdentityPartitionField = delete.equalityFieldIds().stream()
.filter(fieldId -> !identityPartitionFieldSourceIds.contains(fieldId))
.collect(Collectors.toList());
result.addAll(equalityFieldIdsExcludeIdentityPartitionField);
deleteInformations.put(ImmutableSet.copyOf(result), new DeleteSetInfo(partitionFields, equalityFieldIdsExcludeIdentityPartitionField));
}
});
}
catch (IOException e) {
throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, "Failed to read equality delete information", e);
}
return ImmutableMap.copyOf(deleteInformations);
}
private TableScanNode createDeletesTableScan(ImmutableMap deleteColumnAssignments,
IcebergTableHandle icebergTableHandle,
IcebergTableName tableName,
List deleteFields,
TableHandle table,
DeleteSetInfo deleteInfo)
{
List outputs = deleteColumnAssignments.keySet().asList();
IcebergTableHandle deletesTableHandle = new IcebergTableHandle(icebergTableHandle.getSchemaName(),
new IcebergTableName(tableName.getTableName(),
IcebergTableType.EQUALITY_DELETES, // Read equality deletes instead of data
tableName.getSnapshotId(),
Optional.empty()),
icebergTableHandle.isSnapshotSpecified(),
icebergTableHandle.getOutputPath(),
icebergTableHandle.getStorageProperties(),
Optional.of(SchemaParser.toJson(new Schema(deleteFields))),
Optional.of(deleteInfo.partitionFields.keySet()), // Enforce reading only delete files that match this schema
Optional.ofNullable(deleteInfo.equalityFieldIds.isEmpty() ? null : deleteInfo.equalityFieldIds));
return new TableScanNode(Optional.empty(),
idAllocator.getNextId(),
new TableHandle(table.getConnectorId(), deletesTableHandle, table.getTransaction(), table.getLayout(), table.getDynamicFilter()),
outputs,
deleteColumnAssignments,
TupleDomain.all(),
TupleDomain.all());
}
/**
* - Updates table handle to DATA_WITHOUT_EQUALITY_DELETES since the page source for this node should now not apply equality deletes.
* - Adds extra assignments and outputs that are needed by the join
*/
private TableScanNode createNewRoot(TableScanNode node, IcebergTableHandle icebergTableHandle, IcebergTableName tableName, ImmutableMap unselectedAssignments, TableHandle table)
{
IcebergTableHandle updatedHandle = new IcebergTableHandle(icebergTableHandle.getSchemaName(),
new IcebergTableName(tableName.getTableName(),
IcebergTableType.DATA_WITHOUT_EQUALITY_DELETES, // Don't apply equality deletes in the split
tableName.getSnapshotId(),
tableName.getChangelogEndSnapshot()),
icebergTableHandle.isSnapshotSpecified(),
icebergTableHandle.getOutputPath(),
icebergTableHandle.getStorageProperties(),
icebergTableHandle.getTableSchemaJson(),
icebergTableHandle.getPartitionSpecId(),
icebergTableHandle.getEqualityFieldIds());
VariableReferenceExpression dataSequenceNumberVariableReference = toVariableReference(DATA_SEQUENCE_NUMBER_COLUMN_HANDLE);
ImmutableMap.Builder assignmentsBuilder = ImmutableMap.builder()
.put(dataSequenceNumberVariableReference, DATA_SEQUENCE_NUMBER_COLUMN_HANDLE)
.putAll(unselectedAssignments)
.putAll(node.getAssignments());
ImmutableList.Builder outputsBuilder = ImmutableList.builder();
outputsBuilder.addAll(node.getOutputVariables());
if (!node.getAssignments().containsKey(dataSequenceNumberVariableReference)) {
outputsBuilder.add(dataSequenceNumberVariableReference);
}
outputsBuilder.addAll(unselectedAssignments.keySet());
return new TableScanNode(node.getSourceLocation(),
node.getId(),
Optional.of(node),
new TableHandle(table.getConnectorId(), updatedHandle, table.getTransaction(), table.getLayout(), table.getDynamicFilter()),
outputsBuilder.build(),
assignmentsBuilder.build(),
node.getTableConstraints(),
node.getCurrentConstraint(),
node.getEnforcedConstraint());
}
/**
* Calculate required fields that the user didn't include in his select statement and add assignments for them to add to the table scan
*/
private ImmutableMap createAssignmentsForUnselectedFields(TableScanNode node,
ImmutableMap, DeleteSetInfo> deleteInfos,
Table icebergTable)
{
Set selectedFields = node.getAssignments().values().stream().map(f -> ((IcebergColumnHandle) f).getId()).collect(Collectors.toSet());
Set unselectedFields = Sets.difference(deleteInfos.keySet().stream().reduce(Sets::union).orElseGet(Collections::emptySet), selectedFields);
ImmutableMap.Builder unselectedAssignmentsBuilder = ImmutableMap.builder();
Map partitionFields = deleteInfos.values().stream()
.flatMap(info -> info.getPartitionFields().entrySet().stream())
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (existing, replacement) -> existing));
unselectedFields
.forEach(fieldId -> {
if (partitionFields.containsKey(fieldId)) {
PartitionFieldInfo partitionFieldInfo = partitionFields.get(fieldId);
PartitionField partitionField = partitionFieldInfo.getPartitionField();
Types.NestedField sourceField = icebergTable.schema().findField(partitionField.sourceId());
if (!partitionField.transform().isIdentity()) {
Type partitionFieldType = partitionField.transform().getResultType(sourceField.type());
VariableReferenceExpression variableReference = variableAllocator.newVariable(partitionField.name(), toPrestoType(partitionFieldType, typeManager));
IcebergColumnHandle columnHandle = new IcebergColumnHandle(
ColumnIdentity.createColumnIdentity(partitionField.name(), partitionField.fieldId(), partitionFieldType),
toPrestoType(partitionFieldType, typeManager),
Optional.empty(),
PARTITION_KEY);
unselectedAssignmentsBuilder.put(variableReference, columnHandle);
}
else if (!selectedFields.contains(sourceField.fieldId())) {
unselectedAssignmentsBuilder.put(
variableAllocator.newVariable(sourceField.name(), toPrestoType(sourceField.type(), typeManager)),
IcebergColumnHandle.create(sourceField, typeManager, REGULAR));
}
}
else {
Types.NestedField schemaField = icebergTable.schema().findField(fieldId);
unselectedAssignmentsBuilder.put(
variableAllocator.newVariable(schemaField.name(), toPrestoType(schemaField.type(), typeManager)),
IcebergColumnHandle.create(schemaField, typeManager, REGULAR));
}
});
return unselectedAssignmentsBuilder.build();
}
private VariableReferenceExpression toVariableReference(IcebergColumnHandle columnHandle)
{
return variableAllocator.newVariable(columnHandle.getName(), columnHandle.getType());
}
private IcebergColumnHandle toIcebergColumnHandle(Types.NestedField field)
{
ColumnIdentity columnIdentity = new ColumnIdentity(field.fieldId(), field.name(), ColumnIdentity.TypeCategory.PRIMITIVE, Collections.emptyList());
return new IcebergColumnHandle(columnIdentity, toPrestoType(field.type(), typeManager), Optional.empty(), REGULAR);
}
private VariableReferenceExpression toVariableReference(Types.NestedField field)
{
return variableAllocator.newVariable(field.name(), toPrestoType(field.type(), typeManager));
}
private static class PartitionFieldInfo
{
private final Types.NestedField nestedField;
private final PartitionField partitionField;
private PartitionFieldInfo(Types.NestedField nestedField, PartitionField partitionField)
{
this.nestedField = nestedField;
this.partitionField = partitionField;
}
public PartitionField getPartitionField()
{
return partitionField;
}
}
private static class DeleteSetInfo
{
private final ImmutableMap partitionFields;
private final Set equalityFieldIds;
private DeleteSetInfo(ImmutableMap partitionFields,
List equalityFieldIds)
{
this.partitionFields = requireNonNull(partitionFields, "partitionFields is null");
this.equalityFieldIds = ImmutableSet.copyOf(requireNonNull(equalityFieldIds, "equalityFieldIds is null"));
}
public ImmutableMap getPartitionFields()
{
return partitionFields;
}
public List allFields(Schema schema)
{
return Stream.concat(equalityFieldIds
.stream()
.map(schema::findField),
partitionFields
.values()
.stream()
.map(partitionFieldInfo -> {
if (partitionFieldInfo.partitionField.transform().isIdentity()) {
return schema.findField(partitionFieldInfo.partitionField.sourceId());
}
return partitionFieldInfo.nestedField;
}))
.collect(Collectors.toList());
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy