All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.PositionDeletesTable Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.ManifestEvaluator;
import org.apache.iceberg.expressions.Projections;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ParallelIterable;
import org.apache.iceberg.util.TableScanUtil;

/**
 * A {@link Table} implementation whose {@link Scan} provides {@link PositionDeletesScanTask}, for
 * reading of position delete files.
 */
public class PositionDeletesTable extends BaseMetadataTable {

  public static final String PARTITION = "partition";
  public static final String SPEC_ID = "spec_id";
  public static final String DELETE_FILE_PATH = "delete_file_path";

  private final Schema schema;
  private final int defaultSpecId;
  private final Map specs;

  PositionDeletesTable(Table table) {
    this(table, table.name() + ".position_deletes");
  }

  PositionDeletesTable(Table table, String name) {
    super(table, name);
    this.schema = calculateSchema();
    this.defaultSpecId = table.spec().specId();
    this.specs = transformSpecs(schema(), table.specs());
  }

  @Override
  MetadataTableType metadataTableType() {
    return MetadataTableType.POSITION_DELETES;
  }

  @Override
  public TableScan newScan() {
    throw new UnsupportedOperationException(
        "Cannot create TableScan from table of type POSITION_DELETES");
  }

  @Override
  public BatchScan newBatchScan() {
    return new PositionDeletesBatchScan(table(), schema());
  }

  @Override
  public Schema schema() {
    return schema;
  }

  @Override
  public PartitionSpec spec() {
    return specs.get(defaultSpecId);
  }

  @Override
  public Map specs() {
    return specs;
  }

  @Override
  public Map properties() {
    // The write properties are needed by PositionDeletesRewriteAction,
    // these properties should respect the ones of BaseTable.
    return Collections.unmodifiableMap(
        table().properties().entrySet().stream()
            .filter(entry -> entry.getKey().startsWith("write."))
            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
  }

  private Schema calculateSchema() {
    Types.StructType partitionType = Partitioning.partitionType(table());
    List columns =
        ImmutableList.of(
            MetadataColumns.DELETE_FILE_PATH,
            MetadataColumns.DELETE_FILE_POS,
            Types.NestedField.optional(
                MetadataColumns.DELETE_FILE_ROW_FIELD_ID,
                MetadataColumns.DELETE_FILE_ROW_FIELD_NAME,
                table().schema().asStruct(),
                MetadataColumns.DELETE_FILE_ROW_DOC),
            Types.NestedField.required(
                MetadataColumns.PARTITION_COLUMN_ID,
                PARTITION,
                partitionType,
                "Partition that position delete row belongs to"),
            Types.NestedField.required(
                MetadataColumns.SPEC_ID_COLUMN_ID,
                SPEC_ID,
                Types.IntegerType.get(),
                MetadataColumns.SPEC_ID_COLUMN_DOC),
            Types.NestedField.required(
                MetadataColumns.FILE_PATH_COLUMN_ID,
                DELETE_FILE_PATH,
                Types.StringType.get(),
                MetadataColumns.FILE_PATH_COLUMN_DOC));

    // Calculate used ids (for de-conflict)
    Set currentlyUsedIds =
        Collections.unmodifiableSet(TypeUtil.indexById(Types.StructType.of(columns)).keySet());
    Set allUsedIds =
        table().schemas().values().stream()
            .map(currSchema -> TypeUtil.indexById(currSchema.asStruct()).keySet())
            .reduce(currentlyUsedIds, Sets::union);

    // Calculate ids to reassign
    Set idsToReassign =
        partitionType.fields().stream().map(Types.NestedField::fieldId).collect(Collectors.toSet());

    // Reassign selected ids to de-conflict with used ids.
    AtomicInteger nextId = new AtomicInteger();
    Schema result =
        new Schema(
            columns,
            ImmutableSet.of(),
            oldId -> {
              if (!idsToReassign.contains(oldId)) {
                return oldId;
              }
              int candidate = nextId.incrementAndGet();
              while (allUsedIds.contains(candidate)) {
                candidate = nextId.incrementAndGet();
              }
              return candidate;
            });

    if (!partitionType.fields().isEmpty()) {
      return result;
    } else {
      // avoid returning an empty struct, which is not always supported.
      // instead, drop the partition field
      return TypeUtil.selectNot(result, Sets.newHashSet(MetadataColumns.PARTITION_COLUMN_ID));
    }
  }

  public static class PositionDeletesBatchScan
      extends SnapshotScan> implements BatchScan {

    private Expression baseTableFilter = Expressions.alwaysTrue();

    protected PositionDeletesBatchScan(Table table, Schema schema) {
      super(table, schema, TableScanContext.empty());
    }

    protected PositionDeletesBatchScan(
        Table table, Schema schema, TableScanContext context, Expression baseTableFilter) {
      super(table, schema, context);
      this.baseTableFilter = baseTableFilter;
    }

    @Override
    protected PositionDeletesBatchScan newRefinedScan(
        Table newTable, Schema newSchema, TableScanContext newContext) {
      return new PositionDeletesBatchScan(newTable, newSchema, newContext, baseTableFilter);
    }

    @Override
    public CloseableIterable> planTasks() {
      return TableScanUtil.planTaskGroups(
          planFiles(), targetSplitSize(), splitLookback(), splitOpenFileCost());
    }

    @Override
    protected List scanColumns() {
      return context().returnColumnStats() ? DELETE_SCAN_WITH_STATS_COLUMNS : DELETE_SCAN_COLUMNS;
    }

    /**
     * Sets a filter that applies on base table of this position deletes table, to use for this
     * scan.
     *
     * 

Only the partition expressions part of the filter will be applied to the position deletes * table, as the schema of the base table does not otherwise match the schema of position * deletes table. * *

    *
  • Only the partition expressions of the filter that can be projected on the base table * partition specs, via {@link * org.apache.iceberg.expressions.Projections.ProjectionEvaluator#project(Expression)} * will be evaluated. Note, not all partition expressions can be projected. *
  • Because it cannot apply beyond the partition expression, this filter will not * contribute to the residuals of tasks returned by this scan. (See {@link * PositionDeletesScanTask#residual()}) *
* * @param expr expression filter that applies on the base table of this posiiton deletes table * @return this for method chaining */ public BatchScan baseTableFilter(Expression expr) { return new PositionDeletesBatchScan( table(), schema(), context(), Expressions.and(baseTableFilter, expr)); } @Override protected CloseableIterable doPlanFiles() { String schemaString = SchemaParser.toJson(tableSchema()); // prepare transformed partition specs and caches Map transformedSpecs = transformSpecs(tableSchema(), table().specs()); LoadingCache specStringCache = partitionCacheOf(transformedSpecs, PartitionSpecParser::toJson); LoadingCache deletesTableEvalCache = partitionCacheOf( transformedSpecs, spec -> ManifestEvaluator.forRowFilter(filter(), spec, isCaseSensitive())); LoadingCache baseTableEvalCache = partitionCacheOf( table().specs(), // evaluate base table filters on base table specs spec -> ManifestEvaluator.forRowFilter(baseTableFilter, spec, isCaseSensitive())); LoadingCache residualCache = partitionCacheOf( transformedSpecs, spec -> ResidualEvaluator.of( spec, // there are no applicable filters in the base table's filter // that we can use to evaluate on the position deletes table shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter(), isCaseSensitive())); // iterate through delete manifests List manifests = snapshot().deleteManifests(table().io()); CloseableIterable matchingManifests = CloseableIterable.filter( scanMetrics().skippedDeleteManifests(), CloseableIterable.withNoopClose(manifests), manifest -> baseTableEvalCache.get(manifest.partitionSpecId()).eval(manifest) && deletesTableEvalCache.get(manifest.partitionSpecId()).eval(manifest)); matchingManifests = CloseableIterable.count(scanMetrics().scannedDeleteManifests(), matchingManifests); Iterable> tasks = CloseableIterable.transform( matchingManifests, manifest -> posDeletesScanTasks( manifest, table().specs().get(manifest.partitionSpecId()), schemaString, transformedSpecs, residualCache, specStringCache)); if (planExecutor() != null) { return new ParallelIterable<>(tasks, planExecutor()); } else { return CloseableIterable.concat(tasks); } } private CloseableIterable posDeletesScanTasks( ManifestFile manifest, PartitionSpec spec, String schemaString, Map transformedSpecs, LoadingCache residualCache, LoadingCache specStringCache) { return new CloseableIterable() { private CloseableIterable iterable; @Override public void close() throws IOException { if (iterable != null) { iterable.close(); } } @Override public CloseableIterator iterator() { Expression partitionFilter = Projections.inclusive(spec, isCaseSensitive()).project(baseTableFilter); // Filter partitions CloseableIterable> deleteFileEntries = ManifestFiles.readDeleteManifest(manifest, table().io(), transformedSpecs) .caseSensitive(isCaseSensitive()) .select(scanColumns()) .filterRows(filter()) .filterPartitions(partitionFilter) .scanMetrics(scanMetrics()) .liveEntries(); // Filter delete file type CloseableIterable> positionDeleteEntries = CloseableIterable.filter( deleteFileEntries, entry -> entry.file().content().equals(FileContent.POSITION_DELETES)); this.iterable = CloseableIterable.transform( positionDeleteEntries, entry -> { int specId = entry.file().specId(); return new BasePositionDeletesScanTask( entry.file().copy(context().returnColumnStats()), schemaString, specStringCache.get(specId), residualCache.get(specId)); }); return iterable.iterator(); } }; } private LoadingCache partitionCacheOf( Map specs, Function constructor) { return Caffeine.newBuilder() .build( specId -> { PartitionSpec spec = specs.get(specId); return constructor.apply(spec); }); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy