All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.ManifestGroup Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.function.BiFunction;
import java.util.function.Predicate;
import org.apache.iceberg.expressions.Evaluator;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.expressions.ManifestEvaluator;
import org.apache.iceberg.expressions.Projections;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.metrics.ScanMetrics;
import org.apache.iceberg.metrics.ScanMetricsUtil;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ContentFileUtil;
import org.apache.iceberg.util.ParallelIterable;

class ManifestGroup {
  private static final Types.StructType EMPTY_STRUCT = Types.StructType.of();

  private final FileIO io;
  private final Set dataManifests;
  private final DeleteFileIndex.Builder deleteIndexBuilder;
  private Predicate> manifestEntryPredicate;
  private Map specsById;
  private Expression dataFilter;
  private Expression fileFilter;
  private Expression partitionFilter;
  private boolean ignoreDeleted;
  private boolean ignoreExisting;
  private boolean ignoreResiduals;
  private List columns;
  private boolean caseSensitive;
  private Set columnsToKeepStats;
  private ExecutorService executorService;
  private ScanMetrics scanMetrics;

  ManifestGroup(FileIO io, Iterable manifests) {
    this(
        io,
        Iterables.filter(manifests, manifest -> manifest.content() == ManifestContent.DATA),
        Iterables.filter(manifests, manifest -> manifest.content() == ManifestContent.DELETES));
  }

  ManifestGroup(
      FileIO io, Iterable dataManifests, Iterable deleteManifests) {
    this.io = io;
    this.dataManifests = Sets.newHashSet(dataManifests);
    this.deleteIndexBuilder = DeleteFileIndex.builderFor(io, deleteManifests);
    this.dataFilter = Expressions.alwaysTrue();
    this.fileFilter = Expressions.alwaysTrue();
    this.partitionFilter = Expressions.alwaysTrue();
    this.ignoreDeleted = false;
    this.ignoreExisting = false;
    this.ignoreResiduals = false;
    this.columns = ManifestReader.ALL_COLUMNS;
    this.caseSensitive = true;
    this.manifestEntryPredicate = e -> true;
    this.scanMetrics = ScanMetrics.noop();
  }

  ManifestGroup specsById(Map newSpecsById) {
    this.specsById = newSpecsById;
    deleteIndexBuilder.specsById(newSpecsById);
    return this;
  }

  ManifestGroup filterData(Expression newDataFilter) {
    this.dataFilter = Expressions.and(dataFilter, newDataFilter);
    deleteIndexBuilder.filterData(newDataFilter);
    return this;
  }

  ManifestGroup filterFiles(Expression newFileFilter) {
    this.fileFilter = Expressions.and(fileFilter, newFileFilter);
    return this;
  }

  ManifestGroup filterPartitions(Expression newPartitionFilter) {
    this.partitionFilter = Expressions.and(partitionFilter, newPartitionFilter);
    deleteIndexBuilder.filterPartitions(newPartitionFilter);
    return this;
  }

  ManifestGroup filterManifestEntries(
      Predicate> newManifestEntryPredicate) {
    this.manifestEntryPredicate = manifestEntryPredicate.and(newManifestEntryPredicate);
    return this;
  }

  ManifestGroup scanMetrics(ScanMetrics metrics) {
    this.scanMetrics = metrics;
    return this;
  }

  ManifestGroup ignoreDeleted() {
    this.ignoreDeleted = true;
    return this;
  }

  ManifestGroup ignoreExisting() {
    this.ignoreExisting = true;
    return this;
  }

  ManifestGroup ignoreResiduals() {
    this.ignoreResiduals = true;
    return this;
  }

  ManifestGroup select(List newColumns) {
    this.columns = Lists.newArrayList(newColumns);
    return this;
  }

  ManifestGroup caseSensitive(boolean newCaseSensitive) {
    this.caseSensitive = newCaseSensitive;
    deleteIndexBuilder.caseSensitive(newCaseSensitive);
    return this;
  }

  ManifestGroup columnsToKeepStats(Set newColumnsToKeepStats) {
    this.columnsToKeepStats =
        newColumnsToKeepStats == null ? null : Sets.newHashSet(newColumnsToKeepStats);
    return this;
  }

  ManifestGroup planWith(ExecutorService newExecutorService) {
    this.executorService = newExecutorService;
    deleteIndexBuilder.planWith(newExecutorService);
    return this;
  }

  /**
   * Returns an iterable of scan tasks. It is safe to add entries of this iterable to a collection
   * as {@link DataFile} in each {@link FileScanTask} is defensively copied.
   *
   * @return a {@link CloseableIterable} of {@link FileScanTask}
   */
  public CloseableIterable planFiles() {
    return plan(ManifestGroup::createFileScanTasks);
  }

  public  CloseableIterable plan(CreateTasksFunction createTasksFunc) {
    LoadingCache residualCache =
        Caffeine.newBuilder()
            .build(
                specId -> {
                  PartitionSpec spec = specsById.get(specId);
                  Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : dataFilter;
                  return ResidualEvaluator.of(spec, filter, caseSensitive);
                });

    DeleteFileIndex deleteFiles = deleteIndexBuilder.scanMetrics(scanMetrics).build();

    boolean dropStats = ManifestReader.dropStats(columns);
    if (deleteFiles.hasEqualityDeletes()) {
      select(ManifestReader.withStatsColumns(columns));
    }

    LoadingCache taskContextCache =
        Caffeine.newBuilder()
            .build(
                specId -> {
                  PartitionSpec spec = specsById.get(specId);
                  ResidualEvaluator residuals = residualCache.get(specId);
                  return new TaskContext(
                      spec, deleteFiles, residuals, dropStats, columnsToKeepStats, scanMetrics);
                });

    Iterable> tasks =
        entries(
            (manifest, entries) -> {
              int specId = manifest.partitionSpecId();
              TaskContext taskContext = taskContextCache.get(specId);
              return createTasksFunc.apply(entries, taskContext);
            });

    if (executorService != null) {
      return new ParallelIterable<>(tasks, executorService);
    } else {
      return CloseableIterable.concat(tasks);
    }
  }

  /**
   * Returns an iterable for manifest entries in the set of manifests.
   *
   * 

Entries are not copied and it is the caller's responsibility to make defensive copies if * adding these entries to a collection. * * @return a CloseableIterable of manifest entries. */ public CloseableIterable> entries() { return CloseableIterable.concat(entries((manifest, entries) -> entries)); } /** * Returns an iterable for groups of data files in the set of manifests. * *

Files are not copied, it is the caller's responsibility to make defensive copies if adding * these files to a collection. * * @return an iterable of file groups */ public Iterable> fileGroups() { return entries( (manifest, entries) -> CloseableIterable.transform(entries, ManifestEntry::file)); } private Iterable> entries( BiFunction>, CloseableIterable> entryFn) { LoadingCache evalCache = specsById == null ? null : Caffeine.newBuilder() .build( specId -> { PartitionSpec spec = specsById.get(specId); return ManifestEvaluator.forPartitionFilter( Expressions.and( partitionFilter, Projections.inclusive(spec, caseSensitive).project(dataFilter)), spec, caseSensitive); }); Evaluator evaluator; if (fileFilter != null && fileFilter != Expressions.alwaysTrue()) { evaluator = new Evaluator(DataFile.getType(EMPTY_STRUCT), fileFilter, caseSensitive); } else { evaluator = null; } CloseableIterable closeableDataManifests = CloseableIterable.withNoopClose(dataManifests); CloseableIterable matchingManifests = evalCache == null ? closeableDataManifests : CloseableIterable.filter( scanMetrics.skippedDataManifests(), closeableDataManifests, manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest)); if (ignoreDeleted) { // only scan manifests that have entries other than deletes // remove any manifests that don't have any existing or added files. if either the added or // existing files count is missing, the manifest must be scanned. matchingManifests = CloseableIterable.filter( scanMetrics.skippedDataManifests(), matchingManifests, manifest -> manifest.hasAddedFiles() || manifest.hasExistingFiles()); } if (ignoreExisting) { // only scan manifests that have entries other than existing // remove any manifests that don't have any deleted or added files. if either the added or // deleted files count is missing, the manifest must be scanned. matchingManifests = CloseableIterable.filter( scanMetrics.skippedDataManifests(), matchingManifests, manifest -> manifest.hasAddedFiles() || manifest.hasDeletedFiles()); } matchingManifests = CloseableIterable.count(scanMetrics.scannedDataManifests(), matchingManifests); return Iterables.transform( matchingManifests, manifest -> new CloseableIterable() { private CloseableIterable iterable; @Override public CloseableIterator iterator() { ManifestReader reader = ManifestFiles.read(manifest, io, specsById) .filterRows(dataFilter) .filterPartitions(partitionFilter) .caseSensitive(caseSensitive) .select(columns) .scanMetrics(scanMetrics); CloseableIterable> entries; if (ignoreDeleted) { entries = reader.liveEntries(); } else { entries = reader.entries(); } if (ignoreExisting) { entries = CloseableIterable.filter( scanMetrics.skippedDataFiles(), entries, entry -> entry.status() != ManifestEntry.Status.EXISTING); } if (evaluator != null) { entries = CloseableIterable.filter( scanMetrics.skippedDataFiles(), entries, entry -> evaluator.eval((GenericDataFile) entry.file())); } entries = CloseableIterable.filter( scanMetrics.skippedDataFiles(), entries, manifestEntryPredicate); iterable = entryFn.apply(manifest, entries); return iterable.iterator(); } @Override public void close() throws IOException { if (iterable != null) { iterable.close(); } } }); } private static CloseableIterable createFileScanTasks( CloseableIterable> entries, TaskContext ctx) { return CloseableIterable.transform( entries, entry -> { DataFile dataFile = ContentFileUtil.copy(entry.file(), ctx.shouldKeepStats(), ctx.columnsToKeepStats()); DeleteFile[] deleteFiles = ctx.deletes().forEntry(entry); ScanMetricsUtil.fileTask(ctx.scanMetrics(), dataFile, deleteFiles); return new BaseFileScanTask( dataFile, deleteFiles, ctx.schemaAsString(), ctx.specAsString(), ctx.residuals()); }); } @FunctionalInterface interface CreateTasksFunction { CloseableIterable apply( CloseableIterable> entries, TaskContext context); } static class TaskContext { private final String schemaAsString; private final String specAsString; private final DeleteFileIndex deletes; private final ResidualEvaluator residuals; private final boolean dropStats; private final Set columnsToKeepStats; private final ScanMetrics scanMetrics; TaskContext( PartitionSpec spec, DeleteFileIndex deletes, ResidualEvaluator residuals, boolean dropStats, Set columnsToKeepStats, ScanMetrics scanMetrics) { this.schemaAsString = SchemaParser.toJson(spec.schema()); this.specAsString = PartitionSpecParser.toJson(spec); this.deletes = deletes; this.residuals = residuals; this.dropStats = dropStats; this.columnsToKeepStats = columnsToKeepStats; this.scanMetrics = scanMetrics; } String schemaAsString() { return schemaAsString; } String specAsString() { return specAsString; } DeleteFileIndex deletes() { return deletes; } ResidualEvaluator residuals() { return residuals; } boolean shouldKeepStats() { return !dropStats; } Set columnsToKeepStats() { return columnsToKeepStats; } public ScanMetrics scanMetrics() { return scanMetrics; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy