org.apache.iceberg.BaseContentScanTask Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-core Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.util.ArrayUtil;

abstract class BaseContentScanTask, F extends ContentFile>
    implements ContentScanTask, SplittableScanTask {

  private final F file;
  private final String schemaString;
  private final String specString;
  private final ResidualEvaluator residuals;

  private transient volatile Schema schema = null;
  private transient volatile PartitionSpec spec = null;

  BaseContentScanTask(F file, String schemaString, String specString, ResidualEvaluator residuals) {
    this.file = file;
    this.schemaString = schemaString;
    this.specString = specString;
    this.residuals = residuals;
  }

  protected abstract ThisT self();

  protected abstract ThisT newSplitTask(ThisT parentTask, long offset, long length);

  @Override
  public F file() {
    return file;
  }

  protected Schema schema() {
    if (schema == null) {
      synchronized (this) {
        if (schema == null) {
          this.schema = SchemaParser.fromJson(schemaString);
        }
      }
    }

    return schema;
  }

  @Override
  public PartitionSpec spec() {
    if (spec == null) {
      synchronized (this) {
        if (spec == null) {
          this.spec = PartitionSpecParser.fromJson(schema(), specString);
        }
      }
    }
    return spec;
  }

  @Override
  public long start() {
    return 0;
  }

  @Override
  public long length() {
    return file.fileSizeInBytes();
  }

  @Override
  public Expression residual() {
    return residuals.residualFor(file.partition());
  }

  @Override
  public long estimatedRowsCount() {
    return estimateRowsCount(length(), file);
  }

  @Override
  public Iterable split(long targetSplitSize) {
    if (file.format().isSplittable()) {
      long[] splitOffsets = splitOffsets(file);
      if (splitOffsets != null && ArrayUtil.isStrictlyAscending(splitOffsets)) {
        return () ->
            new OffsetsAwareSplitScanTaskIterator<>(
                self(), length(), splitOffsets, this::newSplitTask);
      } else {
        return () ->
            new FixedSizeSplitScanTaskIterator<>(
                self(), length(), targetSplitSize, this::newSplitTask);
      }
    }

    return ImmutableList.of(self());
  }

  @Override
  public String toString() {
    return MoreObjects.toStringHelper(this)
        .add("file", file().path())
        .add("partition_data", file().partition())
        .add("residual", residual())
        .toString();
  }

  static long estimateRowsCount(long length, ContentFile file) {
    long[] splitOffsets = splitOffsets(file);
    long splitOffset = splitOffsets != null ? splitOffsets[0] : 0L;
    double scannedFileFraction = ((double) length) / (file.fileSizeInBytes() - splitOffset);
    return (long) (scannedFileFraction * file.recordCount());
  }

  private static long[] splitOffsets(ContentFile file) {
    if (file instanceof BaseFile) {
      return ((BaseFile) file).splitOffsetArray();
    } else {
      return ArrayUtil.toLongArray(file.splitOffsets());
    }
  }
}