org.apache.iceberg.spark.source.SparkChangelogScan Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.3_2.12 Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.source;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.iceberg.ChangelogScanTask;
import org.apache.iceberg.IncrementalChangelogScan;
import org.apache.iceberg.ScanTaskGroup;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkReadConf;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.spark.SparkUtil;
import org.apache.iceberg.types.Types;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.read.Batch;
import org.apache.spark.sql.connector.read.Scan;
import org.apache.spark.sql.connector.read.Statistics;
import org.apache.spark.sql.connector.read.SupportsReportStatistics;
import org.apache.spark.sql.types.StructType;

class SparkChangelogScan implements Scan, SupportsReportStatistics {

  private static final Types.StructType EMPTY_GROUPING_KEY_TYPE = Types.StructType.of();

  private final JavaSparkContext sparkContext;
  private final Table table;
  private final IncrementalChangelogScan scan;
  private final SparkReadConf readConf;
  private final Schema expectedSchema;
  private final List filters;
  private final Long startSnapshotId;
  private final Long endSnapshotId;
  private final boolean readTimestampWithoutZone;

  // lazy variables
  private List> taskGroups = null;
  private StructType expectedSparkType = null;

  SparkChangelogScan(
      SparkSession spark,
      Table table,
      IncrementalChangelogScan scan,
      SparkReadConf readConf,
      Schema expectedSchema,
      List filters,
      boolean emptyScan) {

    SparkSchemaUtil.validateMetadataColumnReferences(table.schema(), expectedSchema);

    this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
    this.table = table;
    this.scan = scan;
    this.readConf = readConf;
    this.expectedSchema = expectedSchema;
    this.filters = filters != null ? filters : Collections.emptyList();
    this.startSnapshotId = readConf.startSnapshotId();
    this.endSnapshotId = readConf.endSnapshotId();
    this.readTimestampWithoutZone = readConf.handleTimestampWithoutZone();
    if (emptyScan) {
      this.taskGroups = Collections.emptyList();
    }
  }

  @Override
  public Statistics estimateStatistics() {
    long rowsCount = taskGroups().stream().mapToLong(ScanTaskGroup::estimatedRowsCount).sum();
    long sizeInBytes = SparkSchemaUtil.estimateSize(readSchema(), rowsCount);
    return new Stats(sizeInBytes, rowsCount);
  }

  @Override
  public StructType readSchema() {
    if (expectedSparkType == null) {
      Preconditions.checkArgument(
          readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema),
          SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR);

      this.expectedSparkType = SparkSchemaUtil.convert(expectedSchema);
    }

    return expectedSparkType;
  }

  @Override
  public Batch toBatch() {
    return new SparkBatch(
        sparkContext,
        table,
        readConf,
        EMPTY_GROUPING_KEY_TYPE,
        taskGroups(),
        expectedSchema,
        hashCode());
  }

  private List> taskGroups() {
    if (taskGroups == null) {
      try (CloseableIterable> groups = scan.planTasks()) {
        this.taskGroups = Lists.newArrayList(groups);
      } catch (IOException e) {
        throw new UncheckedIOException("Failed to close changelog scan: " + scan, e);
      }
    }

    return taskGroups;
  }

  @Override
  public String description() {
    return String.format(
        "%s [fromSnapshotId=%d, toSnapshotId=%d, filters=%s]",
        table, startSnapshotId, endSnapshotId, Spark3Util.describe(filters));
  }

  @Override
  public String toString() {
    return String.format(
        "IcebergChangelogScan(table=%s, type=%s, fromSnapshotId=%d, toSnapshotId=%d, filters=%s)",
        table,
        expectedSchema.asStruct(),
        startSnapshotId,
        endSnapshotId,
        Spark3Util.describe(filters));
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }

    if (o == null || getClass() != o.getClass()) {
      return false;
    }

    SparkChangelogScan that = (SparkChangelogScan) o;
    return table.name().equals(that.table.name())
        && readSchema().equals(that.readSchema()) // compare Spark schemas to ignore field IDs
        && filters.toString().equals(that.filters.toString())
        && Objects.equals(startSnapshotId, that.startSnapshotId)
        && Objects.equals(endSnapshotId, that.endSnapshotId);
  }

  @Override
  public int hashCode() {
    return Objects.hash(
        table.name(), readSchema(), filters.toString(), startSnapshotId, endSnapshotId);
  }
}