org.apache.paimon.spark.PaimonBaseScanBuilder.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of paimon-spark-common Show documentation
There is a newer version: 0.9.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.paimon.spark

import org.apache.paimon.predicate.{PartitionPredicateVisitor, Predicate, PredicateBuilder}
import org.apache.paimon.table.Table
import org.apache.paimon.table.source.ReadBuilder

import org.apache.spark.internal.Logging
import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns}
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import scala.collection.mutable

abstract class PaimonBaseScanBuilder(table: Table)
  extends ScanBuilder
  with SupportsPushDownFilters
  with SupportsPushDownRequiredColumns
  with Logging {

  protected var predicates: Option[Predicate] = None

  protected var pushed: Option[Array[Filter]] = None

  protected var projectedIndexes: Option[Array[Int]] = None

  protected def getReadBuilder(): ReadBuilder = {
    val readBuilder = table.newReadBuilder()
    projectedIndexes.foreach(readBuilder.withProjection)
    predicates.foreach(readBuilder.withFilter)

    readBuilder
  }

  override def build(): Scan = {
    new PaimonScan(table, getReadBuilder());
  }

  /**
   * Pushes down filters, and returns filters that need to be evaluated after scanning.  Rows
   * should be returned from the data source if and only if all of the filters match. That is,
   * filters must be interpreted as ANDed together.
   */
  override def pushFilters(filters: Array[Filter]): Array[Filter] = {
    val pushable = mutable.ArrayBuffer.empty[Filter]
    val postScan = mutable.ArrayBuffer.empty[Filter]
    val predicates = mutable.ArrayBuffer.empty[Predicate]

    val converter = new SparkFilterConverter(table.rowType)
    val visitor = new PartitionPredicateVisitor(table.partitionKeys())
    filters.foreach {
      filter =>
        try {
          val predicate = converter.convert(filter)
          pushable.append(filter)
          predicates.append(predicate)
          if (!predicate.visit(visitor)) postScan.append(filter)
        } catch {
          case e: UnsupportedOperationException =>
            logWarning(e.getMessage)
            postScan.append(filter)
        }
    }

    if (predicates.nonEmpty) {
      this.predicates = Some(PredicateBuilder.and(predicates: _*))
    }
    this.pushed = Some(pushable.toArray)
    postScan.toArray
  }

  /**
   * Returns the filters that are pushed to the data source via {@link # pushFilters ( Filter [ ]
   * )}. 
 There are 3 kinds of filters: 
 pushable filters which don't need to be
   * evaluated again after scanning.
 pushable filters which still need to be evaluated
   * after scanning, e.g. parquet row group filter.
 non-pushable filters.
 
 
   * Both case 1 and 2 should be considered as pushed filters and should be returned by this method.
   *  It's possible that there is no filters in the query and {@link # pushFilters ( Filter [ ]
   * )} is never called, empty array should be returned for this case.
   */
  override def pushedFilters(): Array[Filter] = {
    pushed.getOrElse(Array.empty)
  }

  override def pruneColumns(requiredSchema: StructType): Unit = {
    val pruneFields = requiredSchema.fieldNames
    val fieldNames = table.rowType.getFieldNames
    val projected = pruneFields.map(field => fieldNames.indexOf(field))
    this.projectedIndexes = Some(projected)
  }
}