org.apache.spark.sql.sources.v2.reader.DataSourceReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of spark-sql_2.11
There is a newer version: 2.4.8
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.sources.v2.reader;

import java.util.List;

import org.apache.spark.annotation.InterfaceStability;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.sources.v2.DataSourceOptions;
import org.apache.spark.sql.sources.v2.ReadSupport;
import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
import org.apache.spark.sql.types.StructType;

/**
 * A data source reader that is returned by
 * {@link ReadSupport#createReader(DataSourceOptions)} or
 * {@link ReadSupportWithSchema#createReader(StructType, DataSourceOptions)}.
 * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
 * logic is delegated to {@link DataReaderFactory}s that are returned by
 * {@link #createDataReaderFactories()}.
 *
 * There are mainly 3 kinds of query optimizations:
 *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
 *      pruning), etc. Names of these interfaces start with `SupportsPushDown`.
 *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
 *      Names of these interfaces start with `SupportsReporting`.
 *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
 *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
 *      implement at most one of the special scans, if more than one special scans are implemented,
 *      only one of them would be respected, according to the priority list from high to low:
 *      {@link SupportsScanColumnarBatch}, {@link SupportsScanUnsafeRow}.
 *
 * If an exception was throw when applying any of these query optimizations, the action would fail
 * and no Spark job was submitted.
 *
 * Spark first applies all operator push-down optimizations that this data source supports. Then
 * Spark collects information this data source reported for further optimizations. Finally Spark
 * issues the scan request and does the actual data reading.
 */
@InterfaceStability.Evolving
public interface DataSourceReader {

  /**
   * Returns the actual schema of this data source reader, which may be different from the physical
   * schema of the underlying storage, as column pruning or other optimizations may happen.
   *
   * If this method fails (by throwing an exception), the action would fail and no Spark job was
   * submitted.
   */
  StructType readSchema();

  /**
   * Returns a list of reader factories. Each factory is responsible for creating a data reader to
   * output data for one RDD partition. That means the number of factories returned here is same as
   * the number of RDD partitions this scan outputs.
   *
   * Note that, this may not be a full scan if the data source reader mixes in other optimization
   * interfaces like column pruning, filter push-down, etc. These optimizations are applied before
   * Spark issues the scan request.
   *
   * If this method fails (by throwing an exception), the action would fail and no Spark job was
   * submitted.
   */
  List> createDataReaderFactories();
}