All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.sources.interfaces.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.sources

import scala.collection.mutable
import scala.util.Try

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{PathFilter, FileStatus, FileSystem, Path}
import org.apache.hadoop.mapred.{JobConf, FileInputFormat}
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
import org.apache.spark.sql.execution.{FileRelation, RDDConversions}
import org.apache.spark.sql.execution.datasources.{PartitioningUtils, PartitionSpec, Partition}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql._
import org.apache.spark.util.SerializableConfiguration

/**
 * ::DeveloperApi::
 * Data sources should implement this trait so that they can register an alias to their data source.
 * This allows users to give the data source alias as the format type over the fully qualified
 * class name.
 *
 * A new instance of this class will be instantiated each time a DDL call is made.
 *
 * @since 1.5.0
 */
@DeveloperApi
trait DataSourceRegister {

  /**
   * The string that represents the format that this data source provider uses. This is
   * overridden by children to provide a nice alias for the data source. For example:
   *
   * {{{
   *   override def format(): String = "parquet"
   * }}}
   *
   * @since 1.5.0
   */
  def shortName(): String
}

/**
 * ::DeveloperApi::
 * Implemented by objects that produce relations for a specific kind of data source.  When
 * Spark SQL is given a DDL operation with a USING clause specified (to specify the implemented
 * RelationProvider), this interface is used to pass in the parameters specified by a user.
 *
 * Users may specify the fully qualified class name of a given data source.  When that class is
 * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
 * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
 * data source 'org.apache.spark.sql.json.DefaultSource'
 *
 * A new instance of this class will be instantiated each time a DDL call is made.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait RelationProvider {
  /**
   * Returns a new base relation with the given parameters.
   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
   * by the Map that is passed to the function.
   */
  def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
}

/**
 * ::DeveloperApi::
 * Implemented by objects that produce relations for a specific kind of data source
 * with a given schema.  When Spark SQL is given a DDL operation with a USING clause specified (
 * to specify the implemented SchemaRelationProvider) and a user defined schema, this interface
 * is used to pass in the parameters specified by a user.
 *
 * Users may specify the fully qualified class name of a given data source.  When that class is
 * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
 * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
 * data source 'org.apache.spark.sql.json.DefaultSource'
 *
 * A new instance of this class will be instantiated each time a DDL call is made.
 *
 * The difference between a [[RelationProvider]] and a [[SchemaRelationProvider]] is that
 * users need to provide a schema when using a [[SchemaRelationProvider]].
 * A relation provider can inherits both [[RelationProvider]] and [[SchemaRelationProvider]]
 * if it can support both schema inference and user-specified schemas.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait SchemaRelationProvider {
  /**
   * Returns a new base relation with the given parameters and user defined schema.
   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
   * by the Map that is passed to the function.
   */
  def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): BaseRelation
}

/**
 * ::Experimental::
 * Implemented by objects that produce relations for a specific kind of data source
 * with a given schema and partitioned columns.  When Spark SQL is given a DDL operation with a
 * USING clause specified (to specify the implemented [[HadoopFsRelationProvider]]), a user defined
 * schema, and an optional list of partition columns, this interface is used to pass in the
 * parameters specified by a user.
 *
 * Users may specify the fully qualified class name of a given data source.  When that class is
 * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for
 * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
 * data source 'org.apache.spark.sql.json.DefaultSource'
 *
 * A new instance of this class will be instantiated each time a DDL call is made.
 *
 * The difference between a [[RelationProvider]] and a [[HadoopFsRelationProvider]] is
 * that users need to provide a schema and a (possibly empty) list of partition columns when
 * using a [[HadoopFsRelationProvider]]. A relation provider can inherits both [[RelationProvider]],
 * and [[HadoopFsRelationProvider]] if it can support schema inference, user-specified
 * schemas, and accessing partitioned relations.
 *
 * @since 1.4.0
 */
@Experimental
trait HadoopFsRelationProvider {
  /**
   * Returns a new base relation with the given parameters, a user defined schema, and a list of
   * partition columns. Note: the parameters' keywords are case insensitive and this insensitivity
   * is enforced by the Map that is passed to the function.
   *
   * @param dataSchema Schema of data columns (i.e., columns that are not partition columns).
   */
  def createRelation(
      sqlContext: SQLContext,
      paths: Array[String],
      dataSchema: Option[StructType],
      partitionColumns: Option[StructType],
      parameters: Map[String, String]): HadoopFsRelation
}

/**
 * @since 1.3.0
 */
@DeveloperApi
trait CreatableRelationProvider {
  /**
    * Creates a relation with the given parameters based on the contents of the given
    * DataFrame. The mode specifies the expected behavior of createRelation when
    * data already exists.
    * Right now, there are three modes, Append, Overwrite, and ErrorIfExists.
    * Append mode means that when saving a DataFrame to a data source, if data already exists,
    * contents of the DataFrame are expected to be appended to existing data.
    * Overwrite mode means that when saving a DataFrame to a data source, if data already exists,
    * existing data is expected to be overwritten by the contents of the DataFrame.
    * ErrorIfExists mode means that when saving a DataFrame to a data source,
    * if data already exists, an exception is expected to be thrown.
     *
     * @since 1.3.0
    */
  def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation
}

/**
 * ::DeveloperApi::
 * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
 * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
 * implementation should inherit from one of the descendant `Scan` classes, which define various
 * abstract methods for execution.
 *
 * BaseRelations must also define an equality function that only returns true when the two
 * instances will return the same data. This equality function is used when determining when
 * it is safe to substitute cached results for a given relation.
 *
 * @since 1.3.0
 */
@DeveloperApi
abstract class BaseRelation {
  def sqlContext: SQLContext
  def schema: StructType

  /**
   * Returns an estimated size of this relation in bytes. This information is used by the planner
   * to decide when it is safe to broadcast a relation and can be overridden by sources that
   * know the size ahead of time. By default, the system will assume that tables are too
   * large to broadcast. This method will be called multiple times during query planning
   * and thus should not perform expensive operations for each invocation.
   *
   * Note that it is always better to overestimate size than underestimate, because underestimation
   * could lead to execution plans that are suboptimal (i.e. broadcasting a very large table).
   *
   * @since 1.3.0
   */
  def sizeInBytes: Long = sqlContext.conf.defaultSizeInBytes

  /**
   * Whether does it need to convert the objects in Row to internal representation, for example:
   *  java.lang.String -> UTF8String
   *  java.lang.Decimal -> Decimal
   *
   * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
   *
   * Note: The internal representation is not stable across releases and thus data sources outside
   * of Spark SQL should leave this as true.
   *
   * @since 1.4.0
   */
  def needConversion: Boolean = true

  /**
   * Returns the list of [[Filter]]s that this datasource may not be able to handle.
   * These returned [[Filter]]s will be evaluated by Spark SQL after data is output by a scan.
   * By default, this function will return all filters, as it is always safe to
   * double evaluate a [[Filter]]. However, specific implementations can override this function to
   * avoid double filtering when they are capable of processing a filter internally.
   *
   * @since 1.6.0
   */
  def unhandledFilters(filters: Array[Filter]): Array[Filter] = filters
}

/**
 * ::DeveloperApi::
 * A BaseRelation that can produce all of its tuples as an RDD of Row objects.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait TableScan {
  def buildScan(): RDD[Row]
}

/**
 * ::DeveloperApi::
 * A BaseRelation that can eliminate unneeded columns before producing an RDD
 * containing all of its tuples as Row objects.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait PrunedScan {
  def buildScan(requiredColumns: Array[String]): RDD[Row]
}

/**
 * ::DeveloperApi::
 * A BaseRelation that can eliminate unneeded columns and filter using selected
 * predicates before producing an RDD containing all matching tuples as Row objects.
 *
 * The actual filter should be the conjunction of all `filters`,
 * i.e. they should be "and" together.
 *
 * The pushed down filters are currently purely an optimization as they will all be evaluated
 * again.  This means it is safe to use them with methods that produce false positives such
 * as filtering partitions based on a bloom filter.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait PrunedFilteredScan {
  def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row]
}

/**
 * ::DeveloperApi::
 * A BaseRelation that can be used to insert data into it through the insert method.
 * If overwrite in insert method is true, the old data in the relation should be overwritten with
 * the new data. If overwrite in insert method is false, the new data should be appended.
 *
 * InsertableRelation has the following three assumptions.
 * 1. It assumes that the data (Rows in the DataFrame) provided to the insert method
 * exactly matches the ordinal of fields in the schema of the BaseRelation.
 * 2. It assumes that the schema of this relation will not be changed.
 * Even if the insert method updates the schema (e.g. a relation of JSON or Parquet data may have a
 * schema update after an insert operation), the new schema will not be used.
 * 3. It assumes that fields of the data provided in the insert method are nullable.
 * If a data source needs to check the actual nullability of a field, it needs to do it in the
 * insert method.
 *
 * @since 1.3.0
 */
@DeveloperApi
trait InsertableRelation {
  def insert(data: DataFrame, overwrite: Boolean): Unit
}

/**
 * ::Experimental::
 * An interface for experimenting with a more direct connection to the query planner.  Compared to
 * [[PrunedFilteredScan]], this operator receives the raw expressions from the
 * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].  Unlike the other APIs this
 * interface is NOT designed to be binary compatible across releases and thus should only be used
 * for experimentation.
 *
 * @since 1.3.0
 */
@Experimental
trait CatalystScan {
  def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row]
}

/**
 * ::Experimental::
 * A factory that produces [[OutputWriter]]s.  A new [[OutputWriterFactory]] is created on driver
 * side for each write job issued when writing to a [[HadoopFsRelation]], and then gets serialized
 * to executor side to create actual [[OutputWriter]]s on the fly.
 *
 * @since 1.4.0
 */
@Experimental
abstract class OutputWriterFactory extends Serializable {
  /**
   * When writing to a [[HadoopFsRelation]], this method gets called by each task on executor side
   * to instantiate new [[OutputWriter]]s.
   *
   * @param path Path of the file to which this [[OutputWriter]] is supposed to write.  Note that
   *        this may not point to the final output file.  For example, `FileOutputFormat` writes to
   *        temporary directories and then merge written files back to the final destination.  In
   *        this case, `path` points to a temporary output file under the temporary directory.
   * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
   *        schema if the relation being written is partitioned.
   * @param context The Hadoop MapReduce task context.
   *
   * @since 1.4.0
   */
  def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter
}

/**
 * ::Experimental::
 * [[OutputWriter]] is used together with [[HadoopFsRelation]] for persisting rows to the
 * underlying file system.  Subclasses of [[OutputWriter]] must provide a zero-argument constructor.
 * An [[OutputWriter]] instance is created and initialized when a new output file is opened on
 * executor side.  This instance is used to persist rows to this single output file.
 *
 * @since 1.4.0
 */
@Experimental
abstract class OutputWriter {
  /**
   * Persists a single row.  Invoked on the executor side.  When writing to dynamically partitioned
   * tables, dynamic partition columns are not included in rows to be written.
   *
   * @since 1.4.0
   */
  def write(row: Row): Unit

  /**
   * Closes the [[OutputWriter]]. Invoked on the executor side after all rows are persisted, before
   * the task output is committed.
   *
   * @since 1.4.0
   */
  def close(): Unit

  private var converter: InternalRow => Row = _

  protected[sql] def initConverter(dataSchema: StructType) = {
    converter =
      CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
  }

  protected[sql] def writeInternal(row: InternalRow): Unit = {
    write(converter(row))
  }
}

/**
 * ::Experimental::
 * A [[BaseRelation]] that provides much of the common code required for relations that store their
 * data to an HDFS compatible filesystem.
 *
 * For the read path, similar to [[PrunedFilteredScan]], it can eliminate unneeded columns and
 * filter using selected predicates before producing an RDD containing all matching tuples as
 * [[Row]] objects. In addition, when reading from Hive style partitioned tables stored in file
 * systems, it's able to discover partitioning information from the paths of input directories, and
 * perform partition pruning before start reading the data. Subclasses of [[HadoopFsRelation()]]
 * must override one of the four `buildScan` methods to implement the read path.
 *
 * For the write path, it provides the ability to write to both non-partitioned and partitioned
 * tables.  Directory layout of the partitioned tables is compatible with Hive.
 *
 * @constructor This constructor is for internal uses only. The [[PartitionSpec]] argument is for
 *              implementing metastore table conversion.
 *
 * @param maybePartitionSpec An [[HadoopFsRelation]] can be created with an optional
 *        [[PartitionSpec]], so that partition discovery can be skipped.
 *
 * @since 1.4.0
 */
@Experimental
abstract class HadoopFsRelation private[sql](
    maybePartitionSpec: Option[PartitionSpec],
    parameters: Map[String, String])
  extends BaseRelation with FileRelation with Logging {

  override def toString: String = getClass.getSimpleName

  def this() = this(None, Map.empty[String, String])

  def this(parameters: Map[String, String]) = this(None, parameters)

  private[sql] def this(maybePartitionSpec: Option[PartitionSpec]) =
    this(maybePartitionSpec, Map.empty[String, String])

  private val hadoopConf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)

  private var _partitionSpec: PartitionSpec = _

  private class FileStatusCache {
    var leafFiles = mutable.LinkedHashMap.empty[Path, FileStatus]

    var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]

    private def listLeafFiles(paths: Array[String]): mutable.LinkedHashSet[FileStatus] = {
      if (paths.length >= sqlContext.conf.parallelPartitionDiscoveryThreshold) {
        HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sqlContext.sparkContext)
      } else {
        val statuses = paths.flatMap { path =>
          val hdfsPath = new Path(path)
          val fs = hdfsPath.getFileSystem(hadoopConf)
          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
          logInfo(s"Listing $qualified on driver")
          // Dummy jobconf to get to the pathFilter defined in configuration
          val jobConf = new JobConf(hadoopConf, this.getClass())
          val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
          if (pathFilter != null) {
            Try(fs.listStatus(qualified, pathFilter)).getOrElse(Array.empty)
          } else {
            Try(fs.listStatus(qualified)).getOrElse(Array.empty)
          }
        }.filterNot { status =>
          val name = status.getPath.getName
          name.toLowerCase == "_temporary" || name.startsWith(".")
        }

        val (dirs, files) = statuses.partition(_.isDir)

        // It uses [[LinkedHashSet]] since the order of files can affect the results. (SPARK-11500)
        if (dirs.isEmpty) {
          mutable.LinkedHashSet(files: _*)
        } else {
          mutable.LinkedHashSet(files: _*) ++ listLeafFiles(dirs.map(_.getPath.toString))
        }
      }
    }

    def refresh(): Unit = {
      val files = listLeafFiles(paths)

      leafFiles.clear()
      leafDirToChildrenFiles.clear()

      leafFiles ++= files.map(f => f.getPath -> f)
      leafDirToChildrenFiles ++= files.toArray.groupBy(_.getPath.getParent)
    }
  }

  private lazy val fileStatusCache = {
    val cache = new FileStatusCache
    cache.refresh()
    cache
  }

  protected def cachedLeafStatuses(): mutable.LinkedHashSet[FileStatus] = {
    mutable.LinkedHashSet(fileStatusCache.leafFiles.values.toArray: _*)
  }

  final private[sql] def partitionSpec: PartitionSpec = {
    if (_partitionSpec == null) {
      _partitionSpec = maybePartitionSpec
        .flatMap {
          case spec if spec.partitions.nonEmpty =>
            Some(spec.copy(partitionColumns = spec.partitionColumns.asNullable))
          case _ =>
            None
        }
        .orElse {
          // We only know the partition columns and their data types. We need to discover
          // partition values.
          userDefinedPartitionColumns.map { partitionSchema =>
            val spec = discoverPartitions()
            val partitionColumnTypes = spec.partitionColumns.map(_.dataType)
            val castedPartitions = spec.partitions.map { case p @ Partition(values, path) =>
              val literals = partitionColumnTypes.zipWithIndex.map { case (dt, i) =>
                Literal.create(values.get(i, dt), dt)
              }
              val castedValues = partitionSchema.zip(literals).map { case (field, literal) =>
                Cast(literal, field.dataType).eval()
              }
              p.copy(values = InternalRow.fromSeq(castedValues))
            }
            PartitionSpec(partitionSchema, castedPartitions)
          }
        }
        .getOrElse {
          if (sqlContext.conf.partitionDiscoveryEnabled()) {
            discoverPartitions()
          } else {
            PartitionSpec(StructType(Nil), Array.empty[Partition])
          }
        }
    }
    _partitionSpec
  }

  /**
   * Paths of this relation.  For partitioned relations, it should be root directories
   * of all partition directories.
   *
   * @since 1.4.0
   */
  def paths: Array[String]

  /**
   * Contains a set of paths that are considered as the base dirs of the input datasets.
   * The partitioning discovery logic will make sure it will stop when it reaches any
   * base path. By default, the paths of the dataset provided by users will be base paths.
   * For example, if a user uses `sqlContext.read.parquet("/path/something=true/")`, the base path
   * will be `/path/something=true/`, and the returned DataFrame will not contain a column of
   * `something`. If users want to override the basePath. They can set `basePath` in the options
   * to pass the new base path to the data source.
   * For the above example, if the user-provided base path is `/path/`, the returned
   * DataFrame will have the column of `something`.
   */
  private def basePaths: Set[Path] = {
    val userDefinedBasePath = parameters.get("basePath").map(basePath => Set(new Path(basePath)))
    userDefinedBasePath.getOrElse {
      // If the user does not provide basePath, we will just use paths.
      val pathSet = paths.toSet
      pathSet.map(p => new Path(p))
    }.map { hdfsPath =>
      // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
      val fs = hdfsPath.getFileSystem(hadoopConf)
      hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    }
  }

  override def inputFiles: Array[String] = cachedLeafStatuses().map(_.getPath.toString).toArray

  override def sizeInBytes: Long = cachedLeafStatuses().map(_.getLen).sum

  /**
   * Partition columns.  Can be either defined by [[userDefinedPartitionColumns]] or automatically
   * discovered.  Note that they should always be nullable.
   *
   * @since 1.4.0
   */
  final def partitionColumns: StructType =
    userDefinedPartitionColumns.getOrElse(partitionSpec.partitionColumns)

  /**
   * Optional user defined partition columns.
   *
   * @since 1.4.0
   */
  def userDefinedPartitionColumns: Option[StructType] = None

  private[sql] def refresh(): Unit = {
    fileStatusCache.refresh()
    if (sqlContext.conf.partitionDiscoveryEnabled()) {
      _partitionSpec = discoverPartitions()
    }
  }

  private def discoverPartitions(): PartitionSpec = {
    // We use leaf dirs containing data files to discover the schema.
    val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
    userDefinedPartitionColumns match {
      case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
        val spec = PartitioningUtils.parsePartitions(
          leafDirs,
          PartitioningUtils.DEFAULT_PARTITION_NAME,
          typeInference = false,
          basePaths = basePaths)

        // Without auto inference, all of value in the `row` should be null or in StringType,
        // we need to cast into the data type that user specified.
        def castPartitionValuesToUserSchema(row: InternalRow) = {
          InternalRow((0 until row.numFields).map { i =>
            Cast(
              Literal.create(row.getUTF8String(i), StringType),
              userProvidedSchema.fields(i).dataType).eval()
          }: _*)
        }

        PartitionSpec(userProvidedSchema, spec.partitions.map { part =>
          part.copy(values = castPartitionValuesToUserSchema(part.values))
        })

      case _ =>
        // user did not provide a partitioning schema
        PartitioningUtils.parsePartitions(
          leafDirs,
          PartitioningUtils.DEFAULT_PARTITION_NAME,
          typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled(),
          basePaths = basePaths)
    }
  }

  /**
   * Schema of this relation.  It consists of columns appearing in [[dataSchema]] and all partition
   * columns not appearing in [[dataSchema]].
   *
   * @since 1.4.0
   */
  override lazy val schema: StructType = {
    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
    StructType(dataSchema ++ partitionColumns.filterNot { column =>
      dataSchemaColumnNames.contains(column.name.toLowerCase)
    })
  }

  final private[sql] def buildInternalScan(
      requiredColumns: Array[String],
      filters: Array[Filter],
      inputPaths: Array[String],
      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
    val inputStatuses = inputPaths.flatMap { input =>
      val path = new Path(input)

      // First assumes `input` is a directory path, and tries to get all files contained in it.
      fileStatusCache.leafDirToChildrenFiles.getOrElse(
        path,
        // Otherwise, `input` might be a file path
        fileStatusCache.leafFiles.get(path).toArray
      ).filter { status =>
        val name = status.getPath.getName
        !name.startsWith("_") && !name.startsWith(".")
      }
    }

    buildInternalScan(requiredColumns, filters, inputStatuses, broadcastedConf)
  }

  /**
   * Specifies schema of actual data files.  For partitioned relations, if one or more partitioned
   * columns are contained in the data files, they should also appear in `dataSchema`.
   *
   * @since 1.4.0
   */
  def dataSchema: StructType

  /**
   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
   * this relation. For partitioned relations, this method is called for each selected partition,
   * and builds an `RDD[Row]` containing all rows within that single partition.
   *
   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
   *        relation. For a partitioned relation, it contains paths of all data files in a single
   *        selected partition.
   *
   * @since 1.4.0
   */
  def buildScan(inputFiles: Array[FileStatus]): RDD[Row] = {
    throw new UnsupportedOperationException(
      "At least one buildScan() method should be overridden to read the relation.")
  }

  /**
   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
   * this relation. For partitioned relations, this method is called for each selected partition,
   * and builds an `RDD[Row]` containing all rows within that single partition.
   *
   * @param requiredColumns Required columns.
   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
   *        relation. For a partitioned relation, it contains paths of all data files in a single
   *        selected partition.
   *
   * @since 1.4.0
   */
  // TODO Tries to eliminate the extra Catalyst-to-Scala conversion when `needConversion` is true
  //
  // PR #7626 separated `Row` and `InternalRow` completely.  One of the consequences is that we can
  // no longer treat an `InternalRow` containing Catalyst values as a `Row`.  Thus we have to
  // introduce another row value conversion for data sources whose `needConversion` is true.
  def buildScan(requiredColumns: Array[String], inputFiles: Array[FileStatus]): RDD[Row] = {
    // Yeah, to workaround serialization...
    val dataSchema = this.dataSchema
    val needConversion = this.needConversion

    val requiredOutput = requiredColumns.map { col =>
      val field = dataSchema(col)
      BoundReference(dataSchema.fieldIndex(col), field.dataType, field.nullable)
    }.toSeq

    val rdd: RDD[Row] = buildScan(inputFiles)
    val converted: RDD[InternalRow] =
      if (needConversion) {
        RDDConversions.rowToRowRdd(rdd, dataSchema.fields.map(_.dataType))
      } else {
        rdd.asInstanceOf[RDD[InternalRow]]
      }

    converted.mapPartitions { rows =>
      val buildProjection =
        GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)

      val projectedRows = {
        val mutableProjection = buildProjection()
        rows.map(r => mutableProjection(r))
      }

      if (needConversion) {
        val requiredSchema = StructType(requiredColumns.map(dataSchema(_)))
        val toScala = CatalystTypeConverters.createToScalaConverter(requiredSchema)
        projectedRows.map(toScala(_).asInstanceOf[Row])
      } else {
        projectedRows
      }
    }.asInstanceOf[RDD[Row]]
  }

  /**
   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
   * this relation. For partitioned relations, this method is called for each selected partition,
   * and builds an `RDD[Row]` containing all rows within that single partition.
   *
   * @param requiredColumns Required columns.
   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
   *        will all be evaluated again. This means it is safe to use them with methods that produce
   *        false positives such as filtering partitions based on a bloom filter.
   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
   *        relation. For a partitioned relation, it contains paths of all data files in a single
   *        selected partition.
   *
   * @since 1.4.0
   */
  def buildScan(
      requiredColumns: Array[String],
      filters: Array[Filter],
      inputFiles: Array[FileStatus]): RDD[Row] = {
    buildScan(requiredColumns, inputFiles)
  }

  /**
   * For a non-partitioned relation, this method builds an `RDD[Row]` containing all rows within
   * this relation. For partitioned relations, this method is called for each selected partition,
   * and builds an `RDD[Row]` containing all rows within that single partition.
   *
   * Note: This interface is subject to change in future.
   *
   * @param requiredColumns Required columns.
   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
   *        will all be evaluated again. This means it is safe to use them with methods that produce
   *        false positives such as filtering partitions based on a bloom filter.
   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
   *        relation. For a partitioned relation, it contains paths of all data files in a single
   *        selected partition.
   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
   *                        overhead of broadcasting the Configuration for every Hadoop RDD.
   *
   * @since 1.4.0
   */
  private[sql] def buildScan(
      requiredColumns: Array[String],
      filters: Array[Filter],
      inputFiles: Array[FileStatus],
      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
    buildScan(requiredColumns, filters, inputFiles)
  }

  /**
   * For a non-partitioned relation, this method builds an `RDD[InternalRow]` containing all rows
   * within this relation. For partitioned relations, this method is called for each selected
   * partition, and builds an `RDD[InternalRow]` containing all rows within that single partition.
   *
   * Note:
   *
   * 1. Rows contained in the returned `RDD[InternalRow]` are assumed to be `UnsafeRow`s.
   * 2. This interface is subject to change in future.
   *
   * @param requiredColumns Required columns.
   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
   *        will all be evaluated again. This means it is safe to use them with methods that produce
   *        false positives such as filtering partitions based on a bloom filter.
   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
   *        relation. For a partitioned relation, it contains paths of all data files in a single
   *        selected partition.
   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
   *        overhead of broadcasting the Configuration for every Hadoop RDD.
   */
  private[sql] def buildInternalScan(
      requiredColumns: Array[String],
      filters: Array[Filter],
      inputFiles: Array[FileStatus],
      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
    val requiredSchema = StructType(requiredColumns.map(dataSchema.apply))
    val internalRows = {
      val externalRows = buildScan(requiredColumns, filters, inputFiles, broadcastedConf)
      execution.RDDConversions.rowToRowRdd(externalRows, requiredSchema.map(_.dataType))
    }

    internalRows.mapPartitions { iterator =>
      val unsafeProjection = UnsafeProjection.create(requiredSchema)
      iterator.map(unsafeProjection)
    }
  }

  /**
   * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
   * be put here.  For example, user defined output committer can be configured here
   * by setting the output committer class in the conf of spark.sql.sources.outputCommitterClass.
   *
   * Note that the only side effect expected here is mutating `job` via its setters.  Especially,
   * Spark SQL caches [[BaseRelation]] instances for performance, mutating relation internal states
   * may cause unexpected behaviors.
   *
   * @since 1.4.0
   */
  def prepareJobForWrite(job: Job): OutputWriterFactory
}

private[sql] object HadoopFsRelation extends Logging {
  // We don't filter files/directories whose name start with "_" except "_temporary" here, as
  // specific data sources may take advantages over them (e.g. Parquet _metadata and
  // _common_metadata files). "_temporary" directories are explicitly ignored since failed
  // tasks/jobs may leave partial/corrupted data files there.  Files and directories whose name
  // start with "." are also ignored.
  def listLeafFiles(fs: FileSystem, status: FileStatus): Array[FileStatus] = {
    logInfo(s"Listing ${status.getPath}")
    val name = status.getPath.getName.toLowerCase
    if (name == "_temporary" || name.startsWith(".")) {
      Array.empty
    } else {
      // Dummy jobconf to get to the pathFilter defined in configuration
      val jobConf = new JobConf(fs.getConf, this.getClass())
      val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
      if (pathFilter != null) {
        val (dirs, files) = fs.listStatus(status.getPath, pathFilter).partition(_.isDir)
        files ++ dirs.flatMap(dir => listLeafFiles(fs, dir))
      } else {
        val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
        files ++ dirs.flatMap(dir => listLeafFiles(fs, dir))
      }
    }
  }

  // `FileStatus` is Writable but not serializable.  What make it worse, somehow it doesn't play
  // well with `SerializableWritable`.  So there seems to be no way to serialize a `FileStatus`.
  // Here we use `FakeFileStatus` to extract key components of a `FileStatus` to serialize it from
  // executor side and reconstruct it on driver side.
  case class FakeFileStatus(
      path: String,
      length: Long,
      isDir: Boolean,
      blockReplication: Short,
      blockSize: Long,
      modificationTime: Long,
      accessTime: Long)

  def listLeafFilesInParallel(
      paths: Array[String],
      hadoopConf: Configuration,
      sparkContext: SparkContext): mutable.LinkedHashSet[FileStatus] = {
    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")

    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
    val fakeStatuses = sparkContext.parallelize(paths).flatMap { path =>
      val hdfsPath = new Path(path)
      val fs = hdfsPath.getFileSystem(serializableConfiguration.value)
      val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
      Try(listLeafFiles(fs, fs.getFileStatus(qualified))).getOrElse(Array.empty)
    }.map { status =>
      FakeFileStatus(
        status.getPath.toString,
        status.getLen,
        status.isDir,
        status.getReplication,
        status.getBlockSize,
        status.getModificationTime,
        status.getAccessTime)
    }.collect()

    val hadoopFakeStatuses = fakeStatuses.map { f =>
      new FileStatus(
        f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path))
    }
    mutable.LinkedHashSet(hadoopFakeStatuses: _*)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy