Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.datasources
import scala.collection.mutable
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.{StringType, StructType}
/**
* An abstract class that represents [[FileCatalog]]s that are aware of partitioned tables.
* It provides the necessary methods to parse partition data based on a set of files.
*
* @param parameters as set of options to control partition discovery
* @param partitionSchema an optional partition schema that will be use to provide types for the
* discovered partitions
*/
abstract class PartitioningAwareFileCatalog(
sparkSession: SparkSession,
parameters: Map[String, String],
partitionSchema: Option[StructType])
extends FileCatalog with Logging {
protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
Partition(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
} else {
prunePartitions(filters, partitionSpec()).map {
case PartitionDirectory(values, path) =>
val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
case Some(existingDir) =>
// Directory has children files in it, return them
existingDir.filter(f => isDataPath(f.getPath))
case None =>
// Directory does not exist, or has no children files
Nil
}
Partition(values, files)
}
}
logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
selectedPartitions
}
override def allFiles(): Seq[FileStatus] = {
if (partitionSpec().partitionColumns.isEmpty) {
// For each of the input paths, get the list of files inside them
paths.flatMap { path =>
// Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
val fs = path.getFileSystem(hadoopConf)
val qualifiedPathPre = fs.makeQualified(path)
val qualifiedPath: Path = if (qualifiedPathPre.isRoot && !qualifiedPathPre.isAbsolute) {
// SPARK-17613: Always append `Path.SEPARATOR` to the end of parent directories,
// because the `leafFile.getParent` would have returned an absolute path with the
// separator at the end.
new Path(qualifiedPathPre, Path.SEPARATOR)
} else {
qualifiedPathPre
}
// There are three cases possible with each path
// 1. The path is a directory and has children files in it. Then it must be present in
// leafDirToChildrenFiles as those children files will have been found as leaf files.
// Find its children files from leafDirToChildrenFiles and include them.
// 2. The path is a file, then it will be present in leafFiles. Include this path.
// 3. The path is a directory, but has no children files. Do not include this path.
leafDirToChildrenFiles.get(qualifiedPath)
.orElse { leafFiles.get(qualifiedPath).map(Array(_)) }
.getOrElse(Array.empty)
}
} else {
leafFiles.values.toSeq
}
}
protected def inferPartitioning(): PartitionSpec = {
// We use leaf dirs containing data files to discover the schema.
val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
// SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
// counted as data files, so that they shouldn't participate partition discovery.
files.exists(f => isDataPath(f.getPath))
}.keys.toSeq
partitionSchema match {
case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
val spec = PartitioningUtils.parsePartitions(
leafDirs,
PartitioningUtils.DEFAULT_PARTITION_NAME,
typeInference = false,
basePaths = basePaths)
// Without auto inference, all of value in the `row` should be null or in StringType,
// we need to cast into the data type that user specified.
def castPartitionValuesToUserSchema(row: InternalRow) = {
InternalRow((0 until row.numFields).map { i =>
Cast(
Literal.create(row.getUTF8String(i), StringType),
userProvidedSchema.fields(i).dataType).eval()
}: _*)
}
PartitionSpec(userProvidedSchema, spec.partitions.map { part =>
part.copy(values = castPartitionValuesToUserSchema(part.values))
})
case _ =>
PartitioningUtils.parsePartitions(
leafDirs,
PartitioningUtils.DEFAULT_PARTITION_NAME,
typeInference = sparkSession.sessionState.conf.partitionColumnTypeInferenceEnabled(),
basePaths = basePaths)
}
}
private def prunePartitions(
predicates: Seq[Expression],
partitionSpec: PartitionSpec): Seq[PartitionDirectory] = {
val PartitionSpec(partitionColumns, partitions) = partitionSpec
val partitionColumnNames = partitionColumns.map(_.name).toSet
val partitionPruningPredicates = predicates.filter {
_.references.map(_.name).toSet.subsetOf(partitionColumnNames)
}
if (partitionPruningPredicates.nonEmpty) {
val predicate = partitionPruningPredicates.reduce(expressions.And)
val boundPredicate = InterpretedPredicate.create(predicate.transform {
case a: AttributeReference =>
val index = partitionColumns.indexWhere(a.name == _.name)
BoundReference(index, partitionColumns(index).dataType, nullable = true)
})
val selected = partitions.filter {
case PartitionDirectory(values, _) => boundPredicate(values)
}
logInfo {
val total = partitions.length
val selectedSize = selected.length
val percentPruned = (1 - selectedSize.toDouble / total.toDouble) * 100
s"Selected $selectedSize partitions out of $total, pruned $percentPruned% partitions."
}
selected
} else {
partitions
}
}
/**
* Contains a set of paths that are considered as the base dirs of the input datasets.
* The partitioning discovery logic will make sure it will stop when it reaches any
* base path.
*
* By default, the paths of the dataset provided by users will be base paths.
* Below are three typical examples,
* Case 1) `spark.read.parquet("/path/something=true/")`: the base path will be
* `/path/something=true/`, and the returned DataFrame will not contain a column of `something`.
* Case 2) `spark.read.parquet("/path/something=true/a.parquet")`: the base path will be
* still `/path/something=true/`, and the returned DataFrame will also not contain a column of
* `something`.
* Case 3) `spark.read.parquet("/path/")`: the base path will be `/path/`, and the returned
* DataFrame will have the column of `something`.
*
* Users also can override the basePath by setting `basePath` in the options to pass the new base
* path to the data source.
* For example, `spark.read.option("basePath", "/path/").parquet("/path/something=true/")`,
* and the returned DataFrame will have the column of `something`.
*/
private def basePaths: Set[Path] = {
parameters.get("basePath").map(new Path(_)) match {
case Some(userDefinedBasePath) =>
val fs = userDefinedBasePath.getFileSystem(hadoopConf)
if (!fs.isDirectory(userDefinedBasePath)) {
throw new IllegalArgumentException("Option 'basePath' must be a directory")
}
Set(fs.makeQualified(userDefinedBasePath))
case None =>
paths.map { path =>
// Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
val qualifiedPath = path.getFileSystem(hadoopConf).makeQualified(path)
if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
}
}
private def isDataPath(path: Path): Boolean = {
val name = path.getName
!((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
}
}