org.apache.spark.sql.execution.datasources.parquet.GeoParquetUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sedona-sql-3.4_2.12 Show documentation
Show all versions of sedona-sql-3.4_2.12 Show documentation
A cluster computing system for processing large-scale spatial data: SQL API for Spark 3.4.
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.datasources.parquet
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.parquet.hadoop.ParquetFileWriter
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import scala.language.existentials
object GeoParquetUtils {
def inferSchema(
sparkSession: SparkSession,
parameters: Map[String, String],
files: Seq[FileStatus]): Option[StructType] = {
val parquetOptions = new ParquetOptions(parameters, sparkSession.sessionState.conf)
val shouldMergeSchemas = parquetOptions.mergeSchema
val mergeRespectSummaries = sparkSession.sessionState.conf.isParquetSchemaRespectSummaries
val filesByType = splitFiles(files)
val filesToTouch =
if (shouldMergeSchemas) {
val needMerged: Seq[FileStatus] =
if (mergeRespectSummaries) {
Seq.empty
} else {
filesByType.data
}
needMerged ++ filesByType.metadata ++ filesByType.commonMetadata
} else {
// Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
// don't have this.
filesByType.commonMetadata.headOption
// Falls back to "_metadata"
.orElse(filesByType.metadata.headOption)
// Summary file(s) not found, the Parquet file is either corrupted, or different part-
// files contain conflicting user defined metadata (two or more values are associated
// with a same key in different files). In either case, we fall back to any of the
// first part-file, and just assume all schemas are consistent.
.orElse(filesByType.data.headOption)
.toSeq
}
GeoParquetFileFormat.mergeSchemasInParallel(parameters, filesToTouch, sparkSession)
}
case class FileTypes(
data: Seq[FileStatus],
metadata: Seq[FileStatus],
commonMetadata: Seq[FileStatus])
private def splitFiles(allFiles: Seq[FileStatus]): FileTypes = {
val leaves = allFiles.toArray.sortBy(_.getPath.toString)
FileTypes(
data = leaves.filterNot(f => isSummaryFile(f.getPath)),
metadata =
leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
commonMetadata =
leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
}
private def isSummaryFile(file: Path): Boolean = {
file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
}
}