scala.com.crealytics.spark.excel.v2.ExcelTable.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of spark-excel_2.12 Show documentation
A Spark plugin for reading and writing Excel files
The newest version!
/*
 * Copyright 2022 Martin Mauch (@nightscape)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.crealytics.spark.excel.v2

import org.apache.hadoop.fs.FileStatus
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.write.Write
import org.apache.spark.sql.connector.write.LogicalWriteInfo
import org.apache.spark.sql.connector.write.WriteBuilder
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.execution.datasources.v2.FileTable
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.connector.catalog.TableCapability
import org.apache.spark.sql.connector.catalog.TableCapability._
import scala.jdk.CollectionConverters._

case class ExcelTable(
  name: String,
  sparkSession: SparkSession,
  map: CaseInsensitiveStringMap,
  paths: Seq[String],
  userSpecifiedSchema: Option[StructType]
) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {

  override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
    ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)

  override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
    val options =
      new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)

    if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
  }

  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
    new WriteBuilder {
      override def build(): Write = ExcelWriteBuilder(paths, formatName, supportsDataType, info)
    }

  override def supportsDataType(dataType: DataType): Boolean = true

  override def formatName: String = "Excel"

  override def fallbackFileFormat: Class[_ <: FileFormat] =
    throw new UnsupportedOperationException("Excel does not support V1 File Format")

  override def capabilities: java.util.Set[TableCapability] =
    Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava

  /* Actual doing schema inferring */
  private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
    val excelHelper = ExcelHelper(options)
    val conf = sparkSession.sessionState.newHadoopConf()

    /* Sampling ratio on file level (not row level as in CSV) */
    val paths = {
      var sample = (inputPaths.size * options.samplingRatio).intValue
      sample = if (sample < 1) 1 else sample
      inputPaths.take(sample).map(_.getPath.toUri)
    }
    val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths)
    try {
      if (sheetData.rowIterator.isEmpty) {
        /* If the first file is empty, not checking further */
        StructType(Seq.empty)
      } else {
        /* Ready to infer schema */
        ExcelInferSchema(options).infer(sheetData.rowIterator, colNames)
      }
    } finally {
      sheetData.close()
    }
  }
}