com.crealytics.spark.v2.excel.ExcelTable.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-excel-2.12.17-3.1.1_2.12 Show documentation
A Spark plugin for reading and writing Excel files
The newest version!
/*
 * Copyright 2022 Martin Mauch (@nightscape)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.crealytics.spark.v2.excel

import org.apache.hadoop.fs.FileStatus
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.write.LogicalWriteInfo
import org.apache.spark.sql.connector.write.WriteBuilder
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.execution.datasources.v2.FileTable
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.connector.catalog.TableCapability
import org.apache.spark.sql.connector.catalog.TableCapability._
import scala.jdk.CollectionConverters._

case class ExcelTable(
  name: String,
  sparkSession: SparkSession,
  map: CaseInsensitiveStringMap,
  paths: Seq[String],
  userSpecifiedSchema: Option[StructType]
) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {

  override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
    ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)

  override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
    val options =
      new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)

    if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
  }

  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
    new ExcelWriteBuilder(paths, formatName, supportsDataType, info)

  override def supportsDataType(dataType: DataType): Boolean = true

  override def formatName: String = "Excel"

  override def fallbackFileFormat: Class[_ <: FileFormat] =
    throw new UnsupportedOperationException("Excel does not support V1 File Format")

  override def capabilities: java.util.Set[TableCapability] =
    Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava

  /* Actual doing schema inferring */
  private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
    val excelHelper = ExcelHelper(options)
    val conf = sparkSession.sqlContext.sparkContext.hadoopConfiguration

    /** Sampling ratio on file level (not row level as in CSV) */
    val paths = {
      var sample = (inputPaths.size * options.samplingRatio).intValue
      sample = if (sample < 1) 1 else sample
      inputPaths.take(sample).map(_.getPath.toUri)
    }
    var rows = excelHelper.getRows(conf, paths.head)

    if (rows.isEmpty) { /* If the first file is empty, not checking further */
      StructType(Seq.empty)
    } else {
      /* Prepare field names */
      val colNames =
        if (options.header) { /* Get column name from the first row */
          val r = excelHelper.getColumnNames(rows.next)
          rows = rows.drop(options.ignoreAfterHeader)
          r
        } else { /* Peek first row, then return back */
          val headerRow = rows.next
          val r = excelHelper.getColumnNames(headerRow)
          rows = Iterator(headerRow) ++ rows
          r
        }

      /* Other files also be utilized (lazily) for field types, reuse field name
         from the first file */
      val numberOfRowToIgnore = if (options.header) (options.ignoreAfterHeader + 1) else 0
      paths.tail.foreach(path => {
        rows ++= excelHelper.getRows(conf, path).drop(numberOfRowToIgnore)
      })

      /* Limit numer of rows to be used for schema infering */
      rows = if (options.excerptSize.isDefined) rows.take(options.excerptSize.get) else rows

      /* Ready to infer schema */
      ExcelInferSchema(options).infer(rows, colNames)
    }
  }
}