All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.crealytics.spark.excel.v2.ExcelHeaderChecker.scala Maven / Gradle / Ivy

There is a newer version: 3.5.1_0.20.4
Show newest version
/*
 * Copyright 2022 Martin Mauch (@nightscape)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.crealytics.spark.excel.v2

import org.apache.spark.internal.Logging
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType

/** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
  * sensitivity.
  *
  * @param schema
  *   provided (or inferred) schema to which Excel must conform.
  * @param options
  *   parsed Excel options.
  * @param source
  *   name of Excel source that are currently checked. It is used in error messages.
  */
class ExcelHeaderChecker(schema: StructType, options: ExcelOptions, source: String) extends Logging {

  /** Indicates if it is set to `false`, comparison of column names and schema field names is not case sensitive.
    */
  private val caseSensitive = SQLConf.get.caseSensitiveAnalysis

  /** Indicates if it is `true`, column names are ignored otherwise the Excel column names are checked for conformance
    * to the schema. In the case if the column name don't conform to the schema, an exception is thrown.
    */
  private val enforceSchema = options.enforceSchema

  /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
    * sensitivity.
    *
    * @param columnNames
    *   names of Excel columns that must be checked against to the schema.
    */
  def checkHeaderColumnNames(columnNames: Vector[String]): Unit = {
    if (columnNames != null) {
      val fieldNames = schema.map(_.name).toIndexedSeq
      val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
      var errorMessage: Option[String] = None

      if (headerLen == schemaSize) {
        var i = 0
        while (errorMessage.isEmpty && i < headerLen) {
          var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
          if (!caseSensitive) {
            // scalastyle:off caselocale
            nameInSchema = nameInSchema.toLowerCase
            nameInHeader = nameInHeader.toLowerCase
            // scalastyle:on caselocale
          }
          if (nameInHeader != nameInSchema) {
            errorMessage = Some(s"""|Excel header does not conform to the schema.
                  | Header: ${columnNames.mkString(", ")}
                  | Schema: ${fieldNames.mkString(", ")}
                  |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
                  |$source""".stripMargin)
          }
          i += 1
        }
      } else {
        errorMessage = Some(s"""|Number of column in Excel header is not equal to number of fields in the schema:
              | Header length: $headerLen, schema size: $schemaSize
              |$source""".stripMargin)
      }

      errorMessage.foreach { msg =>
        if (enforceSchema) { logWarning(msg) }
        else { throw new IllegalArgumentException(msg) }
      }
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy