com.crealytics.spark.v2.excel.ExcelHeaderChecker.scala Maven / Gradle / Ivy
/** Copyright 2016 - 2021 Martin Mauch (@nightscape)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.crealytics.spark.v2.excel
import org.apache.spark.internal.Logging
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType
/** Checks that column names in a Excel header and field names in the schema are
* the same by taking into account case sensitivity.
*
* @param schema provided (or inferred) schema to which Excel must conform.
* @param options parsed Excel options.
* @param source name of Excel source that are currently checked. It is used in
* error messages.
*/
class ExcelHeaderChecker(schema: StructType, options: ExcelOptions, source: String) extends Logging {
/** Indicates if it is set to `false`, comparison of column names and schema
* field names is not case sensitive.
*/
private val caseSensitive = SQLConf.get.caseSensitiveAnalysis
/** Indicates if it is `true`, column names are ignored otherwise the Excel
* column names are checked for conformance to the schema. In the case if
* the column name don't conform to the schema, an exception is thrown.
*/
private val enforceSchema = options.enforceSchema
/** Checks that column names in a Excel header and field names in the schema
* are the same by taking into account case sensitivity.
*
* @param columnNames names of Excel columns that must be checked against to
* the schema.
*/
def checkHeaderColumnNames(columnNames: Vector[String]): Unit = {
if (columnNames != null) {
val fieldNames = schema.map(_.name).toIndexedSeq
val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
var errorMessage: Option[String] = None
if (headerLen == schemaSize) {
var i = 0
while (errorMessage.isEmpty && i < headerLen) {
var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
if (!caseSensitive) {
// scalastyle:off caselocale
nameInSchema = nameInSchema.toLowerCase
nameInHeader = nameInHeader.toLowerCase
// scalastyle:on caselocale
}
if (nameInHeader != nameInSchema) {
errorMessage = Some(s"""|Excel header does not conform to the schema.
| Header: ${columnNames.mkString(", ")}
| Schema: ${fieldNames.mkString(", ")}
|Expected: ${fieldNames(i)} but found: ${columnNames(i)}
|$source""".stripMargin)
}
i += 1
}
} else {
errorMessage = Some(s"""|Number of column in Excel header is not equal to number of fields in the schema:
| Header length: $headerLen, schema size: $schemaSize
|$source""".stripMargin)
}
errorMessage.foreach { msg =>
if (enforceSchema) { logWarning(msg) }
else { throw new IllegalArgumentException(msg) }
}
}
}
}