com.crealytics.spark.excel.DataLocator.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-excel-2.12.17-3.1.1_2.12 Show documentation
Show all versions of spark-excel-2.12.17-3.1.1_2.12 Show documentation
A Spark plugin for reading and writing Excel files
The newest version!
/*
* Copyright 2022 Martin Mauch (@nightscape)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.crealytics.spark.excel
import com.crealytics.spark.excel.Utils.MapIncluding
import org.apache.poi.ss.SpreadsheetVersion
import org.apache.poi.ss.usermodel.{Cell, Sheet, Workbook}
import org.apache.poi.ss.util.{AreaReference, CellReference}
import org.apache.poi.xssf.usermodel.{XSSFTable, XSSFWorkbook}
import spoiwo.model.HasIndex._
import spoiwo.model.{
Cell => WriteCell,
CellDataFormat,
CellRange,
CellStyle,
Row => WriteRow,
Sheet => WriteSheet,
Table,
TableColumn
}
import scala.jdk.CollectionConverters._
import scala.util.Try
trait DataLocator {
def dateFormat: Option[String]
def timestampFormat: Option[String]
val dateFrmt = dateFormat.getOrElse(ExcelFileSaver.DEFAULT_DATE_FORMAT)
val timestampFrmt = timestampFormat.getOrElse(ExcelFileSaver.DEFAULT_TIMESTAMP_FORMAT)
def readFrom(workbook: Workbook): Iterator[Seq[Cell]]
def toSheet(header: Option[Seq[String]], data: Iterator[Seq[Any]], existingWorkbook: Workbook): WriteSheet
}
object DataLocator {
private def parseRangeAddress(address: String): AreaReference =
Try {
val cellRef = new CellReference(address)
new AreaReference(
cellRef,
new CellReference(
cellRef.getSheetName,
SpreadsheetVersion.EXCEL2007.getLastRowIndex,
SpreadsheetVersion.EXCEL2007.getLastColumnIndex,
false,
false
),
SpreadsheetVersion.EXCEL2007
)
}.getOrElse(new AreaReference(address, SpreadsheetVersion.EXCEL2007))
val TableAddress = """(.*)\[(.*)\]""".r
val WithDataAddress = MapIncluding(Seq("dataAddress"), optionally = Seq("dateFormat", "timestampFormat"))
val WithoutDataAddress = MapIncluding(Seq(), optionally = Seq("dateFormat", "timestampFormat"))
def apply(parameters: Map[String, String]): DataLocator =
parameters match {
case WithDataAddress(Seq(TableAddress(_, _)), _) if parameters.contains("maxRowsInMemory") =>
throw new IllegalArgumentException(
s"Reading from a table cannot be combined with maxRowsInMemory, parameters are: $parameters"
)
case WithDataAddress(Seq(TableAddress(tableName, "#All")), Seq(dateFormat, timestampFormat)) =>
new TableDataLocator(tableName, dateFormat, timestampFormat)
case WithDataAddress(Seq(dataAddress), Seq(dateFormat, timestampFormat)) =>
new CellRangeAddressDataLocator(
parseRangeAddress(Option(dataAddress).getOrElse("A1")),
dateFormat,
timestampFormat
)
case WithoutDataAddress(Seq(), Seq(dateFormat, timestampFormat)) =>
new CellRangeAddressDataLocator(parseRangeAddress("A1"), dateFormat, timestampFormat)
}
}
trait AreaDataLocator extends DataLocator {
def columnIndices(workbook: Workbook): Range
def rowIndices(workbook: Workbook): Range
def sheetName(workbook: Workbook): Option[String]
def findSheet(workBook: Workbook, sheetName: Option[String]): Sheet =
sheetName
.map(sn =>
Try(Option(workBook.getSheetAt(sn.toInt))).toOption.flatten
.orElse(Option(workBook.getSheet(sn)))
.getOrElse(throw new IllegalArgumentException(s"Unknown sheet $sn"))
)
.getOrElse(workBook.getSheetAt(0))
def readFromSheet(workbook: Workbook, name: Option[String]): Iterator[Vector[Cell]] = {
val sheet = findSheet(workbook, name)
val rowInd = rowIndices(workbook)
val colInd = columnIndices(workbook)
sheet.iterator.asScala
.filter(r => rowInd.contains(r.getRowNum))
.map(_.cellIterator().asScala.filter(c => colInd.contains(c.getColumnIndex)).toVector)
}
override def toSheet(
header: Option[Seq[String]],
data: Iterator[Seq[Any]],
existingWorkbook: Workbook
): WriteSheet = {
val colInd = columnIndices(existingWorkbook)
val dataRows: List[WriteRow] = (header.iterator ++ data)
.zip(rowIndices(existingWorkbook).iterator)
.map { case (row, rowIdx) =>
WriteRow(
row.zip(colInd).map { case (c, colIdx) =>
toCell(c, dateFrmt, timestampFrmt).withIndex(colIdx)
},
index = rowIdx
)
}
.toList
sheetName(existingWorkbook).foldLeft(WriteSheet(rows = dataRows))(_ withSheetName _)
}
def dateCell(time: Long, format: String): WriteCell = {
WriteCell(new java.util.Date(time), style = CellStyle(dataFormat = CellDataFormat(format)))
}
def toCell(a: Any, dateFormat: String, timestampFormat: String): WriteCell =
a match {
case t: java.sql.Timestamp => dateCell(t.getTime, timestampFormat)
case d: java.sql.Date => dateCell(d.getTime, dateFormat)
case s: String => WriteCell(s)
case f: Float => WriteCell(f.toDouble)
case d: Double => WriteCell(d)
case b: Boolean => WriteCell(b)
case b: Byte => WriteCell(b.toInt)
case s: Short => WriteCell(s.toInt)
case i: Int => WriteCell(i)
case l: Long => WriteCell(l)
case b: BigDecimal => WriteCell(b)
case b: java.math.BigDecimal => WriteCell(BigDecimal(b))
case null => WriteCell.Empty
}
}
class CellRangeAddressDataLocator(
val dataAddress: AreaReference,
val dateFormat: Option[String] = None,
val timestampFormat: Option[String] = None
) extends AreaDataLocator {
private val sheetName = Option(dataAddress.getFirstCell.getSheetName)
def columnIndices(workbook: Workbook): Range =
(dataAddress.getFirstCell.getCol.toInt to dataAddress.getLastCell.getCol.toInt)
def rowIndices(workbook: Workbook): Range =
(dataAddress.getFirstCell.getRow to dataAddress.getLastCell.getRow)
override def readFrom(workbook: Workbook): Iterator[Seq[Cell]] = readFromSheet(workbook, sheetName)
override def sheetName(workbook: Workbook): Option[String] = sheetName
}
class TableDataLocator(
tableName: String,
val dateFormat: Option[String] = None,
val timestampFormat: Option[String] = None
) extends AreaDataLocator {
override def readFrom(workbook: Workbook): Iterator[Seq[Cell]] = {
val xwb = workbook.asInstanceOf[XSSFWorkbook]
readFromSheet(workbook, Some(xwb.getTable(tableName).getSheetName))
}
override def toSheet(
header: Option[Seq[String]],
data: Iterator[Seq[Any]],
existingWorkbook: Workbook
): WriteSheet = {
val sheet = super.toSheet(header, data, existingWorkbook)
val maxRow = sheet.rows.maxIndex
val minRow = sheet.rows.flatMap(_.index).sorted.headOption.getOrElse(0)
val maxCol = sheet.rows.map(_.cells.maxIndex).sorted.lastOption.getOrElse(0)
val minCol = sheet.rows.flatMap(_.cells.flatMap(_.index)).sorted.headOption.getOrElse(0)
val table =
Table(cellRange = CellRange(rowRange = (minRow, maxRow), columnRange = (minCol, maxCol)), name = tableName)
val tableWithPotentialHeader =
header.foldLeft(table)((tbl, hdr) =>
tbl.withColumns(hdr.zipWithIndex.map { case (h, i) => TableColumn(h, i.toLong) }.toList)
)
sheet.withTables(tableWithPotentialHeader)
}
def columnIndices(workbook: Workbook): Range =
findTable(workbook).map(t => t.getStartColIndex to t.getEndColIndex).getOrElse(0 until Int.MaxValue)
override def rowIndices(workbook: Workbook): Range =
findTable(workbook).map(t => t.getStartRowIndex to t.getEndRowIndex).getOrElse(0 until Int.MaxValue)
override def sheetName(workbook: Workbook): Option[String] =
findTable(workbook).map(_.getSheetName).orElse(Some(tableName))
private def findTable(workbook: Workbook): Option[XSSFTable] = {
Option(workbook.asInstanceOf[XSSFWorkbook].getTable(tableName))
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy