All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fi.pelam.csv.table.TableReader.scala Maven / Gradle / Ivy

/*
 * This file is part of pelam-scala-csv
 *
 * Copyright © Peter Lamberg 2015 ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package fi.pelam.csv.table

import java.io.BufferedReader

import fi.pelam.csv.cell._
import fi.pelam.csv.stream.CsvReader
import fi.pelam.csv.util.{Pipeline, SortedBiMap}

// @formatter:off IntelliJ 14.1 (Scala plugin) formatter messes up Scaladoc
/**
 * This class is part of the the higher level API for reading,
 * writing and processing CSV data.
 *
 * [[fi.pelam.csv.stream.CsvReader The simpler stream based API]] is enough for
 * many scenarios, but if several different sets of data will be pulled from
 * the same CSV file and the structure of the CSV file is not rigid, this API may
 * be a better fit.
 *
 * The result of calling `read` method on this class will be an instance of [[Table]] class.
 * The table is an immutable data structure for holding and processing data
 * in a spreadsheet program like format.
 *
 * [[TableWriter]] is the counterpart of this class for writing [[Table]] out to disk for example.
 *
 * == Code Example ==
 *
 * This example parses a small bit of CSV data in which column types are defined
 * on the first row.
 *
 * {{{
    import fi.pelam.csv.table._
    import fi.pelam.csv.cell._
    import TableReaderConfig._

    // Create a TableReader that parses a small bit of CSV data in which the
    // column types are defined on the first row.
    val reader = new TableReader[String, String, SimpleMetadata](

      // An implicit from the object TableReaderConfig converts the string
      // to a function providing streams.
      openStream =
        "product,price,number\n" +
        "apple,0.99,3\n" +
        "orange,1.25,2\n" +
        "banana,0.80,4\n",

      // The first row is the header, the rest are data.
      rowTyper = makeRowTyper({
        case (CellKey(0, _), _) => "header"
        case _ => "data"
      }),

      // First row defines column types.
      colTyper = makeColTyper({
        case (CellKey(0, _), colType) => colType
      }),

      // Convert cells on the "data" rows in the "number" column to integer cells.
      // Convert cells on the "data" rows in the "price" column to decimal cells.
      cellUpgrader = makeCellUpgrader({
        case CellType("data", "number") => IntegerCell.defaultParser
        case CellType("data", "price") => DoubleCell.defaultParser
      }))

    // Get values from cells in column with type "product" on rows with type "data."
    table.getSingleCol("data", "product").map(_.value).toList
    // Will give List("apple", "orange", "banana")

    // Get values from cells in column with type "price" on rows with type "data."
    table.getSingleCol("data", "price").map(_.value).toList)
    // Will give List(0.99, 1.25, 0.8)
 * }}}
 *
 * == CSV format detection heuristics ==
 * One simple detection heuristic is implemented in [[DetectingTableReader]]
 *
 * Since deducing whether correct parameters like character set were used in reading a CSV
 * file without any extra knowledge is impossible, this class supports implementing a custom
 * format detection algorithm by client code.
 *
 * The table reading is split to stages to allow implementing format detection heuristics that
 * lock some variables during the earlier stages and then proceeding to later stages. Unfortunately
 * there is currently no example or implementation of this idea.
 *
 * Locking some variables and then proceeding results in more efficient algorithm
 * than exhaustive search of the full set of combinations (character set, locale, separator etc).
 *
 * The actual detection heuristic is handled outside this class. The idea is that the
 * detection heuristic class uses this repeatedly with varying parameters until some criterion
 * is met. The criterion for ending detection could be that zero errors is detected.
 * If no combination of parameters gives zero errors, then the heuristic could just pick
 * the solution which gave errors in the latest stage and then the fewest errors.
 *
 * == Stages ==
 * @note This is about the internal structure of `TableReader` processing.
 *
 * The table reading is split into four stages.
 *
 * The table reading process may fail and terminate at each phase. Then an incomplete Table object
 * will be returned together with the errors detected so far.
 *
 * The table reading is split to stages to allow implementing format detection heuristics
 * in a structured manner.
 *
 *  - `csvReadingStage` Parse CSV byte data to cells. Depends on `charset` and `separator` provided
 * via the `metadata` parameter.
 *
 *  - `rowTypeDetectionStage` Detect row types (hard coded or based on cell contents). The `rowTyper` parameter
 * is used in this stage.
 *
 *  - `colTypeDetectionStage` Detect column types (hard coded or based on row types and cell contents). The `colTyper` parameter
 * is used in this stage.
 *
 *  - `cellUpgradeStage` Upgrade cells based on cell types, which are combined from row and column types. The `cellUpgrader`
 * parameter is used in this stage.
 *
 * @param openStream A function that returns input stream from which the data should be read.
 *                        The function is called once for every `read` call.
 *
 * @param tableMetadata A custom metadata object which will be passed to the resulting `Table` object.
 *
 * @param rowTyper Partial function used in `rowTypeDetectionStage`
 *
 * @param colTyper Partial function used in `colTypeDetectionStage`
 *
 * @param cellUpgrader Partial function used in `cellUpgradeStage`
 *
 * @tparam RT The client specific row type.
 *
 * @tparam CT The client specific column type.
 *
 * @tparam M The type of the `metadata` parameter. Must be a sub type of [[TableMetadata]].
 *           This specifies the character set and separator to use when reading the CSV data from the input stream.
 */
// @formatter:on IntelliJ 14.1 (Scala plugin) formatter messes up Scaladoc
// TODO: pelam-scala-csv TableReader API could support reparsing table, but without going through serialization to CSV
class TableReader[RT, CT, M <: TableMetadata](
  val openStream: TableReader.StreamOpener,
  val tableMetadata: M = SimpleMetadata(),
  val rowTyper: TableReader.RowTyper[RT] = PartialFunction.empty,
  val colTyper: TableReader.ColTyper[RT, CT] = PartialFunction.empty,
  val cellUpgrader: TableReader.CellUpgrader[RT, CT] = PartialFunction.empty
  ) {

  /**
   * The [[Table]] type returned by Read
   */
  type ResultTable = Table[RT, CT, M]

  /**
   * Internal type for state passed through the [[.pipeline]].
   */
  private type State = TableReadingState[RT, CT]

  /**
   * The main method in this class. Can be called several times.
   * The input stream is opened and closed once per each call.
   *
   * If there are no errors [[TableReadingErrors.noErrors]] is `true`.
   *
   * @return a pair with a [[.TableReader]] and [[TableReadingErrors]].
   */
  def read(): (ResultTable, TableReadingErrors) = {

    val initial = TableReadingState[RT, CT]()

    val result = pipeline.run(initial)

    (Table(result.cells, result.rowTypes, result.colTypes, tableMetadata), result.errors)
  }

  /**
   * This method extends the basic `read` method with exception based error handling,
   * which may be useful in smaller applications that don't expect or handle
   * errors in input.
   *
   * A `RuntimeException` will be thrown when error is encountered.
   */
  def readOrThrow(): ResultTable = {
    val (table, errors) = read()
    if (errors.noErrors) {
      table
    } else {
      sys.error(errors.toString)
    }
  }

  override def toString() = s"TablerReader(tableMetadata = $tableMetadata)"

  // Chain stages together into one pipeline and setup error handling
  // using the Pipeline abstraction.
  private[csv] def pipeline = {
    for (_ <- Pipeline.Stage(csvReadingStage);
         _ <- Pipeline.Stage(rowTypeDetectionStage);
         _ <- Pipeline.Stage(colTypeDetectionStage);
         x <- Pipeline.Stage(cellUpgradeStage)) yield x
  }

  private[csv] def csvReadingStage(input: State): State = {
    val inputStream = openStream()

    try {

      val inputReader: java.io.Reader = new BufferedReader(new java.io.InputStreamReader(inputStream, tableMetadata.charset), 1024)

      val csvReader = new CsvReader(inputReader, separator = tableMetadata.separator)

      val (lefts, rights) = csvReader.partition(_.isLeft)

      val errors = lefts.map(either => TableReadingError(either.left.get))
      val cells = rights.map(either => either.right.get)

      // Package StringCells and possible errors to output state
      input.copy(errors = input.errors.addError(errors), cells = cells.toIndexedSeq)

    } finally {
      inputStream.close()
    }
  }

  private[csv] def rowTypeDetectionStage(initialInput: State): State = {

    // Accumulate state with row types in the map
    initialInput.cells.foldLeft(initialInput) { (input, cell) =>

      if (rowTyper.isDefinedAt(cell) && !input.rowTypes.contains(cell.rowKey)) {
        rowTyper(cell) match {
          case Left(error) => input.addError(error.relatedCellAdded(cell))
          case Right(rowType) => input.defineRowType(cell.rowKey, rowType)
        }
      } else {
        input
      }

    }
  }

  private[csv] def colTypeDetectionStage(initialInput: State): State = {

    // Accumulate state with column types in the map
    initialInput.cells.foldLeft(initialInput) { (input, cell) =>

      if (colTyper.isDefinedAt(cell, initialInput.rowTypes) && !input.colTypes.contains(cell.colKey)) {
        colTyper(cell, initialInput.rowTypes) match {
          case Left(error) => input.addError(error.relatedCellAdded(cell))
          case Right(colType) => input.defineColType(cell.colKey, colType)
        }
      } else {
        input
      }

    }
  }

  private[csv] def cellUpgradeStage(input: State): State = {
    val upgrader = cellUpgrader.lift

    // Map each cell to same cell, upgraded cell or possible error
    val upgrades = for (cell <- input.cells) yield {

      val upgrade = for (rowType <- input.rowTypes.get(cell.rowKey);
                         colType <- input.colTypes.get(cell.colKey);
                         cellType = CellType[RT, CT](rowType, colType);
                         upgradeResult <- upgrader(cell, cellType)) yield upgradeResult

      upgrade match {
        case Some(Left(tableReadingError)) => Left(tableReadingError.relatedCellAdded(cell))
        case Some(Right(upgradedCell)) => Right(upgradedCell)
        case None => Right(cell)
      }
    }

    val (lefts, rights) = upgrades.partition(_.isLeft)

    val errors = lefts.map(either => either.left.get)
    val upgradedCells = rights.map(either => either.right.get)

    // Package in upgraded cells and possible errors
    input.copy(errors = input.errors.addError(errors), cells = upgradedCells.toIndexedSeq)
  }

}

/**
 * Contains type definitions for various types used in constructing a `TableReader`
 * instance.
 */
object TableReader {

  type StreamOpener = () => java.io.InputStream

  type RowTyperResult[RT] = Either[TableReadingError, RT]

  type RowTyper[RT] = PartialFunction[(Cell), RowTyperResult[RT]]

  type ColTyperResult[CT] = Either[TableReadingError, CT]

  type RowTypes[RT] = SortedBiMap[RowKey, RT]

  type ColTyper[RT, CT] = PartialFunction[(Cell, RowTypes[RT]), ColTyperResult[CT]]

  type ColTypes[CT] = SortedBiMap[ColKey, CT]

  type CellUpgraderResult = Either[TableReadingError, Cell]

  /**
   * A partial function that can inspect cells and their assigned cell types and
   * optionally return a modified (upgraded) cell or [[TableReadingError]]error.
   *
   * Idea is to allow specifying more specialized types like [[fi.pelam.csv.cell.IntegerCell the IntegerCell]]
   * for some cell types.
   *
   * There are two major benefits of more specific cell types. One is that contents of cells get validated
   * better. The other is that getting data from the resulting [[Table]] object later in the client
   * code will be simpler as working with strings can be avoided.
   *
   * @tparam RT client specific row type
   * @tparam CT client specific column type
   */
  type CellUpgrader[RT, CT] = PartialFunction[(Cell, CellType[RT, CT]), CellUpgraderResult]
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy