
fi.pelam.csv.table.DetectingTableReader.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pelam-scala-csv_2.12 Show documentation
Show all versions of pelam-scala-csv_2.12 Show documentation
Scala library for reading and writing CSV data with an optional high level API. Supports structured access to tabular data and a form of CSV format detection.
The newest version!
/*
* This file is part of pelam-scala-csv
*
* Copyright © Peter Lamberg 2015 ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package fi.pelam.csv.table
import java.nio.charset.{Charset, StandardCharsets}
import java.util.Locale
import fi.pelam.csv.CsvConstants
// @formatter:off IntelliJ 14.1 (Scala plugin) formatter messes up Scaladoc
/**
* This CSV format detection heuristic tries to read the input CSV
* with varying parameters until a `Table` is produced with no errors or
* all combinations are exhausted. In the latter case
* the `Table` with least errors is returned.
*
* For simpler usage you can skip `initialMetadata` in the constructor
* by using [[fi.pelam.csv.table.DetectingTableReader.apply the apply method]]
* defined in the companion object.
*
* The interface of this class is similar to the one in [[TableReader]], but this
* class creates multiple `TableReader` instances under the hood.
*
* == Example on detection of CSV format ==
*
* This example detects and parses a weird CSV format in which the separator
* is the one used at least in the finnish locale, but numeric data is
* formatted in english style. The column types are defined
* on the first row and the row type is defined by the first column.
*
* {{{
import fi.pelam.csv.table._
import fi.pelam.csv.cell._
import TableReaderConfig._ // Provides implicits for things like converting string to a stream provider
val validColTypes = Set("header", "model", "price")
// Setup a DetectingTableReader which will try combinations of CSV formatting types
// to understand the data.
val reader = DetectingTableReader[String, String](
tableReaderMaker = { (metadata) => new TableReader(
// An implicit from the object TableReaderConfig converts the string
// to a function providing streams.
openStream = // Converted by implicit in TableReaderConfig
"header;model;price\n" +
"data;300D;1,234.0\n" +
"data;SLS AMG;234,567.89",
// Make correct metadata end up in the final Table
tableMetadata = metadata,
// First column specifies row types
rowTyper = makeRowTyper({
case (CellKey(_, 0), rowType) => rowType
}),
// Column type is specified by the first row.
// Type names are checked and error is generated for unknown
// column types by errorOnUndefinedCol.
// This strictness is what enables the correct detection of CSV format.
colTyper = errorOnUndefinedCol(makeColTyper({
case (CellKey(0, _), colType) if validColTypes.contains(colType) => colType
})),
cellUpgrader = makeCellUpgrader({
case CellType("data", "price") => DoubleCell.parserForLocale(metadata.dataLocale)
}))
}
)
val table = reader.readOrThrow()
// Get values from cells in column with type "name" on rows with type "data."
table.getSingleCol("data", "model").map(_.value).toList
// Will give List("300D", "SLS AMG")
// Get values from cells in column with type "number" on rows with type "data."
table.getSingleCol("number", "price").map(_.value).toList)
// Will give List(1234, 234567.89)
}}}
*
* @param initialMetadata base metadata instance. Copies with different format parameters will be created from this
* using [[LocaleTableMetadata.withFormatParameters]]. Idea is that you client
* have custom metadata subclass and use this parameter to set initial values for
* custom fields in the metadata.
*
* @param tableReaderMaker user provided method that constructs a [[TableReader]] using
* locales, separator and charset specified by [[LocaleTableMetadata]] parameter.
*
* @param locales List of locales to try for `cellTypeLocale` and `dataLocale`.
* The default is [[CsvConstants.commonLocales]]. Provide a minimal list list here to optimize reading.
*
* @param charsets List of charsets to try. Default is [[CsvConstants.commonCharsets]].
* Provide a minimal list list here to optimize reading.
*
* @param separators List of separators to try. Default is [[CsvConstants.commonSeparators]].
* Provide a minimal list list here to optimize reading.
*
* @tparam RT The client specific row type.
*
* @tparam CT The client specific column type.
*
* @tparam M The type of the `metadata` parameter. Must be a sub type of [[LocaleTableMetadata]].
* This is used to manage the character set, separator, `cellTypeLocale` and `dataLocale`
* combinations when attempting to read the CSV data from the input stream.
*/
// TODO: Simplest use of this class should just need an input string
// TODO: Add example to docs with simplest possible use
// TODO: Add some tie breaking "most rows times columns heuristic"
// @formatter:on IntelliJ 14.1 (Scala plugin) formatter messes up Scaladoc
final class DetectingTableReader[RT, CT, M <: LocaleTableMetadata[M]](
val initialMetadata: M, // TODO: Should this be 2nd instead of 1st parameter, TODO: should have default value
val tableReaderMaker: (M) => TableReader[RT, CT, M], // TODO: Implicit to provide sensible default for this for quick use
val locales: Seq[Locale] = CsvConstants.commonLocales,
val charsets: Seq[Charset] = CsvConstants.commonCharsets,
val separators: Seq[Char] = CsvConstants.commonSeparators) {
type ResultTable = Table[RT, CT, M]
/**
* The main method in this class. Can be called again to reread the table.
* The input stream is may be opened and closed many times
* per each call.
*
* If there are no errors [[TableReadingErrors.noErrors]] is `true`.
*
* @return a pair with a [[.TableReader]] and [[TableReadingErrors]].
*/
def read(): (ResultTable, TableReadingErrors) = {
// TODO: Is there a functional way to combine TableReaderEvaluator and the generation of CSV format combinations.
val readers = for (
separator <- List(CsvConstants.defaultSeparatorChar, ';');
charset <- List(StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1);
cellTypeLocale <- locales;
dataLocale <- locales) yield {
// http://stackoverflow.com/questions/8801818/polymorphic-updates-in-an-immutable-class-hierarchy
// This is the cleanest way I can think of now...
val metadata = initialMetadata.withFormatParameters(separator, charset, cellTypeLocale, dataLocale)
tableReaderMaker(metadata)
}
val initialEvaluator = TableReaderEvaluator[RT, CT, M]()
val finalEvaluator = readers.foldLeft(initialEvaluator)(_.evaluateReader(_))
val resultOption = finalEvaluator.result
resultOption.getOrElse(defaultRead())
}
/**
* This method extends the basic `read` method with exception based error handling,
* which may be useful in smaller applications that don't expect or handle
* errors in input.
*
* A `RuntimeException` will be thrown when error is encountered.
*/
def readOrThrow(): ResultTable = {
val (table, errors) = read()
if (errors.noErrors) {
table
} else {
sys.error(errors.toString)
}
}
// If there are no parameter combinations to evaluate, this is used as the return value.
private def defaultRead() = {
tableReaderMaker(initialMetadata).read()
}
}
object DetectingTableReader {
/**
* Custom constructor for using concrete class [[LocaleMetadata]]
* instead of client defined [[LocaleTableMetadata]] subtype.
*
* @param tableReaderMaker user provided method that constructs a [[TableReader]] using
* locales, separator and charset specified by [[LocaleTableMetadata]] parameter.
*
* @param locales List of locales to try. Default is [[CsvConstants.commonLocales]].
*
* @param charsets List of charsets to try. Default is [[CsvConstants.commonCharsets]].
*
* @param separators List of separators to try. Default is [[CsvConstants.commonSeparators]].
*
* @tparam RT The client specific row type.
*
* @tparam CT The client specific column type.
*/
def apply[RT, CT](
tableReaderMaker: (LocaleMetadata) => TableReader[RT, CT, LocaleMetadata],
locales: Seq[Locale] = CsvConstants.commonLocales,
charsets: Seq[Charset] = CsvConstants.commonCharsets,
separators: Seq[Char] = CsvConstants.commonSeparators) = {
new DetectingTableReader(initialMetadata = LocaleMetadata(),
tableReaderMaker = tableReaderMaker,
locales = locales,
charsets = charsets,
separators = separators)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy