com.databricks.spark.csv.readers.readers.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-csv_2.10 Show documentation
spark-csv
The newest version!
// scalastyle:off
/*
 * Copyright 2015 Ayasdi Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// scalastyle:on

package com.databricks.spark.csv.readers

import java.io.StringReader

import com.univocity.parsers.csv._

/**
 * Read and parse CSV-like input
 * @param fieldSep the delimiter used to separate fields in a line
 * @param lineSep the delimiter used to separate lines
 * @param quote character used to quote fields
 * @param escape character used to escape the quote character
 * @param ignoreLeadingSpace ignore white space before a field
 * @param ignoreTrailingSpace ignore white space after a field
 * @param headers headers for the columns
 * @param inputBufSize size of buffer to use for parsing input, tune for performance
 * @param maxCols maximum number of columns allowed, for safety against bad inputs
 */
private[readers] abstract class CsvReader(
    fieldSep: Char = ',',
    lineSep: String = "\n",
    quote: Char = '"',
    escape: Char = '\\',
    commentMarker: Char = '#',
    ignoreLeadingSpace: Boolean = true,
    ignoreTrailingSpace: Boolean = true,
    headers: Seq[String],
    inputBufSize: Int = 128,
    maxCols: Int = 20480,
    maxCharsPerCol: Int = 100000) {
  protected lazy val parser: CsvParser = {
    val settings = new CsvParserSettings()
    val format = settings.getFormat
    format.setDelimiter(fieldSep)
    format.setLineSeparator(lineSep)
    format.setQuote(quote)
    format.setQuoteEscape(escape)
    format.setComment(commentMarker)
    settings.setIgnoreLeadingWhitespaces(ignoreLeadingSpace)
    settings.setIgnoreTrailingWhitespaces(ignoreTrailingSpace)
    settings.setReadInputOnSeparateThread(false)
    settings.setInputBufferSize(inputBufSize)
    settings.setMaxColumns(maxCols)
    settings.setNullValue("")
    settings.setMaxCharsPerColumn(maxCharsPerCol)
    if (headers != null) settings.setHeaders(headers: _*)

    new CsvParser(settings)
  }
}

/**
 * Parser for parsing a line at a time. Not efficient for bulk data.
 * @param fieldSep the delimiter used to separate fields in a line
 * @param lineSep the delimiter used to separate lines
 * @param quote character used to quote fields
 * @param escape character used to escape the quote character
 * @param ignoreLeadingSpace ignore white space before a field
 * @param ignoreTrailingSpace ignore white space after a field
 * @param inputBufSize size of buffer to use for parsing input, tune for performance
 * @param maxCols maximum number of columns allowed, for safety against bad inputs
 */
private[csv] class LineCsvReader(
    fieldSep: Char = ',',
    lineSep: String = "\n",
    quote: Char = '"',
    escape: Char = '\\',
    commentMarker: Char = '#',
    ignoreLeadingSpace: Boolean = true,
    ignoreTrailingSpace: Boolean = true,
    inputBufSize: Int = 128,
    maxCols: Int = 20480,
    maxCharsPerCol: Int = 100000)
  extends CsvReader(
    fieldSep,
    lineSep,
    quote,
    escape,
    commentMarker,
    ignoreLeadingSpace,
    ignoreTrailingSpace,
    null,
    inputBufSize,
    maxCols,
    maxCharsPerCol) {
  /**
   * parse a line
   * @param line a String with no newline at the end
   * @return array of strings where each string is a field in the CSV record
   */
  def parseLine(line: String): Array[String] = {
    parser.beginParsing(new StringReader(line))
    val parsed = parser.parseNext()
    parser.stopParsing()
    parsed
  }
}

/**
 * Parser for parsing lines in bulk. Use this when efficiency is desired.
 * @param iter iterator over lines in the file
 * @param fieldSep the delimiter used to separate fields in a line
 * @param lineSep the delimiter used to separate lines
 * @param quote character used to quote fields
 * @param escape character used to escape the quote character
 * @param ignoreLeadingSpace ignore white space before a field
 * @param ignoreTrailingSpace ignore white space after a field
 * @param headers headers for the columns
 * @param inputBufSize size of buffer to use for parsing input, tune for performance
 * @param maxCols maximum number of columns allowed, for safety against bad inputs
 */
private[csv] class BulkCsvReader(
    iter: Iterator[String],
    split: Int,      // for debugging
    fieldSep: Char = ',',
    lineSep: String = "\n",
    quote: Char = '"',
    escape: Char = '\\',
    commentMarker: Char = '#',
    ignoreLeadingSpace: Boolean = true,
    ignoreTrailingSpace: Boolean = true,
    headers: Seq[String],
    inputBufSize: Int = 128,
    maxCols: Int = 20480,
    maxCharsPerCol: Int = 100000)
  extends CsvReader(
    fieldSep,
    lineSep,
    quote,
    escape,
    commentMarker,
    ignoreLeadingSpace,
    ignoreTrailingSpace,
    headers,
    inputBufSize,
    maxCols,
    maxCharsPerCol)
  with Iterator[Array[String]] {

  private val reader = new StringIteratorReader(iter)
  parser.beginParsing(reader)
  private var nextRecord = parser.parseNext()

  /**
   * get the next parsed line.
   * @return array of strings where each string is a field in the CSV record
   */
  override def next(): Array[String] = {
    val curRecord = nextRecord
    if(curRecord != null) {
      nextRecord = parser.parseNext()
    } else {
      throw new NoSuchElementException("next record is null")
    }
    curRecord
  }

  override def hasNext: Boolean = nextRecord != null

}

/**
 * A Reader that "reads" from a sequence of lines. Spark's textFile method removes newlines at
 * end of each line Univocity parser requires a Reader that provides access to the data to be
 * parsed and needs the newlines to be present
 * @param iter iterator over RDD[String]
 */
private class StringIteratorReader(val iter: Iterator[String]) extends java.io.Reader {

  private var next: Long = 0
  private var length: Long = 0  // length of input so far
  private var start: Long = 0
  private var str: String = null   // current string from iter

  /**
   * fetch next string from iter, if done with current one
   * pretend there is a new line at the end of every string we get from from iter
   */
  private def refill(): Unit = {
    if (length == next) {
      if (iter.hasNext) {
        str = iter.next()
        start = length
        length += (str.length + 1) // allowance for newline removed by SparkContext.textFile()
      } else {
        str = null
      }
    }
  }

  /**
   * read the next character, if at end of string pretend there is a new line
   */
  override def read(): Int = {
    refill()
    if (next >= length) {
      -1
    } else {
      val cur = next - start
      next += 1
      if (cur == str.length) '\n' else str.charAt(cur.toInt)
    }
  }

  /**
   * read from str into cbuf
   */
  override def read(cbuf: Array[Char], off: Int, len: Int): Int = {
    refill()
    var n = 0
    if ((off < 0) || (off > cbuf.length) || (len < 0) ||
      ((off + len) > cbuf.length) || ((off + len) < 0)) {
      throw new IndexOutOfBoundsException()
    } else if (len == 0) {
      n = 0
    } else {
      if (next >= length) {   // end of input
        n = -1
      } else {
        n = Math.min(length - next, len).toInt // lesser of amount of input available or buf size
        if (n == length - next) {
          str.getChars((next - start).toInt, (next - start + n - 1).toInt, cbuf, off)
          cbuf(off + n - 1) = '\n'
        } else {
          str.getChars((next - start).toInt, (next - start + n).toInt, cbuf, off)
        }
        next += n
        if (n < len) {
          val m = read(cbuf, off + n, len - n)  // have more space, fetch more input from iter
          if(m != -1) n += m
        }
      }
    }
    n
  }

  override def skip(ns: Long): Long = {
    throw new IllegalArgumentException("Skip not implemented")
  }

  override def ready: Boolean = {
    refill()
    true
  }

  override def markSupported: Boolean = false

  override def mark(readAheadLimit: Int): Unit = {
    throw new IllegalArgumentException("Mark not implemented")
  }

  override def reset(): Unit = {
    throw new IllegalArgumentException("Mark and hence reset not implemented")
  }

  override def close(): Unit = { }
}