All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.tixxit.delimited.parser.DelimitedParserImpl.scala Maven / Gradle / Ivy

package net.tixxit.delimited
package parser

import scala.annotation.tailrec
import scala.collection.mutable.Builder

import java.nio.charset.{ Charset, StandardCharsets }
import java.io.File
import java.io.{ InputStream, FileInputStream }
import java.io.{ Reader, InputStreamReader }

import ParserState._
import Instr._

final case class DelimitedParserImpl(
  strategy: DelimitedFormatStrategy,
  parserState: ParserState,
  fail: Option[Fail],
  row: Long,
  bufferSize: Int,
  maxCharsPerRow: Int
) extends DelimitedParser {
  require(maxCharsPerRow >= 0, "max row characters parameter must be non-negative")

  def format: Option[DelimitedFormat] = strategy match {
    case (fmt: DelimitedFormat) => Some(fmt)
    case _ => None
  }

  def reset: (String, DelimitedParserImpl) = {
    val in = parserState.input
    (in.substring(in.mark, in.limit), DelimitedParserImpl(strategy, bufferSize, maxCharsPerRow))
  }

  def parseChunk(chunk: Option[String]): (DelimitedParserImpl, Vector[Either[DelimitedError, Row]]) = {
    val initState = chunk match {
      case Some(str) => parserState.mapInput(_.append(str))
      case None => parserState.mapInput(_.finished)
    }
    val format = strategy match {
      case (guess: GuessDelimitedFormat) =>
        if (initState.input.isLast || initState.input.data.length >= bufferSize) {
          // We want enough data here.
          guess(initState.input.data)
        } else {
          // TODO: We could get rid of this return.
          return (DelimitedParserImpl(strategy, initState, fail, row, bufferSize, maxCharsPerRow), Vector.empty)
        }
      case (fmt: DelimitedFormat) =>
        fmt
    }

    val maxRowDelimLength: Int = format.rowDelim match {
      case RowDelim(value, None) => value.length
      case RowDelim(value, Some(alt)) => scala.math.max(value.length, alt.length)
    }

    @tailrec
    def loop(s0: ParserState, fail: Option[Fail], row: Long, acc: Vector[Either[DelimitedError, Row]]): (DelimitedParserImpl, Vector[Either[DelimitedError, Row]]) = {
      val (s1, instr) = DelimitedParserImpl.parse(format)(s0)

      instr match {
        case EmitRow(cells) =>
          if (maxCharsPerRow > 0 && (s1.rowStart - s0.rowStart - maxRowDelimLength) > maxCharsPerRow) {
              val context = DelimitedParserImpl.removeRowDelim(format,
                s1.input.substring(s0.rowStart, s1.rowStart))
            val error = DelimitedError(s"row exceeded maximum length of $maxCharsPerRow",
              s0.rowStart, s0.rowStart, context, row, 1)
            loop(s1, fail, row + 1, acc :+ Left(error))
          } else {
            loop(s1, fail, row + 1, acc :+ Right(cells))
          }

        case f @ Fail(_, _) =>
          loop(s1, Some(f), row, acc)

        case Resume =>
          fail match {
            case Some(Fail(msg, pos)) =>
              val context = DelimitedParserImpl.removeRowDelim(format, s1.input.substring(s0.rowStart, s1.rowStart))
              val error = DelimitedError(msg, s0.rowStart, pos, context, row, pos - s0.rowStart + 1)
              loop(s1, None, row + 1, acc :+ Left(error))

            case None =>
              loop(s1, None, row, acc)
          }

        case NeedInput =>
          if (maxCharsPerRow > 0 && fail.isEmpty &&
              (s1.input.limit - s1.rowStart - maxRowDelimLength) > maxCharsPerRow) {
            val f = Some(Fail(s"row exceeded maximum length of $maxCharsPerRow", s1.rowStart))
            DelimitedParserImpl(format, s1.skipRow, f, row, bufferSize, maxCharsPerRow) -> acc
          } else {
            DelimitedParserImpl(format, s1, fail, row, bufferSize, maxCharsPerRow) -> acc
          }

        case Done =>
          DelimitedParserImpl(format, s1, None, row, bufferSize, maxCharsPerRow) -> acc
      }
    }

    loop(initState, fail, row, Vector.empty)
  }
}

object DelimitedParserImpl {

  /**
   * Returns a new DelimitedParserImpl whose state is initially empty.
   *
   * @note If `maxRowsChars` is 0, then there is no limit on row size.
   *
   * @param format the format strategy to use for parsing
   * @param bufferSize the minimum size of the buffer to use for format inference
   * @param maxRowsChars a hard limit on the allowable size of a row
   */
  def apply(
    format: DelimitedFormatStrategy,
    bufferSize: Int,
    maxCharsPerRow: Int
  ): DelimitedParserImpl = {
    DelimitedParserImpl(format, ParserState.ParseRow(0L, 0L, Input.init("")), None, 1L, bufferSize, maxCharsPerRow)
  }

  def parseRow(format: DelimitedFormat, row: String): Either[DelimitedError, Row] = {
    val s0 = ParserState.ParseRow(0L, 0L, Input.last(row))
    DelimitedParserImpl.parse(format)(s0) match {
      case (s, Instr.EmitRow(row)) if s.rowStart == s.input.data.length =>
        Right(row)
      case (s, Instr.EmitRow(_)) =>
        Left(DelimitedError("unexpected start of new row",
                            0, s.rowStart, row, 1, s.rowStart + 1))
      case (_, Instr.Fail(message, pos)) =>
        Left(DelimitedError(message, 0, pos, row, 1, pos + 1))
      case (_, Instr.NeedInput | Instr.Resume | Instr.Done) =>
        Left(DelimitedError("empty row", 0, 0, row, 1, 1))
    }
  }

  def parse(format: DelimitedFormat)(state: ParserState): (ParserState, Instr) = {
    import format._

    val input: Input = state.input
    val buf: InputBuffer = new InputBuffer(state)

    def isQuote(): Int = buf.isFlag(quote)
    def isQuoteEscape(): Int = buf.isFlag(quoteEscape)
    def isSeparator(): Int = buf.isFlag(separator)

    val primaryRowDelim: String = rowDelim.value
    val secondaryRowDelim: String = rowDelim.alternate.orNull
    def isRowDelim(): Int = buf.eitherFlag(primaryRowDelim, secondaryRowDelim)
    def isEndOfCell(): Int = {
      val i = isSeparator()
      if (i == 0) isRowDelim() else i
    }

    def isEscapedQuote(): Int = {
      val e = isQuoteEscape()
      if (e > 0) {
        buf.advance(e)
        val q = isQuote()
        buf.retreat(e)
        if (q > 0) q + e
        else q
      } else {
        e
      }
    }

    def unquotedCell(bldr: Builder[String, Row]): ParseResult = {
      val start = buf.getPos()
      def loop(): ParseResult = {
        val flag = isEndOfCell()
        if (flag > 0 || buf.endOfFile()) {
          val value = input.substring(start, buf.getPos())
          bldr += value
          Success
        } else if (flag == 0) {
          buf.advance(1)
          loop()
        } else {
          NeedInput
        }
      }

      loop()
    }

    def quotedCell(bldr: Builder[String, Row]): ParseResult = {
      val start = buf.getPos()
      def loop(): ParseResult = {
        if (buf.endOfInput()) {
          if (buf.endOfFile()) {
            Fail("Unmatched quoted string at end of file", buf.getPos())
          } else {
            NeedInput
          }
        } else {
          val d = if (allowRowDelimInQuotes) 0 else isRowDelim()
          val e = isEscapedQuote()
          val q = isQuote()

          if (d < 0 || e < 0 || q < 0) {
            NeedInput
          } else if (d > 0) {
            Fail("Unmatched quoted string at row delimiter", buf.getPos())
          } else if (e > 0) {
            buf.advance(e)
            loop()
          } else if (q > 0) {
            val unescaped = unescape(input.substring(start, buf.getPos()))
            buf.advance(q)
            bldr += unescaped
            Success
          } else {
            buf.advance(1)
            loop()
          }
        }
      }

      loop()
    }

    def cell(bldr: Builder[String, Row]): ParseResult = {
      val q = isQuote()
      if (q == 0) {
        unquotedCell(bldr)
      } else if (q > 0) {
        buf.advance(q)
        quotedCell(bldr)
      } else {
        NeedInput
      }
    }

    def skipToNextRow(): Boolean = {
      val d = isRowDelim()
      if (d > 0 || buf.endOfFile()) {
        buf.advance(d)
        true
      } else if (d == 0) {
        buf.advance(1)
        skipToNextRow()
      } else {
        if (input.isLast)
          buf.advance((input.limit - buf.getPos()).toInt)
        input.isLast
      }
    }

    def row(rowStart: Long, cells: Builder[String, Row]): (ParserState, Instr) = {
      val start = buf.getPos()
      def needInput() = (ContinueRow(rowStart, start, cells.result(), input), NeedInput)

      val s = isSeparator()
      if (s == 0) {
        val r = isRowDelim()
        if (r > 0 || buf.endOfFile()) {
          buf.advance(r)
          val row = cells.result()
          (ParseRow(buf.getPos(), buf.getPos(), input.marked(buf.getPos()), row.size), EmitRow(row))
        } else if (r == 0) {
          (SkipRow(rowStart, buf.getPos(), input), Fail("Expected separator, row delimiter, or end of file", buf.getPos()))
        } else {
          needInput()
        }
      } else if (s > 0) {
        buf.advance(s)
        cell(cells) match {
          case Success =>
            row(rowStart, cells)
          case f @ Fail(_, _) =>
            (SkipRow(rowStart, buf.getPos(), input), f)
          case NeedInput =>
            needInput()
        }
      } else {
        needInput()
      }
    }

    state match {
      case ContinueRow(rowStart, readFrom, partial, _, _) =>
        row(rowStart, state.newRowBuilder ++= partial.iterator)

      case instr @ ParseRow(rowStart, readFrom, _, sizeHint) =>
        if (buf.endOfFile()) {
          (instr, Done)
        } else {
          val cells = state.newRowBuilder
          cell(cells) match {
            case Success =>
              row(rowStart, cells)
            case f @ Fail(_, _) =>
              (SkipRow(rowStart, buf.getPos(), input, sizeHint), f)
            case NeedInput =>
              (instr, NeedInput)
          }
        }

      case SkipRow(rowStart, readFrom, _, sizeHint) =>
        if (skipToNextRow()) {
          (ParseRow(buf.getPos(), buf.getPos(), input.marked(buf.getPos()), sizeHint), Resume)
        } else {
          (SkipRow(rowStart, buf.getPos(), input, sizeHint), NeedInput)
        }
    }
  }

  private def removeRowDelim(format: DelimitedFormat, context: String): String = {
    def dropTail(tail: String): Option[String] =
      if (context.endsWith(tail)) Some(context.dropRight(tail.length))
      else None

    dropTail(format.rowDelim.value).
      orElse(format.rowDelim.alternate.flatMap(dropTail)).
      getOrElse(context)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy