All Downloads are FREE. Search and download functionalities are using the official Maven repository.

scalaz.stream.text.scala Maven / Gradle / Ivy

The newest version!
package scalaz.stream

import java.nio.charset.Charset
import java.util.regex.Pattern
import scalaz.std.string._
import scodec.bits.ByteVector

import process1._

/**
 * Module for text related processes.
 */
object text {
  private val utf8Charset = Charset.forName("UTF-8")

  /** Converts UTF-8 encoded `ByteVector` inputs to `String`. */
  val utf8Decode: Process1[ByteVector,String] = {
    /**
     * Returns the number of continuation bytes if `b` is an ASCII byte or a
     * leading byte of a multi-byte sequence, and -1 otherwise.
     */
    def continuationBytes(b: Byte): Int = {
      if      ((b & 0x80) == 0x00) 0 // ASCII byte
      else if ((b & 0xE0) == 0xC0) 1 // leading byte of a 2 byte seq
      else if ((b & 0xF0) == 0xE0) 2 // leading byte of a 3 byte seq
      else if ((b & 0xF8) == 0xF0) 3 // leading byte of a 4 byte seq
      else                        -1 // continuation byte or garbage
    }

    /**
     * Returns the length of an incomplete multi-byte sequence at the end of
     * `bs`. If `bs` ends with an ASCII byte or a complete multi-byte sequence,
     * 0 is returned.
     */
    def lastIncompleteBytes(bs: ByteVector): Int = {
      val lastThree = bs.toIndexedSeq.reverseIterator.take(3)
      lastThree.map(continuationBytes).zipWithIndex.find {
        case (c, _) => c >= 0
      } map {
        case (c, i) => if (c == i) 0 else i + 1
      } getOrElse 0
    }

    def splitAtLastIncompleteChar(bs: ByteVector): (Option[ByteVector], Option[ByteVector]) = {
      val splitIndex = bs.length - lastIncompleteBytes(bs)

      if (bs.isEmpty || splitIndex == bs.length)
        (Some(bs), None)
      else if (splitIndex == 0)
        (None, Some(bs))
      else {
        val (complete, rest) = bs.splitAt(splitIndex)
        (Some(complete), Some(rest))
      }
    }

    repartition2(splitAtLastIncompleteChar)
      .map(bs => new String(bs.toArray, utf8Charset))
  }

  /** Converts `String` inputs to UTF-8 encoded `ByteVector`. */
  val utf8Encode: Process1[String,ByteVector] =
    lift(s => ByteVector.view(s.getBytes(utf8Charset)))

  /**
   * Exception that is thrown by `[[lines]](maxLength)` if it encounters an
   * input whose length exceeds `maxLength`. The `input` parameter is set to the
   * input `String` that caused `lines` to fail.
   */
  case class LengthExceeded(maxLength: Int, input: String) extends Exception {
    override def getMessage: String = {
      val n = 10
      val shortened = if (input.length <= n) input else input.take(n) + " ..."

      s"Input '$shortened' exceeded maximum length: $maxLength"
    }
  }

  /**
   * Repartitions `String` inputs by line endings. If `maxLineLength` is
   * greater than zero and a line exceeding `maxLineLength` is found, the
   * `Process` fails with an `LengthExceeded` exception. The default
   * `maxLineLength` is 10242.
   *
   * @example {{{
   * scala> Process("Hel", "lo\nWo", "rld!\n").pipe(text.lines()).toList
   * res0: List[String] = List(Hello, World!)
   * }}}
   */
  def lines(maxLineLength: Int = 1024 * 1024): Process1[String, String] = {
    val pattern = Pattern.compile("\r\n|\n")
    def splitLines(s: String): IndexedSeq[String] = pattern.split(s, -1)

    val repartitionProcess =
      if (maxLineLength > 0)
        repartition { (s: String) =>
          val chunks = splitLines(s)
          if (chunks.forall(_.length <= maxLineLength)) chunks
          else throw LengthExceeded(maxLineLength, s)
        }
      else repartition(splitLines)
    repartitionProcess.dropLastIf(_.isEmpty)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy