All Downloads are FREE. Search and download functionalities are using the official Maven repository.

epic.preprocess.StreamSentenceSegmenter.scala Maven / Gradle / Ivy

The newest version!
package epic.preprocess

import java.io.{Reader, InputStream}
import breeze.util.Iterators
import java.nio.channels.Channels

/**
 * TODO
 *
 * @author dlwh
 **/
class StreamSentenceSegmenter(val baseSegmenter: SentenceSegmenter) {

  def sentences(stream: InputStream):Iterator[String] = {
    // addendum maintains the characters that we haven't read.
    var addendum = ""
    val pieces = chunkInput(stream).flatMap { (s: String) =>
      val sentences = baseSegmenter(addendum + s).toIndexedSeq
      addendum = sentences.last
      sentences.view.slice(0, sentences.length - 1)
    }

    pieces ++ Iterator(addendum).filter(_.nonEmpty)
  }

  private def chunkInput(stream: InputStream):Iterator[String] = {
    val cin = Channels.newChannel(stream)
    val reader = Channels.newReader(cin, "UTF-8")
    val buffer = new Array[Char](1024 * 1024)
    var done = false
    Iterators.fromProducer {
      if(done)  {
        None
      } else {
        val numRead = reader.read(buffer)
        if(numRead == -1) {
          done = true
          None
        } else {
          val s = new String(buffer.take(numRead))
          Some(s)
        }
      }



    }
  }

}

object StreamSentenceSegmenter {
  def main(args: Array[String]) {
    val seg = MLSentenceSegmenter.loadModel(new java.io.File("en-sent-segmenter.model.ser.gz"))
    val ss = new StreamSentenceSegmenter(seg)
    for(s <- ss.sentences(System.in)) {
      println(">>> " + s)
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy