All Downloads are FREE. Search and download functionalities are using the official Maven repository.

axle.nlp.language.English.scala Maven / Gradle / Ivy

The newest version!
package axle.nlp.language

import org.tartarus.snowball.ext.englishStemmer

import axle.nlp.Language

/*
import axle.actor.Defaults._
import akka.actor._
import akka.pattern.ask

object StemmerProtocol {

  case class Stem(word: String)
}

class StemmerActor(stemmer: SnowballStemmer) extends Actor {

  import StemmerProtocol._

  def receive = {
    case Stem(word) => {
      stemmer.setCurrent(word)
      stemmer.stem
      sender ! stemmer.getCurrent
    }
  }

}

*/

object English extends Language {

  // From Lucene's list of stopwords:

  def stopWords = Set(
    "a", "an", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "such",
    "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  )

  def tokenize(s: String): IndexedSeq[String] = s
    .replaceAll("""([\?!()\";\|\[\]\.\,'])""", " ")
    .trim
    .split("\\s+")
    .toIndexedSeq

  val stemmer = new englishStemmer()

  def stem(word: String): String = {
    stemmer.setCurrent(word)
    stemmer.stem
    stemmer.getCurrent
  }

  // import StemmerProtocol._
  //
  // lazy val englishStemmerActor = system.actorOf(Props(new StemmerActor(new englishStemmer())))
  //
  // def stem(word: String): String =
  //   Await.result((englishStemmerActor ? Stem(word)).mapTo[String], 1.second)

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy