All Downloads are FREE. Search and download functionalities are using the official Maven repository.

breeze.text.transform.RemoveRareWords.scala Maven / Gradle / Ivy

The newest version!
package breeze.text.transform

import breeze.data._
import breeze.linalg.Counter

/**
 * Filter that removes rare word that occur in fewer than threshold documents
 * Syntax: new RemoveRareWords(10) apply (data)
 *
 * @author dlwh
 */
class RemoveRareWords(threshold: Int = 10) {
  def apply[T, Obs <: Observation[Seq[T]]](data: Seq[Obs]) = {
    val c = Counter[T, Int]()
    for {
      d <- data
      w <- d.features.toSet[T]
    } {
      c(w) += 1
    }

    for (d <- data)
    yield for (seq <- d)
    yield for (w <- seq if c(w) >= threshold) yield w
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy