All Downloads are FREE. Search and download functionalities are using the official Maven repository.

axle.nlp.Corpus.scala Maven / Gradle / Ivy

The newest version!
package axle.nlp

import scala.collection.GenSeq

import axle.enrichGenSeq
import spire.implicits.LongAlgebra

class Corpus(documents: GenSeq[String], language: Language) {

  lazy val wordCountMap: Map[String, Long] =
    documents.flatMap(doc => language.tokenize(doc.toLowerCase)).tally[Long]

  def wordCount(word: String): Option[Long] = wordCountMap.get(word)

  def topWordCounts(cutoff: Long): List[(String, Long)] =
    wordCountMap
      .filter { _._2 > cutoff }
      .toList
      .sortBy { _._2 }
      .reverse

  def topWords(cutoff: Long): List[String] =
    topWordCounts(cutoff) map { _._1 }

  override def toString: String = {

    val wordCutoff = 20L

    s"""
Corpus of ${documents.length} documents.
There are ${topWords(wordCutoff).length} unique words used more than $wordCutoff time(s).
Top 10 words: ${topWords(wordCutoff).take(10).mkString(", ")}
Top 10 bigrams: ${topBigrams(10).mkString(", ")}
"""
  }

  lazy val bigramCounts = documents.flatMap({ d =>
    bigrams(language.tokenize(d.toLowerCase))
  }).tally[Long]

  def sortedBigramCounts: List[((String, String), Long)] =
    bigramCounts
      .filter { _._2 > 1 }
      .toList
      .sortBy { _._2 }
      .reverse

  def topBigrams(maxBigrams: Int): List[(String, String)] =
    sortedBigramCounts take (maxBigrams) map { _._1 }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy