
epic.lexicon.SimpleLexicon.scala Maven / Gradle / Ivy
package epic.lexicon
import java.io.ObjectStreamException
import breeze.linalg._
import breeze.util.Index
import scala.collection.immutable.BitSet
import scala.collection.mutable
/**
* A simple lexicon that thresholds to decide when to open up the rare word to all (open) tags
* @param wordTagCounts (tag -> word -> count)
* @param openTagThreshold how many different word types does a tag have to be seen with to be considered open.
* @param closedWordThreshold How many times do we have to see a word before we decide we know its tag set?
* Words with counts below this will be allowed to be any open tag in addition to their
* observed tag set.
*/
@SerialVersionUID(1L)
class SimpleLexicon[L, W](val labelIndex: Index[L],
wordTagCounts: Counter2[L, W, Double],
openTagThreshold: Int = 50,
closedWordThreshold: Int= 10) extends Lexicon[L, W] with Serializable {
private val wordCounts:Counter[W, Double] = sum(wordTagCounts, Axis._0)
private val labelCounts:Counter[L, Double] = sum(wordTagCounts, Axis._1)
private val byWord: mutable.Map[W, Set[Int]] = mutable.Map.empty[W, Set[Int]] ++ wordTagCounts.keySet.groupBy(_._2).mapValues(_.map(pair => labelIndex(pair._1)).toSet)
private val openTags: Set[Int] = {
val set = labelCounts.keysIterator.filter(l => wordTagCounts(l, ::).size > openTagThreshold).toSet.map((l:L) => labelIndex(l))
if(set.isEmpty) BitSet.empty ++ (0 until labelIndex.size)
else set
}
for( (w,v) <- wordCounts.iterator if v < closedWordThreshold) {
byWord.get(w) match {
case None => byWord(w) = openTags
case Some(set) => byWord(w) = set ++ openTags
}
}
def allowedTags(w: W): Set[Int] = byWord.getOrElse(w, openTags)
def anchor(w: IndexedSeq[W]):Anchoring = new Anchoring {
def length = w.length
val x = Array.tabulate(w.length)(pos =>byWord.getOrElse(w(pos), openTags))
def allowedTags(pos: Int): Set[Int] = x(pos)
}
@throws(classOf[ObjectStreamException])
private def writeReplace():Object = {
new SimpleLexicon.SerializedForm(labelIndex, wordTagCounts, openTagThreshold, closedWordThreshold)
}
override def morePermissive: Lexicon[L, W] = new SimpleLexicon(labelIndex, wordTagCounts, openTagThreshold, 1000000)
}
object SimpleLexicon {
@SerialVersionUID(1L)
private class SerializedForm[L, W](labelIndex: Index[L], wordTagCounts: Counter2[L, W, Double], openTagThreshold: Int, closedWordThreshold: Int) extends Serializable {
@throws(classOf[ObjectStreamException])
private def readResolve():Object = {
try {
Class.forName("breeze.linalg.Counter$Impl")
new SimpleLexicon(labelIndex, wordTagCounts, openTagThreshold, closedWordThreshold)
} catch {
case ex =>
ex.printStackTrace()
throw ex
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy