edu.arizona.sista.struct.Lexicon.scala Maven / Gradle / Ivy
package edu.arizona.sista.struct
import edu.arizona.sista.struct.Lexicon.logger
import edu.arizona.sista.utils.Files
import org.slf4j.LoggerFactory
import scala.Serializable
* Generic lexicon: maps objects of type T to Ints, both ways
* User: mihais
* Date: 3/1/13
class Lexicon[T] extends Serializable {
private var lexicon = new collection.mutable.HashMap[T, Int]
private var index = new collection.mutable.ArrayBuffer[T]()
/** Counts the number of objects that were saved by not storing strings repeatedly */
private var savedMemory = 0
* Adds a string to the lexicon without adding it twice if it already exists
def add(s:T):Int = {
if (lexicon.contains(s)) {
savedMemory += 1
} else {
val i = index.size
lexicon += s -> i
index += s
def exists(i:Int):Boolean = i < index.size
* Fetches the string with the given index from the lexicon
* This deliberately does NOT check for bounds for fast access.
* Use exists() if you want to make sure that your index exists.
* @param i Index of the string in the lexicon
* @return The string corresponding to the index
def get(i:Int):T = index(i)
def get(w:T):Option[Int] = lexicon.get(w)
def size = lexicon.size
override def toString:String = {
val os = new StringBuilder
for(w <- lexicon.keySet) {
os.append(w + " -> " + lexicon.get(w) + "\n")
def keySet = lexicon.keySet
def stats() {"Stored objects: " + index.size)" Saved objects: " + savedMemory + " (" + (100.0 * savedMemory / (savedMemory + lexicon.size)) + "%)")
def saveTo(fileName:String) {
val w = new BufferedWriter(new FileWriter(fileName))
def saveTo(w:Writer) {
val p = Files.toPrintWriter(w)
if(index.size > 0) {
val first = index(0)
first match {
// TODO: kinda hacky, but don't know how to recover from type erasure in loadFrom()...
case i: Int => p.println("I")
case d: Double => p.println("D")
case s: String => p.println("S")
case _ => throw new RuntimeException("ERROR: unknown type in lexicon!")
} else {
p.println("S") // this does not matter
for(i <- 0 until index.size) {
p.println(s"$i ${index(i)}")
object Lexicon {
val logger = LoggerFactory.getLogger(classOf[Lexicon[String]])
/** Copy constructor for Lexicon */
def apply[T](other:Lexicon[T]):Lexicon[T] = {
val lex = new Lexicon[T]
for(w <- other.lexicon.keySet)
lex.lexicon += w -> other.lexicon.get(w).get
for(i <- 0 until other.index.size)
lex.index += other.index(i)
/** Loads a lexicon saved by Lexicon.saveTo */
def loadFrom[F](fileName:String):Lexicon[F] = {
val is = new BufferedReader(new FileReader(fileName))
val lex = loadFrom[F](is)
def loadFrom[F](r:Reader):Lexicon[F] = {
val reader = Files.toBufferedReader(r)
val lex = new Lexicon[F]
var line = reader.readLine()
val size = line.trim.toInt
val ftype = reader.readLine() // type of features in lexicon
//println(s"SIZE = $size FTYPE = $ftype")
assert(size >= 0)
for(i <- 0 until size) {
line = reader.readLine().trim
//println("LEX LINE = " + line)
val space = line.indexOf(' ')
assert(space > 0)
val index = line.substring(0, space).toInt
assert(index == i)
val f = line.substring(space + 1)
ftype match {
case "S" => lex.add(f.asInstanceOf[F])
case "I" => lex.add(f.toInt.asInstanceOf[F])
case "D" => lex.add(f.toDouble.asInstanceOf[F])
case _ => throw new RuntimeException("ERROR: unknown type in lexicon!")