All Downloads are FREE. Search and download functionalities are using the official Maven repository.

epic.features.MorphFeaturizer.scala Maven / Gradle / Ivy

The newest version!
package epic.features

import breeze.linalg.Counter
import epic.framework.Feature
import com.typesafe.scalalogging.slf4j.LazyLogging
import breeze.io.FileStreams
import java.io.File
import java.io.BufferedReader
import scala.io.Source
import epic.parser.morph.MorphFeat
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
import java.io.InputStreamReader

class MorphFeaturizer private (morphLookupTable: MorphFeaturizer.MorphLookupTable) extends WordFeaturizer[String] with Serializable {
  def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] {
    val morphFeats = if (!morphLookupTable.contains(w)) {
      println("Sentence wasn't found in lookup table: " + w);
      (0 until w.size).map(i => Array[MorphFeat]());
    } else {
      morphLookupTable(w);
    }
    val feats = (0 until w.size).map(i => morphFeats(i).filter(feat => feat.label == "lem").map(feat => IndicatorFeature(feat): Feature))
//    logger.info("Feats for sentence: " + w);
//    (0 until w.size).foreach(i => logger.info(w(i) + ": " + feats(i).toSeq));

    def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos)

    def words: IndexedSeq[String] = w
  }
}

object MorphFeaturizer {
  
  // Stores each sentence's associated vector of 
  type MorphLookupTable = HashMap[IndexedSeq[String],IndexedSeq[Array[MorphFeat]]];
  
  def makeLookupTable(pathToTaggedSentences: String): MorphLookupTable = {
    val in = breeze.io.FileStreams.input(new File(pathToTaggedSentences))
    val br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
    val lookupTable = new HashMap[IndexedSeq[String],IndexedSeq[Array[MorphFeat]]]
    val morphFeatArr = new ArrayBuffer[IndexedSeq[Array[MorphFeat]]];
    var thisSent = new ArrayBuffer[String];
    var thisSentFeats = new ArrayBuffer[Array[MorphFeat]];
    while (br.ready()) {
      val line = br.readLine();
      if (line.trim.isEmpty) {
        lookupTable.put(thisSent, thisSentFeats);
        morphFeatArr += thisSentFeats;
        thisSent = new ArrayBuffer[String];
        thisSentFeats = new ArrayBuffer[Array[MorphFeat]];
      } else {
        val splitLine = line.split("\\s+");
        if (splitLine.size != 3) {
          println("WARNING: Bad line, split into more than three parts on whitespace: " + splitLine);
        }
        thisSent += splitLine(0)
        thisSentFeats += MorphFeat.readMorphFeatsFromBit(splitLine(2)).toArray;
      }
    }
    if (!thisSent.isEmpty) {
      lookupTable.put(thisSent, thisSentFeats);
    }
    println("Loaded " + lookupTable.size + " entries from " + pathToTaggedSentences);
    lookupTable;
  }
  
  def apply(pathsToTaggedSentences: Seq[String]) = {
    val lookupTable = pathsToTaggedSentences.map(makeLookupTable(_)).reduce(_ ++ _);
    new MorphFeaturizer(lookupTable);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy