epic.features.NGramSpanFeaturizer.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of epic_2.10 Show documentation
epic
The newest version!
package epic.features

import epic.framework.Feature
import breeze.linalg.Counter
import breeze.util.Index
import epic.trees.TreeInstance
import scala.collection.mutable.ArrayBuffer
import com.typesafe.scalalogging.slf4j.LazyLogging

/**
 *
 *
 * @author dlwh
 */
@SerialVersionUID(1L)
class NGramSpanFeaturizer(wordCounts: Counter[String,Double],
                          bigramCounts: Counter[(String,String),Double],
                          allSents: Seq[Seq[String]],
                          ngramCountThreshold: Int,
                          maxOrder: Int,
                          useNot: Boolean) extends SurfaceFeaturizer[String] with Serializable {
  private val higherOrderCounts = (3 to maxOrder).map(n => NGramSpanFeaturizer.countNgrams(allSents, n));
  
  private val wordIndex = Index(wordCounts.keysIterator)
  private val bigramIndex = Index(bigramCounts.keysIterator)
  private val higherOrderIndices = (3 to maxOrder).map(n => Index(higherOrderCounts(n-3).keysIterator));
  println(wordIndex.size + " unigrams, " + bigramIndex.size + " bigrams, " + (higherOrderIndices.map(_.size)) + " higher-order n-grams");
  
  def anchor(words: IndexedSeq[String]): SurfaceFeatureAnchoring[String] = {
    new SurfaceFeatureAnchoring[String] {
      def featuresForSpan(begin: Int, end: Int): Array[Feature] = {
//        println("Span: " + words.slice(begin, end));
        val unigramFeats = for (i <- begin until end) yield {
//          println(words(i) + ": " + wordCounts(words(i)));
          NGramUnigramFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i)));
        }
        val bigramFeats = for (i <- begin until end - 1) yield {
          val pair = (words(i), words(i+1));
//          println(pair + ": " + bigramCounts(pair));
          NGramBigramFeature(if (bigramCounts(pair) < ngramCountThreshold) -1 else bigramIndex(pair));
        }
        val notFeats = if (useNot) {
          val notFeats = new ArrayBuffer[Feature];
          var inNotSpan = false;
          for (i <- begin until end) {
            if (NGramSpanFeaturizer.NotWords.contains(words(i))) {
              inNotSpan = true;
            } else if (NGramSpanFeaturizer.NotEndingPunc.contains(words(i))) {
              inNotSpan = false;
            } else if (inNotSpan) {
//              println(words.slice(begin, end) + " (not span): " + words(i));
              notFeats += NotFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i)));
            }
          }
          notFeats.toArray;
        } else {
          Array[Feature]();
        }
        if (maxOrder >= 3) {
          val ngramFeats = (3 to maxOrder).flatMap(n => {
            for (i <- begin until end - n + 1) yield {
              val slice = words.slice(i, i+n);
//              println(slice + ": " + higherOrderCounts(n-3)(slice));
              NGramFeature(n, if (higherOrderCounts(n-3)(slice) < ngramCountThreshold) -1 else higherOrderIndices(n-3)(slice));
            }
          });
          (unigramFeats ++ bigramFeats ++ notFeats ++ ngramFeats).toArray;
        } else {
          (unigramFeats ++ bigramFeats ++ notFeats).toArray;
        }
        
      }
    }
  }
}

object NGramSpanFeaturizer {
  
  val NotWords = Set("not", "n't")
  val NotEndingPunc = Set(".", ",", ";", "--", ":")

  def countBigrams[L, W](data: TraversableOnce[TreeInstance[L, W]]) = {
    val bigrams = Counter[(W,W), Double]()
    for( ti <- data) {
      val TreeInstance(_, tree, words) = ti
      for (i <- 0 until words.size - 1) {
        bigrams((words(i), words(i+1))) += 1.0;
      }
    }
    bigrams
  }

  def countNgrams(allSents: Seq[Seq[String]], n: Int) = {
    val ngrams = Counter[Seq[String], Double]()
    for(sent <- allSents) {
      for (i <- 0 until sent.size - n) {
        ngrams(sent.slice(i, i+n)) += 1.0;
      }
    }
    ngrams
  }
}

case class NotFeature(idx: Int) extends Feature

case class NGramUnigramFeature(idx: Int) extends Feature

case class NGramBigramFeature(idx: Int) extends Feature

case class NGramFeature(n: Int, idx: Int) extends Feature