All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.etsy.conjecture.text.FeatureHelper.scala Maven / Gradle / Ivy

There is a newer version: 0.2.3
Show newest version
package com.etsy.conjecture.text

import com.etsy.conjecture.data.{ AbstractInstance, BinaryLabeledInstance, LabeledInstance, StringKeyedVector }

import com.twitter.algebird.Operators._

import cascading.tuple.Fields
import cascading.pipe.Pipe

import scala.collection.JavaConverters._

import spray.json._
import DefaultJsonProtocol._


object FeatureHelper {

    import com.twitter.scalding.Dsl._

    def keepFeaturesWithCountGreaterThan(pipe: Pipe, instance_field: Fields, n: Int): Pipe = {
        val counts = pipe
            .flatMapTo(instance_field -> ('term, '__count)) {
                v: AnyRef =>
                    val vector = v match {
                        case skv: StringKeyedVector => skv
                        case ins: AbstractInstance[_] => ins.getVector
                        case lin: LabeledInstance[_] => lin.getVector
                        case _ => throw new IllegalArgumentException("keepFeaturesWithCountGreaterThan does not expect class: " + v.getClass.getName)
                    }
                    vector.keySet.asScala.map { k => k -> 1 }
            }
            .groupBy('term) { _.sum[Long]('__count) }
            .filter('__count) { c: Long => c > n }
            .mapTo('term -> 'set) { t: String => Set(t) }
            .groupAll { _.sum[Set[String]]('set) }

        pipe
            .crossWithTiny(counts)
            .map(instance_field.append('set) -> instance_field) { x: (AnyRef, Set[String]) =>
                val skv = x._1 match {
                    case s: StringKeyedVector => s
                    case i: AbstractInstance[_] => i.getVector
                    case l: LabeledInstance[_] => l.getVector
                    case _ => throw new IllegalArgumentException("keepFeaturesWithCountGreaterThan does not expect class: " + x._1.getClass.getName)
                }
                val it = skv.iterator
                while (it.hasNext) {
                    val e = it.next
                    if (!x._2.contains(e.getKey)) {
                        it.remove
                    }
                }
                x._1
            }
    }

    def nGramsUpTo(string: String, n: Int = 2, prefix: String = ""): List[String] = {
        val toks = Text(string.toLowerCase).standardTextFilter.toString.split(" ").toList
        val toks_pad = "" +: toks :+ ""
        val grams = (1 to n).map { m => toks_pad.sliding(m).toList.map { p => p.mkString("::") } }.foldLeft(List[String]()) { _ ++ _ }
        grams.filter { g => g != "" }.map { g => prefix + g }
    }

    def stringListToSKV(list: List[String], weight: Double = 1.0): StringKeyedVector = {
        val skv = new StringKeyedVector();
        list.foreach { f => skv.setCoordinate(f, weight) }
        skv
    }

    def getEmailBody(body: String): Option[String] = {
        val p = parseEmailBodyToTextAndType(body)
        if (p._1 != null)
            Some(p._1)
        else
            None
    }

    def parseEmailBodyToTextAndType(body: String): (String, String) = {
        try {
            val email = JsonParser(body).convertTo[List[Map[String, String]]]
            val textParts = email.filter(part => part("type") == "text/plain")
            if (textParts.length > 0)
                (textParts.map(part => part("body")).mkString(" "), "text/plain")
            else {
                val htmlParts = email.filter(part => part("type") == "text/html")
                if (htmlParts.length > 0)
                    (htmlParts.map(part => part("body")).mkString(" "), "text/html")
                else
                    (null, "filter") // Filter this email
            }
        } catch {
            case _ : Exception => (null, "filter")
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy