All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.featran.FeatureSpec.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2017 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.featran

import com.spotify.featran.transformers.{Settings, Transformer}

import scala.collection.mutable
import scala.reflect.ClassTag

/** Companion object for [[FeatureSpec]]. */
object FeatureSpec extends FeatureSpecCompat {
  private[featran] type ARRAY = Array[Option[Any]]

  /**
   * Create a new [[FeatureSpec]] for input record type `T`.
   * @tparam T input record type to extract features from
   */
  def of[T]: FeatureSpec[T] = new FeatureSpec[T](Array.empty, Crossings.empty)

  /** Combine multiple [[FeatureSpec]]s into a single spec. */
  def combine[T](specs: FeatureSpec[T]*): FeatureSpec[T] = {
    require(specs.nonEmpty, "Empty specs")
    new FeatureSpec(specs.map(_.features).reduce(_ ++ _), specs.map(_.crossings).reduce(_ ++ _))
  }
}

/**
 * Encapsulate specification for feature extraction and transformation.
 * @tparam T input record type to extract features from
 */
class FeatureSpec[T] private[featran] (
  private[featran] val features: Array[Feature[T, _, _, _]],
  private[featran] val crossings: Crossings
) {
  private def featureSet: FeatureSet[T] = new FeatureSet[T](features, crossings)

  /**
   * Add a required field specification.
   * @param f function to extract feature `A` from record `T`
   * @param t [[com.spotify.featran.transformers.Transformer Transformer]] for extracted feature `A`
   * @tparam A extracted feature type
   */
  def required[A](f: T => A)(t: Transformer[A, _, _]): FeatureSpec[T] =
    optional(t => Some(f(t)))(t)

  /**
   * Add an optional field specification.
   * @param f function to extract feature `Option[A]` from record `T`
   * @param default default for missing values
   * @param t [[com.spotify.featran.transformers.Transformer Transformer]] for extracted feature `A`
   * @tparam A extracted feature type
   */
  def optional[A](f: T => Option[A], default: Option[A] = None)(
    t: Transformer[A, _, _]
  ): FeatureSpec[T] =
    new FeatureSpec[T](this.features :+ new Feature(f, default, t), this.crossings)

  /**
   * Cross feature values of two underlying transformers.
   * @param k names of transformers to be crossed
   * @param f function to cross feature value pairs
   */
  def cross(k: (String, String))(f: (Double, Double) => Double): FeatureSpec[T] = {
    val names: Set[String] = features.iterator.map(_.transformer.name).toSet
    val d = Set(k._1, k._2).diff(names)
    require(d.isEmpty, s"Feature ${d.mkString(", ")} not found")
    new FeatureSpec[T](this.features, this.crossings + (k -> f))
  }

  /**
   * Compose with another spec by applying a prepare function to input records first.
   *
   * Useful for reusing an existing spec for a different input record type.
   * @param spec spec to compose with
   * @param f function to prepare input records for the other spec
   * @tparam S input record type of the other spec
   */
  def compose[S](spec: FeatureSpec[S])(f: T => S): FeatureSpec[T] = {
    val composedFeatures = spec.features.map { feature =>
      val t = feature.transformer.asInstanceOf[Transformer[Any, _, _]]
      new Feature(f.andThen(feature.f), feature.default, t)
    }
    new FeatureSpec[T](this.features ++ composedFeatures, this.crossings ++ spec.crossings)
  }

  /**
   * Extract features from an input collection.
   *
   * This is done in two steps, a `reduce` step over the collection to aggregate feature summary,
   * and a `map` step to transform values using the summary.
   * @param input input collection
   * @tparam M input collection type, e.g. `Array`, `List`
   */
  def extract[M[_]: CollectionType](input: M[T]): FeatureExtractor[M, T] = {
    import CollectionType.ops._

    val fs = input.pure(featureSet)
    new FeatureExtractor[M, T](fs, input, None)
  }

  /**
   * Creates a new FeatureSpec with only the features that respect the given predicate.
   *
   * @param predicate Function determining whether or not to include the feature
   */
  def filter(predicate: Feature[T, _, _, _] => Boolean): FeatureSpec[T] = {
    val filteredFeatures = features.filter(predicate)
    val featuresByName = {
      val b = Map.newBuilder[String, Feature[T, _, _, _]]
      b ++= filteredFeatures.iterator.map(f => f.transformer.name -> f)
      b.result()
    }
    val filteredCrossings = crossings.filter[T](featuresByName.contains)

    new FeatureSpec[T](filteredFeatures, filteredCrossings)
  }

  /**
   * Extract features from an input collection using a partial settings from a previous session.
   *
   * This bypasses the `reduce` step in [[extract]] and uses feature summary from settings exported
   * in a previous session.
   * @param input input collection
   * @param settings JSON settings from a previous session
   * @tparam M input collection type, e.g. `Array`, `List`
   */
  def extractWithSubsetSettings[M[_]: CollectionType](
    input: M[T],
    settings: M[String]
  ): FeatureExtractor[M, T] = {
    import json._
    import CollectionType.ops._

    val featureSet = settings.map { s =>
      val settingsJson = decode[Seq[Settings]](s).right.get
      val predicate: Feature[T, _, _, _] => Boolean =
        f => settingsJson.exists(x => x.name == f.transformer.name)

      filter(predicate).featureSet
    }

    new FeatureExtractor[M, T](featureSet, input, Some(settings))
  }

  /**
   * Extract features from an input collection using settings from a previous session.
   *
   * This bypasses the `reduce` step in [[extract]] and uses feature summary from settings exported
   * in a previous session.
   * @param input input collection
   * @param settings JSON settings from a previous session
   * @tparam M input collection type, e.g. `Array`, `List`
   */
  def extractWithSettings[M[_]: CollectionType](
    input: M[T],
    settings: M[String]
  ): FeatureExtractor[M, T] = {
    import CollectionType.ops._

    val fs = input.pure(featureSet)
    new FeatureExtractor[M, T](fs, input, Some(settings))
  }

  /**
   * Extract features from individual records using partial settings. Since the
   * settings are parsed only once, this is more efficient and is recommended when the input is
   * from an unbounded source, e.g. a stream of events or a backend service.
   *
   * This bypasses the `reduce` step in [[extract]] and uses feature summary from settings exported
   * in a previous session.
   * @param settings JSON settings from a previous session
   */
  def extractWithSubsetSettings[F: FeatureBuilder: ClassTag](
    settings: String
  ): RecordExtractor[T, F] = {
    import json._

    val s = decode[Seq[Settings]](settings).right.get
    val predicate: Feature[T, _, _, _] => Boolean = f => s.exists(x => x.name == f.transformer.name)

    new RecordExtractor[T, F](filter(predicate).featureSet, settings)
  }

  /**
   * Extract features from individual records using settings from a previous session. Since the
   * settings are parsed only once, this is more efficient and is recommended when the input is
   * from an unbounded source, e.g. a stream of events or a backend service.
   *
   * This bypasses the `reduce` step in [[extract]] and uses feature summary from settings exported
   * in a previous session.
   * @param settings JSON settings from a previous session
   */
  def extractWithSettings[F: FeatureBuilder: ClassTag](settings: String): RecordExtractor[T, F] =
    new RecordExtractor[T, F](new FeatureSet[T](features, crossings), settings)
}

class Feature[T, A, B, C] private[featran] (
  val f: T => Option[A],
  val default: Option[A],
  val transformer: Transformer[A, B, C]
) extends Serializable {
  def get(t: T): Option[A] = f(t).orElse(default)

  // Option[A] => Option[B]
  def unsafePrepare(a: Option[Any]): Option[B] =
    a.asInstanceOf[Option[A]].map(transformer.aggregator.prepare)

  // (Option[B], Option[B]) => Option[B]
  def unsafeSum(x: Option[Any], y: Option[Any]): Option[Any] =
    (x.asInstanceOf[Option[B]], y.asInstanceOf[Option[B]]) match {
      case (Some(a), Some(b)) =>
        Some(transformer.aggregator.semigroup.plus(a, b))
      case (Some(a), None) => Some(a)
      case (None, Some(b)) => Some(b)
      case _               => None
    }

  // Option[B] => Option[C]
  def unsafePresent(b: Option[Any]): Option[C] =
    b.asInstanceOf[Option[B]].map(transformer.aggregator.present)

  // Option[C] => Int
  def unsafeFeatureDimension(c: Option[Any]): Int =
    transformer.optFeatureDimension(c.asInstanceOf[Option[C]])

  // Option[C] => Array[String]
  def unsafeFeatureNames(c: Option[Any]): Seq[String] =
    transformer.optFeatureNames(c.asInstanceOf[Option[C]])

  // (Option[A], Option[C], FeatureBuilder[F])
  def unsafeBuildFeatures(a: Option[Any], c: Option[Any], fb: FeatureBuilder[_]): Unit =
    transformer.optBuildFeatures(a.asInstanceOf[Option[A]], c.asInstanceOf[Option[C]], fb)

  // Option[C]
  def unsafeSettings(c: Option[Any]): Settings =
    transformer.settings(c.asInstanceOf[Option[C]])
}

private class FeatureSet[T](
  private[featran] val features: Array[Feature[T, _, _, _]],
  private[featran] val crossings: Crossings
) extends Serializable {
  {
    val (_, dups) = features.foldLeft((Set.empty[String], Set.empty[String])) { case ((u, d), f) =>
      val n = f.transformer.name
      if (u.contains(n)) {
        (u, d + n)
      } else {
        (u + n, d)
      }
    }
    require(dups.isEmpty, "duplicate transformer names: " + dups.mkString(", "))
  }

  import FeatureSpec.ARRAY

  protected val n: Int = features.length

  // T => Array[Option[A]]
  def unsafeGet(t: T): ARRAY = features.map(_.get(t))

  // Array[Option[A]] => Array[Option[B]]
  def unsafePrepare(a: ARRAY): ARRAY = {
    require(n == a.length)
    var i = 0
    val r = Array.fill[Option[Any]](n)(null)
    while (i < n) {
      r(i) = features(i).unsafePrepare(a(i))
      i += 1
    }
    r
  }

  // (Array[Option[B]], Array[Option[B]]) => Array[Option[B]]
  def unsafeSum(lhs: ARRAY, rhs: ARRAY): ARRAY = {
    require(n == lhs.length)
    require(n == rhs.length)
    val r = Array.fill[Option[Any]](n)(null)
    var i = 0
    while (i < n) {
      r(i) = features(i).unsafeSum(lhs(i), rhs(i))
      i += 1
    }
    r
  }

  // Array[Option[B]] => Array[Option[C]]
  def unsafePresent(b: ARRAY): ARRAY = {
    require(n == b.length)
    var i = 0
    val r = Array.fill[Option[Any]](n)(null)
    while (i < n) {
      r(i) = features(i).unsafePresent(b(i))
      i += 1
    }
    r
  }

  // Array[Option[C]] => Int
  def featureDimension(c: ARRAY): Int = {
    require(n == c.length)
    var sum = 0
    var i = 0
    val m = mutable.Map.empty[String, Int]
    while (i < n) {
      val f = features(i)
      val size = f.unsafeFeatureDimension(c(i))
      sum += size
      val name = f.transformer.name
      if (crossings.keys.contains(name)) {
        m(name) = size
      }
      i += 1
    }
    crossings.map.keys.foreach { case (n1, n2) =>
      sum += m(n1) * m(n2)
    }
    sum
  }

  // Array[Option[C]] => Array[String]
  def featureNames(c: ARRAY): Seq[String] = {
    require(n == c.length)
    val b = Seq.newBuilder[String]
    var i = 0
    val m = mutable.Map.empty[String, Seq[String]]
    while (i < n) {
      val f = features(i)
      val names = f.unsafeFeatureNames(c(i))
      b ++= names
      val name = f.transformer.name
      if (crossings.keys.contains(name)) {
        m(name) = names
      }
      i += 1
    }
    crossings.map.keys.foreach { case (n1, n2) =>
      for {
        x <- m(n1)
        y <- m(n2)
      } {
        b += Crossings.name(x, y)
      }
    }
    b.result()
  }

  // (Array[Option[A]], Array[Option[C]], FeatureBuilder[F])
  def featureValues[F](a: ARRAY, c: ARRAY, fb: FeatureBuilder[F]): Unit = {
    require(n == c.length)
    fb.init(featureDimension(c))
    var i = 0
    while (i < n) {
      val f = features(i)
      fb.prepare(f.transformer)
      f.unsafeBuildFeatures(a(i), c(i), fb)
      i += 1
    }
  }

  // Option[C]
  def featureSettings(c: ARRAY): Seq[Settings] = {
    require(n == c.length)
    val b = Seq.newBuilder[Settings]
    var i = 0
    while (i < n) {
      b += features(i).unsafeSettings(c(i))
      i += 1
    }
    b.result()
  }

  def decodeAggregators(s: Seq[Settings]): ARRAY = {
    val m: Map[String, Settings] = s.iterator.map(x => (x.name, x)).toMap
    features.map { feature =>
      val name = feature.transformer.name
      require(m.contains(name), s"Missing settings for $name")
      m(feature.transformer.name).aggregators.map(feature.transformer.decodeAggregator)
    }
  }
}

private class MultiFeatureSet[T](
  features: Array[Feature[T, _, _, _]],
  crossings: Crossings,
  private val mapping: Map[String, Int]
) extends FeatureSet[T](features, crossings)
    with Serializable {
  import FeatureSpec.ARRAY

  private[this] val dims = mapping.values.toSet.size

  def multiFeatureBuilders[F: FeatureBuilder]: Array[FeatureBuilder[F]] =
    // each underlying FeatureSpec should get a unique copy of FeatureBuilder
    Array.fill(dims) {
      CrossingFeatureBuilder(FeatureBuilder[F].newBuilder, crossings)
    }

  // Array[Option[C]] => Array[String]
  def multiFeatureNames(c: ARRAY): Seq[Seq[String]] = {
    require(n == c.length)
    val bs = (0 until dims).map(_ => Seq.newBuilder[String])
    var i = 0
    val maps = Array.fill(dims)(mutable.Map.empty[String, Seq[String]])
    while (i < n) {
      val f = features(i)
      val names = f.unsafeFeatureNames(c(i))
      val tName = f.transformer.name
      val idx = mapping(tName)
      names.foreach(bs(idx) += _)
      if (crossings.keys.contains(tName)) {
        maps(idx)(tName) = names
      }
      i += 1
    }
    var idx = 0
    while (idx < dims) {
      val m = maps(idx).withDefaultValue(Nil)
      crossings.map.keys.foreach { case (n1, n2) =>
        for {
          x <- m(n1)
          y <- m(n2)
        } {
          bs(idx) += Crossings.name(x, y)
        }
      }
      idx += 1
    }
    bs.map(_.result())
  }

  // Array[Option[C]] => Array[Int]
  def multiFeatureDimension(c: ARRAY): Array[Int] = {
    require(n == c.length)
    val sums = Array.fill[Int](dims)(0)
    var i = 0
    val maps = Array.fill(dims)(mutable.Map.empty[String, Int])
    while (i < n) {
      val f = features(i)
      val size = f.unsafeFeatureDimension(c(i))
      val tName = f.transformer.name
      val idx = mapping(tName)
      sums(idx) += size
      if (crossings.keys.contains(tName)) {
        maps(idx)(tName) = size
      }
      i += 1
    }
    var idx = 0
    while (idx < dims) {
      val m = maps(idx).withDefaultValue(0)
      crossings.map.keys.foreach { case (n1, n2) =>
        sums(idx) += m(n1) * m(n2)
      }
      idx += 1
    }
    sums
  }

  // (Array[Option[A]], Array[Option[C]], FeatureBuilder[F])
  def multiFeatureValues[F](a: ARRAY, c: ARRAY, fbs: Array[FeatureBuilder[F]]): Unit = {
    var i = 0
    val counts = multiFeatureDimension(c)
    while (i < fbs.length) {
      fbs(i).init(counts(i))
      i += 1
    }
    i = 0
    while (i < n) {
      val f = features(i)
      val fb = fbs(mapping(f.transformer.name))
      fb.prepare(f.transformer)
      f.unsafeBuildFeatures(a(i), c(i), fb)
      i += 1
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy