All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.featran.FlatExtractor.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2018 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.featran

import com.spotify.featran.transformers._
import simulacrum.typeclass

import scala.reflect.ClassTag
import scala.annotation.implicitNotFound

/**
 * TypeClass that is used to read data from flat files.  The requirement is that each
 * feature comes from the same type and can be looked up by name.
 * @tparam T The intermediate storage format for each feature.
 */
@implicitNotFound("Could not find an instance of FlatReader for ${T}")
@typeclass trait FlatReader[T] extends Serializable {
  def readDouble(name: String): T => Option[Double]

  def readMdlRecord(name: String): T => Option[MDLRecord[String]]

  def readWeightedLabel(name: String): T => Option[List[WeightedLabel]]

  def readDoubles(name: String): T => Option[Seq[Double]]

  def readDoubleArray(name: String): T => Option[Array[Double]]

  def readString(name: String): T => Option[String]

  def readStrings(name: String): T => Option[Seq[String]]
}

object FlatReader {
  /* ======================================================================== */
  /* THE FOLLOWING CODE IS MANAGED BY SIMULACRUM; PLEASE DO NOT EDIT!!!!      */
  /* ======================================================================== */

  /** Summon an instance of [[FlatReader]] for `T`. */
  @inline def apply[T](implicit instance: FlatReader[T]): FlatReader[T] = instance

  object ops {
    implicit def toAllFlatReaderOps[T](target: T)(implicit tc: FlatReader[T]): AllOps[T] {
      type TypeClassType = FlatReader[T]
    } = new AllOps[T] {
      type TypeClassType = FlatReader[T]
      val self: T = target
      val typeClassInstance: TypeClassType = tc
    }
  }
  trait Ops[T] extends Serializable {
    type TypeClassType <: FlatReader[T]
    def self: T
    val typeClassInstance: TypeClassType
  }
  trait AllOps[T] extends Ops[T]
  trait ToFlatReaderOps extends Serializable {
    implicit def toFlatReaderOps[T](target: T)(implicit tc: FlatReader[T]): Ops[T] {
      type TypeClassType = FlatReader[T]
    } = new Ops[T] {
      type TypeClassType = FlatReader[T]
      val self: T = target
      val typeClassInstance: TypeClassType = tc
    }
  }
  object nonInheritedOps extends ToFlatReaderOps

  /* ======================================================================== */
  /* END OF SIMULACRUM-MANAGED CODE                                           */
  /* ======================================================================== */

}

/**
 * Sometimes it is useful to store the features in an intermediate state in normally
 * a flat version like Examples or maybe JSON.  This makes it easier to interface with
 * other systems.
 */
object FlatExtractor {

  /**
   * This function allows the reading of data from these flat versions by name with a given
   * settings file to extract the final output.
   *
   * @param settings Setting information
   * @tparam M Collection Type
   * @tparam T The intermediate format where the data is stored
   * @return Class for converting to Features
   */
  @inline def apply[M[_]: CollectionType, T: ClassTag: FlatReader](
    settings: M[String]
  ): FlatExtractor[M, T] = new FlatExtractor[M, T](settings)

  /**
   * Another useful operation is to use the Spec and Information we have to map from
   * the Scala Object to read the data directly from the intermediate format for parsing
   *
   * @param spec Current FeatureSpec
   * @tparam T The intermediate format where the data is stored
   * @tparam X The Input Scala Object
   * @return FeatureSpec for the intermediate format
   */
  def flatSpec[T: ClassTag: FlatReader, X: ClassTag](spec: FeatureSpec[X]): FeatureSpec[T] = {
    val features = spec.features.map { feature =>
      val t = feature.transformer.asInstanceOf[Transformer[Any, _, _]]
      new Feature(feature.transformer.flatRead, feature.default, t)
        .asInstanceOf[Feature[T, _, _, _]]
    }
    new FeatureSpec[T](features, spec.crossings)
  }

  /**
   * Another useful operation is to use the Spec and Information we have to map from
   * the Scala Object to read the data directly from the intermediate format for parsing
   *
   * @param spec Current FeatureSpec
   * @tparam T The intermediate format where the data is stored
   * @tparam X The Input Scala Object
   * @return FeatureSpec for the intermediate format
   */
  def multiFlatSpec[T: ClassTag: FlatReader, X: ClassTag](
    spec: MultiFeatureSpec[X]
  ): MultiFeatureSpec[T] = {
    val features = spec.features.map { feature =>
      val t = feature.transformer.asInstanceOf[Transformer[Any, _, _]]
      new Feature(feature.transformer.flatRead, feature.default, t)
        .asInstanceOf[Feature[T, _, _, _]]
    }
    new MultiFeatureSpec[T](spec.mapping, features, spec.crossings)
  }
}

private[featran] class FlatExtractor[M[_]: CollectionType, T: ClassTag: FlatReader](
  settings: M[String]
) extends Serializable {
  import json._
  import CollectionType.ops._
  import scala.reflect.runtime.universe

  @transient private[this] val converters = settings.map { str =>
    val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader)
    val jsonOpt = decode[Seq[Settings]](str)
    assert(jsonOpt.isRight, "Unable to parse the settings files.")
    jsonOpt.right.get.map { setting =>
      val transformer = runtimeMirror
        .reflectModule(runtimeMirror.staticModule(setting.cls))
        .instance
        .asInstanceOf[SettingsBuilder]
        .fromSettings(setting)

      (transformer.flatRead[T], setting.aggregators, transformer)
    }
  }

  @transient private[this] val dimSize: M[Int] = converters.map { items =>
    items.map { case (_, aggr, tr) =>
      val ta = aggr.map(tr.decodeAggregator)
      tr.unsafeFeatureDimension(ta)
    }.sum
  }

  def featureValues[F: FeatureBuilder: ClassTag](records: M[T]): M[F] =
    featureResults(records).map(_._1)

  def featureResults[F: FeatureBuilder: ClassTag](
    records: M[T]
  ): M[(F, Map[String, FeatureRejection])] = {
    val fb = FeatureBuilder[F].newBuilder
    records.cross(converters).cross(dimSize).map { case ((record, convs), size) =>
      fb.init(size)
      convs.foreach { case (fn, aggr, conv) =>
        val a = aggr.map(conv.decodeAggregator)
        conv.unsafeBuildFeatures(fn(record), a, fb)
      }
      (fb.result, fb.rejections)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy