All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.prediction.data.view.PBatchView.scala Maven / Gradle / Ivy

The newest version!
/** Copyright 2015 TappingStone, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

package io.prediction.data.view

import io.prediction.data.storage.hbase.HBPEvents
import io.prediction.data.storage.Event
import io.prediction.data.storage.EventValidation
import io.prediction.data.storage.DataMap
import io.prediction.data.storage.Storage

import org.joda.time.DateTime

import org.json4s.JValue

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD


// each JValue data associated with the time it is set
private[prediction] case class PropTime(val d: JValue, val t: Long) extends Serializable

private[prediction] case class SetProp (
  val fields: Map[String, PropTime],
  // last set time. Note: fields could be empty with valid set time
  val t: Long) extends Serializable {

  def ++ (that: SetProp): SetProp = {
    val commonKeys = fields.keySet.intersect(that.fields.keySet)

    val common: Map[String, PropTime] = commonKeys.map { k =>
      val thisData = this.fields(k)
      val thatData = that.fields(k)
      // only keep the value with latest time
      val v = if (thisData.t > thatData.t) thisData else thatData
      (k, v)
    }.toMap

    val combinedFields = common ++
      (this.fields -- commonKeys) ++ (that.fields -- commonKeys)

    // keep the latest set time
    val combinedT = if (this.t > that.t) this.t else that.t

    SetProp(
      fields = combinedFields,
      t = combinedT
    )
  }
}

private[prediction] case class UnsetProp (fields: Map[String, Long]) extends Serializable {
  def ++ (that: UnsetProp): UnsetProp = {
    val commonKeys = fields.keySet.intersect(that.fields.keySet)

    val common: Map[String, Long] = commonKeys.map { k =>
      val thisData = this.fields(k)
      val thatData = that.fields(k)
      // only keep the value with latest time
      val v = if (thisData > thatData) thisData else thatData
      (k, v)
    }.toMap

    val combinedFields = common ++
      (this.fields -- commonKeys) ++ (that.fields -- commonKeys)

    UnsetProp(
      fields = combinedFields
    )
  }
}

private[prediction] case class DeleteEntity (t: Long) extends Serializable {
  def ++ (that: DeleteEntity): DeleteEntity = {
    if (this.t > that.t) this else that
  }
}

private[prediction] case class EventOp (
  val setProp: Option[SetProp] = None,
  val unsetProp: Option[UnsetProp] = None,
  val deleteEntity: Option[DeleteEntity] = None
) extends Serializable {

  def ++ (that: EventOp): EventOp = {
    EventOp(
      setProp = (setProp ++ that.setProp).reduceOption(_ ++ _),
      unsetProp = (unsetProp ++ that.unsetProp).reduceOption(_ ++ _),
      deleteEntity = (deleteEntity ++ that.deleteEntity).reduceOption(_ ++ _)
    )
  }

  def toDataMap(): Option[DataMap] = {
    setProp.flatMap { set =>

      val unsetKeys: Set[String] = unsetProp.map( unset =>
        unset.fields.filter{ case (k, v) => (v >= set.fields(k).t) }.keySet
      ).getOrElse(Set())

      val combinedFields = deleteEntity.map { delete =>
        if (delete.t >= set.t) {
          None
        } else {
          val deleteKeys: Set[String] = set.fields
            .filter { case (k, PropTime(kv, t)) =>
              (delete.t >= t)
            }.keySet
          Some(set.fields -- unsetKeys -- deleteKeys)
        }
      }.getOrElse{
        Some(set.fields -- unsetKeys)
      }

      // Note: mapValues() doesn't return concrete Map and causes
      // NotSerializableException issue. Use map(identity) to work around this.
      // see https://issues.scala-lang.org/browse/SI-7005
      combinedFields.map(f => DataMap(f.mapValues(_.d).map(identity)))
    }
  }

}

private[prediction] object EventOp {
  def apply(e: Event): EventOp = {
    val t = e.eventTime.getMillis
    e.event match {
      case "$set" => {
        val fields = e.properties.fields.mapValues(jv =>
          PropTime(jv, t)
        ).map(identity)

        EventOp(
          setProp = Some(SetProp(fields = fields, t = t))
        )
      }
      case "$unset" => {
        val fields = e.properties.fields.mapValues(jv => t).map(identity)
        EventOp(
          unsetProp = Some(UnsetProp(fields = fields))
        )
      }
      case "$delete" => {
        EventOp(
          deleteEntity = Some(DeleteEntity(t))
        )
      }
      case _ => {
        EventOp()
      }
    }
  }
}

@deprecated("Use PEvents or PEventStore instead.", "0.9.2")
class PBatchView(
  val appId: Int,
  val startTime: Option[DateTime],
  val untilTime: Option[DateTime],
  val sc: SparkContext) {

  // NOTE: parallel Events DB interface
  @transient lazy val eventsDb = Storage.getPEvents()

  @transient lazy val _events: RDD[Event] =
    eventsDb.getByAppIdAndTimeAndEntity(
      appId = appId,
      startTime = startTime,
      untilTime = untilTime,
      entityType = None,
      entityId = None)(sc)

  // TODO: change to use EventSeq?
  @transient lazy val events: RDD[Event] = _events

  def aggregateProperties(
    entityType: String,
    startTimeOpt: Option[DateTime] = None,
    untilTimeOpt: Option[DateTime] = None
  ): RDD[(String, DataMap)] = {

    _events
      .filter( e => ((e.entityType == entityType) &&
        (EventValidation.isSpecialEvents(e.event))) )
      .map( e => (e.entityId, EventOp(e) ))
      .aggregateByKey[EventOp](EventOp())(
        // within same partition
        seqOp = { case (u, v) => u ++ v },
        // across partition
        combOp = { case (accu, u) => accu ++ u }
      )
      .mapValues(_.toDataMap)
      .filter{ case (k, v) => v.isDefined }
      .map{ case (k, v) => (k, v.get) }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy