All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.prediction.data.view.DataView.scala Maven / Gradle / Ivy

The newest version!
/** Copyright 2015 TappingStone, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

package io.prediction.data.view

import io.prediction.annotation.Experimental
import io.prediction.data.storage.Event

import grizzled.slf4j.Logger
import io.prediction.data.store.PEventStore

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.joda.time.DateTime

import scala.reflect.ClassTag
import scala.reflect.runtime.universe._
import scala.util.hashing.MurmurHash3

/**
 * :: Experimental ::
 */
@Experimental
object DataView {
  /**
    * :: Experimental ::
    *
    * Create a DataFrame from events of a specified app.
    *
    * @param appName return events of this app
    * @param channelName use events of this channel (default channel if it's None)
    * @param startTime return events with eventTime >= startTime
    * @param untilTime return events with eventTime < untilTime
    * @param conversionFunction a function that turns raw Events into events of interest.
    *                           If conversionFunction returns None, such events are dropped.
    * @param name identify the DataFrame created
    * @param version used to track changes to the conversionFunction, e.g. version = "20150413"
    *                and update whenever the function is changed.
    * @param sqlContext SQL context
    * @tparam E the output type of the conversion function. The type needs to extend Product
    *           (e.g. case class)
    * @return a DataFrame of events
    */
  @Experimental
  def create[E <: Product: TypeTag: ClassTag](
    appName: String,
    channelName: Option[String] = None,
    startTime: Option[DateTime] = None,
    untilTime: Option[DateTime] = None,
    conversionFunction: Event => Option[E],
    name: String = "",
    version: String = "")(sqlContext: SQLContext): DataFrame = {

    @transient lazy val logger = Logger[this.type]

    val sc = sqlContext.sparkContext

    val beginTime = startTime match {
      case Some(t) => t
      case None => new DateTime(0L)
    }
    val endTime = untilTime match {
      case Some(t) => t
      case None => DateTime.now() // fix the current time
    }
    // detect changes to the case class
    val uid = java.io.ObjectStreamClass.lookup(implicitly[reflect.ClassTag[E]].runtimeClass)
        .getSerialVersionUID
    val hash = MurmurHash3.stringHash(s"$beginTime-$endTime-$version-$uid")
    val baseDir = s"${sys.env("PIO_FS_BASEDIR")}/view"
    val fileName = s"$baseDir/$name-$appName-$hash.parquet"
    try {
      sqlContext.parquetFile(fileName)
    } catch {
      case e: java.io.FileNotFoundException =>
        logger.info("Cached copy not found, reading from DB.")
        // if cached copy is found, use it. If not, grab from Storage
        val result: RDD[E] = PEventStore.find(
            appName = appName,
            channelName = channelName,
            startTime = startTime,
            untilTime = Some(endTime))(sc)
          .flatMap((e) => conversionFunction(e))
        import sqlContext.implicits._ // needed for RDD.toDF()
        val resultDF = result.toDF()

        resultDF.saveAsParquetFile(fileName)
        sqlContext.parquetFile(fileName)
      case e: java.lang.RuntimeException =>
        if (e.toString.contains("is not a Parquet file")) {
          logger.error(s"$fileName does not contain a valid Parquet file. " +
            "Please delete it and try again.")
        }
        throw e
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy