All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cloudera.oryx.example.speed.ExampleScalaSpeedModelManager.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.example.speed

import scala.collection.mutable
import scala.collection.JavaConverters._
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD
import com.cloudera.oryx.api.speed.AbstractScalaSpeedModelManager
import com.cloudera.oryx.example.batch.ExampleScalaBatchLayerUpdate
import com.typesafe.config.Config

/**
 * Also counts and emits counts of number of distinct words that occur with words.
 * Listens for updates from the Batch Layer, which give the current correct count at its
 * last run. Updates these counts approximately in response to the same data stream
 * that the Batch Layer sees, but assumes all words seen are new and distinct, which is only
 * approximately true. Emits updates of the form "word,count".
 */
class ExampleScalaSpeedModelManager(val config: Config)
  extends AbstractScalaSpeedModelManager[String,String,String](config) {

  private val distinctOtherWords = mutable.Map[String,Int]()

  override def consumeKeyMessage(key: String, message: String, hadoopConf: Configuration): Unit = {
    key match {
      case "MODEL" =>
        val model =
          new ObjectMapper().readValue(message, classOf[java.util.Map[String,String]]).asScala
        distinctOtherWords.synchronized {
          distinctOtherWords.clear()
          model.foreach { case (word, count) =>
            distinctOtherWords.put(word, count.toInt)
          }
        }
      case _ => // ignore
    }
  }

  override def buildUpdates(newData: RDD[(String,String)]): Seq[String] = {
    ExampleScalaBatchLayerUpdate.countDistinctOtherWords(newData).map { case (word, count) =>
      distinctOtherWords.synchronized {
        val newCount = count + distinctOtherWords(word)
        distinctOtherWords(word) = newCount
        word + "," + newCount
      }
    }.toSeq
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy