com.cloudera.oryx.example.batch.ExampleScalaBatchLayerUpdate.scala Maven / Gradle / Ivy
/*
* Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.oryx.example.batch
import scala.collection.JavaConverters._
import scala.collection.Map
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import com.cloudera.oryx.api.TopicProducer
import com.cloudera.oryx.api.batch.ScalaBatchLayerUpdate
/**
* Input keys are ignored. Values are treated as lines of space-separated text. The job
* counts, for each word, the number of distinct other words that co-occur in some line
* of text in the input. These are written as a "MODEL" update, where the word-count mapping
* is written as a JSON string.
*/
class ExampleScalaBatchLayerUpdate extends ScalaBatchLayerUpdate[String,String,String] {
override def configureUpdate(sparkContext: SparkContext,
timestamp: Long,
newData: RDD[(String,String)],
pastData: RDD[(String,String)],
modelDirString: String,
modelUpdateTopic: TopicProducer[String,String]): Unit = {
val modelString = new ObjectMapper().writeValueAsString(
ExampleScalaBatchLayerUpdate.countDistinctOtherWords(newData.union(pastData)).asJava)
modelUpdateTopic.send("MODEL", modelString)
}
}
object ExampleScalaBatchLayerUpdate {
def countDistinctOtherWords(data: RDD[(String,String)]): Map[String, Int] = {
data.values.flatMap { line =>
val tokens = line.split(" ").distinct
for (a <- tokens; b <- tokens if a != b) yield (a, b)
}.distinct().mapValues(_ => 1).reduceByKey(_ + _).collectAsMap()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy