com.intel.analytics.zoo.examples.anomalydetection.AnomalyDetection.scala Maven / Gradle / Ivy
/*
* Copyright 2018 Analytics Zoo Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intel.analytics.zoo.examples.anomalydetection
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.Shape
import com.intel.analytics.zoo.common.NNContext
import com.intel.analytics.zoo.models.anomalydetection._
import com.intel.analytics.zoo.pipeline.api.keras.objectives.MeanSquaredError
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.functions._
import scopt.OptionParser
import org.joda.time.format.DateTimeFormat
case class Taxi(ts: String, value: Float)
case class LocalParams(val inputDir: String = "./data/NAB/nyc_taxi/",
val unrollLength: Int = 50,
val batchSize: Int = 1024,
val nEpochs: Int = 20
)
object AnomalyDetection {
def main(args: Array[String]): Unit = {
val defaultParams = LocalParams()
val parser = new OptionParser[LocalParams]("NCF Example") {
opt[String]("inputDir")
.text(s"inputDir")
.action((x, c) => c.copy(inputDir = x))
opt[Int]('b', "batchSize")
.text(s"batchSize")
.action((x, c) => c.copy(batchSize = x.toInt))
opt[Int]('e', "nEpochs")
.text("epoch numbers")
.action((x, c) => c.copy(nEpochs = x))
}
parser.parse(args, defaultParams).map {
params =>
run(params)
} getOrElse {
System.exit(1)
}
}
def run(param: LocalParams): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf()
conf.setAppName("AnomalyDetectionExample")
val sc = NNContext.initNNContext(conf)
val sqlContext = SQLContext.getOrCreate(sc)
val featureDF = loadData(sqlContext, param.inputDir)
val featureShape = Shape(param.unrollLength, 3)
val unrolled: RDD[FeatureLabelIndex[Float]] =
assemblyFeature(featureDF, true, param.unrollLength)
val (trainRdd, testRdd) = Utils.trainTestSplit(unrolled, testSize = 1000)
val model: AnomalyDetector[Float] = AnomalyDetector[Float](featureShape)
model.compile(optimizer = new RMSprop(learningRate = 0.001, decayRate = 0.9),
loss = MeanSquaredError[Float](),
metrics = List( new MAE[Float]()))
model.fit(trainRdd, batchSize = param.batchSize, nbEpoch = param.nEpochs,
validationData = testRdd)
val predictions = model.predict(testRdd)
val yPredict: RDD[Float] = predictions.map(x => x.toTensor.toArray()(0))
val yTruth: RDD[Float] = testRdd.map(x => x.label.toArray()(0))
val anomalies = AnomalyDetector.detectAnomalies(yTruth, yPredict, 50)
anomalies.take(6).foreach(a => println("anomaly: " + a))
println("detection finished...")
sc.stop()
}
def loadData(sqlContext: SQLContext, dataPath: String): DataFrame = {
@transient lazy val formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
import sqlContext.implicits._
val df = sqlContext.sparkContext.textFile(dataPath + "/nyc_taxi.csv")
.mapPartitionsWithIndex((idx, iter) => if (idx == 0) iter.drop(1) else iter)
.map(x => {
val line = x.split(",")
Taxi(line(0), line(1).toFloat)
}).toDF()
val hourUDF = udf((time: String) => (formatter.parseDateTime(time).hourOfDay().get()))
val awakeUDF = udf((hour: Int) => if (hour >= 6 && hour <= 23) 1 else 0)
val featureDF = df.withColumn("hour", hourUDF(col("ts")))
.withColumn("awake", awakeUDF(col("hour")))
.select("value", "hour", "awake")
featureDF
}
def assemblyFeature(featureDF: DataFrame,
ifScale: Boolean = true,
unrollLength: Int): RDD[FeatureLabelIndex[Float]] = {
val scaledDF = if (ifScale) {
Utils.standardScale(featureDF, Seq("value", "hour", "awake"))
} else {
featureDF
}
val featureLen = scaledDF.columns.length
val dataRdd: RDD[Array[Float]] = scaledDF.rdd
.map(row => (0 to featureLen - 1).toArray.map(x => row.getAs[Float](x)))
AnomalyDetector.unroll(dataRdd, unrollLength)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy