
com.datastax.data.prepare.spark.dataset.OutlierDetection.scala Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset
import com.datastax.insight.core.driver.SparkContextBuilder
import com.datastax.data.prepare.spark.dataset.params.{DistanceContainer, OutlierObject}
import com.datastax.data.prepare.spark.math.{ArrayCorrelation, DistanceMetricMethods, VectorCorrelation}
import com.datastax.data.prepare.util.{Consts, SharedMethods}
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructField, StructType}
import scala.collection.mutable.{ArrayBuffer}
object OutlierDetection {
final private val increasingId = "increasingID"
final private val outlier = "Outlier"
final private val labelPreffix = "vector-"
def distanceOutlier(data: Dataset[Row], cols: Array[String], k: Int, n: Int, t: String): Dataset[Row] = {
val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
require(outlierObjects.nonEmpty)
distanceOutlier(outlierObjects, k, n, t)
buildDataset(data, outlierObjects, 1)
}
def distanceOutlier(outlierObjects: Array[OutlierObject], k: Int, n: Int, t: String): Unit = {
for(i <- outlierObjects.indices) {
outlierObjects(i).setContainer(ArrayCorrelation.seekPreNumberOfK(distanceArray(outlierObjects, i, t), k)(_.getDistance > _.getDistance))
outlierObjects(i).setPreKMean(preKMean(outlierObjects(i).getContainer))
}
val topN = ArrayCorrelation.seekPreNumberOfK(outlierObjects, n)(_.getPreKMean < _.getPreKMean)
for(o <- topN) {
outlierObjects(o.label.substring(labelPreffix.length).toInt).setOutlierStatus(true)
}
}
def densitiesOutlier(data: Dataset[Row], cols: Array[String], distance: Double, proportion: Double, t: String): Dataset[Row] = {
val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
require(outlierObjects.nonEmpty)
val total = data.count()
densitiesOutlier(outlierObjects, distance, proportion, t, total)
buildDataset(data, outlierObjects, 1)
}
def densitiesOutlier(outlierObjects: Array[OutlierObject], distance: Double, proportion: Double, t: String, total: Long): Unit = {
for(position <- outlierObjects.indices) {
val num = pointsNumberWithinDistance(outlierObjects, position, distance, t)
outlierObjects(position).setProportion(num.toDouble / total.toDouble)
outlierObjects(position).checkProportion(proportion)
}
}
def LOFOutlier(data: Dataset[Row], cols: Array[String], lower: Int, upper: Int, t: String): Dataset[Row] = {
val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
require(outlierObjects.nonEmpty)
LOFOutlier(outlierObjects, lower, upper, t)
buildDataset(data, outlierObjects, 2)
}
def LOFOutlier(outlierObjects: Array[OutlierObject], lower: Int, upper: Int, t: String): Unit = {
val size = upper - lower
for(i <- outlierObjects.indices) {
outlierObjects(i).setKSize(lower, upper)
outlierObjects(i).setContainer(ArrayCorrelation.seekPreNumberOfK(distanceArray(outlierObjects, i, t), upper)(_.getDistance > _.getDistance))
outlierObjects(i).setCardN(outlierObjects(i).getContainer.length, size)
outlierObjects(i).setKDistance(outlierObjects(i).getContainer.last.getDistance, size)
var k = 0
while(k < size) {
val container = outlierObjects(i).getContainer
outlierObjects(i).setKDistance(container(k + lower - 1).getDistance, k)
var z = k + lower - 1
while(z != container.length - 1 && container(z).getDistance == container(z + 1).getDistance) {
z += 1
}
outlierObjects(i).setCardN(z + 1, k)
k += 1
}
}
for(i <- outlierObjects.indices) {
var k = 0
while(k <= size) {
var sumDistances = 0.0
var j = 0
while (j < outlierObjects(i).getCardN(k)) {
sumDistances = sumDistances + Math.max(outlierObjects(i).getContainer(j).getOutlierObject.getKDistance(k), getDistance(outlierObjects(i).vector, outlierObjects(i).getContainer(j).getOutlierObject.vector, t))
j += 1
}
outlierObjects(i).setLRD(outlierObjects(i).getCardN(k) / sumDistances, k) // 1/(reach-dist[k](p, o)/cardN(k)) o belongs to p.container
k += 1
}
}
for(i <- outlierObjects.indices) {
var k = 0
while(k <= size) {
var sumLRDs = 0.0
var j = 0
while(j < outlierObjects(i).getCardN(k)) {
sumLRDs = sumLRDs + outlierObjects(i).getContainer(j).getOutlierObject.getLRD(k) / outlierObjects(i).getLRD(k)
j += 1
}
val lof = sumLRDs / outlierObjects(i).getCardN(k)
outlierObjects(i).setLOF(lof, k)
if(outlierObjects(i).getMaxLOF < lof) {
outlierObjects(i).setMaxLOF(lof)
}
k += 1
}
}
}
def pointsNumberWithinDistance(outlierObjects: Array[OutlierObject], position: Int, distance: Double, t: String): Long = {
var i = 0
var sum = 0
while(i < outlierObjects.length) {
val d = getDistance(outlierObjects(position).vector, outlierObjects(i).vector, t)
if(d >= distance) {
sum += 1
}
i += 1
}
sum
}
def distanceArray(outlierObjects: Array[OutlierObject], position: Int, t: String): Array[DistanceContainer] = {
(for(i <- outlierObjects.indices if position != i) yield new DistanceContainer(outlierObjects(i), getDistance(outlierObjects(position).vector, outlierObjects(i).vector, t))).toArray
}
private def getDistance(xs: Array[Double], ys: Array[Double], t: String): Double = t match {
case Consts.SQUARED => DistanceMetricMethods.squaredDistance(xs, ys)
case Consts.COSINE => DistanceMetricMethods.cosineDistance(xs, ys)
case Consts.INVERTED_COSINE => DistanceMetricMethods.invertedCosineDistance(xs, ys)
case _ => DistanceMetricMethods.euclidianDistance(xs, ys) //默认为 欧式距离
}
def buildDataset(data: Dataset[Row], outlierObjects: Array[OutlierObject], t: Int): Dataset[Row] = {
val spark = SparkContextBuilder.getSession
if(t == 1) {
val rdd = spark.sparkContext.makeRDD(for(i <- outlierObjects.indices) yield (outlierObjects(i).index, outlierObjects(i).getOutlierStatus))
val ds = spark.createDataFrame(rdd).toDF(increasingId, outlier)
data.withColumn(increasingId, functions.monotonically_increasing_id()).join(ds, increasingId).drop(increasingId)
} else {
val rdd = spark.sparkContext.makeRDD(for(i <- outlierObjects.indices) yield (outlierObjects(i).index, outlierObjects(i).getMaxLOF))
val ds = spark.createDataFrame(rdd).toDF(increasingId, outlier)
data.withColumn(increasingId, functions.monotonically_increasing_id()).join(ds, increasingId).drop(increasingId)
}
}
def preKMean(as: Array[DistanceContainer]): Double = {
var i = 0
var sum = 0.0
while(i < as.length) {
sum += as(i).getDistance
i += 1
}
sum / as.length
}
def getNeedStructFields(structType: StructType, cols: Array[String]): Array[String] = {
val array = new ArrayBuffer[String]
for(c <- cols) {
val t = c.trim
if (!t.isEmpty && structType.exists(_.name == t)) {
val field = structType.fields(structType.fieldIndex(t))
if (checkFieldType(field)) {
array += field.name
}
}
}
array.toArray
}
def checkFieldType(structField: StructField): Boolean = SharedMethods.isNumericType(structField) || SharedMethods.isVectorType(structField)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy