All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.prepare.spark.dataset.OutlierDetection.scala Maven / Gradle / Ivy

package com.datastax.data.prepare.spark.dataset

import com.datastax.insight.core.driver.SparkContextBuilder
import com.datastax.data.prepare.spark.dataset.params.{DistanceContainer, OutlierObject}
import com.datastax.data.prepare.spark.math.{ArrayCorrelation, DistanceMetricMethods, VectorCorrelation}
import com.datastax.data.prepare.util.{Consts, SharedMethods}
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructField, StructType}

import scala.collection.mutable.{ArrayBuffer}

object OutlierDetection {
  final private val increasingId = "increasingID"
  final private val outlier = "Outlier"
  final private val labelPreffix = "vector-"

  def distanceOutlier(data: Dataset[Row], cols: Array[String], k: Int, n: Int, t: String): Dataset[Row] = {
    val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
    require(outlierObjects.nonEmpty)
    distanceOutlier(outlierObjects, k, n, t)
    buildDataset(data, outlierObjects, 1)
  }

  def distanceOutlier(outlierObjects: Array[OutlierObject], k: Int, n: Int, t: String): Unit = {
    for(i <- outlierObjects.indices) {
      outlierObjects(i).setContainer(ArrayCorrelation.seekPreNumberOfK(distanceArray(outlierObjects, i, t), k)(_.getDistance > _.getDistance))
      outlierObjects(i).setPreKMean(preKMean(outlierObjects(i).getContainer))
    }
    val topN = ArrayCorrelation.seekPreNumberOfK(outlierObjects, n)(_.getPreKMean < _.getPreKMean)
    for(o <- topN) {
      outlierObjects(o.label.substring(labelPreffix.length).toInt).setOutlierStatus(true)
    }
  }

  def densitiesOutlier(data: Dataset[Row], cols: Array[String], distance: Double, proportion: Double, t: String): Dataset[Row] = {
    val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
    require(outlierObjects.nonEmpty)
    val total = data.count()
    densitiesOutlier(outlierObjects, distance, proportion, t, total)
    buildDataset(data, outlierObjects, 1)
  }

  def densitiesOutlier(outlierObjects: Array[OutlierObject], distance: Double, proportion: Double, t: String, total: Long): Unit = {
    for(position <- outlierObjects.indices) {
      val num = pointsNumberWithinDistance(outlierObjects, position, distance, t)
      outlierObjects(position).setProportion(num.toDouble / total.toDouble)
      outlierObjects(position).checkProportion(proportion)
    }
  }

  def LOFOutlier(data: Dataset[Row], cols: Array[String], lower: Int, upper: Int, t: String): Dataset[Row] = {
    val outlierObjects = VectorCorrelation.row2OutlierObject(data, cols).toArray[OutlierObject]
    require(outlierObjects.nonEmpty)
    LOFOutlier(outlierObjects, lower, upper, t)
    buildDataset(data, outlierObjects, 2)
  }

  def LOFOutlier(outlierObjects: Array[OutlierObject], lower: Int, upper: Int, t: String): Unit = {
    val size = upper - lower
    for(i <- outlierObjects.indices) {
      outlierObjects(i).setKSize(lower, upper)
      outlierObjects(i).setContainer(ArrayCorrelation.seekPreNumberOfK(distanceArray(outlierObjects, i, t), upper)(_.getDistance > _.getDistance))
      outlierObjects(i).setCardN(outlierObjects(i).getContainer.length, size)
      outlierObjects(i).setKDistance(outlierObjects(i).getContainer.last.getDistance, size)
      var k = 0
      while(k < size) {
        val container = outlierObjects(i).getContainer
        outlierObjects(i).setKDistance(container(k + lower - 1).getDistance, k)
        var z = k + lower - 1
        while(z != container.length - 1 && container(z).getDistance == container(z + 1).getDistance) {
          z += 1
        }
        outlierObjects(i).setCardN(z + 1, k)
        k += 1
      }
    }

    for(i <- outlierObjects.indices) {
      var k = 0
      while(k <= size) {
        var sumDistances = 0.0
        var j = 0
        while (j < outlierObjects(i).getCardN(k)) {
          sumDistances = sumDistances + Math.max(outlierObjects(i).getContainer(j).getOutlierObject.getKDistance(k), getDistance(outlierObjects(i).vector, outlierObjects(i).getContainer(j).getOutlierObject.vector, t))
          j += 1
        }
        outlierObjects(i).setLRD(outlierObjects(i).getCardN(k) / sumDistances, k)  // 1/(reach-dist[k](p, o)/cardN(k))  o belongs to p.container
        k += 1
      }
    }

    for(i <- outlierObjects.indices) {
      var k = 0
      while(k <= size) {
        var sumLRDs = 0.0
        var j = 0
        while(j < outlierObjects(i).getCardN(k)) {
          sumLRDs = sumLRDs + outlierObjects(i).getContainer(j).getOutlierObject.getLRD(k) / outlierObjects(i).getLRD(k)
          j += 1
        }
        val lof = sumLRDs / outlierObjects(i).getCardN(k)
        outlierObjects(i).setLOF(lof, k)
        if(outlierObjects(i).getMaxLOF < lof) {
          outlierObjects(i).setMaxLOF(lof)
        }
        k += 1
      }
    }
  }

  def pointsNumberWithinDistance(outlierObjects: Array[OutlierObject], position: Int, distance: Double, t: String): Long = {
    var i = 0
    var sum = 0
    while(i < outlierObjects.length) {
      val d = getDistance(outlierObjects(position).vector, outlierObjects(i).vector, t)
      if(d >= distance) {
        sum += 1
      }
      i += 1
    }
    sum
  }

  def distanceArray(outlierObjects: Array[OutlierObject], position: Int, t: String): Array[DistanceContainer] = {
    (for(i <- outlierObjects.indices if position != i) yield new DistanceContainer(outlierObjects(i), getDistance(outlierObjects(position).vector, outlierObjects(i).vector, t))).toArray
  }

  private def getDistance(xs: Array[Double], ys: Array[Double], t: String): Double = t match {
    case Consts.SQUARED => DistanceMetricMethods.squaredDistance(xs, ys)
    case Consts.COSINE => DistanceMetricMethods.cosineDistance(xs, ys)
    case Consts.INVERTED_COSINE => DistanceMetricMethods.invertedCosineDistance(xs, ys)
    case _ => DistanceMetricMethods.euclidianDistance(xs, ys)  //默认为 欧式距离
  }

  def buildDataset(data: Dataset[Row], outlierObjects: Array[OutlierObject], t: Int): Dataset[Row] = {
    val spark = SparkContextBuilder.getSession
    if(t == 1) {
      val rdd = spark.sparkContext.makeRDD(for(i <- outlierObjects.indices) yield (outlierObjects(i).index, outlierObjects(i).getOutlierStatus))
      val ds = spark.createDataFrame(rdd).toDF(increasingId, outlier)
      data.withColumn(increasingId, functions.monotonically_increasing_id()).join(ds, increasingId).drop(increasingId)
    } else {
      val rdd = spark.sparkContext.makeRDD(for(i <- outlierObjects.indices) yield (outlierObjects(i).index, outlierObjects(i).getMaxLOF))
      val ds = spark.createDataFrame(rdd).toDF(increasingId, outlier)
      data.withColumn(increasingId, functions.monotonically_increasing_id()).join(ds, increasingId).drop(increasingId)
    }
  }

  def preKMean(as: Array[DistanceContainer]): Double = {
    var i = 0
    var sum = 0.0
    while(i < as.length) {
      sum += as(i).getDistance
      i += 1
    }
    sum / as.length
  }

  def getNeedStructFields(structType: StructType, cols: Array[String]): Array[String] = {
    val array = new ArrayBuffer[String]
    for(c <- cols) {
      val t = c.trim
      if (!t.isEmpty && structType.exists(_.name == t)) {
        val field = structType.fields(structType.fieldIndex(t))
        if (checkFieldType(field)) {
          array += field.name
        }
      }
    }
    array.toArray
  }

  def checkFieldType(structField: StructField): Boolean = SharedMethods.isNumericType(structField) || SharedMethods.isVectorType(structField)

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy