All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.chen0040.lof.LOF Maven / Gradle / Ivy

There is a newer version: 1.0.4
Show newest version
package com.github.chen0040.lof;


import com.github.chen0040.data.frame.DataFrame;
import com.github.chen0040.data.frame.DataRow;
import com.github.chen0040.data.utils.TupleTwo;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;

import java.util.*;
import java.util.concurrent.*;
import java.util.function.BiFunction;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * Created by xschen on 17/8/15.
 * Link:
 */
@Getter
@Setter
public class LOF {

    public double threshold = 0.5;

    // min number for minPts;
    public int minPtsLB = 3;

    // max number for minPts;
    public int minPtsUB = 10;
    public boolean parallel = true;
    public boolean automaticThresholding = false;
    public double automaticThresholdingRatio = 0.05;


    private static final Logger logger = Logger.getLogger(String.valueOf(LOF.class));

    private BiFunction distanceMeasure;

    @Setter(AccessLevel.NONE)
    private double minScore;
    @Setter(AccessLevel.NONE)
    private double maxScore;

    private DataFrame model;


    protected void adjustThreshold(DataFrame batch){
        int m = batch.rowCount();

        List orders = new ArrayList<>();
        List probs = new ArrayList<>();

        for(int i=0; i < m; ++i){
            DataRow tuple = batch.row(i);
            double prob = evaluate(tuple, model);
            probs.add(prob);
            orders.add(i);
        }

        final List probs2 = probs;
        // sort descendingly by probability values
        Collections.sort(orders, (h1, h2) -> {
            double prob1 = probs2.get(h1);
            double prob2 = probs2.get(h2);
            return Double.compare(prob2, prob1);
        });

        int selected_index = autoThresholdingCaps(orders.size());
        if(selected_index >= orders.size()){
            threshold = probs.get(orders.get(orders.size() - 1));
        }
        else{
            threshold = probs.get(orders.get(selected_index));
        }

    }

    public LOF(){
        super();
        threshold = 0.5;
        setSearchRange(3, 10);
        parallel = true;
        automaticThresholding = true;
        automaticThresholdingRatio = 0.05;
    }

    protected int autoThresholdingCaps(int m){
        return Math.max(1, (int) (automaticThresholdingRatio * m));
    }


    public void copy(LOF that){
        minScore = that.minScore;
        maxScore = that.maxScore;
        distanceMeasure = that.distanceMeasure;
        model = that.model == null ? null : that.model.makeCopy();
    }

    public LOF makeCopy(){
        LOF clone = new LOF();
        clone.copy(this);

        return clone;
    }

    public MinPtsBounds searchRange() {
        return new MinPtsBounds(minPtsLB, minPtsUB);
    }

    public void setSearchRange(int minPtsLB, int minPtsUB) {
        this.minPtsLB = minPtsLB;
        this.minPtsUB = minPtsUB;
    }

    public BiFunction getDistanceMeasure() {
        return distanceMeasure;
    }

    public void setDistanceMeasure(BiFunction distanceMeasure) {
        this.distanceMeasure = distanceMeasure;
    }

    public boolean isAnomaly(DataRow tuple) {
        double score_lof = evaluate(tuple, model);
        return score_lof > threshold;
    }

    private class ScoreTask implements Callable{
        private DataFrame batch;
        private DataRow tuple;
        public ScoreTask(DataFrame batch, DataRow tuple){
            this.batch = batch;
            this.tuple = tuple;
        }

        public Double call() throws Exception {
            double score = score_lof_sync(batch, tuple);
            return score;
        }
    }



    public DataFrame fitAndTransform(DataFrame batch) {
        this.model = batch.makeCopy();

        int m = model.rowCount();

        minScore = Double.MAX_VALUE;
        maxScore = Double.NEGATIVE_INFINITY;



        if(parallel) {
            ExecutorService executor = Executors.newFixedThreadPool(10);
            List tasks = new ArrayList<>();
            for (int i = 0; i < m; ++i) {
                tasks.add(new ScoreTask(model, model.row(i)));
            }

            try {
                List> results = executor.invokeAll(tasks);
                executor.shutdown();
                for (int i = 0; i < m; ++i) {
                    double score = results.get(i).get();
                    if(Double.isNaN(score)) continue;
                    if(Double.isInfinite(score)) continue;
                    minScore = Math.min(score, minScore);
                    maxScore = Math.max(score, maxScore);
                }
            } catch (InterruptedException | ExecutionException e) {
                e.printStackTrace();
            }
        }else{
            for(int i=0; i < m; ++i){
                double score = score_lof_sync(model, model.row(i));
                if(Double.isNaN(score)) continue;
                if(Double.isInfinite(score)) continue;
                minScore = Math.min(score, minScore);
                maxScore = Math.max(score, maxScore);
            }
        }

        if(automaticThresholding){
            adjustThreshold(model);
        }

        for(int i=0; i < m; ++i){
            DataRow tuple = model.row(i);
            double score_lof = evaluate(tuple, batch);
            tuple.setCategoricalTargetCell("cluster", score_lof > threshold ? "OUTLIER" : "NORMAL");
        }

        return this.model;
    }

    private class LOFTask implements Callable{
        private DataFrame batch;
        private DataRow tuple;
        private int minPts;

        public LOFTask(DataFrame batch, DataRow tuple, int minPts){
            this.batch = batch;
            this.tuple = tuple;
            this.minPts = minPts;
        }

        public Double call() throws Exception {
            double lof = local_outlier_factor(batch, tuple, minPts);
            return lof;
        }
    }

    private double score_lof_sync(DataFrame batch, DataRow tuple){
        double maxLOF = Double.NEGATIVE_INFINITY;

        for(int minPts = minPtsLB; minPts <= minPtsUB; ++minPts) { // the number of nearest neighbors used in defining the local neighborhood of the object.
            double lof = local_outlier_factor(batch, tuple, minPts);
            if(Double.isNaN(lof)) continue;
            maxLOF = Math.max(maxLOF, lof);
        }


        return maxLOF;
    }

    private double score_lof_async(DataFrame batch, DataRow tuple){
        if(!parallel){
            return score_lof_sync(batch, tuple);
        }

        double maxLOF = 0;

        ExecutorService executor = Executors.newFixedThreadPool(Math.min(8, minPtsUB - minPtsLB + 1));

        List tasks = new ArrayList();
        for(int minPts = minPtsLB; minPts <= minPtsUB; ++minPts) { // the number of nearest neighbors used in defining the local neighborhood of the object.
            tasks.add(new LOFTask(batch, tuple, minPts));
        }

        try {
            List > results = executor.invokeAll(tasks);
            executor.shutdown();
            for(int i=0; i < results.size(); ++i){
                double lof = results.get(i).get();
                if(Double.isNaN(lof)) continue;
                if(Double.isInfinite(lof)) continue;
                maxLOF = Math.max(maxLOF, lof);
            }
        } catch (InterruptedException | ExecutionException e) {
            logger.log(Level.SEVERE, "score_lof_async failed", e);
        }

        return maxLOF;
    }

    public double evaluate(DataRow tuple, DataFrame context){
        double score = score_lof_async(model, tuple);

        //logger.info(String.format("score: %f minScore: %f, maxScore: %f", score, minScore, maxScore));

        score -= minScore;
        if(score < 0) score = 0;

        score /= (maxScore - minScore);

        if(score > 1) score = 1;

        return score;
    }

    private double evaluate_sync(DataRow tuple, DataFrame batch){
        double score = score_lof_sync(batch, tuple);

        score -= minScore;
        if(score < 0) score = 0;

        score /= (maxScore - minScore);

        if(score > 1) score = 1;

        return score;
    }

    public double k_distance(DataFrame batch, DataRow o, int k){
        TupleTwo kth = DistanceMeasureService.getKthNearestNeighbor(batch, o, k, distanceMeasure);
        return kth._2();
    }

    private double reach_dist(DataFrame batch, DataRow p, DataRow o, int k){
        double distance_p_o = DistanceMeasureService.getDistance(batch, p, o, distanceMeasure);
        double distance_k_o = k_distance(batch, o, k);
        return Math.max(distance_k_o, distance_p_o);
    }

    private double local_reachability_density(DataFrame batch, DataRow p, int k){
        List> knn_p = DistanceMeasureService.getKNearestNeighbors(batch, p, k, distanceMeasure);
        double density = local_reachability_density(batch, p, k, knn_p);
        return density;
    }

    private double local_reachability_density(DataFrame batch, DataRow p, int k, List> knn_p){
        double sum_reach_dist = 0;
        for(TupleTwo o : knn_p){
            sum_reach_dist += reach_dist(batch, p, o._1(), k);
        }
        double density = 1 / (sum_reach_dist / knn_p.size());
        return density;
    }

    // the higher this value, the more likely the point is an outlier
    public double local_outlier_factor(DataFrame batch, DataRow p, int k){

        List> knn_p = DistanceMeasureService.getKNearestNeighbors(batch, p, k, distanceMeasure);
        double lrd_p = local_reachability_density(batch, p, k, knn_p);
        double sum_lrd = 0;
        for(TupleTwo o : knn_p){
            sum_lrd += local_reachability_density(batch, o._1(), k);
        }

        if(Double.isInfinite(sum_lrd) && Double.isInfinite(lrd_p)){
            return 1.0 / knn_p.size();
        }

        double lof = (sum_lrd / lrd_p) / knn_p.size();

        return lof;
    }

    private static class MinPtsBounds{
        private int lowerBound;
        private int upperBound;

        public void setLowerBound(int lowerBound) {
            this.lowerBound = lowerBound;
        }

        public void setUpperBound(int upperBound) {
            this.upperBound = upperBound;
        }

        public MinPtsBounds(int lowerBounds, int upperBounds){
            lowerBound = lowerBounds;
            upperBound = upperBounds;
        }

        public int getLowerBound(){
            return lowerBound;
        }

        public int getUpperBound(){
            return upperBound;
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy