
com.github.chen0040.lof.CBLOF Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-local-outlier-factor Show documentation
Show all versions of java-local-outlier-factor Show documentation
Java implementation of a number of Local Outlier Factor algorithms
package com.github.chen0040.lof;
import com.github.chen0040.data.frame.DataFrame;
import com.github.chen0040.data.frame.DataRow;
import com.github.chen0040.data.utils.CountRepository;
import com.github.chen0040.data.utils.discretizers.KMeansDiscretizer;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* Created by xschen on 18/8/15.
*/
@Getter
@Setter
public class CBLOF {
private ArrayList clusters;
@Setter(AccessLevel.NONE)
private KMeansDiscretizer discretizer;
private int split;
public double threshold;
public boolean parallel;
public boolean automaticThresholding;
public double anomalyRatioInAutomaticThresholding;
public double similarityThreshold;
public double alpha;
public double beta;
public void copy(CBLOF that){
split = that.split;
discretizer = that.discretizer == null ? null : that.discretizer.makeCopy();
clusters = null;
if(that.clusters != null){
clusters = new ArrayList<>();
for(int i=0; i < that.clusters.size(); ++i){
clusters.add((Cluster)that.clusters.get(i).clone());
}
}
}
public CBLOF makeCopy(){
CBLOF clone = new CBLOF();
clone.copy(this);
return clone;
}
public CBLOF(){
super();
KMeansDiscretizer d = new KMeansDiscretizer();
d.setMaxLevelCount(10);
discretizer = d;
threshold = 0.5;
alpha = 0.8;
beta = 0.1;
similarityThreshold = 0.8;
parallel =true;
automaticThresholding = false;
anomalyRatioInAutomaticThresholding = 0.05;
}
public boolean isAnomaly(DataRow tuple) {
double CBLOF = transform(tuple);
return CBLOF > threshold;
}
protected void adjustThreshold(DataFrame batch){
int m = batch.rowCount();
List orders = new ArrayList();
List probs = new ArrayList();
for(int i=0; i < m; ++i){
DataRow tuple = batch.row(i);
double prob = transform(tuple);
probs.add(prob);
orders.add(i);
}
final List probs2 = probs;
// sort descendingly by probability values
Collections.sort(orders, new Comparator() {
public int compare(Integer h1, Integer h2) {
double prob1 = probs2.get(h1);
double prob2 = probs2.get(h2);
return Double.compare(prob2, prob1);
}
});
int selected_position = (int)(anomalyRatioInAutomaticThresholding * orders.size());
int last_index = orders.get(orders.size() - 1);
int selected_index = -1;
if(selected_position >= orders.size()){
selected_position = orders.size() - 1;
selected_index = last_index; //setAttribute(THRESHOLD, probs.get(orders.get(orders.size() - 1)));
}
else {
selected_index = orders.get(selected_position);
}
while(probs.get(selected_index) == probs.get(last_index) && selected_position > 0){
selected_position--;
selected_index = orders.get(selected_position);
}
threshold = probs.get(selected_index);
}
public DataFrame fitAndTransform(DataFrame frame) {
DataFrame dataFrame = discretizer.fitAndTransform(frame);
runSqueezer(dataFrame, similarityThreshold);
// sort descendingly based on cluster size
Collections.sort(clusters, (o1, o2) -> Integer.compare(o2.size(), o1.size()));
for(int i=0; i < clusters.size(); ++i){
Cluster cluster = clusters.get(i);
cluster.setIndex(i);
//System.out.println("cluster[" +i +"].size: " + cluster.size());
}
int m = dataFrame.rowCount();
split = 0; // clusters with index < split will be the large clusters; otherwise small clusters
int accumulated_count = 0;
for(split = 0; split < clusters.size() - 1; ++split){
int current_cluster_size = clusters.get(split).size();
accumulated_count += current_cluster_size;
if(accumulated_count >= m * alpha && split != 0) break;
int next_cluster_size = clusters.get(split+1).size();
double ratio = (double)current_cluster_size / next_cluster_size;
if(ratio < beta && split != 0) break;
}
//System.out.println("split: "+split);
for(int i=0; i < m; ++i){
DataRow row = dataFrame.row(i);
Cluster c = clusters.get(Integer.parseInt(row.getCategoricalTargetCell("cluster")));
if(c.getIndex() > split){ // c belongs to small clusters
double minDistance = Double.MAX_VALUE;
for(int j=0; j <= split; ++j){
double distance = clusters.get(j).distance(row);
if(minDistance > distance){
minDistance = distance;
}
}
row.setTargetCell("CBLOF", c.size() * minDistance);
}else{
row.setTargetCell("CBLOF", c.size() * c.distance(row));
}
}
if(automaticThresholding){
adjustThreshold(dataFrame);
}
return dataFrame;
}
private void runSqueezer(DataFrame batch, double s){
clusters = new ArrayList<>();
int m = batch.rowCount();
for(int i=0; i < m; ++i){
DataRow row = batch.row(i);
if(i==0){
clusters.add(new Cluster(row));
}
else{
double maxSim = Double.MIN_VALUE;
Cluster closestCluster = null;
for(Cluster c : clusters){
double sim = c.similarity(row);
if(sim > maxSim){
maxSim = sim;
closestCluster = c;
}
}
if(maxSim < s){
clusters.add(new Cluster(row));
}else{
closestCluster.add(row);
}
}
}
}
// the higher the CBLOF, the more likely the tuple is an outlier
public double transform(DataRow row) {
row = discretizer.transform(row);
double CBLOF;
double maxSim = Double.MIN_VALUE;
Cluster closestCluster = null;
for(Cluster c : clusters){
double sim = c.similarity(row);
if(sim > maxSim){
maxSim = sim;
closestCluster = c;
}
}
assert closestCluster != null;
if(closestCluster.getIndex() > split){ // c belongs to small clusters
double minDistance = Double.MAX_VALUE;
for(int j=0; j <= split; ++j){
double distance = clusters.get(j).distance(row);
if(minDistance > distance){
minDistance = distance;
}
}
CBLOF = closestCluster.size() * minDistance;
}else{
CBLOF = closestCluster.size() * closestCluster.distance(row);
}
return CBLOF;
}
private class Cluster implements Cloneable {
private CountRepository counts;
private int totalCount;
private int index;
public void copy(Cluster rhs){
counts = rhs.counts.makeCopy();
totalCount = rhs.totalCount;
index = rhs.index;
}
@Override
public Object clone(){
Cluster clone = new Cluster();
clone.copy(this);
return clone;
}
public Cluster(){
counts = new CountRepository();
totalCount = 0;
}
public Cluster(DataRow tuple){
counts = new CountRepository();
totalCount = 0;
add(tuple);
}
public int getIndex(){
return this.index;
}
public void setIndex(int index){
this.index = index;
}
public void add(DataRow row){
List columnNames = row.getCategoricalColumnNames();
for(String columnName : columnNames){
String value = row.getCategoricalCell(columnName);
String eventName = columnName+"="+value;
counts.addSupportCount(eventName);
counts.addSupportCount(columnName);
}
row.setCategoricalTargetCell("cluster", "" + index);
totalCount++;
}
public int size(){
return totalCount;
}
public double similarity(DataRow row){
double similarity = 0;
List columnNames = row.getCategoricalColumnNames();
for(String columnName : columnNames){
String value = row.getCategoricalCell(columnName);
String eventName = columnName+"="+value;
double count_Ai = counts.getSupportCount(eventName);
double count_Total = counts.getSupportCount(columnName);
if(count_Total > 0) {
similarity += count_Ai / count_Total;
}
}
return similarity / columnNames.size();
}
public double distance(DataRow tuple){
double sim = similarity(tuple);
return 1 - sim;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy