weka.classifiers.misc.IsolationForest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of isolationForest Show documentation
Show all versions of isolationForest Show documentation
Class for building and using a classifier built on the Isolation Forest anomaly detection algorithm. For more information see Fei Tony Liu, Kai Ming Ting and Zhi-Hua Zhou. 2008. Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 413-422.
The newest version!
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* IsolationForest.java
* Copyright (C) 2012-16 University of Waikato, Hamilton, New Zealand
*
*/
package weka.classifiers.misc;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.classifiers.RandomizableClassifier;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
/**
*
* Implements the isolation forest method for anomaly detection.
*
* Note that this classifier is designed for anomaly detection, it is not designed for solving two-class or multi-class classification problems!
*
* The data is expected to have have a class attribute with one or two values, which is ignored at training time. The distributionForInstance() method returns (1 - anomaly score) as the first element in the distribution, the second element (in the case of two classes) is the anomaly score.
*
* To evaluate performance of this method for a dataset where anomalies are known, simply code the anomalies using the class attribute: normal cases should correspond to the first value of the class attribute, anomalies to the second one.
*
* For more information, see:
*
* Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou: Isolation Forest. In: ICDM, 413-422, 2008.
*
*
*
*
* BibTeX:
*
* @inproceedings{Liu2008,
* author = {Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou},
* booktitle = {ICDM},
* pages = {413-422},
* publisher = {IEEE Computer Society},
* title = {Isolation Forest},
* year = {2008}
* }
*
*
*
*
*
* Valid options are:
*
*
-I <number of trees>
* The number of trees in the forest (default 100).
*
* -N <the size of the subsample for each tree>
* The subsample size for each tree (default 256).
*
* -S <num>
* Random number seed.
* (default 1)
*
* -output-debug-info
* If set, classifier is run in debug mode and
* may output additional info to the console
*
* -do-not-check-capabilities
* If set, classifier capabilities are not checked before classifier is built
* (use with caution).
*
* -num-decimal-places
* The number of decimal places for the output of numbers in the model (default 2).
*
*
*
* @author Eibe Frank ([email protected])
* @version $Revision: 12345 $
*/
public class IsolationForest extends RandomizableClassifier implements
TechnicalInformationHandler, Serializable {
// For serialization
private static final long serialVersionUID = 5586674623147772788L;
// The set of trees
protected Tree[] m_trees = null;
// The number of trees
protected int m_numTrees = 100;
// The subsample size
protected int m_subsampleSize = 256;
/**
* Returns a string describing this filter
*/
public String globalInfo() {
return "Implements the isolation forest method for anomaly detection.\n\n"
+ "Note that this classifier is designed for anomaly detection, it is not designed for solving "
+ "two-class or multi-class classification problems!\n\n"
+ "The data is expected to have have a class attribute with one or two values, "
+ "which is ignored at training time. The distributionForInstance() "
+ "method returns (1 - anomaly score) as the first element in the distribution, "
+ "the second element (in the case of two classes) is the anomaly score.\n\nTo evaluate performance "
+ "of this method for a dataset where anomalies are known, simply "
+ "code the anomalies using the class attribute: normal cases should "
+ "correspond to the first value of the class attribute, anomalies to "
+ "the second one." + "\n\nFor more information, see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR,
"Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou");
result.setValue(Field.TITLE, "Isolation Forest");
result.setValue(Field.BOOKTITLE, "ICDM");
result.setValue(Field.YEAR, "2008");
result.setValue(Field.PAGES, "413-422");
result.setValue(Field.PUBLISHER, "IEEE Computer Society");
return result;
}
/**
* Returns the Capabilities of this filter.
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
// class
result.enable(Capability.UNARY_CLASS);
result.enable(Capability.BINARY_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
// instances
result.setMinimumNumberInstances(0);
return result;
}
/**
* Returns brief description of the classifier.
*/
@Override
public String toString() {
if (m_trees == null) {
return "No model built yet.";
} else {
return "Isolation forest for anomaly detection (" + m_numTrees + ", "
+ m_subsampleSize + ")";
}
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String numTreesTipText() {
return "The number of trees to use in the forest.";
}
/**
* Get the value of numTrees.
*
* @return Value of numTrees.
*/
public int getNumTrees() {
return m_numTrees;
}
/**
* Set the value of numTrees.
*
* @param k value to assign to numTrees.
*/
public void setNumTrees(int k) {
m_numTrees = k;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String subsampleSizeTipText() {
return "The size of the subsample used to build each tree.";
}
/**
* Get the value of subsampleSize.
*
* @return Value of subsampleSize.
*/
public int getSubsampleSize() {
return m_subsampleSize;
}
/**
* Set the value of subsampleSize.
*
* @param n value to assign to subsampleSize.
*/
public void setSubsampleSize(int n) {
m_subsampleSize = n;
}
/**
* Lists the command-line options for this classifier.
*
* @return an enumeration over all possible options
*/
@Override
public Enumeration