All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.classifiers.misc.IsolationForest Maven / Gradle / Ivy

Go to download

Class for building and using a classifier built on the Isolation Forest anomaly detection algorithm. For more information see Fei Tony Liu, Kai Ming Ting and Zhi-Hua Zhou. 2008. Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 413-422.

The newest version!
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    IsolationForest.java
 *    Copyright (C) 2012-16 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.classifiers.misc;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.RandomizableClassifier;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;

/**
 * 
 * Implements the isolation forest method for anomaly detection.
*
* Note that this classifier is designed for anomaly detection, it is not designed for solving two-class or multi-class classification problems!
*
* The data is expected to have have a class attribute with one or two values, which is ignored at training time. The distributionForInstance() method returns (1 - anomaly score) as the first element in the distribution, the second element (in the case of two classes) is the anomaly score.
*
* To evaluate performance of this method for a dataset where anomalies are known, simply code the anomalies using the class attribute: normal cases should correspond to the first value of the class attribute, anomalies to the second one.
*
* For more information, see:
*
* Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou: Isolation Forest. In: ICDM, 413-422, 2008. *

* * * * BibTeX: *
 * @inproceedings{Liu2008,
 *    author = {Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou},
 *    booktitle = {ICDM},
 *    pages = {413-422},
 *    publisher = {IEEE Computer Society},
 *    title = {Isolation Forest},
 *    year = {2008}
 * }
 * 
*

* * * * Valid options are:

* *

 -I <number of trees>
 *  The number of trees in the forest (default 100).
* *
 -N <the size of the subsample for each tree>
 *  The subsample size for each tree (default 256).
* *
 -S <num>
 *  Random number seed.
 *  (default 1)
* *
 -output-debug-info
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
* *
 -do-not-check-capabilities
 *  If set, classifier capabilities are not checked before classifier is built
 *  (use with caution).
* *
 -num-decimal-places
 *  The number of decimal places for the output of numbers in the model (default 2).
* * * * @author Eibe Frank ([email protected]) * @version $Revision: 12345 $ */ public class IsolationForest extends RandomizableClassifier implements TechnicalInformationHandler, Serializable { // For serialization private static final long serialVersionUID = 5586674623147772788L; // The set of trees protected Tree[] m_trees = null; // The number of trees protected int m_numTrees = 100; // The subsample size protected int m_subsampleSize = 256; /** * Returns a string describing this filter */ public String globalInfo() { return "Implements the isolation forest method for anomaly detection.\n\n" + "Note that this classifier is designed for anomaly detection, it is not designed for solving " + "two-class or multi-class classification problems!\n\n" + "The data is expected to have have a class attribute with one or two values, " + "which is ignored at training time. The distributionForInstance() " + "method returns (1 - anomaly score) as the first element in the distribution, " + "the second element (in the case of two classes) is the anomaly score.\n\nTo evaluate performance " + "of this method for a dataset where anomalies are known, simply " + "code the anomalies using the class attribute: normal cases should " + "correspond to the first value of the class attribute, anomalies to " + "the second one." + "\n\nFor more information, see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou"); result.setValue(Field.TITLE, "Isolation Forest"); result.setValue(Field.BOOKTITLE, "ICDM"); result.setValue(Field.YEAR, "2008"); result.setValue(Field.PAGES, "413-422"); result.setValue(Field.PUBLISHER, "IEEE Computer Society"); return result; } /** * Returns the Capabilities of this filter. */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); // class result.enable(Capability.UNARY_CLASS); result.enable(Capability.BINARY_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(0); return result; } /** * Returns brief description of the classifier. */ @Override public String toString() { if (m_trees == null) { return "No model built yet."; } else { return "Isolation forest for anomaly detection (" + m_numTrees + ", " + m_subsampleSize + ")"; } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String numTreesTipText() { return "The number of trees to use in the forest."; } /** * Get the value of numTrees. * * @return Value of numTrees. */ public int getNumTrees() { return m_numTrees; } /** * Set the value of numTrees. * * @param k value to assign to numTrees. */ public void setNumTrees(int k) { m_numTrees = k; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String subsampleSizeTipText() { return "The size of the subsample used to build each tree."; } /** * Get the value of subsampleSize. * * @return Value of subsampleSize. */ public int getSubsampleSize() { return m_subsampleSize; } /** * Set the value of subsampleSize. * * @param n value to assign to subsampleSize. */ public void setSubsampleSize(int n) { m_subsampleSize = n; } /** * Lists the command-line options for this classifier. * * @return an enumeration over all possible options */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy