weka.classifiers.mi.MITI Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of multiInstanceLearning Show documentation
A collection of multi-instance learning classifiers. Includes the Citation KNN method, several variants of the diverse density method, support vector machines for multi-instance learning, simple wrappers for applying standard propositional learners to multi-instance data, decision tree and rule learners, and some other methods.
The newest version!
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    MITI.java
 *    Copyright (C) 2011 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.classifiers.mi;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.RandomizableClassifier;
import weka.classifiers.mi.miti.AlgorithmConfiguration;
import weka.classifiers.mi.miti.Bag;
import weka.classifiers.mi.miti.NextSplitHeuristic;
import weka.classifiers.mi.miti.TreeNode;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.MultiInstanceCapabilitiesHandler;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;

/**
 *  MITI (Multi Instance Tree Inducer): multi-instance
 * classification based a decision tree learned using Blockeel et al.'s
 * algorithm. For more information, see

 * 

 * Hendrik Blockeel, David Page, Ashwin Srinivasan: Multi-instance Tree
 * Learning. In: Proceedings of the International Conference on Machine
 * Learning, 57-64, 2005.

 * 

 * Luke Bjerring, Eibe Frank: Beyond Trees: Adopting MITI to Learn Rules and
 * Ensemble Classifiers for Multi-instance Data. In: Proceedings of the
 * Australasian Joint Conference on Artificial Intelligence, 2011.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @inproceedings{Blockeel2005,
 *    author = {Hendrik Blockeel and David Page and Ashwin Srinivasan},
 *    booktitle = {Proceedings of the International Conference on Machine Learning},
 *    pages = {57-64},
 *    publisher = {ACM},
 *    title = {Multi-instance Tree Learning},
 *    year = {2005}
 * }
 * 
 * @inproceedings{Bjerring2011,
 *    author = {Luke Bjerring and Eibe Frank},
 *    booktitle = {Proceedings of the Australasian Joint Conference on Artificial Intelligence},
 *    publisher = {Springer},
 *    title = {Beyond Trees: Adopting MITI to Learn Rules and Ensemble Classifiers for Multi-instance Data},
 *    year = {2011}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -M [1|2|3]
 *  The method used to determine best split:
 *  1. Gini; 2. MaxBEPP; 3. SSBEPP
 * 
 * 
 *  * -K [kBEPPConstant]
 *  The constant used in the tozero() hueristic
 * 
 * 
 *  * -L
 *  Scales the value of K to the size of the bags
 * 
 * 
 *  * -U
 *  Use unbiased estimate rather than BEPP, i.e. UEPP.
 * 
 * 
 *  * -B
 *  Uses the instances present for the bag counts at each node when splitting,
 *  weighted according to 1 - Ba ^ n, where n is the number of instances
 *  present which belong to the bag, and Ba is another parameter (default 0.5)
 * 
 * 
 *  * -Ba [multiplier]
 *  Multiplier for count influence of a bag based on the number of its instances
 * 
 * 
 *  * -A [number of attributes]
 *  The number of randomly selected attributes to split
 *  -1: All attributes
 *  -2: square root of the total number of attributes
 * 
 * 
 *  * -An [number of splits]
 *  The number of top scoring attribute splits to randomly pick from
 *  -1: All splits (completely random selection)
 *  -2: square root of the number of splits
 * 
 * 
 *  * -S <num>
 *  Random number seed.
 *  (default 1)
 * 
 * 
 *  * -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * 
 * 
 * 
 * 
 * @author Luke Bjerring
 * @author Eibe Frank
 */
public class MITI extends RandomizableClassifier implements OptionHandler,
  AdditionalMeasureProducer, TechnicalInformationHandler,
  MultiInstanceCapabilitiesHandler {

  /** for serialization */
  static final long serialVersionUID = -217735168397644244L;

  // Reference to the actual tree
  protected MultiInstanceDecisionTree tree;

  // Used to select the split selection measure.
  public static final int SPLITMETHOD_GINI = 1;
  public static final int SPLITMETHOD_MAXBEPP = 2;
  public static final int SPLITMETHOD_SSBEPP = 3;

  public static final Tag[] TAGS_SPLITMETHOD = {
    new Tag(SPLITMETHOD_GINI, "Gini: E * (1 - E)"),
    new Tag(SPLITMETHOD_MAXBEPP, "MaxBEPP: E"),
    new Tag(SPLITMETHOD_SSBEPP, "Sum Squared BEPP: E * E") };

  // The chosen splitting method.
  protected int m_SplitMethod = SPLITMETHOD_MAXBEPP;

  // Wether to scale based on the number of instances
  protected boolean m_scaleK = false;

  // Whether to use bag-based statistics for subset scoring
  protected boolean m_useBagCount = false;

  // Whether to use BEPP or EPP
  protected boolean m_unbiasedEstimate = false;

  // The constant used in BEPP
  protected int m_kBEPPConstant = 5;

  // The number of random attributes to consider for splitting
  protected int m_AttributesToSplit = -1;

  // The number of top-N attributes to choose from randomly
  protected int m_AttributeSplitChoices = 1;

  // Determines the influence of the number of instances in a bag that are
  // present in a subset when applying bag-based statistics.
  protected double m_bagInstanceMultiplier = 0.5;

  /**
   * Returns a string describing classifier
   * 
   * @return a description suitable for displaying in the explorer/experimenter
   *         gui
   */
  public String globalInfo() {

    return "MITI (Multi Instance Tree Inducer): multi-instance classification "
      + " based a decision tree learned using Blockeel et al.'s algorithm. For more "
      + "information, see\n\n" + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {

    TechnicalInformation result;
    TechnicalInformation additional;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR,
      "Hendrik Blockeel and David Page and Ashwin Srinivasan");
    result.setValue(Field.TITLE, "Multi-instance Tree Learning");
    result.setValue(Field.BOOKTITLE,
      "Proceedings of the International Conference on Machine Learning");
    result.setValue(Field.YEAR, "2005");
    result.setValue(Field.PAGES, "57-64");
    result.setValue(Field.PUBLISHER, "ACM");

    additional = result.add(Type.INPROCEEDINGS);
    additional.setValue(Field.AUTHOR, "Luke Bjerring and Eibe Frank");
    additional
      .setValue(
        Field.TITLE,
        "Beyond Trees: Adopting MITI to Learn Rules and Ensemble Classifiers for Multi-instance Data");
    additional
      .setValue(Field.BOOKTITLE,
        "Proceedings of the Australasian Joint Conference on Artificial Intelligence");
    additional.setValue(Field.YEAR, "2011");
    additional.setValue(Field.PUBLISHER, "Springer");

    return result;
  }

  /**
   * Returns the capabilities of this classifier.
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.RELATIONAL_ATTRIBUTES);
    result.disable(Capability.MISSING_VALUES);

    // class
    result.disableAllClasses();
    result.disableAllClassDependencies();
    result.enable(Capability.BINARY_CLASS);

    // Only multi instance data
    result.enable(Capability.ONLY_MULTIINSTANCE);

    return result;
  }

  /**
   * Returns the capabilities of this multi-instance classifier for the
   * relational data.
   * 
   * @return the capabilities of this object
   * @see Capabilities
   */
  @Override
  public Capabilities getMultiInstanceCapabilities() {
    Capabilities result = super.getCapabilities();

    // class
    result.disableAllClasses();
    result.enable(Capability.NO_CLASS);

    return result;
  }

  /**
   * Learns the classifier from the training data.
   */
  @Override
  public void buildClassifier(Instances trainingData) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(trainingData);

    tree = new MultiInstanceDecisionTree(trainingData);
  }

  /**
   * Returns an enumeration of the additional measure names.
   * 
   * @return an enumeration of the measure names
   */
  @Override
  public Enumeration enumerateMeasures() {

    Vector newVector = new Vector(3);
    newVector.addElement("measureNumRules");
    newVector.addElement("measureNumPositiveRules");
    newVector.addElement("measureNumConditionsInPositiveRules");
    return newVector.elements();
  }

  /**
   * Returns the value of the named measure.
   * 
   * @param additionalMeasureName the name of the measure to query for its value
   * @return the value of the named measure
   * @throws IllegalArgumentException if the named measure is not supported
   */
  @Override
  public double getMeasure(String additionalMeasureName) {

    if (additionalMeasureName.equalsIgnoreCase("measureNumRules")) {
      return tree.getNumLeaves();
    }
    if (additionalMeasureName.equalsIgnoreCase("measureNumPositiveRules")) {
      return tree.numPosRulesAndNumPosConditions()[0];
    }
    if (additionalMeasureName
      .equalsIgnoreCase("measureNumConditionsInPositiveRules")) {
      return tree.numPosRulesAndNumPosConditions()[1];
    } else {
      throw new IllegalArgumentException(additionalMeasureName
        + " not supported (MultiInstanceRuleLearner)");
    }
  }

  /**
   * Returns the "class distribution" for the given bag.
   */
  @Override
  public double[] distributionForInstance(Instance newBag) throws Exception {

    double[] distribution = new double[2];
    Instances contents = newBag.relationalValue(1);
    boolean positive = false;
    for (Instance i : contents) {
      if (tree.isPositive(i)) {
        positive = true;
        break;
      }
    }

    distribution[1] = positive ? 1 : 0;
    distribution[0] = 1 - distribution[1];

    return distribution;
  }

  /**
   * Class for learning and representing the tree.
   */
  protected class MultiInstanceDecisionTree implements Serializable {

    /** ID added to avoid warning */
    private static final long serialVersionUID = 4037700809781784985L;

    // The root of the tree.
    private TreeNode root;

    // A hash map that tell us to which bag a particular instance belongs
    private final HashMap m_instanceBags;

    // The number of leaves in the tree
    private int numLeaves = 0;

    // Returns the number of leaves in the tree
    public int getNumLeaves() {
      return numLeaves;
    }

    /**
     * Constructs the tree from the given set of instances.
     */
    protected MultiInstanceDecisionTree(Instances instances) {

      m_instanceBags = new HashMap();
      ArrayList all = new ArrayList();
      double totalInstances = 0;
      double totalBags = 0;
      for (Instance i : instances) {
        Bag bag = new Bag(i);
        for (Instance bagged : bag.instances()) {
          m_instanceBags.put(bagged, bag);
          all.add(bagged);
        }
        totalBags++;
        totalInstances += bag.instances().numInstances();
      }

      double b_multiplier = totalInstances / totalBags;
      if (m_scaleK) {
        for (Bag bag : m_instanceBags.values()) {
          bag.setBagWeightMultiplier(b_multiplier);
        }
      }

      makeTree(m_instanceBags, all, false);
    }

    /**
     * Constructs tree based on given arguments.
     */
    public MultiInstanceDecisionTree(HashMap instanceBags,
      ArrayList all, boolean stopOnFirstPositiveLeaf) {

      m_instanceBags = instanceBags;
      makeTree(instanceBags, all, stopOnFirstPositiveLeaf);
    }

    /**
     * Method that actually makes the tree.
     */
    private void makeTree(HashMap instanceBags,
      ArrayList all, boolean stopOnFirstPositiveLeaf) {

      Random r = new Random(getSeed());

      AlgorithmConfiguration settings = getSettings();

      ArrayList toSplit = new ArrayList();

      root = new TreeNode(null, all);
      toSplit.add(root);
      numLeaves = 0;

      while (toSplit.size() > 0) {

        // The next two lines are here solely to reproduce the results from the
        // paper
        // (i.e. so that the same random number sequence is used.
        int nextIndex = Math.min(1, toSplit.size());
        nextIndex = r.nextInt(nextIndex);

        TreeNode next = toSplit.remove(nextIndex);
        if (next == null) {
          continue;
        }

        if (next.isPurePositive(instanceBags)) {
          next.makeLeafNode(true);
          ArrayList deactivated = new ArrayList();
          next.deactivateRelatedInstances(instanceBags, deactivated);

          if (m_Debug && deactivated.size() > 0) {
            Bag.printDeactivatedInstances(deactivated);
          }

          // Need to re-calculate scores if positive leaf has been
          // created
          for (TreeNode n : toSplit) {
            n.removeDeactivatedInstances(instanceBags);
            n.calculateNodeScore(instanceBags, m_unbiasedEstimate,
              m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
          }

          if (stopOnFirstPositiveLeaf && deactivated.size() > 0) {
            return;
          }

        } else if (next.isPureNegative(instanceBags)) {
          next.makeLeafNode(false);
        } else {
          next.splitInstances(instanceBags, settings, r, m_Debug);
          if (!next.isLeafNode()) {
            if (next.split.isNominal) {
              TreeNode[] nominals = next.nominals();
              for (TreeNode nominal : nominals) {
                nominal.calculateNodeScore(instanceBags, m_unbiasedEstimate,
                  m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
                toSplit.add(nominal);
              }
            } else {
              next.left().calculateNodeScore(instanceBags, m_unbiasedEstimate,
                m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
              toSplit.add(next.left());
              next.right().calculateNodeScore(instanceBags, m_unbiasedEstimate,
                m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
              toSplit.add(next.right());
            }
          } else {
            // Need to re-calculate scores if positive leaf has been
            // created
            if (next.isPositiveLeaf()) {
              for (TreeNode n : toSplit) {
                n.removeDeactivatedInstances(instanceBags);
                n.calculateNodeScore(instanceBags, m_unbiasedEstimate,
                  m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
              }

              if (stopOnFirstPositiveLeaf) {
                return;
              }
            }
          }
        }

        // Increment number of leaves if necessary
        if (next.isLeafNode()) {
          numLeaves++;
        }

        // Re-evaluate the best next node, because we've most likely
        // added new nodes or disabled bags
        Comparator sh = Collections
          .reverseOrder(new NextSplitHeuristic());
        Collections.sort(toSplit, sh);
      }

      if (m_Debug) {
        System.out.println(root.render(1, instanceBags));
      }
    }

    /**
     * Is instance positive given tree?
     */
    protected boolean isPositive(Instance i) {
      TreeNode leaf = traverseTree(i);
      return leaf != null && leaf.isPositiveLeaf();
    }

    /**
     * Traverse to a leaf for the given instance.
     */
    private TreeNode traverseTree(Instance i) {
      TreeNode next = root;
      while (next != null && !next.isLeafNode()) {
        Attribute a = next.split.attribute;
        if (a.isNominal()) {
          next = next.nominals()[(int) i.value(a)];
        } else {
          if (i.value(a) < next.split.splitPoint) {
            next = next.left();
          } else {
            next = next.right();
          }
        }
      }
      return next;
    }

    /**
     * Render the tree as a string.
     */
    public String render() {
      return root.render(0, m_instanceBags);
    }

    /**
     * Trim negative branches for MIRI's parial trees.
     */
    public boolean trimNegativeBranches() {
      return root.trimNegativeBranches();
    }

    /**
     * Determines the number of positive rules and the number of conditions used
     * in the positive rules, for the given subtree.
     */
    public int[] numPosRulesAndNumPosConditions() {

      return numPosRulesAndNumPosConditions(root);
    }

    /**
     * Determines the number of positive rules and the number of conditions used
     * in the positive rules, for the given subtree.
     */
    private int[] numPosRulesAndNumPosConditions(TreeNode next) {

      int[] numPosRulesAndNumPosConditions = new int[2];
      if ((next != null) && next.isLeafNode()) {

        // Do we have a positive leaf node? Then there's one positive rule.
        if (next.isPositiveLeaf()) {
          numPosRulesAndNumPosConditions[0] = 1;
        }
      } else if (next != null) {

        // We must be at an internal node
        Attribute a = next.split.attribute;
        int[] fromBelow = null;
        if (a.isNominal()) {
          for (TreeNode child : next.nominals()) {
            fromBelow = numPosRulesAndNumPosConditions(child);

            // Need to keep track of the number of positive rules
            numPosRulesAndNumPosConditions[0] += fromBelow[0];

            // One test is added for each positive rule
            numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];
          }
        } else {
          fromBelow = numPosRulesAndNumPosConditions(next.left());

          // Need to keep track of the number of positive rules
          numPosRulesAndNumPosConditions[0] += fromBelow[0];

          // One test is added for each positive rule
          numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];

          fromBelow = numPosRulesAndNumPosConditions(next.right());

          // Need to keep track of the number of positive rules
          numPosRulesAndNumPosConditions[0] += fromBelow[0];

          // One test is added for each positive rule
          numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];
        }
      }
      return numPosRulesAndNumPosConditions;
    }
  }

  /**
   * Gets the user-specified settings as a configuration object.
   */
  protected AlgorithmConfiguration getSettings() {
    return new AlgorithmConfiguration(m_SplitMethod, m_unbiasedEstimate,
      m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier,
      m_AttributesToSplit, m_AttributeSplitChoices);
  }

  /**
   * Lists the options for this classifier. Valid options are:  Valid options are:
   * 
   * 
   * 
   * -M [1|2|3]
   *  The method used to determine best split:
   *  1. Gini; 2. MaxBEPP; 3. SSBEPP
   * 
   * 
   *    * -K [kBEPPConstant]
   *  The constant used in the tozero() hueristic
   * 
   * 
   *    * -L
   *  Scales the value of K to the size of the bags
   * 
   * 
   *    * -U
   *  Use unbiased estimate rather than BEPP, i.e. UEPP.
   * 
   * 
   *    * -B
   *  Uses the instances present for the bag counts at each node when splitting,
   *  weighted according to 1 - Ba ^ n, where n is the number of instances
   *  present which belong to the bag, and Ba is another parameter (default 0.5)
   * 
   * 
   *    * -Ba [multiplier]
   *  Multiplier for count influence of a bag based on the number of its instances
   * 
   * 
   *    * -A [number of attributes]
   *  The number of randomly selected attributes to split
   *  -1: All attributes
   *  -2: square root of the total number of attributes
   * 
   * 
   *    * -An [number of splits]
   *  The number of top scoring attribute splits to randomly pick from
   *  -1: All splits (completely random selection)
   *  -2: square root of the number of splits
   * 
   * 
   *    * -S <num>
   *  Random number seed.
   *  (default 1)
   * 
   * 
   *    * -D
   *  If set, classifier is run in debug mode and
   *  may output additional info to the console
   * 
   * 
   * 
   */
  @Override
  public Enumeration