weka.classifiers.mi.MITI Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of multiInstanceLearning Show documentation
Show all versions of multiInstanceLearning Show documentation
A collection of multi-instance learning classifiers. Includes the Citation KNN method, several variants of the diverse density method, support vector machines for multi-instance learning, simple wrappers for applying standard propositional learners to multi-instance data, decision tree and rule learners, and some other methods.
The newest version!
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* MITI.java
* Copyright (C) 2011 University of Waikato, Hamilton, New Zealand
*
*/
package weka.classifiers.mi;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Random;
import java.util.Vector;
import weka.classifiers.RandomizableClassifier;
import weka.classifiers.mi.miti.AlgorithmConfiguration;
import weka.classifiers.mi.miti.Bag;
import weka.classifiers.mi.miti.NextSplitHeuristic;
import weka.classifiers.mi.miti.TreeNode;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.MultiInstanceCapabilitiesHandler;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
/**
* MITI (Multi Instance Tree Inducer): multi-instance
* classification based a decision tree learned using Blockeel et al.'s
* algorithm. For more information, see
*
* Hendrik Blockeel, David Page, Ashwin Srinivasan: Multi-instance Tree
* Learning. In: Proceedings of the International Conference on Machine
* Learning, 57-64, 2005.
*
* Luke Bjerring, Eibe Frank: Beyond Trees: Adopting MITI to Learn Rules and
* Ensemble Classifiers for Multi-instance Data. In: Proceedings of the
* Australasian Joint Conference on Artificial Intelligence, 2011.
*
*
*
* BibTeX:
*
*
* @inproceedings{Blockeel2005,
* author = {Hendrik Blockeel and David Page and Ashwin Srinivasan},
* booktitle = {Proceedings of the International Conference on Machine Learning},
* pages = {57-64},
* publisher = {ACM},
* title = {Multi-instance Tree Learning},
* year = {2005}
* }
*
* @inproceedings{Bjerring2011,
* author = {Luke Bjerring and Eibe Frank},
* booktitle = {Proceedings of the Australasian Joint Conference on Artificial Intelligence},
* publisher = {Springer},
* title = {Beyond Trees: Adopting MITI to Learn Rules and Ensemble Classifiers for Multi-instance Data},
* year = {2011}
* }
*
*
*
*
* Valid options are:
*
*
*
* -M [1|2|3]
* The method used to determine best split:
* 1. Gini; 2. MaxBEPP; 3. SSBEPP
*
*
*
* -K [kBEPPConstant]
* The constant used in the tozero() hueristic
*
*
*
* -L
* Scales the value of K to the size of the bags
*
*
*
* -U
* Use unbiased estimate rather than BEPP, i.e. UEPP.
*
*
*
* -B
* Uses the instances present for the bag counts at each node when splitting,
* weighted according to 1 - Ba ^ n, where n is the number of instances
* present which belong to the bag, and Ba is another parameter (default 0.5)
*
*
*
* -Ba [multiplier]
* Multiplier for count influence of a bag based on the number of its instances
*
*
*
* -A [number of attributes]
* The number of randomly selected attributes to split
* -1: All attributes
* -2: square root of the total number of attributes
*
*
*
* -An [number of splits]
* The number of top scoring attribute splits to randomly pick from
* -1: All splits (completely random selection)
* -2: square root of the number of splits
*
*
*
* -S <num>
* Random number seed.
* (default 1)
*
*
*
* -D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
*
*
*
* @author Luke Bjerring
* @author Eibe Frank
*/
public class MITI extends RandomizableClassifier implements OptionHandler,
AdditionalMeasureProducer, TechnicalInformationHandler,
MultiInstanceCapabilitiesHandler {
/** for serialization */
static final long serialVersionUID = -217735168397644244L;
// Reference to the actual tree
protected MultiInstanceDecisionTree tree;
// Used to select the split selection measure.
public static final int SPLITMETHOD_GINI = 1;
public static final int SPLITMETHOD_MAXBEPP = 2;
public static final int SPLITMETHOD_SSBEPP = 3;
public static final Tag[] TAGS_SPLITMETHOD = {
new Tag(SPLITMETHOD_GINI, "Gini: E * (1 - E)"),
new Tag(SPLITMETHOD_MAXBEPP, "MaxBEPP: E"),
new Tag(SPLITMETHOD_SSBEPP, "Sum Squared BEPP: E * E") };
// The chosen splitting method.
protected int m_SplitMethod = SPLITMETHOD_MAXBEPP;
// Wether to scale based on the number of instances
protected boolean m_scaleK = false;
// Whether to use bag-based statistics for subset scoring
protected boolean m_useBagCount = false;
// Whether to use BEPP or EPP
protected boolean m_unbiasedEstimate = false;
// The constant used in BEPP
protected int m_kBEPPConstant = 5;
// The number of random attributes to consider for splitting
protected int m_AttributesToSplit = -1;
// The number of top-N attributes to choose from randomly
protected int m_AttributeSplitChoices = 1;
// Determines the influence of the number of instances in a bag that are
// present in a subset when applying bag-based statistics.
protected double m_bagInstanceMultiplier = 0.5;
/**
* Returns a string describing classifier
*
* @return a description suitable for displaying in the explorer/experimenter
* gui
*/
public String globalInfo() {
return "MITI (Multi Instance Tree Inducer): multi-instance classification "
+ " based a decision tree learned using Blockeel et al.'s algorithm. For more "
+ "information, see\n\n" + getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR,
"Hendrik Blockeel and David Page and Ashwin Srinivasan");
result.setValue(Field.TITLE, "Multi-instance Tree Learning");
result.setValue(Field.BOOKTITLE,
"Proceedings of the International Conference on Machine Learning");
result.setValue(Field.YEAR, "2005");
result.setValue(Field.PAGES, "57-64");
result.setValue(Field.PUBLISHER, "ACM");
additional = result.add(Type.INPROCEEDINGS);
additional.setValue(Field.AUTHOR, "Luke Bjerring and Eibe Frank");
additional
.setValue(
Field.TITLE,
"Beyond Trees: Adopting MITI to Learn Rules and Ensemble Classifiers for Multi-instance Data");
additional
.setValue(Field.BOOKTITLE,
"Proceedings of the Australasian Joint Conference on Artificial Intelligence");
additional.setValue(Field.YEAR, "2011");
additional.setValue(Field.PUBLISHER, "Springer");
return result;
}
/**
* Returns the capabilities of this classifier.
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.RELATIONAL_ATTRIBUTES);
result.disable(Capability.MISSING_VALUES);
// class
result.disableAllClasses();
result.disableAllClassDependencies();
result.enable(Capability.BINARY_CLASS);
// Only multi instance data
result.enable(Capability.ONLY_MULTIINSTANCE);
return result;
}
/**
* Returns the capabilities of this multi-instance classifier for the
* relational data.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getMultiInstanceCapabilities() {
Capabilities result = super.getCapabilities();
// class
result.disableAllClasses();
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Learns the classifier from the training data.
*/
@Override
public void buildClassifier(Instances trainingData) throws Exception {
// can classifier handle the data?
getCapabilities().testWithFail(trainingData);
tree = new MultiInstanceDecisionTree(trainingData);
}
/**
* Returns an enumeration of the additional measure names.
*
* @return an enumeration of the measure names
*/
@Override
public Enumeration enumerateMeasures() {
Vector newVector = new Vector(3);
newVector.addElement("measureNumRules");
newVector.addElement("measureNumPositiveRules");
newVector.addElement("measureNumConditionsInPositiveRules");
return newVector.elements();
}
/**
* Returns the value of the named measure.
*
* @param additionalMeasureName the name of the measure to query for its value
* @return the value of the named measure
* @throws IllegalArgumentException if the named measure is not supported
*/
@Override
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.equalsIgnoreCase("measureNumRules")) {
return tree.getNumLeaves();
}
if (additionalMeasureName.equalsIgnoreCase("measureNumPositiveRules")) {
return tree.numPosRulesAndNumPosConditions()[0];
}
if (additionalMeasureName
.equalsIgnoreCase("measureNumConditionsInPositiveRules")) {
return tree.numPosRulesAndNumPosConditions()[1];
} else {
throw new IllegalArgumentException(additionalMeasureName
+ " not supported (MultiInstanceRuleLearner)");
}
}
/**
* Returns the "class distribution" for the given bag.
*/
@Override
public double[] distributionForInstance(Instance newBag) throws Exception {
double[] distribution = new double[2];
Instances contents = newBag.relationalValue(1);
boolean positive = false;
for (Instance i : contents) {
if (tree.isPositive(i)) {
positive = true;
break;
}
}
distribution[1] = positive ? 1 : 0;
distribution[0] = 1 - distribution[1];
return distribution;
}
/**
* Class for learning and representing the tree.
*/
protected class MultiInstanceDecisionTree implements Serializable {
/** ID added to avoid warning */
private static final long serialVersionUID = 4037700809781784985L;
// The root of the tree.
private TreeNode root;
// A hash map that tell us to which bag a particular instance belongs
private final HashMap m_instanceBags;
// The number of leaves in the tree
private int numLeaves = 0;
// Returns the number of leaves in the tree
public int getNumLeaves() {
return numLeaves;
}
/**
* Constructs the tree from the given set of instances.
*/
protected MultiInstanceDecisionTree(Instances instances) {
m_instanceBags = new HashMap();
ArrayList all = new ArrayList();
double totalInstances = 0;
double totalBags = 0;
for (Instance i : instances) {
Bag bag = new Bag(i);
for (Instance bagged : bag.instances()) {
m_instanceBags.put(bagged, bag);
all.add(bagged);
}
totalBags++;
totalInstances += bag.instances().numInstances();
}
double b_multiplier = totalInstances / totalBags;
if (m_scaleK) {
for (Bag bag : m_instanceBags.values()) {
bag.setBagWeightMultiplier(b_multiplier);
}
}
makeTree(m_instanceBags, all, false);
}
/**
* Constructs tree based on given arguments.
*/
public MultiInstanceDecisionTree(HashMap instanceBags,
ArrayList all, boolean stopOnFirstPositiveLeaf) {
m_instanceBags = instanceBags;
makeTree(instanceBags, all, stopOnFirstPositiveLeaf);
}
/**
* Method that actually makes the tree.
*/
private void makeTree(HashMap instanceBags,
ArrayList all, boolean stopOnFirstPositiveLeaf) {
Random r = new Random(getSeed());
AlgorithmConfiguration settings = getSettings();
ArrayList toSplit = new ArrayList();
root = new TreeNode(null, all);
toSplit.add(root);
numLeaves = 0;
while (toSplit.size() > 0) {
// The next two lines are here solely to reproduce the results from the
// paper
// (i.e. so that the same random number sequence is used.
int nextIndex = Math.min(1, toSplit.size());
nextIndex = r.nextInt(nextIndex);
TreeNode next = toSplit.remove(nextIndex);
if (next == null) {
continue;
}
if (next.isPurePositive(instanceBags)) {
next.makeLeafNode(true);
ArrayList deactivated = new ArrayList();
next.deactivateRelatedInstances(instanceBags, deactivated);
if (m_Debug && deactivated.size() > 0) {
Bag.printDeactivatedInstances(deactivated);
}
// Need to re-calculate scores if positive leaf has been
// created
for (TreeNode n : toSplit) {
n.removeDeactivatedInstances(instanceBags);
n.calculateNodeScore(instanceBags, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
}
if (stopOnFirstPositiveLeaf && deactivated.size() > 0) {
return;
}
} else if (next.isPureNegative(instanceBags)) {
next.makeLeafNode(false);
} else {
next.splitInstances(instanceBags, settings, r, m_Debug);
if (!next.isLeafNode()) {
if (next.split.isNominal) {
TreeNode[] nominals = next.nominals();
for (TreeNode nominal : nominals) {
nominal.calculateNodeScore(instanceBags, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
toSplit.add(nominal);
}
} else {
next.left().calculateNodeScore(instanceBags, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
toSplit.add(next.left());
next.right().calculateNodeScore(instanceBags, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
toSplit.add(next.right());
}
} else {
// Need to re-calculate scores if positive leaf has been
// created
if (next.isPositiveLeaf()) {
for (TreeNode n : toSplit) {
n.removeDeactivatedInstances(instanceBags);
n.calculateNodeScore(instanceBags, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier);
}
if (stopOnFirstPositiveLeaf) {
return;
}
}
}
}
// Increment number of leaves if necessary
if (next.isLeafNode()) {
numLeaves++;
}
// Re-evaluate the best next node, because we've most likely
// added new nodes or disabled bags
Comparator sh = Collections
.reverseOrder(new NextSplitHeuristic());
Collections.sort(toSplit, sh);
}
if (m_Debug) {
System.out.println(root.render(1, instanceBags));
}
}
/**
* Is instance positive given tree?
*/
protected boolean isPositive(Instance i) {
TreeNode leaf = traverseTree(i);
return leaf != null && leaf.isPositiveLeaf();
}
/**
* Traverse to a leaf for the given instance.
*/
private TreeNode traverseTree(Instance i) {
TreeNode next = root;
while (next != null && !next.isLeafNode()) {
Attribute a = next.split.attribute;
if (a.isNominal()) {
next = next.nominals()[(int) i.value(a)];
} else {
if (i.value(a) < next.split.splitPoint) {
next = next.left();
} else {
next = next.right();
}
}
}
return next;
}
/**
* Render the tree as a string.
*/
public String render() {
return root.render(0, m_instanceBags);
}
/**
* Trim negative branches for MIRI's parial trees.
*/
public boolean trimNegativeBranches() {
return root.trimNegativeBranches();
}
/**
* Determines the number of positive rules and the number of conditions used
* in the positive rules, for the given subtree.
*/
public int[] numPosRulesAndNumPosConditions() {
return numPosRulesAndNumPosConditions(root);
}
/**
* Determines the number of positive rules and the number of conditions used
* in the positive rules, for the given subtree.
*/
private int[] numPosRulesAndNumPosConditions(TreeNode next) {
int[] numPosRulesAndNumPosConditions = new int[2];
if ((next != null) && next.isLeafNode()) {
// Do we have a positive leaf node? Then there's one positive rule.
if (next.isPositiveLeaf()) {
numPosRulesAndNumPosConditions[0] = 1;
}
} else if (next != null) {
// We must be at an internal node
Attribute a = next.split.attribute;
int[] fromBelow = null;
if (a.isNominal()) {
for (TreeNode child : next.nominals()) {
fromBelow = numPosRulesAndNumPosConditions(child);
// Need to keep track of the number of positive rules
numPosRulesAndNumPosConditions[0] += fromBelow[0];
// One test is added for each positive rule
numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];
}
} else {
fromBelow = numPosRulesAndNumPosConditions(next.left());
// Need to keep track of the number of positive rules
numPosRulesAndNumPosConditions[0] += fromBelow[0];
// One test is added for each positive rule
numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];
fromBelow = numPosRulesAndNumPosConditions(next.right());
// Need to keep track of the number of positive rules
numPosRulesAndNumPosConditions[0] += fromBelow[0];
// One test is added for each positive rule
numPosRulesAndNumPosConditions[1] += fromBelow[1] + fromBelow[0];
}
}
return numPosRulesAndNumPosConditions;
}
}
/**
* Gets the user-specified settings as a configuration object.
*/
protected AlgorithmConfiguration getSettings() {
return new AlgorithmConfiguration(m_SplitMethod, m_unbiasedEstimate,
m_kBEPPConstant, m_useBagCount, m_bagInstanceMultiplier,
m_AttributesToSplit, m_AttributeSplitChoices);
}
/**
* Lists the options for this classifier. Valid options are: Valid options are:
*
*
*
* -M [1|2|3]
* The method used to determine best split:
* 1. Gini; 2. MaxBEPP; 3. SSBEPP
*
*
*
* -K [kBEPPConstant]
* The constant used in the tozero() hueristic
*
*
*
* -L
* Scales the value of K to the size of the bags
*
*
*
* -U
* Use unbiased estimate rather than BEPP, i.e. UEPP.
*
*
*
* -B
* Uses the instances present for the bag counts at each node when splitting,
* weighted according to 1 - Ba ^ n, where n is the number of instances
* present which belong to the bag, and Ba is another parameter (default 0.5)
*
*
*
* -Ba [multiplier]
* Multiplier for count influence of a bag based on the number of its instances
*
*
*
* -A [number of attributes]
* The number of randomly selected attributes to split
* -1: All attributes
* -2: square root of the total number of attributes
*
*
*
* -An [number of splits]
* The number of top scoring attribute splits to randomly pick from
* -1: All splits (completely random selection)
* -2: square root of the number of splits
*
*
*
* -S <num>
* Random number seed.
* (default 1)
*
*
*
* -D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
*
*
*/
@Override
public Enumeration