weka.classifiers.mi.MISMO Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of multiInstanceLearning Show documentation
A collection of multi-instance learning classifiers. Includes the Citation KNN method, several variants of the diverse density method, support vector machines for multi-instance learning, simple wrappers for applying standard propositional learners to multi-instance data, decision tree and rule learners, and some other methods.
The newest version!
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * MISMO.java
 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.mi;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.functions.Logistic;
import weka.classifiers.functions.supportVector.Kernel;
import weka.classifiers.functions.supportVector.SMOset;
import weka.classifiers.mi.supportVector.MIPolyKernel;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.MultiInstanceCapabilitiesHandler;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SerializedObject;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.MultiInstanceToPropositional;
import weka.filters.unsupervised.attribute.NominalToBinary;
import weka.filters.unsupervised.attribute.Normalize;
import weka.filters.unsupervised.attribute.PropositionalToMultiInstance;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
import weka.filters.unsupervised.attribute.Standardize;

/**
 *  Implements John Platt's sequential minimal
 * optimization algorithm for training a support vector classifier.

 * 

 * This implementation globally replaces all missing values and transforms
 * nominal attributes into binary ones. It also normalizes all attributes by
 * default. (In that case the coefficients in the output are based on the
 * normalized data, not the original data --- this is important for interpreting
 * the classifier.)

 * 

 * Multi-class problems are solved using pairwise classification.

 * 

 * To obtain proper probability estimates, use the option that fits logistic
 * regression models to the outputs of the support vector machine. In the
 * multi-class case the predicted probabilities are coupled using Hastie and
 * Tibshirani's pairwise coupling method.

 * 

 * Note: for improved speed normalization should be turned off when operating on
 * SparseInstances.

 * 

 * For more information on the SMO algorithm, see

 * 

 * J. Platt: Machines using Sequential Minimal Optimization. In B. Schoelkopf
 * and C. Burges and A. Smola, editors, Advances in Kernel Methods - Support
 * Vector Learning, 1998.

 * 

 * S.S. Keerthi, S.K. Shevade, C. Bhattacharyya, K.R.K. Murthy (2001).
 * Improvements to Platt's SMO Algorithm for SVM Classifier Design. Neural
 * Computation. 13(3):637-649.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @incollection{Platt1998,
 *    author = {J. Platt},
 *    booktitle = {Advances in Kernel Methods - Support Vector Learning},
 *    editor = {B. Schoelkopf and C. Burges and A. Smola},
 *    publisher = {MIT Press},
 *    title = {Machines using Sequential Minimal Optimization},
 *    year = {1998}
 * }
 * 
 * @article{Keerthi2001,
 *    author = {S.S. Keerthi and S.K. Shevade and C. Bhattacharyya and K.R.K. Murthy},
 *    journal = {Neural Computation},
 *    number = {3},
 *    pages = {637-649},
 *    title = {Improvements to Platt's SMO Algorithm for SVM Classifier Design},
 *    volume = {13},
 *    year = {2001}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -no-checks
 *  Turns off all checks - use with caution!
 *  Turning them off assumes that data is purely numeric, doesn't
 *  contain any missing values, and has a nominal class. Turning them
 *  off also means that no header information will be stored if the
 *  machine is linear. Finally, it also assumes that no instance has
 *  a weight equal to 0.
 *  (default: checks on)
 * 
 * 
 *  * -C <double>
 *  The complexity constant C. (default 1)
 * 
 * 
 *  * -N
 *  Whether to 0=normalize/1=standardize/2=neither.
 *  (default 0=normalize)
 * 
 * 
 *  * -I
 *  Use MIminimax feature space.
 * 
 * 
 *  * -L <double>
 *  The tolerance parameter. (default 1.0e-3)
 * 
 * 
 *  * -P <double>
 *  The epsilon for round-off error. (default 1.0e-12)
 * 
 * 
 *  * -M
 *  Fit logistic models to SVM outputs.
 * 
 * 
 *  * -V <double>
 *  The number of folds for the internal cross-validation. 
 *  (default -1, use training data)
 * 
 * 
 *  * -W <double>
 *  The random number seed. (default 1)
 * 
 * 
 *  * -K <classname and parameters>
 *  The Kernel to use.
 *  (default: weka.classifiers.functions.supportVector.PolyKernel)
 * 
 * 
 *  * Options specific to kernel weka.classifiers.mi.supportVector.MIPolyKernel:
 * 
 * 
 *  * -no-checks
 *  Turns off all checks - use with caution!
 *  (default: checks on)
 * 
 * 
 *  * -C <num>
 *  The size of the cache (a prime number), 0 for full cache and 
 *  -1 to turn it off.
 *  (default: 250007)
 * 
 * 
 *  * -E <num>
 *  The Exponent to use.
 *  (default: 1.0)
 * 
 * 
 *  * -L
 *  Use lower-order terms.
 *  (default: no)
 * 
 * 
 * 
 * 
 * @author Eibe Frank ([email protected])
 * @author Shane Legg ([email protected]) (sparse vector code)
 * @author Stuart Inglis ([email protected]) (sparse vector code)
 * @author Lin Dong ([email protected]) (code for adapting to MI data)
 * @version $Revision: 12560 $
 */
public class MISMO extends AbstractClassifier implements
  WeightedInstancesHandler, MultiInstanceCapabilitiesHandler,
  TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = -5834036950143719712L;

  /**
   * Returns a string describing classifier
   * 
   * @return a description suitable for displaying in the explorer/experimenter
   *         gui
   */
  public String globalInfo() {

    return "Implements John Platt's sequential minimal optimization "
      + "algorithm for training a support vector classifier.\n\n"
      + "This implementation globally replaces all missing values and "
      + "transforms nominal attributes into binary ones. It also "
      + "normalizes all attributes by default. (In that case the coefficients "
      + "in the output are based on the normalized data, not the "
      + "original data --- this is important for interpreting the classifier.)\n\n"
      + "Multi-class problems are solved using pairwise classification.\n\n"
      + "To obtain proper probability estimates, use the option that fits "
      + "logistic regression models to the outputs of the support vector "
      + "machine. In the multi-class case the predicted probabilities "
      + "are coupled using Hastie and Tibshirani's pairwise coupling "
      + "method.\n\n"
      + "Note: for improved speed normalization should be turned off when "
      + "operating on SparseInstances.\n\n"
      + "For more information on the SMO algorithm, see\n\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;
    TechnicalInformation additional;

    result = new TechnicalInformation(Type.INCOLLECTION);
    result.setValue(Field.AUTHOR, "J. Platt");
    result.setValue(Field.YEAR, "1998");
    result.setValue(Field.TITLE,
      "Machines using Sequential Minimal Optimization");
    result.setValue(Field.BOOKTITLE,
      "Advances in Kernel Methods - Support Vector Learning");
    result.setValue(Field.EDITOR, "B. Schoelkopf and C. Burges and A. Smola");
    result.setValue(Field.PUBLISHER, "MIT Press");

    additional = result.add(Type.ARTICLE);
    additional.setValue(Field.AUTHOR,
      "S.S. Keerthi and S.K. Shevade and C. Bhattacharyya and K.R.K. Murthy");
    additional.setValue(Field.YEAR, "2001");
    additional.setValue(Field.TITLE,
      "Improvements to Platt's SMO Algorithm for SVM Classifier Design");
    additional.setValue(Field.JOURNAL, "Neural Computation");
    additional.setValue(Field.VOLUME, "13");
    additional.setValue(Field.NUMBER, "3");
    additional.setValue(Field.PAGES, "637-649");

    return result;
  }

  /**
   * Class for building a binary support vector machine.
   */
  protected class BinaryMISMO implements Serializable, RevisionHandler {

    /** for serialization */
    static final long serialVersionUID = -7107082483475433531L;

    /** The Lagrange multipliers. */
    protected double[] m_alpha;

    /** The thresholds. */
    protected double m_b, m_bLow, m_bUp;

    /** The indices for m_bLow and m_bUp */
    protected int m_iLow, m_iUp;

    /** The training data. */
    protected Instances m_data;

    /** Weight vector for linear machine. */
    protected double[] m_weights;

    /**
     * Variables to hold weight vector in sparse form. (To reduce storage
     * requirements.)
     */
    protected double[] m_sparseWeights;
    protected int[] m_sparseIndices;

    /** Kernel to use **/
    protected Kernel m_kernel;

    /** The transformed class values. */
    protected double[] m_class;

    /** The current set of errors for all non-bound examples. */
    protected double[] m_errors;

    /* The five different sets used by the algorithm. */
    /** {i: 0 < m_alpha[i] < C} */
    protected SMOset m_I0;
    /** {i: m_class[i] = 1, m_alpha[i] = 0} */
    protected SMOset m_I1;
    /** {i: m_class[i] = -1, m_alpha[i] = C} */
    protected SMOset m_I2;
    /** {i: m_class[i] = 1, m_alpha[i] = C} */
    protected SMOset m_I3;
    /** {i: m_class[i] = -1, m_alpha[i] = 0} */
    protected SMOset m_I4;

    /** The set of support vectors {i: 0 < m_alpha[i]} */
    protected SMOset m_supportVectors;

    /** Stores logistic regression model for probability estimate */
    protected Logistic m_logistic = null;

    /** Stores the weight of the training instances */
    protected double m_sumOfWeights = 0;

    /**
     * Fits logistic regression model to SVM outputs analogue to John Platt's
     * method.
     * 
     * @param insts the set of training instances
     * @param cl1 the first class' index
     * @param cl2 the second class' index
     * @param numFolds the number of folds for cross-validation
     * @param random the random number generator for cross-validation
     * @throws Exception if the sigmoid can't be fit successfully
     */
    protected void fitLogistic(Instances insts, int cl1, int cl2, int numFolds,
      Random random) throws Exception {

      // Create header of instances object
      ArrayList atts = new ArrayList(2);
      atts.add(new Attribute("pred"));
      ArrayList attVals = new ArrayList(2);
      attVals.add(insts.classAttribute().value(cl1));
      attVals.add(insts.classAttribute().value(cl2));
      atts.add(new Attribute("class", attVals));
      Instances data = new Instances("data", atts, insts.numInstances());
      data.setClassIndex(1);

      // Collect data for fitting the logistic model
      if (numFolds <= 0) {

        // Use training data
        for (int j = 0; j < insts.numInstances(); j++) {
          Instance inst = insts.instance(j);
          double[] vals = new double[2];
          vals[0] = SVMOutput(-1, inst);
          if (inst.classValue() == cl2) {
            vals[1] = 1;
          }
          data.add(new DenseInstance(inst.weight(), vals));
        }
      } else {

        // Check whether number of folds too large
        if (numFolds > insts.numInstances()) {
          numFolds = insts.numInstances();
        }

        // Make copy of instances because we will shuffle them around
        insts = new Instances(insts);

        // Perform three-fold cross-validation to collect
        // unbiased predictions
        insts.randomize(random);
        insts.stratify(numFolds);
        for (int i = 0; i < numFolds; i++) {
          Instances train = insts.trainCV(numFolds, i, random);
          SerializedObject so = new SerializedObject(this);
          BinaryMISMO smo = (BinaryMISMO) so.getObject();
          smo.buildClassifier(train, cl1, cl2, false, -1, -1);
          Instances test = insts.testCV(numFolds, i);
          for (int j = 0; j < test.numInstances(); j++) {
            double[] vals = new double[2];
            vals[0] = smo.SVMOutput(-1, test.instance(j));
            if (test.instance(j).classValue() == cl2) {
              vals[1] = 1;
            }
            data.add(new DenseInstance(test.instance(j).weight(), vals));
          }
        }
      }

      // Build logistic regression model
      m_logistic = new Logistic();
      m_logistic.buildClassifier(data);
    }

    /**
     * sets the kernel to use
     * 
     * @param value the kernel to use
     */
    public void setKernel(Kernel value) {
      m_kernel = value;
    }

    /**
     * Returns the kernel to use
     * 
     * @return the current kernel
     */
    public Kernel getKernel() {
      return m_kernel;
    }

    /**
     * Method for building the binary classifier.
     * 
     * @param insts the set of training instances
     * @param cl1 the first class' index
     * @param cl2 the second class' index
     * @param fitLogistic true if logistic model is to be fit
     * @param numFolds number of folds for internal cross-validation
     * @param randomSeed seed value for random number generator for
     *          cross-validation
     * @throws Exception if the classifier can't be built successfully
     */
    protected void buildClassifier(Instances insts, int cl1, int cl2,
      boolean fitLogistic, int numFolds, int randomSeed) throws Exception {

      // Initialize some variables
      m_bUp = -1;
      m_bLow = 1;
      m_b = 0;
      m_alpha = null;
      m_data = null;
      m_weights = null;
      m_errors = null;
      m_logistic = null;
      m_I0 = null;
      m_I1 = null;
      m_I2 = null;
      m_I3 = null;
      m_I4 = null;
      m_sparseWeights = null;
      m_sparseIndices = null;

      // Store the sum of weights
      m_sumOfWeights = insts.sumOfWeights();

      // Set class values
      m_class = new double[insts.numInstances()];
      m_iUp = -1;
      m_iLow = -1;
      for (int i = 0; i < m_class.length; i++) {
        if ((int) insts.instance(i).classValue() == cl1) {
          m_class[i] = -1;
          m_iLow = i;
        } else if ((int) insts.instance(i).classValue() == cl2) {
          m_class[i] = 1;
          m_iUp = i;
        } else {
          throw new Exception("This should never happen!");
        }
      }

      // Check whether one or both classes are missing
      if ((m_iUp == -1) || (m_iLow == -1)) {
        if (m_iUp != -1) {
          m_b = -1;
        } else if (m_iLow != -1) {
          m_b = 1;
        } else {
          m_class = null;
          return;
        }
        m_supportVectors = new SMOset(0);
        m_alpha = new double[0];
        m_class = new double[0];

        // Fit sigmoid if requested
        if (fitLogistic) {
          fitLogistic(insts, cl1, cl2, numFolds, new Random(randomSeed));
        }
        return;
      }

      // Set the reference to the data
      m_data = insts;
      m_weights = null;

      // Initialize alpha array to zero
      m_alpha = new double[m_data.numInstances()];

      // Initialize sets
      m_supportVectors = new SMOset(m_data.numInstances());
      m_I0 = new SMOset(m_data.numInstances());
      m_I1 = new SMOset(m_data.numInstances());
      m_I2 = new SMOset(m_data.numInstances());
      m_I3 = new SMOset(m_data.numInstances());
      m_I4 = new SMOset(m_data.numInstances());

      // Clean out some instance variables
      m_sparseWeights = null;
      m_sparseIndices = null;

      // Initialize error cache
      m_errors = new double[m_data.numInstances()];
      m_errors[m_iLow] = 1;
      m_errors[m_iUp] = -1;

      // Initialize kernel
      m_kernel.buildKernel(m_data);

      // Build up I1 and I4
      for (int i = 0; i < m_class.length; i++) {
        if (m_class[i] == 1) {
          m_I1.insert(i);
        } else {
          m_I4.insert(i);
        }
      }

      // Loop to find all the support vectors
      int numChanged = 0;
      boolean examineAll = true;
      while ((numChanged > 0) || examineAll) {
        numChanged = 0;
        if (examineAll) {
          for (int i = 0; i < m_alpha.length; i++) {
            if (examineExample(i)) {
              numChanged++;
            }
          }
        } else {

          // This code implements Modification 1 from Keerthi et al.'s paper
          for (int i = 0; i < m_alpha.length; i++) {
            if ((m_alpha[i] > 0)
              && (m_alpha[i] < m_C * m_data.instance(i).weight())) {
              if (examineExample(i)) {
                numChanged++;
              }

              // Is optimality on unbound vectors obtained?
              if (m_bUp > m_bLow - 2 * m_tol) {
                numChanged = 0;
                break;
              }
            }
          }

          // This is the code for Modification 2 from Keerthi et al.'s paper
          /*
           * boolean innerLoopSuccess = true; numChanged = 0; while ((m_bUp <
           * m_bLow - 2 * m_tol) && (innerLoopSuccess == true)) {
           * innerLoopSuccess = takeStep(m_iUp, m_iLow, m_errors[m_iLow]); }
           */
        }

        if (examineAll) {
          examineAll = false;
        } else if (numChanged == 0) {
          examineAll = true;
        }
      }

      // Set threshold
      m_b = (m_bLow + m_bUp) / 2.0;

      // Save memory
      m_kernel.clean();

      m_errors = null;
      m_I0 = m_I1 = m_I2 = m_I3 = m_I4 = null;

      // Fit sigmoid if requested
      if (fitLogistic) {
        fitLogistic(insts, cl1, cl2, numFolds, new Random(randomSeed));
      }

    }

    /**
     * Computes SVM output for given instance.
     * 
     * @param index the instance for which output is to be computed
     * @param inst the instance
     * @return the output of the SVM for the given instance
     * @throws Exception if something goes wrong
     */
    protected double SVMOutput(int index, Instance inst) throws Exception {

      double result = 0;

      for (int i = m_supportVectors.getNext(-1); i != -1; i = m_supportVectors
        .getNext(i)) {
        result += m_class[i] * m_alpha[i] * m_kernel.eval(index, i, inst);
      }
      result -= m_b;

      return result;
    }

    /**
     * Prints out the classifier.
     * 
     * @return a description of the classifier as a string
     */
    @Override
    public String toString() {

      StringBuffer text = new StringBuffer();
      int printed = 0;

      if ((m_alpha == null) && (m_sparseWeights == null)) {
        return "BinaryMISMO: No model built yet.\n";
      }
      try {
        text.append("BinaryMISMO\n\n");

        for (int i = 0; i < m_alpha.length; i++) {
          if (m_supportVectors.contains(i)) {
            double val = m_alpha[i];
            if (m_class[i] == 1) {
              if (printed > 0) {
                text.append(" + ");
              }
            } else {
              text.append(" - ");
            }
            text.append(Utils.doubleToString(val, 12, 4) + " * <");
            for (int j = 0; j < m_data.numAttributes(); j++) {
              if (j != m_data.classIndex()) {
                text.append(m_data.instance(i).toString(j));
              }
              if (j != m_data.numAttributes() - 1) {
                text.append(" ");
              }
            }
            text.append("> * X]\n");
            printed++;
          }
        }

        if (m_b > 0) {
          text.append(" - " + Utils.doubleToString(m_b, 12, 4));
        } else {
          text.append(" + " + Utils.doubleToString(-m_b, 12, 4));
        }

        text.append("\n\nNumber of support vectors: "
          + m_supportVectors.numElements());
        int numEval = 0;
        int numCacheHits = -1;
        if (m_kernel != null) {
          numEval = m_kernel.numEvals();
          numCacheHits = m_kernel.numCacheHits();
        }
        text.append("\n\nNumber of kernel evaluations: " + numEval);
        if (numCacheHits >= 0 && numEval > 0) {
          double hitRatio = 1 - numEval * 1.0 / (numCacheHits + numEval);
          text.append(" (" + Utils.doubleToString(hitRatio * 100, 7, 3).trim()
            + "% cached)");
        }

      } catch (Exception e) {
        e.printStackTrace();

        return "Can't print BinaryMISMO classifier.";
      }

      return text.toString();
    }

    /**
     * Examines instance.
     * 
     * @param i2 index of instance to examine
     * @return true if examination was successfull
     * @throws Exception if something goes wrong
     */
    protected boolean examineExample(int i2) throws Exception {

      double y2, F2;
      int i1 = -1;

      y2 = m_class[i2];
      if (m_I0.contains(i2)) {
        F2 = m_errors[i2];
      } else {
        F2 = SVMOutput(i2, m_data.instance(i2)) + m_b - y2;
        m_errors[i2] = F2;

        // Update thresholds
        if ((m_I1.contains(i2) || m_I2.contains(i2)) && (F2 < m_bUp)) {
          m_bUp = F2;
          m_iUp = i2;
        } else if ((m_I3.contains(i2) || m_I4.contains(i2)) && (F2 > m_bLow)) {
          m_bLow = F2;
          m_iLow = i2;
        }
      }

      // Check optimality using current bLow and bUp and, if
      // violated, find an index i1 to do joint optimization
      // with i2...
      boolean optimal = true;
      if (m_I0.contains(i2) || m_I1.contains(i2) || m_I2.contains(i2)) {
        if (m_bLow - F2 > 2 * m_tol) {
          optimal = false;
          i1 = m_iLow;
        }
      }
      if (m_I0.contains(i2) || m_I3.contains(i2) || m_I4.contains(i2)) {
        if (F2 - m_bUp > 2 * m_tol) {
          optimal = false;
          i1 = m_iUp;
        }
      }
      if (optimal) {
        return false;
      }

      // For i2 unbound choose the better i1...
      if (m_I0.contains(i2)) {
        if (m_bLow - F2 > F2 - m_bUp) {
          i1 = m_iLow;
        } else {
          i1 = m_iUp;
        }
      }
      if (i1 == -1) {
        throw new Exception("This should never happen!");
      }
      return takeStep(i1, i2, F2);
    }

    /**
     * Method solving for the Lagrange multipliers for two instances.
     * 
     * @param i1 index of the first instance
     * @param i2 index of the second instance
     * @param F2
     * @return true if multipliers could be found
     * @throws Exception if something goes wrong
     */
    protected boolean takeStep(int i1, int i2, double F2) throws Exception {

      double alph1, alph2, y1, y2, F1, s, L, H, k11, k12, k22, eta, a1, a2, f1, f2, v1, v2, Lobj, Hobj;
      double C1 = m_C * m_data.instance(i1).weight();
      double C2 = m_C * m_data.instance(i2).weight();

      // Don't do anything if the two instances are the same
      if (i1 == i2) {
        return false;
      }

      // Initialize variables
      alph1 = m_alpha[i1];
      alph2 = m_alpha[i2];
      y1 = m_class[i1];
      y2 = m_class[i2];
      F1 = m_errors[i1];
      s = y1 * y2;

      // Find the constraints on a2
      if (y1 != y2) {
        L = Math.max(0, alph2 - alph1);
        H = Math.min(C2, C1 + alph2 - alph1);
      } else {
        L = Math.max(0, alph1 + alph2 - C1);
        H = Math.min(C2, alph1 + alph2);
      }
      if (L >= H) {
        return false;
      }

      // Compute second derivative of objective function
      k11 = m_kernel.eval(i1, i1, m_data.instance(i1));
      k12 = m_kernel.eval(i1, i2, m_data.instance(i1));
      k22 = m_kernel.eval(i2, i2, m_data.instance(i2));
      eta = 2 * k12 - k11 - k22;

      // Check if second derivative is negative
      if (eta < 0) {

        // Compute unconstrained maximum
        a2 = alph2 - y2 * (F1 - F2) / eta;

        // Compute constrained maximum
        if (a2 < L) {
          a2 = L;
        } else if (a2 > H) {
          a2 = H;
        }
      } else {

        // Look at endpoints of diagonal
        f1 = SVMOutput(i1, m_data.instance(i1));
        f2 = SVMOutput(i2, m_data.instance(i2));
        v1 = f1 + m_b - y1 * alph1 * k11 - y2 * alph2 * k12;
        v2 = f2 + m_b - y1 * alph1 * k12 - y2 * alph2 * k22;
        double gamma = alph1 + s * alph2;
        Lobj = (gamma - s * L) + L - 0.5 * k11 * (gamma - s * L)
          * (gamma - s * L) - 0.5 * k22 * L * L - s * k12 * (gamma - s * L) * L
          - y1 * (gamma - s * L) * v1 - y2 * L * v2;
        Hobj = (gamma - s * H) + H - 0.5 * k11 * (gamma - s * H)
          * (gamma - s * H) - 0.5 * k22 * H * H - s * k12 * (gamma - s * H) * H
          - y1 * (gamma - s * H) * v1 - y2 * H * v2;
        if (Lobj > Hobj + m_eps) {
          a2 = L;
        } else if (Lobj < Hobj - m_eps) {
          a2 = H;
        } else {
          a2 = alph2;
        }
      }
      if (Math.abs(a2 - alph2) < m_eps * (a2 + alph2 + m_eps)) {
        return false;
      }

      // To prevent precision problems
      if (a2 > C2 - m_Del * C2) {
        a2 = C2;
      } else if (a2 <= m_Del * C2) {
        a2 = 0;
      }

      // Recompute a1
      a1 = alph1 + s * (alph2 - a2);

      // To prevent precision problems
      if (a1 > C1 - m_Del * C1) {
        a1 = C1;
      } else if (a1 <= m_Del * C1) {
        a1 = 0;
      }

      // Update sets
      if (a1 > 0) {
        m_supportVectors.insert(i1);
      } else {
        m_supportVectors.delete(i1);
      }
      if ((a1 > 0) && (a1 < C1)) {
        m_I0.insert(i1);
      } else {
        m_I0.delete(i1);
      }
      if ((y1 == 1) && (a1 == 0)) {
        m_I1.insert(i1);
      } else {
        m_I1.delete(i1);
      }
      if ((y1 == -1) && (a1 == C1)) {
        m_I2.insert(i1);
      } else {
        m_I2.delete(i1);
      }
      if ((y1 == 1) && (a1 == C1)) {
        m_I3.insert(i1);
      } else {
        m_I3.delete(i1);
      }
      if ((y1 == -1) && (a1 == 0)) {
        m_I4.insert(i1);
      } else {
        m_I4.delete(i1);
      }
      if (a2 > 0) {
        m_supportVectors.insert(i2);
      } else {
        m_supportVectors.delete(i2);
      }
      if ((a2 > 0) && (a2 < C2)) {
        m_I0.insert(i2);
      } else {
        m_I0.delete(i2);
      }
      if ((y2 == 1) && (a2 == 0)) {
        m_I1.insert(i2);
      } else {
        m_I1.delete(i2);
      }
      if ((y2 == -1) && (a2 == C2)) {
        m_I2.insert(i2);
      } else {
        m_I2.delete(i2);
      }
      if ((y2 == 1) && (a2 == C2)) {
        m_I3.insert(i2);
      } else {
        m_I3.delete(i2);
      }
      if ((y2 == -1) && (a2 == 0)) {
        m_I4.insert(i2);
      } else {
        m_I4.delete(i2);
      }

      // Update error cache using new Lagrange multipliers
      for (int j = m_I0.getNext(-1); j != -1; j = m_I0.getNext(j)) {
        if ((j != i1) && (j != i2)) {
          m_errors[j] += y1 * (a1 - alph1)
            * m_kernel.eval(i1, j, m_data.instance(i1)) + y2 * (a2 - alph2)
            * m_kernel.eval(i2, j, m_data.instance(i2));
        }
      }

      // Update error cache for i1 and i2
      m_errors[i1] += y1 * (a1 - alph1) * k11 + y2 * (a2 - alph2) * k12;
      m_errors[i2] += y1 * (a1 - alph1) * k12 + y2 * (a2 - alph2) * k22;

      // Update array with Lagrange multipliers
      m_alpha[i1] = a1;
      m_alpha[i2] = a2;

      // Update thresholds
      m_bLow = -Double.MAX_VALUE;
      m_bUp = Double.MAX_VALUE;
      m_iLow = -1;
      m_iUp = -1;
      for (int j = m_I0.getNext(-1); j != -1; j = m_I0.getNext(j)) {
        if (m_errors[j] < m_bUp) {
          m_bUp = m_errors[j];
          m_iUp = j;
        }
        if (m_errors[j] > m_bLow) {
          m_bLow = m_errors[j];
          m_iLow = j;
        }
      }
      if (!m_I0.contains(i1)) {
        if (m_I3.contains(i1) || m_I4.contains(i1)) {
          if (m_errors[i1] > m_bLow) {
            m_bLow = m_errors[i1];
            m_iLow = i1;
          }
        } else {
          if (m_errors[i1] < m_bUp) {
            m_bUp = m_errors[i1];
            m_iUp = i1;
          }
        }
      }
      if (!m_I0.contains(i2)) {
        if (m_I3.contains(i2) || m_I4.contains(i2)) {
          if (m_errors[i2] > m_bLow) {
            m_bLow = m_errors[i2];
            m_iLow = i2;
          }
        } else {
          if (m_errors[i2] < m_bUp) {
            m_bUp = m_errors[i2];
            m_iUp = i2;
          }
        }
      }
      if ((m_iLow == -1) || (m_iUp == -1)) {
        throw new Exception("This should never happen!");
      }

      // Made some progress.
      return true;
    }

    /**
     * Quick and dirty check whether the quadratic programming problem is
     * solved.
     * 
     * @throws Exception if something goes wrong
     */
    protected void checkClassifier() throws Exception {

      double sum = 0;
      for (int i = 0; i < m_alpha.length; i++) {
        if (m_alpha[i] > 0) {
          sum += m_class[i] * m_alpha[i];
        }
      }
      System.err.println("Sum of y(i) * alpha(i): " + sum);

      for (int i = 0; i < m_alpha.length; i++) {
        double output = SVMOutput(i, m_data.instance(i));
        if (Utils.eq(m_alpha[i], 0)) {
          if (Utils.sm(m_class[i] * output, 1)) {
            System.err.println("KKT condition 1 violated: " + m_class[i]
              * output);
          }
        }
        if (Utils.gr(m_alpha[i], 0)
          && Utils.sm(m_alpha[i], m_C * m_data.instance(i).weight())) {
          if (!Utils.eq(m_class[i] * output, 1)) {
            System.err.println("KKT condition 2 violated: " + m_class[i]
              * output);
          }
        }
        if (Utils.eq(m_alpha[i], m_C * m_data.instance(i).weight())) {
          if (Utils.gr(m_class[i] * output, 1)) {
            System.err.println("KKT condition 3 violated: " + m_class[i]
              * output);
          }
        }
      }
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 12560 $");
    }
  }

  /** Normalize training data */
  public static final int FILTER_NORMALIZE = 0;
  /** Standardize training data */
  public static final int FILTER_STANDARDIZE = 1;
  /** No normalization/standardization */
  public static final int FILTER_NONE = 2;
  /** The filter to apply to the training data */
  public static final Tag[] TAGS_FILTER = {
    new Tag(FILTER_NORMALIZE, "Normalize training data"),
    new Tag(FILTER_STANDARDIZE, "Standardize training data"),
    new Tag(FILTER_NONE, "No normalization/standardization"), };

  /** The binary classifier(s) */
  protected BinaryMISMO[][] m_classifiers = null;

  /** The complexity parameter. */
  protected double m_C = 1.0;

  /** Epsilon for rounding. */
  protected double m_eps = 1.0e-12;

  /** Tolerance for accuracy of result. */
  protected double m_tol = 1.0e-3;

  /** Whether to normalize/standardize/neither */
  protected int m_filterType = FILTER_NORMALIZE;

  /** Use MIMinimax feature space? */
  protected boolean m_minimax = false;

  /** The filter used to make attributes numeric. */
  protected NominalToBinary m_NominalToBinary;

  /** The filter used to standardize/normalize all values. */
  protected Filter m_Filter = null;

  /** The filter used to get rid of missing values. */
  protected ReplaceMissingValues m_Missing;

  /** The class index from the training data */
  protected int m_classIndex = -1;

  /** The class attribute */
  protected Attribute m_classAttribute;

  /** Kernel to use **/
  protected Kernel m_kernel = new MIPolyKernel();

  /**
   * Turn off all checks and conversions? Turning them off assumes that data is
   * purely numeric, doesn't contain any missing values, and has a nominal
   * class. Turning them off also means that no header information will be
   * stored if the machine is linear. Finally, it also assumes that no instance
   * has a weight equal to 0.
   */
  protected boolean m_checksTurnedOff;

  /** Precision constant for updating sets */
  protected static double m_Del = 1000 * Double.MIN_VALUE;

  /** Whether logistic models are to be fit */
  protected boolean m_fitLogisticModels = false;

  /** The number of folds for the internal cross-validation */
  protected int m_numFolds = -1;

  /** The random number seed */
  protected int m_randomSeed = 1;

  /**
   * Turns off checks for missing values, etc. Use with caution.
   */
  public void turnChecksOff() {

    m_checksTurnedOff = true;
  }

  /**
   * Turns on checks for missing values, etc.
   */
  public void turnChecksOn() {

    m_checksTurnedOff = false;
  }

  /**
   * Returns default capabilities of the classifier.
   * 
   * @return the capabilities of this classifier
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = getKernel().getCapabilities();
    result.setOwner(this);

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.RELATIONAL_ATTRIBUTES);

    // class
    result.disableAllClasses();
    result.disableAllClassDependencies();
    result.disable(Capability.NO_CLASS);
    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);

    // other
    result.enable(Capability.ONLY_MULTIINSTANCE);

    return result;
  }

  /**
   * Returns the capabilities of this multi-instance classifier for the
   * relational data.
   * 
   * @return the capabilities of this object
   * @see Capabilities
   */
  @Override
  public Capabilities getMultiInstanceCapabilities() {
    Capabilities result = ((MultiInstanceCapabilitiesHandler) getKernel())
      .getMultiInstanceCapabilities();
    result.setOwner(this);

    // attribute
    result.enableAllAttributeDependencies();
    // with NominalToBinary we can also handle nominal attributes, but only
    // if the kernel can handle numeric attributes
    if (result.handles(Capability.NUMERIC_ATTRIBUTES)) {
      result.enable(Capability.NOMINAL_ATTRIBUTES);
    }
    result.enable(Capability.MISSING_VALUES);

    return result;
  }

  /**
   * Method for building the classifier. Implements a one-against-one wrapper
   * for multi-class problems.
   * 
   * @param insts the set of training instances
   * @throws Exception if the classifier can't be built successfully
   */
  @Override
  public void buildClassifier(Instances insts) throws Exception {
    if (!m_checksTurnedOff) {
      // can classifier handle the data?
      getCapabilities().testWithFail(insts);

      // remove instances with missing class
      insts = new Instances(insts);
      insts.deleteWithMissingClass();

      /*
       * Removes all the instances with weight equal to 0. MUST be done since
       * condition (8) of Keerthi's paper is made with the assertion Ci > 0 (See
       * equation (3a).
       */
      Instances data = new Instances(insts, insts.numInstances());
      for (int i = 0; i < insts.numInstances(); i++) {
        if (insts.instance(i).weight() > 0) {
          data.add(insts.instance(i));
        }
      }
      if (data.numInstances() == 0) {
        throw new Exception("No training instances left after removing "
          + "instance with either a weight null or a missing class!");
      }
      insts = data;
    }

    // filter data
    if (!m_checksTurnedOff) {
      m_Missing = new ReplaceMissingValues();
    } else {
      m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
      boolean onlyNumeric = true;
      if (!m_checksTurnedOff) {
        for (int i = 0; i < insts.numAttributes(); i++) {
          if (i != insts.classIndex()) {
            if (!insts.attribute(i).isNumeric()) {
              onlyNumeric = false;
              break;
            }
          }
        }
      }

      if (!onlyNumeric) {
        m_NominalToBinary = new NominalToBinary();
        // exclude the bag attribute
        m_NominalToBinary.setAttributeIndices("2-last");
      } else {
        m_NominalToBinary = null;
      }
    } else {
      m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
      m_Filter = new Standardize();
    } else if (m_filterType == FILTER_NORMALIZE) {
      m_Filter = new Normalize();
    } else {
      m_Filter = null;
    }

    Instances transformedInsts;
    Filter convertToProp = new MultiInstanceToPropositional();
    Filter convertToMI = new PropositionalToMultiInstance();

    // transform the data into single-instance format
    if (m_minimax) {
      /*
       * using SimpleMI class minimax transform method. this method transforms
       * the multi-instance dataset into minmax feature space (single-instance)
       */
      SimpleMI transMinimax = new SimpleMI();
      transMinimax.setTransformMethod(new SelectedTag(
        SimpleMI.TRANSFORMMETHOD_MINIMAX, SimpleMI.TAGS_TRANSFORMMETHOD));
      transformedInsts = transMinimax.transform(insts);
    } else {
      convertToProp.setInputFormat(insts);
      transformedInsts = Filter.useFilter(insts, convertToProp);
    }

    if (m_Missing != null) {
      m_Missing.setInputFormat(transformedInsts);
      transformedInsts = Filter.useFilter(transformedInsts, m_Missing);
    }

    if (m_NominalToBinary != null) {
      m_NominalToBinary.setInputFormat(transformedInsts);
      transformedInsts = Filter.useFilter(transformedInsts, m_NominalToBinary);
    }

    if (m_Filter != null) {
      m_Filter.setInputFormat(transformedInsts);
      transformedInsts = Filter.useFilter(transformedInsts, m_Filter);
    }

    // convert the single-instance format to multi-instance format
    convertToMI.setInputFormat(transformedInsts);
    insts = Filter.useFilter(transformedInsts, convertToMI);

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
      subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
      Instance inst = insts.instance(j);
      subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
      subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinaryMISMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
      for (int j = i + 1; j < insts.numClasses(); j++) {
        m_classifiers[i][j] = new BinaryMISMO();
        m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
        Instances data = new Instances(insts, insts.numInstances());
        for (int k = 0; k < subsets[i].numInstances(); k++) {
          data.add(subsets[i].instance(k));
        }
        for (int k = 0; k < subsets[j].numInstances(); k++) {
          data.add(subsets[j].instance(k));
        }
        data.compactify();
        data.randomize(rand);
        m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels,
          m_numFolds, m_randomSeed);
      }
    }

  }

  /**
   * Estimates class probabilities for given instance.
   * 
   * @param inst the instance to compute the distribution for
   * @return the class probabilities
   * @throws Exception if computation fails
   */
  @Override
  public double[] distributionForInstance(Instance inst) throws Exception {

    // convert instance into instances
    Instances insts = new Instances(inst.dataset(), 0);
    insts.add(inst);

    // transform the data into single-instance format
    Filter convertToProp = new MultiInstanceToPropositional();
    Filter convertToMI = new PropositionalToMultiInstance();

    if (m_minimax) { // using minimax feature space
      SimpleMI transMinimax = new SimpleMI();
      transMinimax.setTransformMethod(new SelectedTag(
        SimpleMI.TRANSFORMMETHOD_MINIMAX, SimpleMI.TAGS_TRANSFORMMETHOD));
      insts = transMinimax.transform(insts);
    } else {
      convertToProp.setInputFormat(insts);
      insts = Filter.useFilter(insts, convertToProp);
    }

    // Filter instances
    if (m_Missing != null) {
      insts = Filter.useFilter(insts, m_Missing);
    }

    if (m_NominalToBinary != null) {
      insts = Filter.useFilter(insts, m_NominalToBinary);
    }

    if (m_Filter != null) {
      insts = Filter.useFilter(insts, m_Filter);
    }

    // convert the single-instance format to multi-instance format
    convertToMI.setInputFormat(insts);
    insts = Filter.useFilter(insts, convertToMI);

    inst = insts.instance(0);

    if (!m_fitLogisticModels) {
      double[] result = new double[inst.numClasses()];
      for (int i = 0; i < inst.numClasses(); i++) {
        for (int j = i + 1; j < inst.numClasses(); j++) {
          if ((m_classifiers[i][j].m_alpha != null)
            || (m_classifiers[i][j].m_sparseWeights != null)) {
            double output = m_classifiers[i][j].SVMOutput(-1, inst);
            if (output > 0) {
              result[j] += 1;
            } else {
              result[i] += 1;
            }
          }
        }
      }
      Utils.normalize(result);
      return result;
    } else {

      // We only need to do pairwise coupling if there are more
      // then two classes.
      if (inst.numClasses() == 2) {
        double[] newInst = new double[2];
        newInst[0] = m_classifiers[0][1].SVMOutput(-1, inst);
        newInst[1] = Utils.missingValue();
        return m_classifiers[0][1].m_logistic
          .distributionForInstance(new DenseInstance(1, newInst));
      }
      double[][] r = new double[inst.numClasses()][inst.numClasses()];
      double[][] n = new double[inst.numClasses()][inst.numClasses()];
      for (int i = 0; i < inst.numClasses(); i++) {
        for (int j = i + 1; j < inst.numClasses(); j++) {
          if ((m_classifiers[i][j].m_alpha != null)
            || (m_classifiers[i][j].m_sparseWeights != null)) {
            double[] newInst = new double[2];
            newInst[0] = m_classifiers[i][j].SVMOutput(-1, inst);
            newInst[1] = Utils.missingValue();
            r[i][j] = m_classifiers[i][j].m_logistic
              .distributionForInstance(new DenseInstance(1, newInst))[0];
            n[i][j] = m_classifiers[i][j].m_sumOfWeights;
          }
        }
      }
      return pairwiseCoupling(n, r);
    }
  }

  /**
   * Implements pairwise coupling.
   * 
   * @param n the sum of weights used to train each model
   * @param r the probability estimate from each model
   * @return the coupled estimates
   */
  public double[] pairwiseCoupling(double[][] n, double[][] r) {

    // Initialize p and u array
    double[] p = new double[r.length];
    for (int i = 0; i < p.length; i++) {
      p[i] = 1.0 / p.length;
    }
    double[][] u = new double[r.length][r.length];
    for (int i = 0; i < r.length; i++) {
      for (int j = i + 1; j < r.length; j++) {
        u[i][j] = 0.5;
      }
    }

    // firstSum doesn't change
    double[] firstSum = new double[p.length];
    for (int i = 0; i < p.length; i++) {
      for (int j = i + 1; j < p.length; j++) {
        firstSum[i] += n[i][j] * r[i][j];
        firstSum[j] += n[i][j] * (1 - r[i][j]);
      }
    }

    // Iterate until convergence
    boolean changed;
    do {
      changed = false;
      double[] secondSum = new double[p.length];
      for (int i = 0; i < p.length; i++) {
        for (int j = i + 1; j < p.length; j++) {
          secondSum[i] += n[i][j] * u[i][j];
          secondSum[j] += n[i][j] * (1 - u[i][j]);
        }
      }
      for (int i = 0; i < p.length; i++) {
        if ((firstSum[i] == 0) || (secondSum[i] == 0)) {
          if (p[i] > 0) {
            changed = true;
          }
          p[i] = 0;
        } else {
          double factor = firstSum[i] / secondSum[i];
          double pOld = p[i];
          p[i] *= factor;
          if (Math.abs(pOld - p[i]) > 1.0e-3) {
            changed = true;
          }
        }
      }
      Utils.normalize(p);
      for (int i = 0; i < r.length; i++) {
        for (int j = i + 1; j < r.length; j++) {
          u[i][j] = p[i] / (p[i] + p[j]);
        }
      }
    } while (changed);
    return p;
  }

  /**
   * Returns the weights in sparse format.
   * 
   * @return the weights in sparse format
   */
  public double[][][] sparseWeights() {

    int numValues = m_classAttribute.numValues();
    double[][][] sparseWeights = new double[numValues][numValues][];

    for (int i = 0; i < numValues; i++) {
      for (int j = i + 1; j < numValues; j++) {
        sparseWeights[i][j] = m_classifiers[i][j].m_sparseWeights;
      }
    }

    return sparseWeights;
  }

  /**
   * Returns the indices in sparse format.
   * 
   * @return the indices in sparse format
   */
  public int[][][] sparseIndices() {

    int numValues = m_classAttribute.numValues();
    int[][][] sparseIndices = new int[numValues][numValues][];

    for (int i = 0; i < numValues; i++) {
      for (int j = i + 1; j < numValues; j++) {
        sparseIndices[i][j] = m_classifiers[i][j].m_sparseIndices;
      }
    }

    return sparseIndices;
  }

  /**
   * Returns the bias of each binary SMO.
   * 
   * @return the bias of each binary SMO
   */
  public double[][] bias() {

    int numValues = m_classAttribute.numValues();
    double[][] bias = new double[numValues][numValues];

    for (int i = 0; i < numValues; i++) {
      for (int j = i + 1; j < numValues; j++) {
        bias[i][j] = m_classifiers[i][j].m_b;
      }
    }

    return bias;
  }

  /**
   * Returns the number of values of the class attribute.
   * 
   * @return the number values of the class attribute
   */
  public int numClassAttributeValues() {

    return m_classAttribute.numValues();
  }

  /**
   * Returns the names of the class attributes.
   * 
   * @return the names of the class attributes
   */
  public String[] classAttributeNames() {

    int numValues = m_classAttribute.numValues();

    String[] classAttributeNames = new String[numValues];

    for (int i = 0; i < numValues; i++) {
      classAttributeNames[i] = m_classAttribute.value(i);
    }

    return classAttributeNames;
  }

  /**
   * Returns the attribute names.
   * 
   * @return the attribute names
   */
  public String[][][] attributeNames() {

    int numValues = m_classAttribute.numValues();
    String[][][] attributeNames = new String[numValues][numValues][];

    for (int i = 0; i < numValues; i++) {
      for (int j = i + 1; j < numValues; j++) {
        int numAttributes = m_classifiers[i][j].m_data.numAttributes();
        String[] attrNames = new String[numAttributes];
        for (int k = 0; k < numAttributes; k++) {
          attrNames[k] = m_classifiers[i][j].m_data.attribute(k).name();
        }
        attributeNames[i][j] = attrNames;
      }
    }
    return attributeNames;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration