Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
weka.classifiers.rules.JRip Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* JRip.java
* Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
*/
package weka.classifiers.rules;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.classifiers.AbstractClassifier;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Copyable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.supervised.attribute.ClassOrder;
/**
* This class implements a propositional rule learner,
* Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was
* proposed by William W. Cohen as an optimized version of IREP.
*
* The algorithm is briefly described as follows:
*
* Initialize RS = {}, and for each class from the less prevalent one to the
* more frequent one, DO:
*
* 1. Building stage:
* Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and
* examples is 64 bits greater than the smallest DL met so far, or there are no
* positive examples, or the error rate >= 50%.
*
* 1.1. Grow phase:
* Grow one rule by greedily adding antecedents (or conditions) to the rule
* until the rule is perfect (i.e. 100% accurate). The procedure tries every
* possible value of each attribute and selects the condition with highest
* information gain: p(log(p/t)-log(P/T)).
*
* 1.2. Prune phase:
* Incrementally prune each rule and allow the pruning of any final sequences of
* the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually
* 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually
* (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).
*
* 2. Optimization stage:
* after generating the initial ruleset {Ri}, generate and prune two variants of
* each rule Ri from randomized data using procedure 1.1 and 1.2. But one
* variant is generated from an empty rule while the other is generated by
* greedily adding antecedents to the original rule. Moreover, the pruning
* metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each
* variant and the original rule is computed. The variant with the minimal DL is
* selected as the final representative of Ri in the ruleset.After all the rules
* in {Ri} have been examined and if there are still residual positives, more
* rules are generated based on the residual positives using Building Stage
* again.
* 3. Delete the rules from the ruleset that would increase the DL of the whole
* ruleset if it were in it. and add resultant ruleset to RS.
* ENDDO
*
* Note that there seem to be 2 bugs in the original ripper program that would
* affect the ruleset size and accuracy slightly. This implementation avoids
* these bugs and thus is a little bit different from Cohen's original
* implementation. Even after fixing the bugs, since the order of classes with
* the same frequency is not defined in ripper, there still seems to be some
* trivial difference between this implementation and the original ripper,
* especially for audiology data in UCI repository, where there are lots of
* classes of few instances.
*
* Details please see:
*
* William W. Cohen: Fast Effective Rule Induction. In: Twelfth International
* Conference on Machine Learning, 115-123, 1995.
*
* PS. We have compared this implementation with the original ripper
* implementation in aspects of accuracy, ruleset size and running time on both
* artificial data "ab+bcd+defg" and UCI datasets. In all these aspects it seems
* to be quite comparable to the original ripper implementation. However, we
* didn't consider memory consumption optimization in this implementation.
*
*
*
*
* BibTeX:
*
*
* @inproceedings{Cohen1995,
* author = {William W. Cohen},
* booktitle = {Twelfth International Conference on Machine Learning},
* pages = {115-123},
* publisher = {Morgan Kaufmann},
* title = {Fast Effective Rule Induction},
* year = {1995}
* }
*
*
*
*
* Valid options are:
*
*
*
* -F <number of folds>
* Set number of folds for REP
* One fold is used as pruning set.
* (default 3)
*
*
*
* -N <min. weights>
* Set the minimal weights of instances
* within a split.
* (default 2.0)
*
*
*
* -O <number of runs>
* Set the number of runs of
* optimizations. (Default: 2)
*
*
*
* -D
* Set whether turn on the
* debug mode (Default: false)
*
*
*
* -S <seed>
* The seed of randomization
* (Default: 1)
*
*
*
* -E
* Whether NOT check the error rate>=0.5
* in stopping criteria (default: check)
*
*
*
* -P
* Whether NOT use pruning
* (default: use pruning)
*
*
*
*
* @author Xin Xu ([email protected] )
* @author Eibe Frank ([email protected] )
* @version $Revision: 13408 $
*/
public class JRip extends AbstractClassifier implements
AdditionalMeasureProducer, WeightedInstancesHandler,
TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = -6589312996832147161L;
/** The limit of description length surplus in ruleset generation */
private static double MAX_DL_SURPLUS = 64.0;
/** The class attribute of the data */
private Attribute m_Class;
/** The ruleset */
private ArrayList m_Ruleset;
/** The predicted class distribution */
private ArrayList m_Distributions;
/** Runs of optimizations */
private int m_Optimizations = 2;
/** Random object used in this class */
private Random m_Random = null;
/** # of all the possible conditions in a rule */
private double m_Total = 0;
/** The seed to perform randomization */
private long m_Seed = 1;
/** The number of folds to split data into Grow and Prune for IREP */
private int m_Folds = 3;
/** The minimal number of instance weights within a split */
private double m_MinNo = 2.0;
/** Whether in a debug mode */
private boolean m_Debug = false;
/** Whether check the error rate >= 0.5 in stopping criteria */
private boolean m_CheckErr = true;
/** Whether use pruning, i.e. the data is clean or not */
private boolean m_UsePruning = true;
/** The filter used to randomize the class order */
private Filter m_Filter = null;
/** The RuleStats for the ruleset of each class value */
private ArrayList m_RulesetStats;
/**
* Returns a string describing classifier
*
* @return a description suitable for displaying in the explorer/experimenter
* gui
*/
public String globalInfo() {
return "This class implements a propositional rule learner, Repeated Incremental "
+ "Pruning to Produce Error Reduction (RIPPER), which was proposed by William "
+ "W. Cohen as an optimized version of IREP. \n\n"
+ "The algorithm is briefly described as follows: \n\n"
+ "Initialize RS = {}, and for each class from the less prevalent one to "
+ "the more frequent one, DO: \n\n"
+ "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) "
+ "of the ruleset and examples is 64 bits greater than the smallest DL "
+ "met so far, or there are no positive examples, or the error rate >= 50%. "
+ "\n\n"
+ "1.1. Grow phase:\n"
+ "Grow one rule by greedily adding antecedents (or conditions) to "
+ "the rule until the rule is perfect (i.e. 100% accurate). The "
+ "procedure tries every possible value of each attribute and selects "
+ "the condition with highest information gain: p(log(p/t)-log(P/T))."
+ "\n\n"
+ "1.2. Prune phase:\n"
+ "Incrementally prune each rule and allow the pruning of any "
+ "final sequences of the antecedents;"
+ "The pruning metric is (p-n)/(p+n) -- but it's actually "
+ "2p/(p+n) -1, so in this implementation we simply use p/(p+n) "
+ "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n"
+ "2. Optimization stage:\n after generating the initial ruleset {Ri}, "
+ "generate and prune two variants of each rule Ri from randomized data "
+ "using procedure 1.1 and 1.2. But one variant is generated from an "
+ "empty rule while the other is generated by greedily adding antecedents "
+ "to the original rule. Moreover, the pruning metric used here is "
+ "(TP+TN)/(P+N)."
+ "Then the smallest possible DL for each variant and the original rule "
+ "is computed. The variant with the minimal DL is selected as the final "
+ "representative of Ri in the ruleset."
+ "After all the rules in {Ri} have been examined and if there are still "
+ "residual positives, more rules are generated based on the residual "
+ "positives using Building Stage again. \n"
+ "3. Delete the rules from the ruleset that would increase the DL of the "
+ "whole ruleset if it were in it. and add resultant ruleset to RS. \n"
+ "ENDDO\n\n"
+ "Note that there seem to be 2 bugs in the original ripper program that would "
+ "affect the ruleset size and accuracy slightly. This implementation avoids "
+ "these bugs and thus is a little bit different from Cohen's original "
+ "implementation. Even after fixing the bugs, since the order of classes with "
+ "the same frequency is not defined in ripper, there still seems to be "
+ "some trivial difference between this implementation and the original ripper, "
+ "especially for audiology data in UCI repository, where there are lots of "
+ "classes of few instances.\n\n"
+ "Details please see:\n\n"
+ getTechnicalInformation().toString()
+ "\n\n"
+ "PS. We have compared this implementation with the original ripper "
+ "implementation in aspects of accuracy, ruleset size and running time "
+ "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these "
+ "aspects it seems to be quite comparable to the original ripper "
+ "implementation. However, we didn't consider memory consumption "
+ "optimization in this implementation.\n\n";
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "William W. Cohen");
result.setValue(Field.TITLE, "Fast Effective Rule Induction");
result.setValue(Field.BOOKTITLE,
"Twelfth International Conference on Machine Learning");
result.setValue(Field.YEAR, "1995");
result.setValue(Field.PAGES, "115-123");
result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
return result;
}
/**
* Returns an enumeration describing the available options Valid options are:
*
*
* -F number
* The number of folds for reduced error pruning. One fold is used as the
* pruning set. (Default: 3)
*
*
* -N number
* The minimal weights of instances within a split. (Default: 2)
*
*
* -O number
* Set the number of runs of optimizations. (Default: 2)
*
*
* -D
* Whether turn on the debug mode
*
* -S number
* The seed of randomization used in Ripper.(Default: 1)
*
*
* -E
* Whether NOT check the error rate >= 0.5 in stopping criteria. (default:
* check)
*
*
* -P
* Whether NOT use pruning. (default: use pruning)
*
*
* @return an enumeration of all the available options
*/
@Override
public Enumeration listOptions() {
Vector newVector = new Vector (7);
newVector.add(new Option("\tSet number of folds for REP\n"
+ "\tOne fold is used as pruning set.\n" + "\t(default 3)", "F", 1,
"-F "));
newVector
.add(new Option("\tSet the minimal weights of instances\n"
+ "\twithin a split.\n" + "\t(default 2.0)", "N", 1,
"-N "));
newVector.add(new Option("\tSet the number of runs of\n"
+ "\toptimizations. (Default: 2)", "O", 1, "-O "));
newVector.add(new Option("\tSet whether turn on the\n"
+ "\tdebug mode (Default: false)", "D", 0, "-D"));
newVector.add(new Option(
"\tThe seed of randomization\n" + "\t(Default: 1)", "S", 1, "-S "));
newVector.add(new Option("\tWhether NOT check the error rate>=0.5\n"
+ "\tin stopping criteria " + "\t(default: check)", "E", 0, "-E"));
newVector.add(new Option("\tWhether NOT use pruning\n"
+ "\t(default: use pruning)", "P", 0, "-P"));
newVector.addAll(Collections.list(super.listOptions()));
return newVector.elements();
}
/**
* Parses a given list of options.
*
*
* Valid options are:
*
*
*
* -F <number of folds>
* Set number of folds for REP
* One fold is used as pruning set.
* (default 3)
*
*
*
* -N <min. weights>
* Set the minimal weights of instances
* within a split.
* (default 2.0)
*
*
*
* -O <number of runs>
* Set the number of runs of
* optimizations. (Default: 2)
*
*
*
* -D
* Set whether turn on the
* debug mode (Default: false)
*
*
*
* -S <seed>
* The seed of randomization
* (Default: 1)
*
*
*
* -E
* Whether NOT check the error rate>=0.5
* in stopping criteria (default: check)
*
*
*
* -P
* Whether NOT use pruning
* (default: use pruning)
*
*
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String numFoldsString = Utils.getOption('F', options);
if (numFoldsString.length() != 0) {
m_Folds = Integer.parseInt(numFoldsString);
} else {
m_Folds = 3;
}
String minNoString = Utils.getOption('N', options);
if (minNoString.length() != 0) {
m_MinNo = Double.parseDouble(minNoString);
} else {
m_MinNo = 2.0;
}
String seedString = Utils.getOption('S', options);
if (seedString.length() != 0) {
m_Seed = Long.parseLong(seedString);
} else {
m_Seed = 1;
}
String runString = Utils.getOption('O', options);
if (runString.length() != 0) {
m_Optimizations = Integer.parseInt(runString);
} else {
m_Optimizations = 2;
}
m_Debug = Utils.getFlag('D', options);
m_CheckErr = !Utils.getFlag('E', options);
m_UsePruning = !Utils.getFlag('P', options);
super.setOptions(options);
Utils.checkForRemainingOptions(options);
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
@Override
public String[] getOptions() {
Vector options = new Vector();
options.add("-F");
options.add("" + m_Folds);
options.add("-N");
options.add("" + m_MinNo);
options.add("-O");
options.add("" + m_Optimizations);
options.add("-S");
options.add("" + m_Seed);
if (m_Debug) {
options.add("-D");
}
if (!m_CheckErr) {
options.add("-E");
}
if (!m_UsePruning) {
options.add("-P");
}
Collections.addAll(options, super.getOptions());
return options.toArray(new String[0]);
}
/**
* Returns an enumeration of the additional measure names
*
* @return an enumeration of the measure names
*/
@Override
public Enumeration enumerateMeasures() {
Vector newVector = new Vector(1);
newVector.add("measureNumRules");
return newVector.elements();
}
/**
* Returns the value of the named measure
*
* @param additionalMeasureName the name of the measure to query for its value
* @return the value of the named measure
* @throws IllegalArgumentException if the named measure is not supported
*/
@Override
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) {
return m_Ruleset.size();
} else {
throw new IllegalArgumentException(additionalMeasureName
+ " not supported (RIPPER)");
}
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String foldsTipText() {
return "Determines the amount of data used for pruning. One fold is used for "
+ "pruning, the rest for growing the rules.";
}
/**
* Sets the number of folds to use
*
* @param fold the number of folds
*/
public void setFolds(int fold) {
m_Folds = fold;
}
/**
* Gets the number of folds
*
* @return the number of folds
*/
public int getFolds() {
return m_Folds;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String minNoTipText() {
return "The minimum total weight of the instances in a rule.";
}
/**
* Sets the minimum total weight of the instances in a rule
*
* @param m the minimum total weight of the instances in a rule
*/
public void setMinNo(double m) {
m_MinNo = m;
}
/**
* Gets the minimum total weight of the instances in a rule
*
* @return the minimum total weight of the instances in a rule
*/
public double getMinNo() {
return m_MinNo;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String seedTipText() {
return "The seed used for randomizing the data.";
}
/**
* Sets the seed value to use in randomizing the data
*
* @param s the new seed value
*/
public void setSeed(long s) {
m_Seed = s;
}
/**
* Gets the current seed value to use in randomizing the data
*
* @return the seed value
*/
public long getSeed() {
return m_Seed;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String optimizationsTipText() {
return "The number of optimization runs.";
}
/**
* Sets the number of optimization runs
*
* @param run the number of optimization runs
*/
public void setOptimizations(int run) {
m_Optimizations = run;
}
/**
* Gets the the number of optimization runs
*
* @return the number of optimization runs
*/
public int getOptimizations() {
return m_Optimizations;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
@Override
public String debugTipText() {
return "Whether debug information is output to the console.";
}
/**
* Sets whether debug information is output to the console
*
* @param d whether debug information is output to the console
*/
@Override
public void setDebug(boolean d) {
m_Debug = d;
}
/**
* Gets whether debug information is output to the console
*
* @return whether debug information is output to the console
*/
@Override
public boolean getDebug() {
return m_Debug;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String checkErrorRateTipText() {
return "Whether check for error rate >= 1/2 is included"
+ " in stopping criterion.";
}
/**
* Sets whether to check for error rate is in stopping criterion
*
* @param d whether to check for error rate is in stopping criterion
*/
public void setCheckErrorRate(boolean d) {
m_CheckErr = d;
}
/**
* Gets whether to check for error rate is in stopping criterion
*
* @return true if checking for error rate is in stopping criterion
*/
public boolean getCheckErrorRate() {
return m_CheckErr;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String usePruningTipText() {
return "Whether pruning is performed.";
}
/**
* Sets whether pruning is performed
*
* @param d Whether pruning is performed
*/
public void setUsePruning(boolean d) {
m_UsePruning = d;
}
/**
* Gets whether pruning is performed
*
* @return true if pruning is performed
*/
public boolean getUsePruning() {
return m_UsePruning;
}
/**
* Get the ruleset generated by Ripper
*
* @return the ruleset
*/
public ArrayList getRuleset() {
return m_Ruleset;
}
/**
* Get the statistics of the ruleset in the given position
*
* @param pos the position of the stats, assuming correct
* @return the statistics of the ruleset in the given position
*/
public RuleStats getRuleStats(int pos) {
return m_RulesetStats.get(pos);
}
/**
* The single antecedent in the rule, which is composed of an attribute and
* the corresponding value. There are two inherited classes, namely
* NumericAntd and NominalAntd in which the attributes are numeric and nominal
* respectively.
*/
public abstract class Antd implements WeightedInstancesHandler, Copyable,
Serializable, RevisionHandler {
/** for serialization */
private static final long serialVersionUID = -8929754772994154334L;
/** The attribute of the antecedent */
protected Attribute att;
/**
* The attribute value of the antecedent. For numeric attribute, value is
* either 0(1st bag) or 1(2nd bag)
*/
protected double value;
/**
* The maximum infoGain achieved by this antecedent test in the growing data
*/
protected double maxInfoGain;
/** The accurate rate of this antecedent test on the growing data */
protected double accuRate;
/** The coverage of this antecedent in the growing data */
protected double cover;
/** The accurate data for this antecedent in the growing data */
protected double accu;
/**
* Constructor
*/
public Antd(Attribute a) {
att = a;
value = Double.NaN;
maxInfoGain = 0;
accuRate = Double.NaN;
cover = Double.NaN;
accu = Double.NaN;
}
/* The abstract members for inheritance */
public abstract Instances[] splitData(Instances data, double defAcRt,
double cla);
public abstract boolean covers(Instance inst);
@Override
public abstract String toString();
/**
* Implements Copyable
*
* @return a copy of this object
*/
@Override
public abstract Object copy();
/* Get functions of this antecedent */
public Attribute getAttr() {
return att;
}
public double getAttrValue() {
return value;
}
public double getMaxInfoGain() {
return maxInfoGain;
}
public double getAccuRate() {
return accuRate;
}
public double getAccu() {
return accu;
}
public double getCover() {
return cover;
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 13408 $");
}
}
/**
* The antecedent with numeric attribute
*/
public class NumericAntd extends Antd {
/** for serialization */
static final long serialVersionUID = 5699457269983735442L;
/** The split point for this numeric antecedent */
private double splitPoint;
/**
* Constructor
*/
public NumericAntd(Attribute a) {
super(a);
splitPoint = Double.NaN;
}
/**
* Get split point of this numeric antecedent
*
* @return the split point of this numeric antecedent
*/
public double getSplitPoint() {
return splitPoint;
}
/**
* Implements Copyable
*
* @return a copy of this object
*/
@Override
public Object copy() {
NumericAntd na = new NumericAntd(getAttr());
na.value = this.value;
na.splitPoint = this.splitPoint;
return na;
}
/**
* Implements the splitData function. This procedure is to split the data
* into two bags according to the information gain of the numeric attribute
* value The maximum infoGain is also calculated.
*
* @param insts the data to be split
* @param defAcRt the default accuracy rate for data
* @param cl the class label to be predicted
* @return the array of data after split
*/
@Override
public Instances[] splitData(Instances insts, double defAcRt, double cl) {
Instances data = insts;
int total = data.numInstances();// Total number of instances without
// missing value for att
int split = 1; // Current split position
int prev = 0; // Previous split position
int finalSplit = split; // Final split position
maxInfoGain = 0;
value = 0;
double fstCover = 0, sndCover = 0, fstAccu = 0, sndAccu = 0;
data.sort(att);
// Find the las instance without missing value
for (int x = 0; x < data.numInstances(); x++) {
Instance inst = data.instance(x);
if (inst.isMissing(att)) {
total = x;
break;
}
sndCover += inst.weight();
if (Utils.eq(inst.classValue(), cl)) {
sndAccu += inst.weight();
}
}
if (total == 0) {
return null; // Data all missing for the attribute
}
splitPoint = data.instance(total - 1).value(att);
for (; split <= total; split++) {
if ((split == total) || (data.instance(split).value(att) > // Can't
// split
// within
data.instance(prev).value(att))) { // same value
for (int y = prev; y < split; y++) {
Instance inst = data.instance(y);
fstCover += inst.weight();
if (Utils.eq(data.instance(y).classValue(), cl)) {
fstAccu += inst.weight(); // First bag positive# ++
}
}
double fstAccuRate = (fstAccu + 1.0) / (fstCover + 1.0), sndAccuRate = (sndAccu + 1.0)
/ (sndCover + 1.0);
/* Which bag has higher information gain? */
boolean isFirst;
double fstInfoGain, sndInfoGain;
double accRate, infoGain, coverage, accurate;
fstInfoGain =
// Utils.eq(defAcRt, 1.0) ?
// fstAccu/(double)numConds :
fstAccu * (Utils.log2(fstAccuRate) - Utils.log2(defAcRt));
sndInfoGain =
// Utils.eq(defAcRt, 1.0) ?
// sndAccu/(double)numConds :
sndAccu * (Utils.log2(sndAccuRate) - Utils.log2(defAcRt));
if (fstInfoGain > sndInfoGain) {
isFirst = true;
infoGain = fstInfoGain;
accRate = fstAccuRate;
accurate = fstAccu;
coverage = fstCover;
} else {
isFirst = false;
infoGain = sndInfoGain;
accRate = sndAccuRate;
accurate = sndAccu;
coverage = sndCover;
}
/* Check whether so far the max infoGain */
if (infoGain > maxInfoGain) {
splitPoint = data.instance(prev).value(att);
value = (isFirst) ? 0 : 1;
accuRate = accRate;
accu = accurate;
cover = coverage;
maxInfoGain = infoGain;
finalSplit = (isFirst) ? split : prev;
}
for (int y = prev; y < split; y++) {
Instance inst = data.instance(y);
sndCover -= inst.weight();
if (Utils.eq(data.instance(y).classValue(), cl)) {
sndAccu -= inst.weight(); // Second bag positive# --
}
}
prev = split;
}
}
/* Split the data */
Instances[] splitData = new Instances[2];
splitData[0] = new Instances(data, 0, finalSplit);
splitData[1] = new Instances(data, finalSplit, total - finalSplit);
return splitData;
}
/**
* Whether the instance is covered by this antecedent
*
* @param inst the instance in question
* @return the boolean value indicating whether the instance is covered by
* this antecedent
*/
@Override
public boolean covers(Instance inst) {
boolean isCover = true;
if (!inst.isMissing(att)) {
if ((int) value == 0) { // First bag
if (inst.value(att) > splitPoint) {
isCover = false;
}
} else if (inst.value(att) < splitPoint) {
isCover = false;
}
} else {
isCover = false;
}
return isCover;
}
/**
* Prints this antecedent
*
* @return a textual description of this antecedent
*/
@Override
public String toString() {
String symbol = ((int) value == 0) ? " <= " : " >= ";
return (att.name() + symbol + Utils.doubleToString(splitPoint, 6));
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 13408 $");
}
}
/**
* The antecedent with nominal attribute
*/
public class NominalAntd extends Antd {
/** for serialization */
static final long serialVersionUID = -9102297038837585135L;
/*
* The parameters of infoGain calculated for each attribute value in the
* growing data
*/
private final double[] accurate;
private final double[] coverage;
/**
* Constructor
*/
public NominalAntd(Attribute a) {
super(a);
int bag = att.numValues();
accurate = new double[bag];
coverage = new double[bag];
}
/**
* Implements Copyable
*
* @return a copy of this object
*/
@Override
public Object copy() {
Antd antec = new NominalAntd(getAttr());
antec.value = this.value;
return antec;
}
/**
* Implements the splitData function. This procedure is to split the data
* into bags according to the nominal attribute value The infoGain for each
* bag is also calculated.
*
* @param data the data to be split
* @param defAcRt the default accuracy rate for data
* @param cl the class label to be predicted
* @return the array of data after split
*/
@Override
public Instances[] splitData(Instances data, double defAcRt, double cl) {
int bag = att.numValues();
Instances[] splitData = new Instances[bag];
for (int x = 0; x < bag; x++) {
splitData[x] = new Instances(data, data.numInstances());
accurate[x] = 0;
coverage[x] = 0;
}
for (int x = 0; x < data.numInstances(); x++) {
Instance inst = data.instance(x);
if (!inst.isMissing(att)) {
int v = (int) inst.value(att);
splitData[v].add(inst);
coverage[v] += inst.weight();
if ((int) inst.classValue() == (int) cl) {
accurate[v] += inst.weight();
}
}
}
for (int x = 0; x < bag; x++) {
double t = coverage[x] + 1.0;
double p = accurate[x] + 1.0;
double infoGain =
// Utils.eq(defAcRt, 1.0) ?
// accurate[x]/(double)numConds :
accurate[x] * (Utils.log2(p / t) - Utils.log2(defAcRt));
if (infoGain > maxInfoGain) {
maxInfoGain = infoGain;
cover = coverage[x];
accu = accurate[x];
accuRate = p / t;
value = x;
}
}
return splitData;
}
/**
* Whether the instance is covered by this antecedent
*
* @param inst the instance in question
* @return the boolean value indicating whether the instance is covered by
* this antecedent
*/
@Override
public boolean covers(Instance inst) {
boolean isCover = false;
if (!inst.isMissing(att)) {
if ((int) inst.value(att) == (int) value) {
isCover = true;
}
}
return isCover;
}
/**
* Prints this antecedent
*
* @return a textual description of this antecedent
*/
@Override
public String toString() {
return (att.name() + " = " + att.value((int) value));
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 13408 $");
}
}
/**
* This class implements a single rule that predicts specified class.
*
* A rule consists of antecedents "AND"ed together and the consequent (class
* value) for the classification. In this class, the Information Gain
* (p*[log(p/t) - log(P/T)]) is used to select an antecedent and Reduced Error
* Prunning (REP) with the metric of accuracy rate p/(p+n) or (TP+TN)/(P+N) is
* used to prune the rule.
*/
public class RipperRule extends Rule {
/** for serialization */
static final long serialVersionUID = -2410020717305262952L;
/** The internal representation of the class label to be predicted */
private double m_Consequent = -1;
/** The vector of antecedents of this rule */
protected ArrayList m_Antds = null;
/** Constructor */
public RipperRule() {
m_Antds = new ArrayList();
}
/**
* Removes redundant tests in the rule.
*
* @param data an instance object that contains the appropriate header information for the attributes.
*/
public void cleanUp(Instances data) {
double[] mins = new double[data.numAttributes()];
double[] maxs = new double[data.numAttributes()];
for (int i = 0; i < data.numAttributes(); i++) {
mins[i] = Double.MAX_VALUE;
maxs[i] = -Double.MAX_VALUE;
}
for (int i = m_Antds.size() - 1; i >= 0; i--) {
Attribute att = m_Antds.get(i).getAttr();
if (att.isNumeric()) {
double splitPoint = ((NumericAntd)m_Antds.get(i)).getSplitPoint();
if (m_Antds.get(i).getAttrValue() == 0) {
if (splitPoint < mins[att.index()]) {
mins[att.index()] = splitPoint;
} else {
m_Antds.remove(i);
}
} else {
if (splitPoint > maxs[att.index()]) {
maxs[att.index()] = splitPoint;
} else {
m_Antds.remove(i);
}
}
}
}
}
/**
* Sets the internal representation of the class label to be predicted
*
* @param cl the internal representation of the class label to be predicted
*/
public void setConsequent(double cl) {
m_Consequent = cl;
}
/**
* Gets the internal representation of the class label to be predicted
*
* @return the internal representation of the class label to be predicted
*/
@Override
public double getConsequent() {
return m_Consequent;
}
/**
* Get a shallow copy of this rule
*
* @return the copy
*/
@Override
public Object copy() {
RipperRule copy = new RipperRule();
copy.setConsequent(getConsequent());
copy.m_Antds = new ArrayList(this.m_Antds.size());
for (Antd a : this.m_Antds) {
copy.m_Antds.add((Antd) a.copy());
}
return copy;
}
/**
* Whether the instance covered by this rule
*
* @param datum the instance in question
* @return the boolean value indicating whether the instance is covered by
* this rule
*/
@Override
public boolean covers(Instance datum) {
boolean isCover = true;
for (int i = 0; i < m_Antds.size(); i++) {
Antd antd = m_Antds.get(i);
if (!antd.covers(datum)) {
isCover = false;
break;
}
}
return isCover;
}
/**
* Whether this rule has antecedents, i.e. whether it is a default rule
*
* @return the boolean value indicating whether the rule has antecedents
*/
@Override
public boolean hasAntds() {
if (m_Antds == null) {
return false;
} else {
return (m_Antds.size() > 0);
}
}
/**
* Return the antecedents
*
* @return the vector of antecedents
*/
public ArrayList getAntds() {
return m_Antds;
}
/**
* the number of antecedents of the rule
*
* @return the size of this rule
*/
@Override
public double size() {
return m_Antds.size();
}
/**
* Private function to compute default number of accurate instances in the
* specified data for the consequent of the rule
*
* @param data the data in question
* @return the default accuracy number
*/
private double computeDefAccu(Instances data) {
double defAccu = 0;
for (int i = 0; i < data.numInstances(); i++) {
Instance inst = data.instance(i);
if ((int) inst.classValue() == (int) m_Consequent) {
defAccu += inst.weight();
}
}
return defAccu;
}
/**
* Build one rule using the growing data
*
* @param data the growing data used to build the rule
* @throws Exception if the consequent is not set yet
*/
@Override
public void grow(Instances data) throws Exception {
if (m_Consequent == -1) {
throw new Exception(" Consequent not set yet.");
}
Instances growData = data;
double sumOfWeights = growData.sumOfWeights();
if (!Utils.gr(sumOfWeights, 0.0)) {
return;
}
/* Compute the default accurate rate of the growing data */
double defAccu = computeDefAccu(growData);
double defAcRt = (defAccu + 1.0) / (sumOfWeights + 1.0);
/* Keep the record of which attributes have already been used */
boolean[] used = new boolean[growData.numAttributes()];
for (int k = 0; k < used.length; k++) {
used[k] = false;
}
int numUnused = used.length;
// If there are already antecedents existing
for (int j = 0; j < m_Antds.size(); j++) {
Antd antdj = m_Antds.get(j);
if (!antdj.getAttr().isNumeric()) {
used[antdj.getAttr().index()] = true;
numUnused--;
}
}
double maxInfoGain;
while (Utils.gr(growData.numInstances(), 0.0) && (numUnused > 0)
&& Utils.sm(defAcRt, 1.0)) {
// We require that infoGain be positive
/*
* if(numAntds == originalSize) maxInfoGain = 0.0; // At least one
* condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ?
* defAccu/(double)numAntds : 0.0;
*/
maxInfoGain = 0.0;
/* Build a list of antecedents */
Antd oneAntd = null;
Instances coverData = null;
Enumeration enumAttr = growData.enumerateAttributes();
/* Build one condition based on all attributes not used yet */
while (enumAttr.hasMoreElements()) {
Attribute att = (enumAttr.nextElement());
if (m_Debug) {
System.err.println("\nOne condition: size = "
+ growData.sumOfWeights());
}
Antd antd = null;
if (att.isNumeric()) {
antd = new NumericAntd(att);
} else {
antd = new NominalAntd(att);
}
if (!used[att.index()]) {
/*
* Compute the best information gain for each attribute, it's stored
* in the antecedent formed by this attribute. This procedure
* returns the data covered by the antecedent
*/
Instances coveredData = computeInfoGain(growData, defAcRt, antd);
if (coveredData != null) {
double infoGain = antd.getMaxInfoGain();
if (m_Debug) {
System.err.println("Test of \'" + antd.toString()
+ "\': infoGain = " + infoGain + " | Accuracy = "
+ antd.getAccuRate() + "=" + antd.getAccu() + "/"
+ antd.getCover() + " def. accuracy: " + defAcRt);
}
if (infoGain > maxInfoGain) {
oneAntd = antd;
coverData = coveredData;
maxInfoGain = infoGain;
}
}
}
}
if (oneAntd == null) {
break; // Cannot find antds
}
if (Utils.sm(oneAntd.getAccu(), m_MinNo)) {
break;// Too low coverage
}
// Numeric attributes can be used more than once
if (!oneAntd.getAttr().isNumeric()) {
used[oneAntd.getAttr().index()] = true;
numUnused--;
}
m_Antds.add(oneAntd);
growData = coverData;// Grow data size is shrinking
defAcRt = oneAntd.getAccuRate();
}
}
/**
* Compute the best information gain for the specified antecedent
*
* @param instances the data based on which the infoGain is computed
* @param defAcRt the default accuracy rate of data
* @param antd the specific antecedent
* @return the data covered by the antecedent
*/
private Instances computeInfoGain(Instances instances, double defAcRt,
Antd antd) {
Instances data = instances;
/*
* Split the data into bags. The information gain of each bag is also
* calculated in this procedure
*/
Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent);
/* Get the bag of data to be used for next antecedents */
if (splitData != null) {
return splitData[(int) antd.getAttrValue()];
} else {
return null;
}
}
/**
* Prune all the possible final sequences of the rule using the pruning
* data. The measure used to prune the rule is based on flag given.
*
* @param pruneData the pruning data used to prune the rule
* @param useWhole flag to indicate whether use the error rate of the whole
* pruning data instead of the data covered
*/
public void prune(Instances pruneData, boolean useWhole) {
Instances data = pruneData;
double total = data.sumOfWeights();
if (!Utils.gr(total, 0.0)) {
return;
}
/* The default accurate # and rate on pruning data */
double defAccu = computeDefAccu(data);
if (m_Debug) {
System.err.println("Pruning with " + defAccu + " positive data out of "
+ total + " instances");
}
int size = m_Antds.size();
if (size == 0) {
return; // Default rule before pruning
}
double[] worthRt = new double[size];
double[] coverage = new double[size];
double[] worthValue = new double[size];
for (int w = 0; w < size; w++) {
worthRt[w] = coverage[w] = worthValue[w] = 0.0;
}
/* Calculate accuracy parameters for all the antecedents in this rule */
double tn = 0.0; // True negative if useWhole
for (int x = 0; x < size; x++) {
Antd antd = m_Antds.get(x);
Instances newData = data;
data = new Instances(newData, 0); // Make data empty
for (int y = 0; y < newData.numInstances(); y++) {
Instance ins = newData.instance(y);
if (antd.covers(ins)) { // Covered by this antecedent
coverage[x] += ins.weight();
data.add(ins); // Add to data for further pruning
if ((int) ins.classValue() == (int) m_Consequent) {
worthValue[x] += ins.weight();
}
} else if (useWhole) { // Not covered
if ((int) ins.classValue() != (int) m_Consequent) {
tn += ins.weight();
}
}
}
if (useWhole) {
worthValue[x] += tn;
worthRt[x] = worthValue[x] / total;
} else {
worthRt[x] = (worthValue[x] + 1.0) / (coverage[x] + 2.0);
}
}
double maxValue = (defAccu + 1.0) / (total + 2.0);
int maxIndex = -1;
for (int i = 0; i < worthValue.length; i++) {
if (m_Debug) {
double denom = useWhole ? total : coverage[i];
System.err.println(i + "(useAccuray? " + !useWhole + "): "
+ worthRt[i] + "=" + worthValue[i] + "/" + denom);
}
if (worthRt[i] > maxValue) { // Prefer to the
maxValue = worthRt[i]; // shorter rule
maxIndex = i;
}
}
/* Prune the antecedents according to the accuracy parameters */
for (int z = size - 1; z > maxIndex; z--) {
m_Antds.remove(z);
}
}
/**
* Prints this rule
*
* @param classAttr the class attribute in the data
* @return a textual description of this rule
*/
public String toString(Attribute classAttr) {
StringBuffer text = new StringBuffer();
if (m_Antds.size() > 0) {
for (int j = 0; j < (m_Antds.size() - 1); j++) {
text.append("(" + (m_Antds.get(j)).toString() + ") and ");
}
text.append("(" + (m_Antds.get(m_Antds.size() - 1)).toString() + ")");
}
text.append(" => " + classAttr.name() + "="
+ classAttr.value((int) m_Consequent));
return text.toString();
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 13408 $");
}
}
/**
* Returns default capabilities of the classifier.
*
* @return the capabilities of this classifier
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
// instances
result.setMinimumNumberInstances(m_Folds);
return result;
}
/**
* Builds Ripper in the order of class frequencies. For each class it's built
* in two stages: building and optimization
*
* @param instances the training data
* @throws Exception if classifier can't be built successfully
*/
@Override
public void buildClassifier(Instances instances) throws Exception {
// can classifier handle the data?
getCapabilities().testWithFail(instances);
// remove instances with missing class
instances = new Instances(instances);
instances.deleteWithMissingClass();
m_Random = instances.getRandomNumberGenerator(m_Seed);
m_Total = RuleStats.numAllConditions(instances);
if (m_Debug) {
System.err.println("Number of all possible conditions = " + m_Total);
}
Instances data = null;
m_Filter = new ClassOrder();
((ClassOrder) m_Filter).setSeed(m_Random.nextInt());
((ClassOrder) m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
m_Filter.setInputFormat(instances);
data = Filter.useFilter(instances, m_Filter);
if (data == null) {
throw new Exception(" Unable to randomize the class orders.");
}
m_Class = data.classAttribute();
m_Ruleset = new ArrayList();
m_RulesetStats = new ArrayList();
m_Distributions = new ArrayList();
// Sort by classes frequency
double[] orderedClasses = ((ClassOrder) m_Filter).getClassCounts();
if (m_Debug) {
System.err.println("Sorted classes:");
for (int x = 0; x < m_Class.numValues(); x++) {
System.err.println(x + ": " + m_Class.value(x) + " has "
+ orderedClasses[x] + " instances.");
}
}
// Iterate from less prevalent class to more frequent one
oneClass: for (int y = 0; y < data.numClasses() - 1; y++) { // For each
// class
double classIndex = y;
if (m_Debug) {
int ci = (int) classIndex;
System.err.println("\n\nClass " + m_Class.value(ci) + "(" + ci + "): "
+ orderedClasses[y] + "instances\n"
+ "=====================================\n");
}
if (Utils.eq(orderedClasses[y], 0.0)) {
continue oneClass;
}
// The expected FP/err is the proportion of the class
double all = 0;
for (int i = y; i < orderedClasses.length; i++) {
all += orderedClasses[i];
}
double expFPRate = orderedClasses[y] / all;
double classYWeights = 0, totalWeights = 0;
for (int j = 0; j < data.numInstances(); j++) {
Instance datum = data.instance(j);
totalWeights += datum.weight();
if ((int) datum.classValue() == y) {
classYWeights += datum.weight();
}
}
// DL of default rule, no theory DL, only data DL
double defDL;
if (classYWeights > 0) {
defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0,
classYWeights);
} else {
continue oneClass; // Subsumed by previous rules
}
if (Double.isNaN(defDL) || Double.isInfinite(defDL)) {
throw new Exception("Should never happen: " + "defDL NaN or infinite!");
}
if (m_Debug) {
System.err.println("The default DL = " + defDL);
}
data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
}
// Remove redundant numeric tests from the rules
for (Rule rule : m_Ruleset) {
((RipperRule)rule).cleanUp(data);
}
// Set the default rule
RipperRule defRule = new RipperRule();
defRule.setConsequent(data.numClasses() - 1);
m_Ruleset.add(defRule);
RuleStats defRuleStat = new RuleStats();
defRuleStat.setData(data);
defRuleStat.setNumAllConds(m_Total);
defRuleStat.addAndUpdate(defRule);
m_RulesetStats.add(defRuleStat);
for (int z = 0; z < m_RulesetStats.size(); z++) {
RuleStats oneClass = m_RulesetStats.get(z);
for (int xyz = 0; xyz < oneClass.getRulesetSize(); xyz++) {
double[] classDist = oneClass.getDistributions(xyz);
Utils.normalize(classDist);
if (classDist != null) {
m_Distributions.add(((ClassOrder) m_Filter)
.distributionsByOriginalIndex(classDist));
}
}
}
// free up memory
for (int i = 0; i < m_RulesetStats.size(); i++) {
(m_RulesetStats.get(i)).cleanUp();
}
}
/**
* Classify the test instance with the rule learner and provide the class
* distributions
*
* @param datum the instance to be classified
* @return the distribution
*/
@Override
public double[] distributionForInstance(Instance datum) {
try {
for (int i = 0; i < m_Ruleset.size(); i++) {
Rule rule = m_Ruleset.get(i);
if (rule.covers(datum)) {
return m_Distributions.get(i);
}
}
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
}
System.err.println("Should never happen!");
return new double[datum.classAttribute().numValues()];
}
/**
* Build a ruleset for the given class according to the given data
*
* @param expFPRate the expected FP/(FP+FN) used in DL calculation
* @param data the given data
* @param classIndex the given class index
* @param defDL the default DL in the data
* @throws Exception if the ruleset can be built properly
*/
protected Instances rulesetForOneClass(double expFPRate, Instances data,
double classIndex, double defDL) throws Exception {
Instances newData = data, growData, pruneData;
boolean stop = false;
ArrayList ruleset = new ArrayList();
double dl = defDL, minDL = defDL;
RuleStats rstats = null;
double[] rst;
// Check whether data have positive examples
boolean defHasPositive = true; // No longer used
boolean hasPositive = defHasPositive;
/********************** Building stage ***********************/
if (m_Debug) {
System.err.println("\n*** Building stage ***");
}
while ((!stop) && hasPositive) { // Generate new rules until
// stopping criteria met
RipperRule oneRule;
if (m_UsePruning) {
/* Split data into Grow and Prune */
// We should have stratified the data, but ripper seems
// to have a bug that makes it not to do so. In order
// to simulate it more precisely, we do the same thing.
// newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData = part[0];
pruneData = part[1];
// growData=newData.trainCV(m_Folds, m_Folds-1);
// pruneData=newData.testCV(m_Folds, m_Folds-1);
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if (m_Debug) {
System.err.println("\nGrowing a rule ...");
}
oneRule.grow(growData); // Build the rule
if (m_Debug) {
System.err.println("One rule found before pruning:"
+ oneRule.toString(m_Class));
}
if (m_Debug) {
System.err.println("\nPruning the rule ...");
}
oneRule.prune(pruneData, false); // Prune the rule
if (m_Debug) {
System.err.println("One rule found after pruning:"
+ oneRule.toString(m_Class));
}
} else {
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if (m_Debug) {
System.err.println("\nNo pruning: growing a rule ...");
}
oneRule.grow(newData); // Build the rule
if (m_Debug) {
System.err.println("No pruning: one rule found:\n"
+ oneRule.toString(m_Class));
}
}
// Compute the DL of this ruleset
if (rstats == null) { // First rule
rstats = new RuleStats();
rstats.setNumAllConds(m_Total);
rstats.setData(newData);
}
rstats.addAndUpdate(oneRule);
int last = rstats.getRuleset().size() - 1; // Index of last rule
dl += rstats.relativeDL(last, expFPRate, m_CheckErr);
if (Double.isNaN(dl) || Double.isInfinite(dl)) {
throw new Exception("Should never happen: dl in "
+ "building stage NaN or infinite!");
}
if (m_Debug) {
System.err.println("Before optimization(" + last + "): the dl = " + dl
+ " | best: " + minDL);
}
if (dl < minDL) {
minDL = dl; // The best dl so far
}
rst = rstats.getSimpleStats(last);
if (m_Debug) {
System.err.println("The rule covers: " + rst[0] + " | pos = " + rst[2]
+ " | neg = " + rst[4] + "\nThe rule doesn't cover: " + rst[1]
+ " | pos = " + rst[5]);
}
stop = checkStop(rst, minDL, dl);
if (!stop) {
ruleset.add(oneRule); // Accepted
newData = rstats.getFiltered(last)[1];// Data not covered
hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining?
if (m_Debug) {
System.err.println("One rule added: has positive? " + hasPositive);
}
} else {
if (m_Debug) {
System.err.println("Quit rule");
}
rstats.removeLast(); // Remove last to be re-used
}
}// while !stop
/******************** Optimization stage *******************/
RuleStats finalRulesetStat = null;
if (m_UsePruning) {
for (int z = 0; z < m_Optimizations; z++) {
if (m_Debug) {
System.err.println("\n*** Optimization: run #" + z + " ***");
}
newData = data;
finalRulesetStat = new RuleStats();
finalRulesetStat.setData(newData);
finalRulesetStat.setNumAllConds(m_Total);
int position = 0;
stop = false;
boolean isResidual = false;
hasPositive = defHasPositive;
dl = minDL = defDL;
oneRule: while (!stop && hasPositive) {
isResidual = (position >= ruleset.size()); // Cover residual positive
// examples
// Re-do shuffling and stratification
// newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData = part[0];
pruneData = part[1];
// growData=newData.trainCV(m_Folds, m_Folds-1);
// pruneData=newData.testCV(m_Folds, m_Folds-1);
RipperRule finalRule;
if (m_Debug) {
System.err.println("\nRule #" + position + "| isResidual?"
+ isResidual + "| data size: " + newData.sumOfWeights());
}
if (isResidual) {
RipperRule newRule = new RipperRule();
newRule.setConsequent(classIndex);
if (m_Debug) {
System.err.println("\nGrowing and pruning" + " a new rule ...");
}
newRule.grow(growData);
newRule.prune(pruneData, false);
finalRule = newRule;
if (m_Debug) {
System.err.println("\nNew rule found: "
+ newRule.toString(m_Class));
}
} else {
RipperRule oldRule = (RipperRule) ruleset.get(position);
boolean covers = false;
// Test coverage of the next old rule
for (int i = 0; i < newData.numInstances(); i++) {
if (oldRule.covers(newData.instance(i))) {
covers = true;
break;
}
}
if (!covers) {// Null coverage, no variants can be generated
finalRulesetStat.addAndUpdate(oldRule);
position++;
continue oneRule;
}
// 2 variants
if (m_Debug) {
System.err.println("\nGrowing and pruning" + " Replace ...");
}
RipperRule replace = new RipperRule();
replace.setConsequent(classIndex);
replace.grow(growData);
// Remove the pruning data covered by the following
// rules, then simply compute the error rate of the
// current rule to prune it. According to Ripper,
// it's equivalent to computing the error of the
// whole ruleset -- is it true?
pruneData = RuleStats.rmCoveredBySuccessives(pruneData, ruleset,
position);
replace.prune(pruneData, true);
if (m_Debug) {
System.err.println("\nGrowing and pruning" + " Revision ...");
}
RipperRule revision = (RipperRule) oldRule.copy();
// For revision, first rm the data covered by the old rule
Instances newGrowData = new Instances(growData, 0);
for (int b = 0; b < growData.numInstances(); b++) {
Instance inst = growData.instance(b);
if (revision.covers(inst)) {
newGrowData.add(inst);
}
}
revision.grow(newGrowData);
revision.prune(pruneData, true);
double[][] prevRuleStats = new double[position][6];
for (int c = 0; c < position; c++) {
prevRuleStats[c] = finalRulesetStat.getSimpleStats(c);
}
// Now compare the relative DL of variants
ArrayList tempRules = new ArrayList(ruleset.size());
for (Rule r : ruleset) {
tempRules.add((Rule) r.copy());
}
tempRules.set(position, replace);
RuleStats repStat = new RuleStats(data, tempRules);
repStat.setNumAllConds(m_Total);
repStat.countData(position, newData, prevRuleStats);
// repStat.countData();
rst = repStat.getSimpleStats(position);
if (m_Debug) {
System.err.println("Replace rule covers: " + rst[0] + " | pos = "
+ rst[2] + " | neg = " + rst[4] + "\nThe rule doesn't cover: "
+ rst[1] + " | pos = " + rst[5]);
}
double repDL = repStat.relativeDL(position, expFPRate, m_CheckErr);
if (m_Debug) {
System.err.println("\nReplace: " + replace.toString(m_Class)
+ " |dl = " + repDL);
}
if (Double.isNaN(repDL) || Double.isInfinite(repDL)) {
throw new Exception("Should never happen: repDL"
+ "in optmz. stage NaN or " + "infinite!");
}
tempRules.set(position, revision);
RuleStats revStat = new RuleStats(data, tempRules);
revStat.setNumAllConds(m_Total);
revStat.countData(position, newData, prevRuleStats);
// revStat.countData();
double revDL = revStat.relativeDL(position, expFPRate, m_CheckErr);
if (m_Debug) {
System.err.println("Revision: " + revision.toString(m_Class)
+ " |dl = " + revDL);
}
if (Double.isNaN(revDL) || Double.isInfinite(revDL)) {
throw new Exception("Should never happen: revDL"
+ "in optmz. stage NaN or " + "infinite!");
}
rstats = new RuleStats(data, ruleset);
rstats.setNumAllConds(m_Total);
rstats.countData(position, newData, prevRuleStats);
// rstats.countData();
double oldDL = rstats.relativeDL(position, expFPRate, m_CheckErr);
if (Double.isNaN(oldDL) || Double.isInfinite(oldDL)) {
throw new Exception("Should never happen: oldDL"
+ "in optmz. stage NaN or " + "infinite!");
}
if (m_Debug) {
System.err.println("Old rule: " + oldRule.toString(m_Class)
+ " |dl = " + oldDL);
}
if (m_Debug) {
System.err.println("\nrepDL: " + repDL + "\nrevDL: " + revDL
+ "\noldDL: " + oldDL);
}
if ((oldDL <= revDL) && (oldDL <= repDL)) {
finalRule = oldRule; // Old the best
} else if (revDL <= repDL) {
finalRule = revision; // Revision the best
} else {
finalRule = replace; // Replace the best
}
}
finalRulesetStat.addAndUpdate(finalRule);
rst = finalRulesetStat.getSimpleStats(position);
if (isResidual) {
dl += finalRulesetStat.relativeDL(position, expFPRate, m_CheckErr);
if (m_Debug) {
System.err.println("After optimization: the dl" + "=" + dl
+ " | best: " + minDL);
}
if (dl < minDL) {
minDL = dl; // The best dl so far
}
stop = checkStop(rst, minDL, dl);
if (!stop) {
ruleset.add(finalRule); // Accepted
} else {
finalRulesetStat.removeLast(); // Remove last to be re-used
position--;
}
} else {
ruleset.set(position, finalRule); // Accepted
}
if (m_Debug) {
System.err.println("The rule covers: " + rst[0] + " | pos = "
+ rst[2] + " | neg = " + rst[4] + "\nThe rule doesn't cover: "
+ rst[1] + " | pos = " + rst[5]);
System.err.println("\nRuleset so far: ");
for (int x = 0; x < ruleset.size(); x++) {
System.err.println(x + ": "
+ ((RipperRule) ruleset.get(x)).toString(m_Class));
}
System.err.println();
}
// Data not covered
if (finalRulesetStat.getRulesetSize() > 0) {
newData = finalRulesetStat.getFiltered(position)[1];
}
hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining?
position++;
} // while !stop && hasPositive
if (ruleset.size() > (position + 1)) { // Hasn't gone through yet
for (int k = position + 1; k < ruleset.size(); k++) {
finalRulesetStat.addAndUpdate(ruleset.get(k));
}
}
if (m_Debug) {
System.err.println("\nDeleting rules to decrease"
+ " DL of the whole ruleset ...");
}
finalRulesetStat.reduceDL(expFPRate, m_CheckErr);
if (m_Debug) {
int del = ruleset.size() - finalRulesetStat.getRulesetSize();
System.err.println(del + " rules are deleted"
+ " after DL reduction procedure");
}
ruleset = finalRulesetStat.getRuleset();
rstats = finalRulesetStat;
} // For each run of optimization
} // if pruning is used
// Concatenate the ruleset for this class to the whole ruleset
if (m_Debug) {
System.err.println("\nFinal ruleset: ");
for (int x = 0; x < ruleset.size(); x++) {
System.err.println(x + ": "
+ ((RipperRule) ruleset.get(x)).toString(m_Class));
}
System.err.println();
}
m_Ruleset.addAll(ruleset);
m_RulesetStats.add(rstats);
if (ruleset.size() > 0) {
return rstats.getFiltered(ruleset.size() - 1)[1]; // Data not
} else {
return data;
}
}
/**
* Check whether the stopping criterion meets
*
* @param rst the statistic of the ruleset
* @param minDL the min description length so far
* @param dl the current description length of the ruleset
* @return true if stop criterion meets, false otherwise
*/
private boolean checkStop(double[] rst, double minDL, double dl) {
if (dl > minDL + MAX_DL_SURPLUS) {
if (m_Debug) {
System.err.println("DL too large: " + dl + " | " + minDL);
}
return true;
} else if (!Utils.gr(rst[2], 0.0)) {// Covered positives
if (m_Debug) {
System.err.println("Too few positives.");
}
return true;
} else if ((rst[4] / rst[0]) >= 0.5) {// Err rate
if (m_CheckErr) {
if (m_Debug) {
System.err.println("Error too large: " + rst[4] + "/" + rst[0]);
}
return true;
} else {
return false;
}
} else {// Not stops
if (m_Debug) {
System.err.println("Continue.");
}
return false;
}
}
/**
* Prints the all the rules of the rule learner.
*
* @return a textual description of the classifier
*/
@Override
public String toString() {
if (m_Ruleset == null) {
return "JRIP: No model built yet.";
}
StringBuffer sb = new StringBuffer("JRIP rules:\n" + "===========\n\n");
for (int j = 0; j < m_RulesetStats.size(); j++) {
RuleStats rs = m_RulesetStats.get(j);
ArrayList rules = rs.getRuleset();
for (int k = 0; k < rules.size(); k++) {
double[] simStats = rs.getSimpleStats(k);
sb.append(((RipperRule) rules.get(k)).toString(m_Class) + " ("
+ simStats[0] + "/" + simStats[4] + ")\n");
}
}
if (m_Debug) {
System.err.println("Inside m_Ruleset");
for (int i = 0; i < m_Ruleset.size(); i++) {
System.err.println(((RipperRule) m_Ruleset.get(i)).toString(m_Class));
}
}
sb.append("\nNumber of Rules : " + m_Ruleset.size() + "\n");
return sb.toString();
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 13408 $");
}
/**
* Main method.
*
* @param args the options for the classifier
*/
public static void main(String[] args) {
runClassifier(new JRip(), args);
}
}