weka.classifiers.pmml.consumer.TreeModel Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* TreeModel.java
* Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.classifiers.pmml.consumer;
import java.io.Serializable;
import java.util.ArrayList;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import weka.core.Attribute;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.pmml.Array;
import weka.core.pmml.MiningSchema;
/**
* Class implementing import of PMML TreeModel. Can be used as a Weka classifier
* for prediction (buildClassifier() raises and Exception).
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 10153 $;
*/
public class TreeModel extends PMMLClassifier implements Drawable {
/**
* For serialization
*/
private static final long serialVersionUID = -2065158088298753129L;
/**
* Inner class representing the ScoreDistribution element
*/
static class ScoreDistribution implements Serializable {
/**
* For serialization
*/
private static final long serialVersionUID = -123506262094299933L;
/** The class label for this distribution element */
private final String m_classLabel;
/** The index of the class label */
private int m_classLabelIndex = -1;
/** The count for this label */
private final double m_recordCount;
/** The optional confidence value */
private double m_confidence = Utils.missingValue();
/**
* Construct a ScoreDistribution entry
*
* @param scoreE the node containing the distribution
* @param miningSchema the mining schema
* @param baseCount the number of records at the node that owns this
* distribution entry
* @throws Exception if something goes wrong
*/
protected ScoreDistribution(Element scoreE, MiningSchema miningSchema,
double baseCount) throws Exception {
// get the label
m_classLabel = scoreE.getAttribute("value");
Attribute classAtt = miningSchema.getFieldsAsInstances().classAttribute();
if (classAtt == null || classAtt.indexOfValue(m_classLabel) < 0) {
throw new Exception(
"[ScoreDistribution] class attribute not set or class value "
+ m_classLabel + " not found!");
}
m_classLabelIndex = classAtt.indexOfValue(m_classLabel);
// get the frequency
String recordC = scoreE.getAttribute("recordCount");
m_recordCount = Double.parseDouble(recordC);
// get the optional confidence
String confidence = scoreE.getAttribute("confidence");
if (confidence != null && confidence.length() > 0) {
m_confidence = Double.parseDouble(confidence);
} else if (!Utils.isMissingValue(baseCount) && baseCount > 0) {
m_confidence = m_recordCount / baseCount;
}
}
/**
* Backfit confidence value (does nothing if the confidence value is already
* set).
*
* @param baseCount the total number of records (supplied either explicitly
* from the node that owns this distribution entry or most likely
* computed from summing the recordCounts of all the distribution
* entries in the distribution that owns this entry).
*/
void deriveConfidenceValue(double baseCount) {
if (Utils.isMissingValue(m_confidence)
&& !Utils.isMissingValue(baseCount) && baseCount > 0) {
m_confidence = m_recordCount / baseCount;
}
}
String getClassLabel() {
return m_classLabel;
}
int getClassLabelIndex() {
return m_classLabelIndex;
}
double getRecordCount() {
return m_recordCount;
}
double getConfidence() {
return m_confidence;
}
@Override
public String toString() {
return m_classLabel + ": " + m_recordCount + " ("
+ Utils.doubleToString(m_confidence, 2) + ") ";
}
}
/**
* Base class for Predicates
*/
static abstract class Predicate implements Serializable {
/**
* For serialization
*/
private static final long serialVersionUID = 1035344165452733887L;
enum Eval {
TRUE, FALSE, UNKNOWN;
}
/**
* Evaluate this predicate.
*
* @param input the input vector of attribute and derived field values.
*
* @return the evaluation status of this predicate.
*/
abstract Eval evaluate(double[] input);
protected String toString(int level, boolean cr) {
return toString(level);
}
protected String toString(int level) {
StringBuffer text = new StringBuffer();
for (int j = 0; j < level; j++) {
text.append("| ");
}
return text.append(toString()).toString();
}
static Eval booleanToEval(boolean missing, boolean result) {
if (missing) {
return Eval.UNKNOWN;
} else if (result) {
return Eval.TRUE;
} else {
return Eval.FALSE;
}
}
/**
* Factory method to return the appropriate predicate for a given node in
* the tree.
*
* @param nodeE the XML node encapsulating the tree node.
* @param miningSchema the mining schema in use
* @return a Predicate
* @throws Exception of something goes wrong.
*/
static Predicate getPredicate(Element nodeE, MiningSchema miningSchema)
throws Exception {
Predicate result = null;
NodeList children = nodeE.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
if (tagName.equals("True")) {
result = new True();
break;
} else if (tagName.equals("False")) {
result = new False();
break;
} else if (tagName.equals("SimplePredicate")) {
result = new SimplePredicate((Element) child, miningSchema);
break;
} else if (tagName.equals("CompoundPredicate")) {
result = new CompoundPredicate((Element) child, miningSchema);
break;
} else if (tagName.equals("SimpleSetPredicate")) {
result = new SimpleSetPredicate((Element) child, miningSchema);
break;
}
}
}
if (result == null) {
throw new Exception(
"[Predicate] unknown or missing predicate type in node");
}
return result;
}
}
/**
* Simple True Predicate
*/
static class True extends Predicate {
/**
* For serialization
*/
private static final long serialVersionUID = 1817942234610531627L;
@Override
public Predicate.Eval evaluate(double[] input) {
return Predicate.Eval.TRUE;
}
@Override
public String toString() {
return "True: ";
}
}
/**
* Simple False Predicate
*/
static class False extends Predicate {
/**
* For serialization
*/
private static final long serialVersionUID = -3647261386442860365L;
@Override
public Predicate.Eval evaluate(double[] input) {
return Predicate.Eval.FALSE;
}
@Override
public String toString() {
return "False: ";
}
}
/**
* Class representing the SimplePredicate
*/
static class SimplePredicate extends Predicate {
/**
* For serialization
*/
private static final long serialVersionUID = -6156684285069327400L;
enum Operator {
EQUAL("equal") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
weka.core.Utils.eq(input[fieldIndex], value));
}
@Override
String shortName() {
return "==";
}
},
NOTEQUAL("notEqual") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
(input[fieldIndex] != value));
}
@Override
String shortName() {
return "!=";
}
},
LESSTHAN("lessThan") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
(input[fieldIndex] < value));
}
@Override
String shortName() {
return "<";
}
},
LESSOREQUAL("lessOrEqual") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
(input[fieldIndex] <= value));
}
@Override
String shortName() {
return "<=";
}
},
GREATERTHAN("greaterThan") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
(input[fieldIndex] > value));
}
@Override
String shortName() {
return ">";
}
},
GREATEROREQUAL("greaterOrEqual") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
(input[fieldIndex] >= value));
}
@Override
String shortName() {
return ">=";
}
},
ISMISSING("isMissing") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(false,
Utils.isMissingValue(input[fieldIndex]));
}
@Override
String shortName() {
return toString();
}
},
ISNOTMISSING("isNotMissing") {
@Override
Predicate.Eval evaluate(double[] input, double value, int fieldIndex) {
return Predicate.booleanToEval(false,
!Utils.isMissingValue(input[fieldIndex]));
}
@Override
String shortName() {
return toString();
}
};
abstract Predicate.Eval evaluate(double[] input, double value,
int fieldIndex);
abstract String shortName();
private final String m_stringVal;
Operator(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
/** the field that we are comparing against */
int m_fieldIndex = -1;
/** the name of the field */
String m_fieldName;
/** true if the field is nominal */
boolean m_isNominal;
/** the value as a string (if nominal) */
String m_nominalValue;
/**
* the value to compare against (if nominal it holds the index of the value)
*/
double m_value;
/** the operator to use */
Operator m_operator;
public SimplePredicate(Element simpleP, MiningSchema miningSchema) throws Exception {
Instances totalStructure = miningSchema.getFieldsAsInstances();
// get the field name and set up the index
String fieldS = simpleP.getAttribute("field");
Attribute att = totalStructure.attribute(fieldS);
if (att == null) {
throw new Exception("[SimplePredicate] unable to find field " + fieldS
+ " in the incoming instance structure!");
}
// find the index
int index = -1;
for (int i = 0; i < totalStructure.numAttributes(); i++) {
if (totalStructure.attribute(i).name().equals(fieldS)) {
index = i;
m_fieldName = totalStructure.attribute(i).name();
break;
}
}
m_fieldIndex = index;
if (att.isNominal()) {
m_isNominal = true;
}
// get the operator
String oppS = simpleP.getAttribute("operator");
for (Operator o : Operator.values()) {
if (o.toString().equals(oppS)) {
m_operator = o;
break;
}
}
if (m_operator != Operator.ISMISSING
&& m_operator != Operator.ISNOTMISSING) {
String valueS = simpleP.getAttribute("value");
if (att.isNumeric()) {
m_value = Double.parseDouble(valueS);
} else {
m_nominalValue = valueS;
m_value = att.indexOfValue(valueS);
if (m_value < 0) {
throw new Exception("[SimplePredicate] can't find value " + valueS
+ " in nominal " + "attribute " + att.name());
}
}
}
}
@Override
public Predicate.Eval evaluate(double[] input) {
return m_operator.evaluate(input, m_value, m_fieldIndex);
}
@Override
public String toString() {
StringBuffer temp = new StringBuffer();
temp.append(m_fieldName + " " + m_operator.shortName());
if (m_operator != Operator.ISMISSING
&& m_operator != Operator.ISNOTMISSING) {
temp.append(" " + ((m_isNominal) ? m_nominalValue : "" + m_value));
}
return temp.toString();
}
}
/**
* Class representing the CompoundPredicate
*/
static class CompoundPredicate extends Predicate {
/**
* For serialization
*/
private static final long serialVersionUID = -3332091529764559077L;
enum BooleanOperator {
OR("or") {
@Override
Predicate.Eval evaluate(ArrayList constituents,
double[] input) {
Predicate.Eval currentStatus = Predicate.Eval.FALSE;
for (Predicate p : constituents) {
Predicate.Eval temp = p.evaluate(input);
if (temp == Predicate.Eval.TRUE) {
currentStatus = temp;
break;
} else if (temp == Predicate.Eval.UNKNOWN) {
currentStatus = temp;
}
}
return currentStatus;
}
},
AND("and") {
@Override
Predicate.Eval evaluate(ArrayList constituents,
double[] input) {
Predicate.Eval currentStatus = Predicate.Eval.TRUE;
for (Predicate p : constituents) {
Predicate.Eval temp = p.evaluate(input);
if (temp == Predicate.Eval.FALSE) {
currentStatus = temp;
break;
} else if (temp == Predicate.Eval.UNKNOWN) {
currentStatus = temp;
}
}
return currentStatus;
}
},
XOR("xor") {
@Override
Predicate.Eval evaluate(ArrayList constituents,
double[] input) {
Predicate.Eval currentStatus = constituents.get(0).evaluate(input);
if (currentStatus != Predicate.Eval.UNKNOWN) {
for (int i = 1; i < constituents.size(); i++) {
Predicate.Eval temp = constituents.get(i).evaluate(input);
if (temp == Predicate.Eval.UNKNOWN) {
currentStatus = temp;
break;
} else {
if (currentStatus != temp) {
currentStatus = Predicate.Eval.TRUE;
} else {
currentStatus = Predicate.Eval.FALSE;
}
}
}
}
return currentStatus;
}
},
SURROGATE("surrogate") {
@Override
Predicate.Eval evaluate(ArrayList constituents,
double[] input) {
Predicate.Eval currentStatus = constituents.get(0).evaluate(input);
int i = 1;
while (currentStatus == Predicate.Eval.UNKNOWN) {
currentStatus = constituents.get(i).evaluate(input);
}
// return false if all our surrogates evaluate to unknown.
if (currentStatus == Predicate.Eval.UNKNOWN) {
currentStatus = Predicate.Eval.FALSE;
}
return currentStatus;
}
};
abstract Predicate.Eval evaluate(ArrayList constituents,
double[] input);
private final String m_stringVal;
BooleanOperator(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
/** the constituent Predicates */
ArrayList m_components = new ArrayList();
/** the boolean operator */
BooleanOperator m_booleanOperator;
public CompoundPredicate(Element compoundP, MiningSchema miningSchema) throws Exception {
// Instances totalStructure = miningSchema.getFieldsAsInstances();
String booleanOpp = compoundP.getAttribute("booleanOperator");
for (BooleanOperator b : BooleanOperator.values()) {
if (b.toString().equals(booleanOpp)) {
m_booleanOperator = b;
}
}
// now get all the encapsulated operators
NodeList children = compoundP.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
if (tagName.equals("True")) {
m_components.add(new True());
} else if (tagName.equals("False")) {
m_components.add(new False());
} else if (tagName.equals("SimplePredicate")) {
m_components
.add(new SimplePredicate((Element) child, miningSchema));
} else if (tagName.equals("CompoundPredicate")) {
m_components.add(new CompoundPredicate((Element) child,
miningSchema));
} else {
m_components.add(new SimpleSetPredicate((Element) child,
miningSchema));
}
}
}
}
@Override
public Predicate.Eval evaluate(double[] input) {
return m_booleanOperator.evaluate(m_components, input);
}
@Override
public String toString() {
return toString(0, false);
}
@Override
public String toString(int level, boolean cr) {
StringBuffer text = new StringBuffer();
for (int j = 0; j < level; j++) {
text.append("| ");
}
text.append("Compound [" + m_booleanOperator.toString() + "]");
if (cr) {
text.append("\\n");
} else {
text.append("\n");
}
for (int i = 0; i < m_components.size(); i++) {
text.append(m_components.get(i).toString(level, cr).replace(":", ""));
if (i != m_components.size() - 1) {
if (cr) {
text.append("\\n");
} else {
text.append("\n");
}
}
}
return text.toString();
}
}
/**
* Class representing the SimpleSetPredicate
*/
static class SimpleSetPredicate extends Predicate {
/**
* For serialization
*/
private static final long serialVersionUID = -2711995401345708486L;
enum BooleanOperator {
IS_IN("isIn") {
@Override
Predicate.Eval evaluate(double[] input, int fieldIndex, Array set,
Attribute nominalLookup) {
if (set.getType() == Array.ArrayType.STRING) {
String value = "";
if (!Utils.isMissingValue(input[fieldIndex])) {
value = nominalLookup.value((int) input[fieldIndex]);
}
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]), set.contains(value));
} else if (set.getType() == Array.ArrayType.NUM
|| set.getType() == Array.ArrayType.REAL) {
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
set.contains(input[fieldIndex]));
}
return Predicate.booleanToEval(
Utils.isMissingValue(input[fieldIndex]),
set.contains((int) input[fieldIndex]));
}
},
IS_NOT_IN("isNotIn") {
@Override
Predicate.Eval evaluate(double[] input, int fieldIndex, Array set,
Attribute nominalLookup) {
Predicate.Eval result = IS_IN.evaluate(input, fieldIndex, set,
nominalLookup);
if (result == Predicate.Eval.FALSE) {
result = Predicate.Eval.TRUE;
} else if (result == Predicate.Eval.TRUE) {
result = Predicate.Eval.FALSE;
}
return result;
}
};
abstract Predicate.Eval evaluate(double[] input, int fieldIndex,
Array set, Attribute nominalLookup);
private final String m_stringVal;
BooleanOperator(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
/** the field to reference */
int m_fieldIndex = -1;
/** the name of the field */
String m_fieldName;
/** is the referenced field nominal? */
boolean m_isNominal = false;
/** the attribute to lookup nominal values from */
Attribute m_nominalLookup;
/** the boolean operator */
BooleanOperator m_operator = BooleanOperator.IS_IN;
/** the array holding the set of values */
Array m_set;
public SimpleSetPredicate(Element setP, MiningSchema miningSchema) throws Exception {
Instances totalStructure = miningSchema.getFieldsAsInstances();
// get the field name and set up the index
String fieldS = setP.getAttribute("field");
Attribute att = totalStructure.attribute(fieldS);
if (att == null) {
throw new Exception("[SimplePredicate] unable to find field " + fieldS
+ " in the incoming instance structure!");
}
// find the index
int index = -1;
for (int i = 0; i < totalStructure.numAttributes(); i++) {
if (totalStructure.attribute(i).name().equals(fieldS)) {
index = i;
m_fieldName = totalStructure.attribute(i).name();
break;
}
}
m_fieldIndex = index;
if (att.isNominal()) {
m_isNominal = true;
m_nominalLookup = att;
}
// need to scan the children looking for an array type
NodeList children = setP.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
if (Array.isArray((Element) child)) {
// found the array
m_set = Array.create((Element) child);
break;
}
}
}
if (m_set == null) {
throw new Exception("[SimpleSetPredictate] couldn't find an "
+ "array containing the set values!");
}
// check array type against field type
if (m_set.getType() == Array.ArrayType.STRING && !m_isNominal) {
throw new Exception("[SimpleSetPredicate] referenced field "
+ totalStructure.attribute(m_fieldIndex).name()
+ " is numeric but array type is string!");
} else if (m_set.getType() != Array.ArrayType.STRING && m_isNominal) {
throw new Exception("[SimpleSetPredicate] referenced field "
+ totalStructure.attribute(m_fieldIndex).name()
+ " is nominal but array type is numeric!");
}
}
@Override
public Predicate.Eval evaluate(double[] input) {
return m_operator.evaluate(input, m_fieldIndex, m_set, m_nominalLookup);
}
@Override
public String toString() {
StringBuffer temp = new StringBuffer();
temp.append(m_fieldName + " " + m_operator.toString() + " ");
temp.append(m_set.toString());
return temp.toString();
}
}
/**
* Class for handling a Node in the tree
*/
class TreeNode implements Serializable {
// TODO: perhaps implement a class called Statistics that contains
// Partitions?
/**
* For serialization
*/
private static final long serialVersionUID = 3011062274167063699L;
/** ID for this node */
private String m_ID = "" + this.hashCode();
/** The score as a string */
private String m_scoreString;
/** The index of this predicted value (if class is nominal) */
private int m_scoreIndex = -1;
/** The score as a number (if target is numeric) */
private double m_scoreNumeric = Utils.missingValue();
/** The record count at this node (if defined) */
private double m_recordCount = Utils.missingValue();
/** The ID of the default child (if applicable) */
private String m_defaultChildID;
/** Holds the node of the default child (if defined) */
private TreeNode m_defaultChild;
/** The distribution for labels (classification) */
private final ArrayList m_scoreDistributions = new ArrayList();
/** The predicate for this node */
private final Predicate m_predicate;
/** The children of this node */
private final ArrayList m_childNodes = new ArrayList();
protected TreeNode(Element nodeE, MiningSchema miningSchema) throws Exception {
Attribute classAtt = miningSchema.getFieldsAsInstances().classAttribute();
// get the ID
String id = nodeE.getAttribute("id");
if (id != null && id.length() > 0) {
m_ID = id;
}
// get the score for this node
String scoreS = nodeE.getAttribute("score");
if (scoreS != null && scoreS.length() > 0) {
m_scoreString = scoreS;
// try to parse as a number in case we
// are part of a regression tree
if (classAtt.isNumeric()) {
try {
m_scoreNumeric = Double.parseDouble(scoreS);
} catch (NumberFormatException ex) {
throw new Exception(
"[TreeNode] class is numeric but unable to parse score "
+ m_scoreString + " as a number!");
}
} else {
// store the index of this class value
m_scoreIndex = classAtt.indexOfValue(m_scoreString);
if (m_scoreIndex < 0) {
throw new Exception(
"[TreeNode] can't find match for predicted value "
+ m_scoreString + " in class attribute!");
}
}
}
// get the record count if defined
String recordC = nodeE.getAttribute("recordCount");
if (recordC != null && recordC.length() > 0) {
m_recordCount = Double.parseDouble(recordC);
}
// get the default child (if applicable)
String defaultC = nodeE.getAttribute("defaultChild");
if (defaultC != null && defaultC.length() > 0) {
m_defaultChildID = defaultC;
}
// TODO: Embedded model (once we support model composition)
// Now get the ScoreDistributions (if any and mining function
// is classification) at this level
if (m_functionType == MiningFunction.CLASSIFICATION) {
getScoreDistributions(nodeE, miningSchema);
}
// Now get the Predicate
m_predicate = Predicate.getPredicate(nodeE, miningSchema);
// Now get the child Node(s)
getChildNodes(nodeE, miningSchema);
// If we have a default child specified, find it now
if (m_defaultChildID != null) {
for (TreeNode t : m_childNodes) {
if (t.getID().equals(m_defaultChildID)) {
m_defaultChild = t;
break;
}
}
}
}
private void getChildNodes(Element nodeE, MiningSchema miningSchema)
throws Exception {
NodeList children = nodeE.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
if (tagName.equals("Node")) {
TreeNode tempN = new TreeNode((Element) child, miningSchema);
m_childNodes.add(tempN);
}
}
}
}
private void getScoreDistributions(Element nodeE, MiningSchema miningSchema)
throws Exception {
NodeList scoreChildren = nodeE.getChildNodes();
for (int i = 0; i < scoreChildren.getLength(); i++) {
Node child = scoreChildren.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
if (tagName.equals("ScoreDistribution")) {
ScoreDistribution newDist = new ScoreDistribution((Element) child,
miningSchema, m_recordCount);
m_scoreDistributions.add(newDist);
}
}
}
// backfit the confidence values
if (Utils.isMissingValue(m_recordCount)) {
double baseCount = 0;
for (ScoreDistribution s : m_scoreDistributions) {
baseCount += s.getRecordCount();
}
for (ScoreDistribution s : m_scoreDistributions) {
s.deriveConfidenceValue(baseCount);
}
}
}
/**
* Get the score value as a string.
*
* @return the score value as a String.
*/
protected String getScore() {
return m_scoreString;
}
/**
* Get the score value as a number (regression trees only).
*
* @return the score as a number
*/
protected double getScoreNumeric() {
return m_scoreNumeric;
}
/**
* Get the ID of this node.
*
* @return the ID of this node.
*/
protected String getID() {
return m_ID;
}
/**
* Get the Predicate at this node.
*
* @return the predicate at this node.
*/
protected Predicate getPredicate() {
return m_predicate;
}
/**
* Get the record count at this node.
*
* @return the record count at this node.
*/
protected double getRecordCount() {
return m_recordCount;
}
protected void dumpGraph(StringBuffer text) throws Exception {
text.append("N" + m_ID + " ");
if (m_scoreString != null) {
text.append("[label=\"score=" + m_scoreString);
}
if (m_scoreDistributions.size() > 0 && m_childNodes.size() == 0) {
text.append("\\n");
for (ScoreDistribution s : m_scoreDistributions) {
text.append(s + "\\n");
}
}
text.append("\"");
if (m_childNodes.size() == 0) {
text.append(" shape=box style=filled");
}
text.append("]\n");
for (TreeNode c : m_childNodes) {
text.append("N" + m_ID + "->" + "N" + c.getID());
text.append(" [label=\"" + c.getPredicate().toString(0, true));
text.append("\"]\n");
c.dumpGraph(text);
}
}
@Override
public String toString() {
StringBuffer text = new StringBuffer();
// print out the root
dumpTree(0, text);
return text.toString();
}
protected void dumpTree(int level, StringBuffer text) {
if (m_childNodes.size() > 0) {
for (int i = 0; i < m_childNodes.size(); i++) {
text.append("\n");
/*
* for (int j = 0; j < level; j++) { text.append("| "); }
*/
// output the predicate for this child node
TreeNode child = m_childNodes.get(i);
text.append(child.getPredicate().toString(level, false));
// process recursively
child.dumpTree(level + 1, text);
}
} else {
// leaf
text.append(": ");
if (!Utils.isMissingValue(m_scoreNumeric)) {
text.append(m_scoreNumeric);
} else {
text.append(m_scoreString + " ");
if (m_scoreDistributions.size() > 0) {
text.append("[");
for (ScoreDistribution s : m_scoreDistributions) {
text.append(s);
}
text.append("]");
} else {
text.append(m_scoreString);
}
}
}
}
/**
* Score an incoming instance. Invokes a missing value handling strategy.
*
* @param instance a vector of incoming attribute and derived field values.
* @param classAtt the class attribute
* @return a predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] score(double[] instance, Attribute classAtt)
throws Exception {
double[] preds = null;
if (classAtt.isNumeric()) {
preds = new double[1];
} else {
preds = new double[classAtt.numValues()];
}
// leaf?
if (m_childNodes.size() == 0) {
doLeaf(classAtt, preds);
} else {
// process the children
switch (TreeModel.this.m_missingValueStrategy) {
case NONE:
preds = missingValueStrategyNone(instance, classAtt);
break;
case LASTPREDICTION:
preds = missingValueStrategyLastPrediction(instance, classAtt);
break;
case DEFAULTCHILD:
preds = missingValueStrategyDefaultChild(instance, classAtt);
break;
default:
throw new Exception("[TreeModel] not implemented!");
}
}
return preds;
}
/**
* Compute the predictions for a leaf.
*
* @param classAtt the class attribute
* @param preds an array to hold the predicted probabilities.
* @throws Exception if something goes wrong.
*/
protected void doLeaf(Attribute classAtt, double[] preds) throws Exception {
if (classAtt.isNumeric()) {
preds[0] = m_scoreNumeric;
} else {
if (m_scoreDistributions.size() == 0) {
preds[m_scoreIndex] = 1.0;
} else {
// collect confidences from the score distributions
for (ScoreDistribution s : m_scoreDistributions) {
preds[s.getClassLabelIndex()] = s.getConfidence();
}
}
}
}
/**
* Evaluate on the basis of the no true child strategy.
*
* @param classAtt the class attribute.
* @param preds an array to hold the predicted probabilities.
* @throws Exception if something goes wrong.
*/
protected void doNoTrueChild(Attribute classAtt, double[] preds)
throws Exception {
if (TreeModel.this.m_noTrueChildStrategy == NoTrueChildStrategy.RETURNNULLPREDICTION) {
for (int i = 0; i < classAtt.numValues(); i++) {
preds[i] = Utils.missingValue();
}
} else {
// return the predictions at this node
doLeaf(classAtt, preds);
}
}
/**
* Compute predictions and optionally invoke the weighted confidence missing
* value handling strategy.
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyWeightedConfidence(
double[] instance, Attribute classAtt) throws Exception {
if (classAtt.isNumeric()) {
throw new Exception(
"[TreeNode] missing value strategy weighted confidence, "
+ "but class is numeric!");
}
double[] preds = null;
TreeNode trueNode = null;
boolean strategyInvoked = false;
int nodeCount = 0;
// look at the evaluation of the child predicates
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
// note the first child to evaluate to true
if (trueNode == null) {
trueNode = c;
}
nodeCount++;
} else if (c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
strategyInvoked = true;
nodeCount++;
}
}
if (strategyInvoked) {
// we expect to combine nodeCount distributions
double[][] dists = new double[nodeCount][];
double[] weights = new double[nodeCount];
// collect the distributions and weights
int count = 0;
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE
|| c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
weights[count] = c.getRecordCount();
if (Utils.isMissingValue(weights[count])) {
throw new Exception(
"[TreeNode] weighted confidence missing value "
+ "strategy invoked, but no record count defined for node "
+ c.getID());
}
dists[count++] = c.score(instance, classAtt);
}
}
// do the combination
preds = new double[classAtt.numValues()];
for (int i = 0; i < classAtt.numValues(); i++) {
for (int j = 0; j < nodeCount; j++) {
preds[i] += ((weights[j] / m_recordCount) * dists[j][i]);
}
}
} else {
if (trueNode != null) {
preds = trueNode.score(instance, classAtt);
} else {
doNoTrueChild(classAtt, preds);
}
}
return preds;
}
protected double[] freqCountsForAggNodesStrategy(double[] instance,
Attribute classAtt) throws Exception {
double[] counts = new double[classAtt.numValues()];
if (m_childNodes.size() > 0) {
// collect the counts
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE
|| c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
double[] temp = c.freqCountsForAggNodesStrategy(instance, classAtt);
for (int i = 0; i < classAtt.numValues(); i++) {
counts[i] += temp[i];
}
}
}
} else {
// process the score distributions
if (m_scoreDistributions.size() == 0) {
throw new Exception(
"[TreeModel] missing value strategy aggregate nodes:"
+ " no score distributions at leaf " + m_ID);
}
for (ScoreDistribution s : m_scoreDistributions) {
counts[s.getClassLabelIndex()] = s.getRecordCount();
}
}
return counts;
}
/**
* Compute predictions and optionally invoke the aggregate nodes missing
* value handling strategy.
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyAggregateNodes(double[] instance,
Attribute classAtt) throws Exception {
if (classAtt.isNumeric()) {
throw new Exception(
"[TreeNode] missing value strategy aggregate nodes, "
+ "but class is numeric!");
}
double[] preds = null;
TreeNode trueNode = null;
boolean strategyInvoked = false;
// look at the evaluation of the child predicates
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
// note the first child to evaluate to true
if (trueNode == null) {
trueNode = c;
}
} else if (c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
strategyInvoked = true;
}
}
if (strategyInvoked) {
double[] aggregatedCounts = freqCountsForAggNodesStrategy(instance,
classAtt);
// normalize
Utils.normalize(aggregatedCounts);
preds = aggregatedCounts;
} else {
if (trueNode != null) {
preds = trueNode.score(instance, classAtt);
} else {
doNoTrueChild(classAtt, preds);
}
}
return preds;
}
/**
* Compute predictions and optionally invoke the default child missing value
* handling strategy.
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyDefaultChild(double[] instance,
Attribute classAtt) throws Exception {
double[] preds = null;
boolean strategyInvoked = false;
// look for a child whose predicate evaluates to TRUE
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
preds = c.score(instance, classAtt);
break;
} else if (c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
strategyInvoked = true;
}
}
// no true child found
if (preds == null) {
if (!strategyInvoked) {
doNoTrueChild(classAtt, preds);
} else {
// do the strategy
// NOTE: we don't actually implement the missing value penalty since
// we always return a full probability distribution.
if (m_defaultChild != null) {
preds = m_defaultChild.score(instance, classAtt);
} else {
throw new Exception(
"[TreeNode] missing value strategy is defaultChild, but "
+ "no default child has been specified in node " + m_ID);
}
}
}
return preds;
}
/**
* Compute predictions and optionally invoke the last prediction missing
* value handling strategy.
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyLastPrediction(double[] instance,
Attribute classAtt) throws Exception {
double[] preds = null;
boolean strategyInvoked = false;
// look for a child whose predicate evaluates to TRUE
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
preds = c.score(instance, classAtt);
break;
} else if (c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
strategyInvoked = true;
}
}
// no true child found
if (preds == null) {
preds = new double[classAtt.numValues()];
if (!strategyInvoked) {
// no true child
doNoTrueChild(classAtt, preds);
} else {
// do the strategy
doLeaf(classAtt, preds);
}
}
return preds;
}
/**
* Compute predictions and optionally invoke the null prediction missing
* value handling strategy.
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyNullPrediction(double[] instance,
Attribute classAtt) throws Exception {
double[] preds = null;
boolean strategyInvoked = false;
// look for a child whose predicate evaluates to TRUE
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
preds = c.score(instance, classAtt);
break;
} else if (c.getPredicate().evaluate(instance) == Predicate.Eval.UNKNOWN) {
strategyInvoked = true;
}
}
// no true child found
if (preds == null) {
preds = new double[classAtt.numValues()];
if (!strategyInvoked) {
doNoTrueChild(classAtt, preds);
} else {
// do the strategy
for (int i = 0; i < classAtt.numValues(); i++) {
preds[i] = Utils.missingValue();
}
}
}
return preds;
}
/**
* Compute predictions and optionally invoke the "none" missing value
* handling strategy (invokes no true child).
*
* @param instance the incoming vector of attribute and derived field
* values.
* @param classAtt the class attribute.
* @return the predicted probability distribution.
* @throws Exception if something goes wrong.
*/
protected double[] missingValueStrategyNone(double[] instance,
Attribute classAtt) throws Exception {
double[] preds = null;
// look for a child whose predicate evaluates to TRUE
for (TreeNode c : m_childNodes) {
if (c.getPredicate().evaluate(instance) == Predicate.Eval.TRUE) {
preds = c.score(instance, classAtt);
break;
}
}
if (preds == null) {
preds = new double[classAtt.numValues()];
// no true child strategy
doNoTrueChild(classAtt, preds);
}
return preds;
}
}
/**
* Enumerated type for the mining function
*/
enum MiningFunction {
CLASSIFICATION, REGRESSION;
}
enum MissingValueStrategy {
LASTPREDICTION("lastPrediction"), NULLPREDICTION("nullPrediction"), DEFAULTCHILD(
"defaultChild"), WEIGHTEDCONFIDENCE("weightedConfidence"), AGGREGATENODES(
"aggregateNodes"), NONE("none");
private final String m_stringVal;
MissingValueStrategy(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
enum NoTrueChildStrategy {
RETURNNULLPREDICTION("returnNullPrediction"), RETURNLASTPREDICTION(
"returnLastPrediction");
private final String m_stringVal;
NoTrueChildStrategy(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
enum SplitCharacteristic {
BINARYSPLIT("binarySplit"), MULTISPLIT("multiSplit");
private final String m_stringVal;
SplitCharacteristic(String name) {
m_stringVal = name;
}
@Override
public String toString() {
return m_stringVal;
}
}
/** The mining function */
protected MiningFunction m_functionType = MiningFunction.CLASSIFICATION;
/** The missing value strategy */
protected MissingValueStrategy m_missingValueStrategy = MissingValueStrategy.NONE;
/**
* The missing value penalty (if defined). We don't actually make use of this
* since we always return full probability distributions.
*/
protected double m_missingValuePenalty = Utils.missingValue();
/** The no true child strategy to use */
protected NoTrueChildStrategy m_noTrueChildStrategy = NoTrueChildStrategy.RETURNNULLPREDICTION;
/** The splitting type */
protected SplitCharacteristic m_splitCharacteristic = SplitCharacteristic.MULTISPLIT;
/** The root of the tree */
protected TreeNode m_root;
public TreeModel(Element model, Instances dataDictionary,
MiningSchema miningSchema) throws Exception {
super(dataDictionary, miningSchema);
if (!getPMMLVersion().equals("3.2")) {
// TODO: might have to throw an exception and only support 3.2
}
String fn = model.getAttribute("functionName");
if (fn.equals("regression")) {
m_functionType = MiningFunction.REGRESSION;
}
// get the missing value strategy (if any)
String missingVS = model.getAttribute("missingValueStrategy");
if (missingVS != null && missingVS.length() > 0) {
for (MissingValueStrategy m : MissingValueStrategy.values()) {
if (m.toString().equals(missingVS)) {
m_missingValueStrategy = m;
break;
}
}
}
// get the missing value penalty (if any)
String missingP = model.getAttribute("missingValuePenalty");
if (missingP != null && missingP.length() > 0) {
// try to parse as a number
try {
m_missingValuePenalty = Double.parseDouble(missingP);
} catch (NumberFormatException ex) {
System.err.println("[TreeModel] WARNING: "
+ "couldn't parse supplied missingValuePenalty as a number");
}
}
String splitC = model.getAttribute("splitCharacteristic");
if (splitC != null && splitC.length() > 0) {
for (SplitCharacteristic s : SplitCharacteristic.values()) {
if (s.toString().equals(splitC)) {
m_splitCharacteristic = s;
break;
}
}
}
// find the root node of the tree
NodeList children = model.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
if (tagName.equals("Node")) {
m_root = new TreeNode((Element) child, miningSchema);
break;
}
}
}
}
/**
* Classifies the given test instance. The instance has to belong to a dataset
* when it's being classified.
*
* @param inst the instance to be classified
* @return the predicted most likely class for the instance or
* Utils.missingValue() if no prediction is made
* @exception Exception if an error occurred during the prediction
*/
@Override
public double[] distributionForInstance(Instance inst) throws Exception {
if (!m_initialized) {
mapToMiningSchema(inst.dataset());
}
double[] preds = null;
if (m_miningSchema.getFieldsAsInstances().classAttribute().isNumeric()) {
preds = new double[1];
} else {
preds = new double[m_miningSchema.getFieldsAsInstances().classAttribute()
.numValues()];
}
double[] incoming = m_fieldsMap.instanceToSchema(inst, m_miningSchema);
preds = m_root.score(incoming, m_miningSchema.getFieldsAsInstances()
.classAttribute());
return preds;
}
@Override
public String toString() {
StringBuffer temp = new StringBuffer();
temp.append("PMML version " + getPMMLVersion());
if (!getCreatorApplication().equals("?")) {
temp.append("\nApplication: " + getCreatorApplication());
}
temp.append("\nPMML Model: TreeModel");
temp.append("\n\n");
temp.append(m_miningSchema);
temp.append("Split-type: " + m_splitCharacteristic + "\n");
temp.append("No true child strategy: " + m_noTrueChildStrategy + "\n");
temp.append("Missing value strategy: " + m_missingValueStrategy + "\n");
temp.append(m_root.toString());
return temp.toString();
}
@Override
public String graph() throws Exception {
StringBuffer text = new StringBuffer();
text.append("digraph PMMTree {\n");
m_root.dumpGraph(text);
text.append("}\n");
return text.toString();
}
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 10153 $");
}
@Override
public int graphType() {
return Drawable.TREE;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy