org.jpmml.rexp.RangerConverter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pmml-rexp Show documentation
Show all versions of pmml-rexp Show documentation
JPMML R to PMML converter
The newest version!
/*
* Copyright (c) 2016 Villu Ruusmann
*
* This file is part of JPMML-R
*
* JPMML-R is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JPMML-R is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with JPMML-R. If not, see .
*/
package org.jpmml.rexp;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.dmg.pmml.DataField;
import org.dmg.pmml.DataType;
import org.dmg.pmml.MiningFunction;
import org.dmg.pmml.OpType;
import org.dmg.pmml.Predicate;
import org.dmg.pmml.ScoreDistribution;
import org.dmg.pmml.ScoreProbability;
import org.dmg.pmml.SimplePredicate;
import org.dmg.pmml.True;
import org.dmg.pmml.mining.MiningModel;
import org.dmg.pmml.mining.Segmentation;
import org.dmg.pmml.tree.BranchNode;
import org.dmg.pmml.tree.ClassifierNode;
import org.dmg.pmml.tree.LeafNode;
import org.dmg.pmml.tree.Node;
import org.dmg.pmml.tree.TreeModel;
import org.jpmml.converter.CategoricalFeature;
import org.jpmml.converter.CategoricalLabel;
import org.jpmml.converter.CategoryManager;
import org.jpmml.converter.ContinuousFeature;
import org.jpmml.converter.Feature;
import org.jpmml.converter.FeatureImportanceMap;
import org.jpmml.converter.ModelUtil;
import org.jpmml.converter.Schema;
import org.jpmml.converter.ValueUtil;
import org.jpmml.converter.mining.MiningModelUtil;
public class RangerConverter extends TreeModelConverter implements HasFeatureImportances {
boolean hasDependentVar = false;
public RangerConverter(RGenericVector ranger){
super(ranger);
}
@Override
public void encodeSchema(RExpEncoder encoder){
RGenericVector ranger = getObject();
RGenericVector forest = ranger.getGenericElement("forest", false);
if(forest == null){
throw new IllegalArgumentException("Missing \'forest\' element. Please re-train the model object with \'write.forest\' argument set to TRUE");
}
RStringVector treeType = ranger.getStringElement("treetype");
RGenericVector variableLevels = DecorationUtil.getGenericElement(ranger, "variable.levels");
{
String name = "_target";
DataField dataField;
switch(treeType.asScalar()){
case "Regression":
{
dataField = encoder.createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE);
}
break;
case "Classification":
case "Probability estimation":
{
RStringVector levels = forest.getStringElement("levels");
dataField = encoder.createDataField(name, OpType.CATEGORICAL, null, levels.getValues());
}
break;
default:
throw new IllegalArgumentException();
}
encoder.setLabel(dataField);
}
RBooleanVector isOrdered = forest.getBooleanElement("is.ordered");
RStringVector independentVariableNames = forest.getStringElement("independent.variable.names");
this.hasDependentVar = (isOrdered.size() == (independentVariableNames.size() + 1));
for(int i = 0; i < independentVariableNames.size(); i++){
if(!isOrdered.getValue(this.hasDependentVar ? (i + 1) : i)){
throw new IllegalArgumentException();
}
String independentVariableName = independentVariableNames.getValue(i);
String name = independentVariableName;
DataField dataField;
if(variableLevels.hasElement(independentVariableName)){
RStringVector levels = variableLevels.getStringElement(independentVariableName);
dataField = encoder.createDataField(name, OpType.CATEGORICAL, DataType.STRING, levels.getValues());
} else
{
dataField = encoder.createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE);
}
encoder.addFeature(dataField);
}
}
@Override
public MiningModel encodeModel(Schema schema){
RGenericVector ranger = getObject();
RStringVector treeType = ranger.getStringElement("treetype");
RGenericVector forest = ranger.getGenericElement("forest");
switch(treeType.asScalar()){
case "Regression":
return encodeRegression(forest, schema);
case "Classification":
return encodeClassification(forest, schema);
case "Probability estimation":
return encodeProbabilityForest(forest, schema);
default:
throw new IllegalArgumentException();
}
}
@Override
public FeatureImportanceMap getFeatureImportances(Schema schema){
RGenericVector ranger = getObject();
RStringVector importanceMode = ranger.getStringElement("importance.mode", false);
RDoubleVector variableImportance = ranger.getDoubleElement("variable.importance", false);
if(variableImportance == null){
return null;
}
List extends Feature> features = schema.getFeatures();
FeatureImportanceMap result = new FeatureImportanceMap(importanceMode != null ? importanceMode.asScalar() : null);
for(int i = 0; i < features.size(); i++){
result.put(features.get(i), variableImportance.getValue(i));
}
return result;
}
private MiningModel encodeRegression(RGenericVector forest, Schema schema){
ScoreEncoder scoreEncoder = new ScoreEncoder(){
@Override
public Node encode(Node node, Number splitValue, RNumberVector> terminalClassCount){
node.setScore(splitValue);
return node;
}
};
List treeModels = encodeForest(forest, MiningFunction.REGRESSION, scoreEncoder, schema);
MiningModel miningModel = new MiningModel(MiningFunction.REGRESSION, ModelUtil.createMiningSchema(schema.getLabel()))
.setSegmentation(MiningModelUtil.createSegmentation(Segmentation.MultipleModelMethod.AVERAGE, Segmentation.MissingPredictionTreatment.RETURN_MISSING, treeModels));
return miningModel;
}
private MiningModel encodeClassification(RGenericVector forest, Schema schema){
RStringVector levels = forest.getStringElement("levels");
ScoreEncoder scoreEncoder = new ScoreEncoder(){
@Override
public Node encode(Node node, Number splitValue, RNumberVector> terminalClassCount){
int index = ValueUtil.asInt(splitValue);
if(terminalClassCount != null){
throw new IllegalArgumentException();
}
node.setScore(levels.getValue(index - 1));
return node;
}
};
List treeModels = encodeForest(forest, MiningFunction.CLASSIFICATION, scoreEncoder, schema);
MiningModel miningModel = new MiningModel(MiningFunction.CLASSIFICATION, ModelUtil.createMiningSchema(schema.getLabel()))
.setSegmentation(MiningModelUtil.createSegmentation(Segmentation.MultipleModelMethod.MAJORITY_VOTE, Segmentation.MissingPredictionTreatment.RETURN_MISSING, treeModels));
return miningModel;
}
private MiningModel encodeProbabilityForest(RGenericVector forest, Schema schema){
RStringVector levels = forest.getStringElement("levels");
CategoricalLabel categoricalLabel = (CategoricalLabel)schema.getLabel();
ScoreEncoder scoreEncoder = new ScoreEncoder(){
@Override
public Node encode(Node node, Number splitValue, RNumberVector> terminalClassCount){
if(splitValue.doubleValue() != 0d || (terminalClassCount == null)){
throw new IllegalArgumentException();
}
RVectorUtil.checkSize(levels, terminalClassCount);
node = new ClassifierNode(node);
List scoreDistributions = node.getScoreDistributions();
Number maxProbability = null;
for(int i = 0; i < terminalClassCount.size(); i++){
String value = levels.getValue(i);
Number probability = terminalClassCount.getValue(i);
if(maxProbability == null || ((Comparable)maxProbability).compareTo(probability) < 0){
node.setScore(value);
maxProbability = probability;
}
ScoreDistribution scoreDistribution = new ScoreProbability(value, null, probability);
scoreDistributions.add(scoreDistribution);
}
return node;
}
};
List treeModels = encodeForest(forest, MiningFunction.CLASSIFICATION, scoreEncoder, schema);
MiningModel miningModel = new MiningModel(MiningFunction.CLASSIFICATION, ModelUtil.createMiningSchema(categoricalLabel))
.setSegmentation(MiningModelUtil.createSegmentation(Segmentation.MultipleModelMethod.AVERAGE, Segmentation.MissingPredictionTreatment.RETURN_MISSING, treeModels))
.setOutput(ModelUtil.createProbabilityOutput(DataType.DOUBLE, categoricalLabel));
return miningModel;
}
private List encodeForest(RGenericVector forest, MiningFunction miningFunction, ScoreEncoder scoreEncoder, Schema schema){
RNumberVector> numTrees = forest.getNumericElement("num.trees");
RGenericVector childNodeIDs = forest.getGenericElement("child.nodeIDs");
RGenericVector splitVarIDs = forest.getGenericElement("split.varIDs");
RGenericVector splitValues = forest.getGenericElement("split.values");
RGenericVector terminalClassCounts = forest.getGenericElement("terminal.class.counts", false);
Schema segmentSchema = schema.toAnonymousSchema();
List treeModels = new ArrayList<>();
for(int i = 0; i < ValueUtil.asInt(numTrees.asScalar()); i++){
TreeModel treeModel = encodeTreeModel(miningFunction, scoreEncoder, childNodeIDs.getGenericValue(i), splitVarIDs.getNumericValue(i), splitValues.getNumericValue(i), (terminalClassCounts != null ? terminalClassCounts.getGenericValue(i) : null), segmentSchema);
treeModels.add(treeModel);
}
return treeModels;
}
private TreeModel encodeTreeModel(MiningFunction miningFunction, ScoreEncoder scoreEncoder, RGenericVector childNodeIDs, RNumberVector> splitVarIDs, RNumberVector> splitValues, RGenericVector terminalClassCounts, Schema schema){
RNumberVector> leftChildIDs = childNodeIDs.getNumericValue(0);
RNumberVector> rightChildIDs = childNodeIDs.getNumericValue(1);
Node root = encodeNode(True.INSTANCE, 0, scoreEncoder, leftChildIDs, rightChildIDs, splitVarIDs, splitValues, terminalClassCounts, new CategoryManager(), schema);
TreeModel treeModel = new TreeModel(miningFunction, ModelUtil.createMiningSchema(schema.getLabel()), root)
.setSplitCharacteristic(TreeModel.SplitCharacteristic.BINARY_SPLIT);
return treeModel;
}
private Node encodeNode(Predicate predicate, int index, ScoreEncoder scoreEncoder, RNumberVector> leftChildIDs, RNumberVector> rightChildIDs, RNumberVector> splitVarIDs, RNumberVector> splitValues, RGenericVector terminalClassCounts, CategoryManager categoryManager, Schema schema){
int leftIndex = ValueUtil.asInt(leftChildIDs.getValue(index));
int rightIndex = ValueUtil.asInt(rightChildIDs.getValue(index));
Number splitValue = splitValues.getValue(index);
RNumberVector> terminalClassCount = (terminalClassCounts != null ? terminalClassCounts.getNumericValue(index) : null);
if(leftIndex == 0 && rightIndex == 0){
Node result = new LeafNode(null, predicate);
return scoreEncoder.encode(result, splitValue, terminalClassCount);
}
CategoryManager leftCategoryManager = categoryManager;
CategoryManager rightCategoryManager = categoryManager;
Predicate leftPredicate;
Predicate rightPredicate;
int splitVarIndex = ValueUtil.asInt(splitVarIDs.getValue(index));
Feature feature = schema.getFeature(this.hasDependentVar ? (splitVarIndex - 1) : splitVarIndex);
if(feature instanceof CategoricalFeature){
CategoricalFeature categoricalFeature = (CategoricalFeature)feature;
int splitLevelIndex = ValueUtil.asInt(Math.floor(splitValue.doubleValue()));
String name = categoricalFeature.getName();
List> values = categoricalFeature.getValues();
java.util.function.Predicate