org.pmml4s.model.MiningModel.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pmml4s_2.12 Show documentation
Show all versions of pmml4s_2.12 Show documentation
A PMML scoring library in Scala
The newest version!
/*
* Copyright (c) 2017-2024 AutoDeployAI
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.pmml4s.model
import org.pmml4s.common.MiningFunction.MiningFunction
import org.pmml4s.common.Predication.Predication
import org.pmml4s.common._
import org.pmml4s.data.{DataVal, Series}
import org.pmml4s.metadata._
import org.pmml4s.model.MissingPredictionTreatment.MissingPredictionTreatment
import org.pmml4s.model.MultipleModelMethod.MultipleModelMethod
import org.pmml4s.transformations.LocalTransformations
import org.pmml4s.util.{MathUtils, Utils}
import scala.collection._
/**
* The element MiningModel allows precise specification of the usage of multiple models within one PMML file.
* The two main approaches are Model Composition, and Segmentation.
*
* Model Composition includes model sequencing and model selection but is only applicable to Tree and Regression models.
* Segmentation allows representation of different models for different data segments and also can be used for model
* ensembles and model sequences. Scoring a case using a model ensemble consists of scoring it using each model
* separately, then combining the results into a single scoring result using one of the pre-defined combination methods.
* Scoring a case using a sequence, or chain, of models allows the output of one model to be passed in as input to
* subsequent models.
*
* ModelComposition uses "embedded model elements" that are defeatured copies of "standalone model elements" --
* specifically, Regression for RegressionModel, DecisionTree for TreeModel. Besides being limited to Regression and
* Tree models, these embedded model elements lack key features like a MiningSchema (essential to manage scope across
* multiple model elements). Therefore, in PMML 4.2, the Model Composition approach has been deprecated since the
* Segmentation approach allows for a wider range of models to be used more reliably. For more on deprecation, see
* Conformance.
*
* Segmentation is accomplished by using any PMML model element inside of a Segment element, which also contains a
* PREDICATE and an optional weight. MiningModel then contains Segmentation element with a number of Segment elements
* as well as the attribute multipleModelMethod specifying how all the models applicable to a record should be combined.
* It is also possible to use a combination of model composition and segmentation approaches, using simple regression or
* decision trees for data preprocessing before segmentation.
*/
class MiningModel(
var parent: Model,
override val attributes: ModelAttributes,
override val miningSchema: MiningSchema,
val embeddedModels: Array[EmbeddedModel],
val segmentation: Option[Segmentation],
override val output: Option[Output] = None,
override val targets: Option[Targets] = None,
override val localTransformations: Option[LocalTransformations] = None,
override val modelStats: Option[ModelStats] = None,
override val modelExplanation: Option[ModelExplanation] = None,
override val modelVerification: Option[ModelVerification] = None,
override val extensions: immutable.Seq[Extension] = immutable.Seq.empty)
extends Model with HasWrappedModelAttributes {
import MultipleModelMethod._
require(segmentation.isEmpty || MultipleModelMethod.support(segmentation.get.multipleModelMethod, functionName),
s"The multiple model method '${segmentation.get.multipleModelMethod}' is not supported by the ${functionName} model")
segmentation.foreach(_.setParent(this))
private val segmentOutputs = candidateOutputFields.filter(_.segmentId.isDefined).map(_.segmentId.get).toSet
private val isWeighted = isSegmentation && (Set(`weightedMajorityVote`, `weightedAverage`, `weightedMedian`, `weightedSum`)
contains segmentation.get.multipleModelMethod)
// enable the predicted value if it's not
if (isSegmentation &&
(!Set(`selectFirst`, `selectAll`, `modelChain`).contains(segmentation.get.multipleModelMethod)) &&
(isRegression || ((isClassification || isClustering) && (segmentation.get.multipleModelMethod == majorityVote || segmentation.get.multipleModelMethod == weightedMajorityVote)))) {
segmentation.get.segments.foreach(x =>{
val idx = x.model.outputIndex(ResultFeature.predictedValue)
if (idx == -1) {
x.model.setSupplementOutput(true)
}
})
}
/** Model element type. */
override def modelElement: ModelElement = ModelElement.MiningModel
/** Predicts values for a given data series. */
override def predict(values: Series): Series = {
val (series, returnInvalid) = prepare(values)
if (returnInvalid) {
return nullSeries
}
val outputs = createOutputs()
// Segmentation
if (isSegmentation) {
val seg = segmentation.get
val segments = seg.segments
import MissingPredictionTreatment._
seg.multipleModelMethod match {
case `selectFirst` => {
val first = segments.find(x => Predication.fire(x.eval(series)))
if (first.isDefined) first.get.predict(series) else nullSeries
}
case `selectAll` => {
val all = segments.map(x => if (Predication.fire(x.eval(series))) x.predict(series) else
x.model.nullSeries)
Series.merge(all)
}
case `modelChain` => {
var last: Series = nullSeries
var lastOutputFields: Array[OutputField] = Array.empty
var in = series
var i = 0
while (i < segments.length) {
val segment = segments(i)
val out = if (Predication.fire(segment.eval(in))) {
last = segment.predict(in)
lastOutputFields = segment.model.outputFields
last
} else {
segment.model.nullSeries
}
segment.id.foreach(x => if (segmentOutputs.contains(x)) outputs.putSegment(x, out))
in = Series.merge(in, out)
i += 1
}
if (output.isDefined) {
// Handle the results further when outputs defined in the ensemble model
val probabilities = mutable.Map.empty[DataVal, Double]
last.toSeq.zip(lastOutputFields).foreach(x => {
x._2.feature match {
case ResultFeature.predictedValue => outputs.predictedValue = x._1
case ResultFeature.probability => x._2.value.foreach(y => {
probabilities += (y -> x._1.toDouble)
})
case _ =>
}
})
outputs.probabilities = probabilities.toMap
// Derive the predicted value from probabilities if it's not set
if (outputs.probabilities.nonEmpty && outputs.predictedValue == null) {
outputs.evalPredictedValueByProbabilities()
}
result(last, outputs)
} else last
}
case method => {
val selections = seg.filter(series)
if (selections.isEmpty) return nullSeries
if (isRegression || ((isClassification || isClustering) && (method == majorityVote || method == weightedMajorityVote))) {
// The child model could have different output schema
val outSeries = selections.map(x => (x.predict(series), x))
var predictions = outSeries.map(x => x._1.get(x._2.model.predictedValueIndex))
// Compute model weights
var weights = if (isWeighted) outSeries.map(x => x._2.weight(Series.merge(series, x._1))) else Array.fill(selections.length)(1.0)
// Check missing results
seg.missingPredictionTreatment match {
case `returnMissing` => if (predictions.exists(x => Utils.isMissing(x))) return nullSeries
case `skipSegment` => {
if (predictions.exists(x => Utils.isMissing(x))) {
val missingWeight = weights.zip(predictions).filter(x => Utils.isMissing(x._2)).map(x => x._1).sum
val totalWeight = weights.sum
if (missingWeight / totalWeight > seg.missingThreshold) return nullSeries
weights = predictions.zip(weights).filter(x => Utils.nonMissing(x._1)).map(_._2)
predictions = predictions.filter(x => Utils.nonMissing(x))
}
}
case `continue` =>
}
if (isRegression) {
val realPredictions = predictions.map(_.toDouble)
val predictedValue = method match {
case `average` => {
realPredictions.sum / realPredictions.length.toDouble
}
case `weightedAverage` => {
MathUtils.product(realPredictions, weights) / weights.sum
}
case `median` => {
MathUtils.median(realPredictions)
}
case `weightedMedian` | `x-weightedMedian` => {
MathUtils.weightedMedian(realPredictions, weights)
}
case `sum` =>
realPredictions.sum
case `weightedSum` | `x-weightedSum` =>
MathUtils.product(realPredictions, weights)
}
outputs.setPredictedValue(predictedValue)
} else {
// Convert predictions to the data type of target, because its type could be different from its target
// in some cases, the probabilities could all become 0 if both data types not match.
val dataTypeWanted = if (classes.nonEmpty) Utils.inferDataType(classes(0)) else UnresolvedDataType
val probabilities = Utils.reduceByKey(predictions.zip(weights)).map(x =>
(Utils.toDataVal(x._1, dataTypeWanted), x._2 / predictions.length)).withDefaultValue(0.0)
outputs.setProbabilities(classes.map(x => (x, probabilities(x))).toMap).evalPredictedValueByProbabilities()
}
} else if (isClassification) {
// Suppose all child model has the same output schema
val outSeries = selections.map(_.predict(series))
val probabilityIndexes = classes.map(x => selections.head.model.outputIndex(ResultFeature.probability, Some(x)))
// Get the probabilities of each class
var probabilities = outSeries.map(x => probabilityIndexes.map(y => if (y != -1) x.get(y).toDouble else Double.NaN))
// Compute model weights
var weights = if (isWeighted) outSeries.zip(selections).map(x => x._2.weight(Series.merge(series, x._1))) else Array.fill(selections.length)(1.0)
// Check missing results
seg.missingPredictionTreatment match {
case `returnMissing` => if (probabilities.exists(x => Utils.anyMissing(x))) return nullSeries
case `skipSegment` => {
if (probabilities.exists(x => Utils.anyMissing(x))) {
val missingWeight = weights.zip(probabilities).filter(x => Utils.anyMissing(x._2)).map(x => x._1).sum
val totalWeight = weights.sum
if (missingWeight / totalWeight > seg.missingThreshold) return nullSeries
weights = probabilities.zip(weights).filter(x => !Utils.anyMissing(x._1)).map(_._2)
probabilities = probabilities.filter(x => !Utils.anyMissing(x))
}
}
case `continue` =>
}
var evalPredictedValue = true
val matrix = probabilities.transpose
outputs.setProbabilities(method match {
case `average` => {
classes.zip(matrix.map(_.sum / selections.length)).toMap
}
case `weightedAverage` => {
val sum = weights.sum
classes.zip(matrix.map(x => MathUtils.product(x, weights) / sum)).toMap
}
case `max` => {
evalPredictedValue = false
outputs.predictedValue = classes.zip(matrix.map(_.max)).maxBy(_._2)._1
val contributions = probabilities.filter(x => classes.zip(x).maxBy(_._2)._1 == outputs.predictedValue)
classes.zip(contributions.transpose.map(_.sum / contributions.length)).toMap
}
case `median` => {
classes.zip(matrix.map(x => MathUtils.median(x))).toMap
}
case `weightedMedian` | `x-weightedMedian` => {
classes.zip(matrix.map(x => MathUtils.weightedMedian(x, weights))).toMap
}
})
if (evalPredictedValue) {
outputs.evalPredictedValueByProbabilities()
}
}
result(series, outputs)
}
}
} else {
// Model Composition
???
}
}
/** Returns the field of a given name, None if a field with the given name does not exist. */
override def getField(name: String): Option[Field] = {
if (isSegmentation && segmentation.get.multipleModelMethod == MultipleModelMethod.modelChain) {
var i = 0
while (i < segmentation.get.segments.length) {
val segment = segmentation.get.segments(i)
val f = segment.model.output.flatMap(_.getField(name))
if (f.isDefined) return f
i += 1
}
}
super.getField(name)
}
/** Creates an object of MiningOutputs that is for writing into an output series. */
override def createOutputs(): MiningOutputs = new MiningOutputs
/** Returns all candidates output fields of this model when there is no output specified explicitly. */
override def defaultOutputFields: Array[OutputField] = {
if (isSegmentation) {
import MultipleModelMethod._
segmentation.get.multipleModelMethod match {
case `modelChain` if segmentation.get.segments.last.predicate == True => {
val lastModel = segmentation.get.segments.last.model
lastModel.setSupplementOutput(supplementOutput)
lastModel.candidateOutputFields
}
case `selectAll` => {
segmentation.get.segments.flatMap(_.model.candidateOutputFields)
}
case _ => {
super.defaultOutputFields
}
}
}
else {
// TODO ModelComposition
???
}
}
def isSegmentation: Boolean = segmentation.isDefined
}
/**
* Specifying how all the models applicable to a record should be combined.
*/
object MultipleModelMethod extends Enumeration {
type MultipleModelMethod = Value
/**
* selectFirst is applicable to any model type. Simply use the first model for which the predicate in the Segment evaluates to true.
* selectAll is applicable to any model type. All models for which the predicate in the Segment evaluates to true are evaluated.
* modelChain is applicable to any model type. During scoring, Segments whose Predicates evaluate to TRUE are executed in
* the order they appear in the PMML.
*
* For clustering models only majorityVote, weightedMajorityVote, modelChain, selectFirst, or selectAll can be used.
* In case of majorityVote the cluster ID that was selected by the largest number of models wins. For weightedMajorityVote
* the weights specified in Segment elements are used, and the cluster ID with highest total weight wins.
* Cluster affinity for the resulting model combined by majorityVote or weightedMajorityVote is not defined.
* Note that combining clustering model predictions can produce not very meaningful results because there is no target variable,
* and the same cluster IDs on different segments can be assigned to very different clusters.
*
* For regression models only average, weightedAverage, median, weightedMedian, sum, weightedSum, modelChain, selectFirst, or selectAll are applicable.
* The first four methods are applied to the predicted values of all models for which the predicate evaluates to true.
*
* For classification models all the combination methods, except for sum and weightedSum, can be used. For the first six combination methods
* the models in all segments must have the same target variable. Note that average, weightedAverage, median, weightedMedian, and max are
* applied to the predicted probabilities of target categories in each of the models used for the case, then the winning
* category is selected based on the highest combined probability, while majorityVote and weightedMajorityVote use the
* predicted categories from all applicable models and select the one based on the models' "votes". Predicted probabilities
* for the final prediction of a classification MiningModel are defined as follows, depending on the combination method:
*
* - majorityVote and weightedMajorityVote: the probabilities are computed as the proportions of the votes or weighted votes for each target category;
* - average, weightedAverage: the probabilities are computed as the average or weighted average of probabilities of the models used in the prediction;
* - max: consider the model(s) that have contributed the chosen probability for the winning category. Return their average probabilities;
* - median, weightedMedian: if the number of models with predicates that resolve to true is odd consider the model(s) that have the chose
*/
val majorityVote, weightedMajorityVote, average, weightedAverage, median, weightedMedian, max, sum, weightedSum, selectFirst, selectAll, modelChain = Value
val `x-weightedMedian`= Value("x-weightedMedian")
val `x-weightedSum` = Value("x-weightedSum")
def support(mmm: MultipleModelMethod, mf: MiningFunction): Boolean = mmm match {
case `selectFirst` | `selectAll` | `modelChain` => true
case `majorityVote` | `weightedMajorityVote`
if mf == MiningFunction.clustering => true
case `average` | `weightedAverage` | `median` | `weightedMedian` | `x-weightedMedian` | `sum` | `weightedSum` | `x-weightedSum`
if mf == MiningFunction.regression => true
case `majorityVote` | `weightedMajorityVote` | `average` | `weightedAverage` | `median` | `weightedMedian` | `x-weightedMedian` | `max`
if mf == MiningFunction.classification => true
case _ => false
}
}
/**
* The missing prediction treatment options are used when at least one model for which the predicate in the Segment
* evaluates to true has a missing result. The attribute missingThreshold is closely related and has default value 1.
* The options are defined as follows:
*
* - returnMissing means that if at least one model has a missing result, the whole MiningModel's result should be missing.
* - skipSegment says that if a model has a missing result, that segment is ignored and the results are computed based
* on other segments. However, if the fraction of the models with missing results ( weighted if the model combination
* method is weighted ) exceeds the missingThreshold, the returned result must be missing. This option should not be
* used with modelChain combination method.
* - continue says that if a model has a missing result, the processing should continue normally. This can work well for
* voting or modelChain situations, as well as returnFirst and returnAll. In case of majorityVote or
* weightedMajorityVote the missing result can be returned if it gets the most ( possibly weighted ) votes, or if the
* fraction of the models with missing result exceeds the missingThreshold. Otherwise a valid result is computed
* normally. Other model combination methods will return a missing value as the result.
*/
object MissingPredictionTreatment extends Enumeration {
type MissingPredictionTreatment = Value
val returnMissing, skipSegment, continue = Value
}
class Segmentation(val multipleModelMethod: MultipleModelMethod,
val segments: Array[Segment],
val missingPredictionTreatment: MissingPredictionTreatment = MissingPredictionTreatment.continue,
val missingThreshold: Double = 1) extends PmmlElement {
private val allTrue: Boolean = segments.count(x => x.isTrue) == segments.length
def +=(segment: Segment): Segmentation = {
new Segmentation(multipleModelMethod, segments :+ segment)
}
def setParent(parent: Model): Segmentation = {
segments.foreach(x => x.model.setParent(parent))
this
}
def filter(series: Series): Array[Segment] = if (allTrue) segments else {
segments.filter(x => Predication.fire(x.eval(series)))
}
}
class VariableWeight(val field: Field) extends PmmlElement
class Segment(val predicate: Predicate,
val model: Model,
val variableWeight: Option[VariableWeight] = None,
val id: Option[String] = None,
val weight: Double = 1.0)
extends Predictable with Predicate with PmmlElement {
/** Predicts values for a given data series. */
override def predict(values: Series): Series = model.predict(values)
/** Evaluates the predicate. */
override def eval(series: Series): Predication = predicate.eval(series)
def weight(series: Series): Double = variableWeight.map(x => x.field.getDouble(series) * weight).getOrElse(weight)
override def isTrue: Boolean = predicate.isTrue
}
class MiningOutputs extends ClsOutputs with RegOutputs with CluOutputs with SegmentOutputs {
override def modelElement: ModelElement = ModelElement.MiningModel
override def clear(): this.type = {
super[ClsOutputs].clear()
super[RegOutputs].clear()
super[CluOutputs].clear()
super[SegmentOutputs].clear()
this
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy