weka.core.pmml.MiningSchema Maven / Gradle / Ivy
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* MiningSchema.java
* Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.pmml;
import java.lang.String;
import java.io.Serializable;
import java.util.ArrayList;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instances;
/**
* This class encapsulates the mining schema from
* a PMML xml file. Specifically, it contains the
* fields used in the PMML model as an Instances
* object (just the header). It also contains meta
* information such as value ranges and how to handle
* missing values, outliers etc.
*
* We also store various other PMML elements here, such as
* the TransformationDictionary, DerivedFields and Targets
* (if defined). They are not part of the mining schema per se, but
* relate to inputs used by the model and it is convenient to
* store them here.
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 5562 $
*/
public class MiningSchema implements Serializable {
/** For serialization */
private static final long serialVersionUID = 7144380586726330455L;
/** The structure of all the fields (both mining schema and derived) as Instances */
protected Instances m_fieldInstancesStructure;
/** Just the mining schema fields as Instances */
protected Instances m_miningSchemaInstancesStructure;
/** Meta information about the mining schema fields */
protected ArrayList m_miningMeta =
new ArrayList();
/**
* Meta information about derived fields (those defined in
* the TransformationDictionary followed by those defined in
* LocalTransformations)
*/
protected ArrayList m_derivedMeta =
new ArrayList();
/** The transformation dictionary (if defined) */
protected TransformationDictionary m_transformationDictionary = null;
/** target meta info (may be null if not defined) */
protected TargetMetaInfo m_targetMetaInfo = null;
private void getLocalTransformations(Element model) throws Exception {
NodeList temp = model.getElementsByTagName("LocalTransformations");
if (temp.getLength() > 0) {
// should be just one LocalTransformations element
Element localT = (Element)temp.item(0);
// Set up some field defs to pass in
ArrayList fieldDefs = new ArrayList();
for (int i = 0; i < m_miningSchemaInstancesStructure.numAttributes(); i++) {
fieldDefs.add(m_miningSchemaInstancesStructure.attribute(i));
}
NodeList localDerivedL = localT.getElementsByTagName("DerivedField");
for (int i = 0; i < localDerivedL.getLength(); i++) {
Node localDerived = localDerivedL.item(i);
if (localDerived.getNodeType() == Node.ELEMENT_NODE) {
DerivedFieldMetaInfo d =
new DerivedFieldMetaInfo((Element)localDerived, fieldDefs, m_transformationDictionary);
m_derivedMeta.add(d);
}
}
}
}
/**
* Constructor for MiningSchema.
*
* @param model the Element
encapsulating the pmml model
* @param dataDictionary the data dictionary as an Instances object
* @throws Exception if something goes wrong during construction of the
* mining schema
*/
public MiningSchema(Element model,
Instances dataDictionary,
TransformationDictionary transDict) throws Exception {
/*// First check for transformation dictionary/local transformations and derived fields.
// These are not supported yet.
NodeList temp = model.getElementsByTagName("LocalTransformations");
if (temp.getLength() > 0) {
throw new Exception("[MiningSchema] LocalTransformations "
+ "are not supported yet.");
}*/
FastVector attInfo = new FastVector();
NodeList fieldList = model.getElementsByTagName("MiningField");
int classIndex = -1;
int addedCount = 0;
for (int i = 0; i < fieldList.getLength(); i++) {
Node miningField = fieldList.item(i);
if (miningField.getNodeType() == Node.ELEMENT_NODE) {
Element miningFieldEl = (Element)miningField;
MiningFieldMetaInfo mfi = new MiningFieldMetaInfo(miningFieldEl);
if (mfi.getUsageType() == MiningFieldMetaInfo.Usage.ACTIVE ||
mfi.getUsageType() == MiningFieldMetaInfo.Usage.PREDICTED) {
// find this attribute in the dataDictionary
Attribute miningAtt = dataDictionary.attribute(mfi.getName());
if (miningAtt != null) {
mfi.setIndex(addedCount);
attInfo.addElement(miningAtt);
addedCount++;
if (mfi.getUsageType() == MiningFieldMetaInfo.Usage.PREDICTED) {
classIndex = addedCount - 1;
}
// add to the array list
m_miningMeta.add(mfi);
} else {
throw new Exception("Can't find mining field: " + mfi.getName()
+ " in the data dictionary.");
}
}
}
}
m_miningSchemaInstancesStructure = new Instances("miningSchema", attInfo, 0);
// set these instances on the MiningFieldMetaInfos so that the
// toString() method can operate correctly
for (MiningFieldMetaInfo m : m_miningMeta) {
m.setMiningSchemaInstances(m_miningSchemaInstancesStructure);
}
m_transformationDictionary = transDict;
// Handle transformation dictionary and any local transformations
if (m_transformationDictionary != null) {
// first update the field defs for any derived fields in the transformation dictionary
// now that we have a fixed ordering for the mining schema attributes (i.e. could
// be different from the order of attributes in the data dictionary that was
// used when the transformation dictionary was initially constructed
m_transformationDictionary.setFieldDefsForDerivedFields(m_miningSchemaInstancesStructure);
ArrayList transDerived = transDict.getDerivedFields();
m_derivedMeta.addAll(transDerived);
}
// Get any local transformations
getLocalTransformations(model);
FastVector newStructure = new FastVector();
for (MiningFieldMetaInfo m : m_miningMeta) {
newStructure.addElement(m.getFieldAsAttribute());
}
for (DerivedFieldMetaInfo d : m_derivedMeta) {
newStructure.addElement(d.getFieldAsAttribute());
}
m_fieldInstancesStructure = new Instances("FieldStructure", newStructure, 0);
if (classIndex != -1) {
m_fieldInstancesStructure.setClassIndex(classIndex);
m_miningSchemaInstancesStructure.setClassIndex(classIndex);
}
// do Targets (if any)
NodeList targetsList = model.getElementsByTagName("Targets");
if (targetsList.getLength() > 0) {
if (targetsList.getLength() > 1) {
throw new Exception("[MiningSchema] Can only handle a single Target");
} else {
Node te = targetsList.item(0);
if (te.getNodeType() == Node.ELEMENT_NODE) {
m_targetMetaInfo = new TargetMetaInfo((Element)te);
// fill in any necessary categorical values in the mining schema
// class attribute
if (m_fieldInstancesStructure.classIndex() >= 0 &&
m_fieldInstancesStructure.classAttribute().isString()) {
ArrayList targetVals = m_targetMetaInfo.getValues();
if (targetVals.size() > 0) {
Attribute classAtt = m_fieldInstancesStructure.classAttribute();
for (int i = 0; i < targetVals.size(); i++) {
classAtt.addStringValue(targetVals.get(i));
}
}
}
}
}
}
}
/**
* Apply the missing value treatments (if any) to an incoming instance.
*
* @param values an array of doubles in order of the fields in the mining schema
* that represents the incoming instance (note: use PMMLUtils.instanceToSchema()
* to generate this).
* @throws Exception if something goes wrong during missing value handling
*/
public void applyMissingValuesTreatment(double[] values) throws Exception {
for (int i = 0; i < m_miningMeta.size(); i++) {
MiningFieldMetaInfo mfi = m_miningMeta.get(i);
values[i] = mfi.applyMissingValueTreatment(values[i]);
}
}
/**
* Apply the outlier treatment methods (if any) to an incoming instance.
*
* @param values an array of doubles in order of the fields in the mining schema
* that represents the incoming instance (note: use PMMLUtils.instanceToSchema()
* to generate this).
* @throws Exception if something goes wrong during outlier treatment handling
*/
public void applyOutlierTreatment(double[] values) throws Exception {
for (int i = 0; i < m_miningMeta.size(); i++) {
MiningFieldMetaInfo mfi = m_miningMeta.get(i);
values[i] = mfi.applyOutlierTreatment(values[i]);
}
}
/**
* Apply both missing and outlier treatments to an incoming instance.
* @param values an array of doubles in order of the fields in the mining schema
* that represents the incoming instance (note: use MappingInfo.instanceToSchema()
* to generate this).
* @throws Exception if something goes wrong during this process
*/
public void applyMissingAndOutlierTreatments(double[] values) throws Exception {
for (int i = 0; i < m_miningMeta.size(); i++) {
MiningFieldMetaInfo mfi = m_miningMeta.get(i);
values[i] = mfi.applyMissingValueTreatment(values[i]);
values[i] = mfi.applyOutlierTreatment(values[i]);
}
}
/**
* Get the all the fields (both mining schema and derived) as Instances.
* Attributes are in order of those in the mining schema, followed by
* derived attributes from the TransformationDictionary followed by
* derived attributes from LocalTransformations.
*
* @return all the fields as an Instances object
*/
public Instances getFieldsAsInstances() {
return m_fieldInstancesStructure;
}
/**
* Get the mining schema fields as an Instances object.
*
* @return the mining schema fields as an Instances object.
*/
public Instances getMiningSchemaAsInstances() {
return m_miningSchemaInstancesStructure;
}
/**
* Get the transformation dictionary .
*
* @return the transformation dictionary or null if none is
* defined.
*/
public TransformationDictionary getTransformationDictionary() {
return m_transformationDictionary;
}
/**
* Returns true if there is Target meta data.
*
* @return true if there is Target meta data
*/
public boolean hasTargetMetaData() {
return (m_targetMetaInfo != null);
}
/**
* Get the Target meta data.
*
* @return the Target meta data
*/
public TargetMetaInfo getTargetMetaData() {
return m_targetMetaInfo;
}
/**
* Method to convert any string attributes in the mining schema
* Instances to nominal attributes. This may be necessary if there are
* no Value elements defined for categorical fields in the data dictionary.
* In this case, elements in the actual model definition will probably reveal
* the valid values for categorical fields.
*/
public void convertStringAttsToNominal() {
Instances miningSchemaI = getFieldsAsInstances();
if (miningSchemaI.checkForStringAttributes()) {
FastVector attInfo = new FastVector();
for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
Attribute tempA = miningSchemaI.attribute(i);
if (tempA.isString()) {
FastVector valueVector = new FastVector();
for (int j = 0; j < tempA.numValues(); j++) {
valueVector.addElement(tempA.value(j));
}
Attribute newAtt = new Attribute(tempA.name(), valueVector);
attInfo.addElement(newAtt);
} else {
attInfo.addElement(tempA);
}
}
Instances newI = new Instances("miningSchema", attInfo, 0);
if (m_fieldInstancesStructure.classIndex() >= 0) {
newI.setClassIndex(m_fieldInstancesStructure.classIndex());
}
m_fieldInstancesStructure = newI;
/* StringToNominal stn = new StringToNominal();
stn.setInputFormat(miningSchemaI);
Instances newI = Filter.useFilter(miningSchemaI, stn);
m_miningSchema = newI; */
}
}
/**
* Convert a numeric attribute in the mining schema to nominal.
*
* @param index the index of the attribute to convert
* @param newVals an ArrayList of the values of the nominal attribute
*/
public void convertNumericAttToNominal(int index,
ArrayList newVals) {
Instances miningSchemaI = getFieldsAsInstances();
if (miningSchemaI.attribute(index).isNominal()) {
throw new IllegalArgumentException("[MiningSchema] convertNumericAttToNominal: attribute is "
+ "already nominal!");
}
FastVector newValues = new FastVector();
for (int i = 0; i < newVals.size(); i++) {
newValues.addElement(newVals.get(i));
}
FastVector attInfo = new FastVector();
for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
Attribute tempA = miningSchemaI.attribute(i);
if (i == index) {
Attribute newAtt = new Attribute(tempA.name(), newValues);
attInfo.addElement(newAtt);
} else {
attInfo.addElement(tempA);
}
}
Instances newI = new Instances("miningSchema", attInfo, 0);
if (m_fieldInstancesStructure.classIndex() >= 0) {
newI.setClassIndex(m_fieldInstancesStructure.classIndex());
}
m_fieldInstancesStructure = newI;
}
public ArrayList getDerivedFields() {
return m_derivedMeta;
}
public ArrayList getMiningFields() {
return m_miningMeta;
}
/**
* Get a textual description of the mining schema.
*
* @return a textual description of the mining schema
*/
public String toString() {
StringBuffer temp = new StringBuffer();
if (m_transformationDictionary != null) {
temp.append(m_transformationDictionary);
}
temp.append("Mining schema:\n\n");
for (MiningFieldMetaInfo m : m_miningMeta) {
temp.append(m + "\n");
}
if (m_derivedMeta.size() > 0) {
temp.append("\nDerived fields:\n\n");
for (DerivedFieldMetaInfo d : m_derivedMeta) {
temp.append(d + "\n");
}
}
temp.append("\n");
return temp.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy