All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.pmml.MiningSchema Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    MiningSchema.java
 *    Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.pmml;

import java.lang.String;
import java.io.Serializable;
import java.util.ArrayList;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instances;

/**
 * This class encapsulates the mining schema from
 * a PMML xml file. Specifically, it contains the
 * fields used in the PMML model as an Instances
 * object (just the header). It also contains meta
 * information such as value ranges and how to handle
 * missing values, outliers etc.
 *
 * We also store various other PMML elements here, such as
 * the TransformationDictionary, DerivedFields and Targets 
 * (if defined). They are not part of the mining schema per se, but
 * relate to inputs used by the model and it is convenient to
 * store them here.
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 5562 $
 */
public class MiningSchema implements Serializable {

  /** For serialization */
  private static final long serialVersionUID = 7144380586726330455L;

  /** The structure of all the fields (both mining schema and derived) as Instances */
  protected Instances m_fieldInstancesStructure;
  
  /** Just the mining schema fields as Instances */
  protected Instances m_miningSchemaInstancesStructure;

  /** Meta information about the mining schema fields */
  protected ArrayList m_miningMeta = 
    new ArrayList();
  
  /** 
   * Meta information about derived fields (those defined in
   * the TransformationDictionary followed by those defined in
   * LocalTransformations)
   */
  protected ArrayList m_derivedMeta = 
    new ArrayList();
  
  /** The transformation dictionary (if defined) */
  protected TransformationDictionary m_transformationDictionary = null;

  /** target meta info (may be null if not defined) */
  protected TargetMetaInfo m_targetMetaInfo = null;
  
  private void getLocalTransformations(Element model) throws Exception {
    NodeList temp = model.getElementsByTagName("LocalTransformations");
    
    if (temp.getLength() > 0) {
      // should be just one LocalTransformations element
      Element localT = (Element)temp.item(0);
      
      // Set up some field defs to pass in
      ArrayList fieldDefs = new ArrayList();
      for (int i = 0; i < m_miningSchemaInstancesStructure.numAttributes(); i++) {
        fieldDefs.add(m_miningSchemaInstancesStructure.attribute(i));
      }
      
      NodeList localDerivedL = localT.getElementsByTagName("DerivedField");
      for (int i = 0; i < localDerivedL.getLength(); i++) {
        Node localDerived = localDerivedL.item(i);
        if (localDerived.getNodeType() == Node.ELEMENT_NODE) {
          DerivedFieldMetaInfo d = 
            new DerivedFieldMetaInfo((Element)localDerived, fieldDefs, m_transformationDictionary);
          m_derivedMeta.add(d);
        }
      }
    }
  }

  /**
   * Constructor for MiningSchema.
   *
   * @param model the Element encapsulating the pmml model
   * @param dataDictionary the data dictionary as an Instances object
   * @throws Exception if something goes wrong during construction of the
   * mining schema
   */
  public MiningSchema(Element model, 
                      Instances dataDictionary,
                      TransformationDictionary transDict) throws Exception {
    
    /*// First check for transformation dictionary/local transformations and derived fields.
    // These are not supported yet.
    NodeList temp = model.getElementsByTagName("LocalTransformations");
    if (temp.getLength() > 0) {
      throw new Exception("[MiningSchema] LocalTransformations "
          + "are not supported yet.");
    }*/

    FastVector attInfo = new FastVector();
    NodeList fieldList = model.getElementsByTagName("MiningField");
    int classIndex = -1;
    int addedCount = 0;
    for (int i = 0; i < fieldList.getLength(); i++) {
      Node miningField = fieldList.item(i);
      if (miningField.getNodeType() == Node.ELEMENT_NODE) {
        Element miningFieldEl = (Element)miningField;

        MiningFieldMetaInfo mfi = new MiningFieldMetaInfo(miningFieldEl);

        if (mfi.getUsageType() == MiningFieldMetaInfo.Usage.ACTIVE ||
            mfi.getUsageType() == MiningFieldMetaInfo.Usage.PREDICTED) {

          // find this attribute in the dataDictionary
          Attribute miningAtt = dataDictionary.attribute(mfi.getName());
          if (miningAtt != null) {
            mfi.setIndex(addedCount);
            attInfo.addElement(miningAtt);
            addedCount++;

            if (mfi.getUsageType() == MiningFieldMetaInfo.Usage.PREDICTED) {
              classIndex = addedCount - 1;
            }

            // add to the array list
            m_miningMeta.add(mfi);
          } else {
          throw new Exception("Can't find mining field: " + mfi.getName() 
                              + " in the data dictionary.");
          }
        }
      }
    }

    m_miningSchemaInstancesStructure = new Instances("miningSchema", attInfo, 0);
    
    // set these instances on the MiningFieldMetaInfos so that the
    // toString() method can operate correctly
    for (MiningFieldMetaInfo m : m_miningMeta) {
      m.setMiningSchemaInstances(m_miningSchemaInstancesStructure);
    }
    
    m_transformationDictionary = transDict;
    
    // Handle transformation dictionary and any local transformations
    if (m_transformationDictionary != null) {
      // first update the field defs for any derived fields in the transformation dictionary
      // now that we have a fixed ordering for the mining schema attributes (i.e. could
      // be different from the order of attributes in the data dictionary that was
      // used when the transformation dictionary was initially constructed
      m_transformationDictionary.setFieldDefsForDerivedFields(m_miningSchemaInstancesStructure);
      
      ArrayList transDerived = transDict.getDerivedFields();
      m_derivedMeta.addAll(transDerived);
    }
    
    // Get any local transformations
    getLocalTransformations(model);
    
    FastVector newStructure = new FastVector();
    for (MiningFieldMetaInfo m : m_miningMeta) {
      newStructure.addElement(m.getFieldAsAttribute());
    }
    
    for (DerivedFieldMetaInfo d : m_derivedMeta) {
      newStructure.addElement(d.getFieldAsAttribute());
    }
    m_fieldInstancesStructure = new Instances("FieldStructure", newStructure, 0);
    
    if (classIndex != -1) {
      m_fieldInstancesStructure.setClassIndex(classIndex);
      m_miningSchemaInstancesStructure.setClassIndex(classIndex);
    }

    // do Targets (if any)
    NodeList targetsList = model.getElementsByTagName("Targets");
    if (targetsList.getLength() > 0) {
      if (targetsList.getLength() > 1) {
        throw new Exception("[MiningSchema] Can only handle a single Target");
      } else {
        Node te = targetsList.item(0);
        if (te.getNodeType() == Node.ELEMENT_NODE) {
          m_targetMetaInfo = new TargetMetaInfo((Element)te);

          // fill in any necessary categorical values in the mining schema 
          // class attribute
          if (m_fieldInstancesStructure.classIndex() >= 0 && 
              m_fieldInstancesStructure.classAttribute().isString()) {
            ArrayList targetVals = m_targetMetaInfo.getValues();
            if (targetVals.size() > 0) {
              Attribute classAtt = m_fieldInstancesStructure.classAttribute();
              for (int i = 0; i < targetVals.size(); i++) {
                classAtt.addStringValue(targetVals.get(i));
              }
            }
          }
        }
      }
    }
  }

  /**
   * Apply the missing value treatments (if any) to an incoming instance.
   *
   * @param values an array of doubles in order of the fields in the mining schema
   * that represents the incoming instance (note: use PMMLUtils.instanceToSchema()
   * to generate this).
   * @throws Exception if something goes wrong during missing value handling
   */
  public void applyMissingValuesTreatment(double[] values) throws Exception {
    for (int i = 0; i < m_miningMeta.size(); i++) {
      MiningFieldMetaInfo mfi = m_miningMeta.get(i);
      values[i] = mfi.applyMissingValueTreatment(values[i]);
    }
  }

  /**
   * Apply the outlier treatment methods (if any) to an incoming instance.
   *
   * @param values an array of doubles in order of the fields in the mining schema
   * that represents the incoming instance (note: use PMMLUtils.instanceToSchema()
   * to generate this).
   * @throws Exception if something goes wrong during outlier treatment handling
   */
  public void applyOutlierTreatment(double[] values) throws Exception {
    for (int i = 0; i < m_miningMeta.size(); i++) {
      MiningFieldMetaInfo mfi = m_miningMeta.get(i);
      values[i] = mfi.applyOutlierTreatment(values[i]);
    }
  }

  /**
   * Apply both missing and outlier treatments to an incoming instance.
   * @param values an array of doubles in order of the fields in the mining schema
   * that represents the incoming instance (note: use MappingInfo.instanceToSchema()
   * to generate this).
   * @throws Exception if something goes wrong during this process
   */
  public void applyMissingAndOutlierTreatments(double[] values) throws Exception {
    for (int i = 0; i < m_miningMeta.size(); i++) {
      MiningFieldMetaInfo mfi = m_miningMeta.get(i);
      values[i] = mfi.applyMissingValueTreatment(values[i]);
      values[i] = mfi.applyOutlierTreatment(values[i]);
    }
  }

  /**
   * Get the all the fields (both mining schema and derived) as Instances.
   * Attributes are in order of those in the mining schema, followed by
   * derived attributes from the TransformationDictionary followed by
   * derived attributes from LocalTransformations.
   *
   * @return all the fields as an Instances object
   */
  public Instances getFieldsAsInstances() {
    return m_fieldInstancesStructure;
  }
  
  /**
   * Get the mining schema fields as an Instances object.
   * 
   * @return the mining schema fields as an Instances object.
   */
  public Instances getMiningSchemaAsInstances() {
    return m_miningSchemaInstancesStructure;
  }
  
  /**
   * Get the transformation dictionary .
   * 
   * @return the transformation dictionary or null if none is
   * defined.
   */
  public TransformationDictionary getTransformationDictionary() {
    return m_transformationDictionary;
  }
  
  /**
   * Returns true if there is Target meta data.
   *
   * @return true if there is Target meta data
   */
  public boolean hasTargetMetaData() {
    return (m_targetMetaInfo != null);
  }

  /**
   * Get the Target meta data.
   *
   * @return the Target meta data
   */
  public TargetMetaInfo getTargetMetaData() {
    return m_targetMetaInfo;
  }

  /**
   * Method to convert any string attributes in the mining schema
   * Instances to nominal attributes. This may be necessary if there are
   * no Value elements defined for categorical fields in the data dictionary.
   * In this case, elements in the actual model definition will probably reveal
   * the valid values for categorical fields.
   */
  public void convertStringAttsToNominal() {
    Instances miningSchemaI = getFieldsAsInstances();
    if (miningSchemaI.checkForStringAttributes()) {
      FastVector attInfo = new FastVector();
      for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
        Attribute tempA = miningSchemaI.attribute(i);
        if (tempA.isString()) {
          FastVector valueVector = new FastVector();
          for (int j = 0; j < tempA.numValues(); j++) {
            valueVector.addElement(tempA.value(j));
          }
          Attribute newAtt = new Attribute(tempA.name(), valueVector);
          attInfo.addElement(newAtt);
        } else {
          attInfo.addElement(tempA);
        }
      }
      Instances newI = new Instances("miningSchema", attInfo, 0);
      if (m_fieldInstancesStructure.classIndex() >= 0) {
        newI.setClassIndex(m_fieldInstancesStructure.classIndex());
      }
      m_fieldInstancesStructure = newI;

      /*      StringToNominal stn = new StringToNominal();
      stn.setInputFormat(miningSchemaI);
      Instances newI = Filter.useFilter(miningSchemaI, stn);
      m_miningSchema = newI; */
    }
  }

  /**
   * Convert a numeric attribute in the mining schema to nominal.
   * 
   * @param index the index of the attribute to convert
   * @param newVals an ArrayList of the values of the nominal attribute
   */
  public void convertNumericAttToNominal(int index, 
                                         ArrayList newVals) {
    Instances miningSchemaI = getFieldsAsInstances();
    if (miningSchemaI.attribute(index).isNominal()) {
      throw new IllegalArgumentException("[MiningSchema] convertNumericAttToNominal: attribute is "
                                         + "already nominal!");
    }

    FastVector newValues = new FastVector();
    for (int i = 0; i < newVals.size(); i++) {
      newValues.addElement(newVals.get(i));
    }

    FastVector attInfo = new FastVector();
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      Attribute tempA = miningSchemaI.attribute(i);
      if (i == index) {
        Attribute newAtt = new Attribute(tempA.name(), newValues);
        attInfo.addElement(newAtt);
      } else {
        attInfo.addElement(tempA);
      }
    }

    Instances newI = new Instances("miningSchema", attInfo, 0);
    if (m_fieldInstancesStructure.classIndex() >= 0) {
      newI.setClassIndex(m_fieldInstancesStructure.classIndex());
    }
    m_fieldInstancesStructure = newI;
  }
  
  public ArrayList getDerivedFields() {
    return m_derivedMeta;
  }
  
  public ArrayList getMiningFields() {
    return m_miningMeta;
  }

  /**
   * Get a textual description of the mining schema.
   *
   * @return a textual description of the mining schema
   */
  public String toString() {
    StringBuffer temp = new StringBuffer();
    
    if (m_transformationDictionary != null) {
      temp.append(m_transformationDictionary);
    }
    
    temp.append("Mining schema:\n\n");
    for (MiningFieldMetaInfo m : m_miningMeta) {
      temp.append(m + "\n");
    }
    
    if (m_derivedMeta.size() > 0) {
      temp.append("\nDerived fields:\n\n");
      for (DerivedFieldMetaInfo d : m_derivedMeta) {
        temp.append(d + "\n");
      }
    }
    temp.append("\n");
    return temp.toString();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy