All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.pmml.Discretize Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Discretize.java
 *    Copyright (C) 2008-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.pmml;

import java.io.Serializable;
import java.util.ArrayList;

import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import weka.core.Attribute;
import weka.core.Utils;

/**
 * Class encapsulating a Discretize Expression.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision 1.0 $
 */
public class Discretize extends Expression {

  /** ID added to avoid warning */
  private static final long serialVersionUID = -5809107997906180082L;

  /**
   * Inner class to encapsulate DiscretizeBin elements
   */
  protected class DiscretizeBin implements Serializable {

    /**
     * For serialization
     */
    private static final long serialVersionUID = 5810063243316808400L;

    /** The intervals for this DiscretizeBin */
    private final ArrayList m_intervals = new ArrayList();

    /** The bin value for this DiscretizeBin */
    private final String m_binValue;

    /**
     * If the optype is continuous or ordinal, we will attempt to parse the bin
     * value as a number and store it here.
     */
    private double m_numericBinValue = Utils.missingValue();

    protected DiscretizeBin(Element bin, FieldMetaInfo.Optype opType) throws Exception {
      NodeList iL = bin.getElementsByTagName("Interval");
      for (int i = 0; i < iL.getLength(); i++) {
        Node iN = iL.item(i);
        if (iN.getNodeType() == Node.ELEMENT_NODE) {
          FieldMetaInfo.Interval tempInterval = new FieldMetaInfo.Interval(
            (Element) iN);
          m_intervals.add(tempInterval);
        }
      }

      m_binValue = bin.getAttribute("binValue");

      if (opType == FieldMetaInfo.Optype.CONTINUOUS
        || opType == FieldMetaInfo.Optype.ORDINAL) {
        try {
          m_numericBinValue = Double.parseDouble(m_binValue);
        } catch (NumberFormatException ex) {
          // quietly ignore...
        }
      }
    }

    /**
     * Get the bin value for this DiscretizeBin
     * 
     * @return the bin value
     */
    protected String getBinValue() {
      return m_binValue;
    }

    /**
     * Get the value of this bin as a number (parsed from the string value).
     * 
     * @return the value of this bin as a number or Double.NaN if the string
     *         value of the bin could not be interpreted as a number.
     */
    protected double getBinValueNumeric() {
      return m_numericBinValue;
    }

    /**
     * Returns true if there is an interval that contains the incoming value.
     * 
     * @param value the value to check against
     * @return true if there is an interval that containst the supplied value
     */
    protected boolean containsValue(double value) {
      boolean result = false;

      for (FieldMetaInfo.Interval i : m_intervals) {
        if (i.containsValue(value)) {
          result = true;
          break;
        }
      }

      return result;
    }

    @Override
    public String toString() {
      StringBuffer buff = new StringBuffer();

      buff.append("\"" + m_binValue + "\" if value in: ");
      boolean first = true;
      for (FieldMetaInfo.Interval i : m_intervals) {
        if (!first) {
          buff.append(", ");
        } else {
          first = false;
        }
        buff.append(i.toString());
      }

      return buff.toString();
    }
  }

  /** The name of the field to be discretized */
  protected String m_fieldName;

  /** The index of the field */
  protected int m_fieldIndex;

  /** True if a replacement for missing values has been specified */
  protected boolean m_mapMissingDefined = false;

  /** The value of the missing value replacement (if defined) */
  protected String m_mapMissingTo;

  /** True if a default value has been specified */
  protected boolean m_defaultValueDefined = false;

  /** The default value (if defined) */
  protected String m_defaultValue;

  /** The bins for this discretization */
  protected ArrayList m_bins = new ArrayList();

  /** The output structure of this discretization */
  protected Attribute m_outputDef;

  /**
   * Constructs a Discretize Expression
   * 
   * @param discretize the Element containing the discretize expression
   * @param opType the optype of this Discretize Expression
   * @param fieldDefs the structure of the incoming fields
   * @throws Exception if the optype is not categorical/ordinal or if there is a
   *           problem parsing this element
   */
  public Discretize(Element discretize, FieldMetaInfo.Optype opType,
    ArrayList fieldDefs) throws Exception {
    super(opType, fieldDefs);

    /*
     * if (m_opType == FieldMetaInfo.Optype.CONTINUOUS) { throw new
     * Exception("[Discretize] must have a categorical or ordinal optype"); }
     */

    m_fieldName = discretize.getAttribute("field");

    m_mapMissingTo = discretize.getAttribute("mapMissingTo");
    if (m_mapMissingTo != null && m_mapMissingTo.length() > 0) {
      m_mapMissingDefined = true;
    }

    m_defaultValue = discretize.getAttribute("defaultValue");
    if (m_defaultValue != null && m_defaultValue.length() > 0) {
      m_defaultValueDefined = true;
    }

    // get the DiscretizeBin Elements
    NodeList dbL = discretize.getElementsByTagName("DiscretizeBin");
    for (int i = 0; i < dbL.getLength(); i++) {
      Node dbN = dbL.item(i);
      if (dbN.getNodeType() == Node.ELEMENT_NODE) {
        Element dbE = (Element) dbN;
        DiscretizeBin db = new DiscretizeBin(dbE, m_opType);
        m_bins.add(db);
      }
    }

    if (fieldDefs != null) {
      setUpField();
    }
  }

  /**
   * Set the field definitions for this Expression to use
   * 
   * @param fieldDefs the field definitions to use
   * @throws Exception if there is a problem setting the field definitions
   */
  @Override
  public void setFieldDefs(ArrayList fieldDefs) throws Exception {
    super.setFieldDefs(fieldDefs);
    setUpField();
  }

  private void setUpField() throws Exception {
    m_fieldIndex = -1;

    if (m_fieldDefs != null) {
      m_fieldIndex = getFieldDefIndex(m_fieldName);
      if (m_fieldIndex < 0) {
        throw new Exception("[Discretize] Can't find field " + m_fieldName
          + " in the supplied field definitions.");
      }

      Attribute field = m_fieldDefs.get(m_fieldIndex);
      if (!field.isNumeric()) {
        throw new Exception("[Discretize] reference field " + m_fieldName
          + " must be continuous.");
      }
    }

    // set up the output structure
    Attribute tempAtt = null;
    boolean categorical = false;
    if (m_opType == FieldMetaInfo.Optype.CONTINUOUS
      || m_opType == FieldMetaInfo.Optype.ORDINAL) {
      // check to see if all bin values could be parsed as numbers
      for (DiscretizeBin d : m_bins) {
        if (Utils.isMissingValue(d.getBinValueNumeric())) {
          categorical = true;
          break;
        }
      }
    } else {
      categorical = true;
    }
    tempAtt = (categorical) ? new Attribute("temp", (ArrayList) null)
      : new Attribute(m_fieldName + "_discretized(optype=continuous)");

    if (categorical) {
      for (DiscretizeBin d : m_bins) {
        tempAtt.addStringValue(d.getBinValue());
      }

      // add the default value (just in case it is some other value than one
      // of the bins
      if (m_defaultValueDefined) {
        tempAtt.addStringValue(m_defaultValue);
      }

      // add the map missing to value (just in case it is some other value than
      // one
      // of the bins
      if (m_mapMissingDefined) {
        tempAtt.addStringValue(m_mapMissingTo);
      }

      // now make this into a nominal attribute
      ArrayList values = new ArrayList();
      for (int i = 0; i < tempAtt.numValues(); i++) {
        values.add(tempAtt.value(i));
      }

      m_outputDef = new Attribute(m_fieldName + "_discretized", values);
    } else {
      m_outputDef = tempAtt;
    }
  }

  /**
   * Return the structure of the result of applying this Expression as an
   * Attribute.
   * 
   * @return the structure of the result of applying this Expression as an
   *         Attribute.
   */
  @Override
  protected Attribute getOutputDef() {
    if (m_outputDef == null) {
      // return a "default" output def. This will get replaced
      // by a final one when the final field defs are are set
      // for all expressions after all derived fields are collected
      return (m_opType == FieldMetaInfo.Optype.CATEGORICAL || m_opType == FieldMetaInfo.Optype.ORDINAL) ? new Attribute(
        m_fieldName + "_discretized", new ArrayList()) : new Attribute(
        m_fieldName + "_discretized(optype=continuous)");
    }
    return m_outputDef;
  }

  /**
   * Get the result of evaluating the expression. In the case of a continuous
   * optype, a real number is returned; in the case of a categorical/ordinal
   * optype, the index of the nominal value is returned as a double.
   * 
   * @param incoming the incoming parameter values
   * @return the result of evaluating the expression
   * @throws Exception if there is a problem computing the result
   */
  @Override
  public double getResult(double[] incoming) throws Exception {

    // default of a missing value for the result if none of the following
    // logic applies
    double result = Utils.missingValue();

    double value = incoming[m_fieldIndex];

    if (Utils.isMissingValue(value)) {
      if (m_mapMissingDefined) {
        if (m_outputDef.isNominal()) {
          result = m_outputDef.indexOfValue(m_mapMissingTo);
        } else {
          try {
            result = Double.parseDouble(m_mapMissingTo);
          } catch (NumberFormatException ex) {
            throw new Exception(
              "[Discretize] Optype is continuous but value of mapMissingTo "
                + "can not be parsed as a number!");
          }
        }
      }
    } else {
      // look for a bin that has an interval that contains this value
      boolean found = false;
      for (DiscretizeBin b : m_bins) {
        if (b.containsValue(value)) {
          found = true;
          if (m_outputDef.isNominal()) {
            result = m_outputDef.indexOfValue(b.getBinValue());
          } else {
            result = b.getBinValueNumeric();
          }
          break;
        }
      }

      if (!found) {
        if (m_defaultValueDefined) {
          if (m_outputDef.isNominal()) {
            result = m_outputDef.indexOfValue(m_defaultValue);
          } else {
            try {
              result = Double.parseDouble(m_defaultValue);
            } catch (NumberFormatException ex) {
              throw new Exception(
                "[Discretize] Optype is continuous but value of "
                  + "default value can not be parsed as a number!");
            }
          }
        }
      }
    }

    return result;
  }

  /**
   * Gets the result of evaluating the expression when the optype is categorical
   * or ordinal as the actual String value.
   * 
   * @param incoming the incoming parameter values
   * @return the result of evaluating the expression
   * @throws Exception if the optype is continuous
   */
  @Override
  public String getResultCategorical(double[] incoming) throws Exception {
    double index = getResult(incoming);
    if (Utils.isMissingValue(index)) {
      return "**Missing Value**";
    }

    return m_outputDef.value((int) index);
  }

  /*
   * (non-Javadoc)
   * 
   * @see weka.core.pmml.Expression#toString(java.lang.String)
   */
  @Override
  public String toString(String pad) {
    StringBuffer buff = new StringBuffer();

    buff.append(pad + "Discretize (" + m_fieldName + "):");
    for (DiscretizeBin d : m_bins) {
      buff.append("\n" + pad + d.toString());
    }

    if (m_outputDef.isNumeric()) {
      buff.append("\n" + pad + "(bin values interpreted as numbers)");
    }

    if (m_mapMissingDefined) {
      buff.append("\n" + pad + "map missing values to: " + m_mapMissingTo);
    }

    if (m_defaultValueDefined) {
      buff.append("\n" + pad + "default value: " + m_defaultValue);
    }

    return buff.toString();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy