All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.NormalizableDistance Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    NormalizableDistance.java
 *    Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.neighboursearch.PerformanceStats;

/**
 * Represents the abstract ancestor for normalizable distance functions, like
 * Euclidean or Manhattan distance.
 * 
 * @author Fracpete (fracpete at waikato dot ac dot nz)
 * @author Gabi Schmidberger ([email protected]) -- original code from
 *         weka.core.EuclideanDistance
 * @author Ashraf M. Kibriya ([email protected]) -- original code from
 *         weka.core.EuclideanDistance
 * @version $Revision: 1.2 $
 */
public abstract class NormalizableDistance implements DistanceFunction,
  OptionHandler, Serializable, RevisionHandler {

  /** Index in ranges for MIN. */
  public static final int R_MIN = 0;

  /** Index in ranges for MAX. */

  public static final int R_MAX = 1;

  /** Index in ranges for WIDTH. */
  public static final int R_WIDTH = 2;

  /** the instances used internally. */
  protected Instances m_Data = null;

  /** True if normalization is turned off (default false). */
  protected boolean m_DontNormalize = false;

  /** The range of the attributes. */
  protected double[][] m_Ranges;

  /** The range of attributes to use for calculating the distance. */
  protected Range m_AttributeIndices = new Range("first-last");

  /** The boolean flags, whether an attribute will be used or not. */
  protected boolean[] m_ActiveIndices;

  /** Whether all the necessary preparations have been done. */
  protected boolean m_Validated;

  /**
   * Invalidates the distance function, Instances must be still set.
   */
  public NormalizableDistance() {
    invalidate();
  }

  /**
   * Initializes the distance function and automatically initializes the ranges.
   * 
   * @param data the instances the distance function should work on
   */
  public NormalizableDistance(Instances data) {
    setInstances(data);
  }

  /**
   * Returns a string describing this object.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public abstract String globalInfo();

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration listOptions() {
    Vector result = new Vector();

    result.add(new Option("\tTurns off the normalization of attribute \n"
      + "\tvalues in distance calculation.", "D", 0, "-D"));

    result.addElement(new Option(
      "\tSpecifies list of columns to used in the calculation of the \n"
        + "\tdistance. 'first' and 'last' are valid indices.\n"
        + "\t(default: first-last)", "R", 1, "-R "));

    result.addElement(new Option("\tInvert matching sense of column indices.",
      "V", 0, "-V"));

    return result.elements();
  }

  /**
   * Gets the current settings. Returns empty array.
   * 
   * @return an array of strings suitable for passing to setOptions()
   */
  @Override
  public String[] getOptions() {
    Vector result;

    result = new Vector();

    if (getDontNormalize()) {
      result.add("-D");
    }

    result.add("-R");
    result.add(getAttributeIndices());

    if (getInvertSelection()) {
      result.add("-V");
    }

    return result.toArray(new String[result.size()]);
  }

  /**
   * Parses a given list of options.
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {
    String tmpStr;

    setDontNormalize(Utils.getFlag('D', options));

    tmpStr = Utils.getOption('R', options);
    if (tmpStr.length() != 0) {
      setAttributeIndices(tmpStr);
    } else {
      setAttributeIndices("first-last");
    }

    setInvertSelection(Utils.getFlag('V', options));
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dontNormalizeTipText() {
    return "Whether if the normalization of attributes should be turned off "
      + "for distance calculation (Default: false i.e. attribute values "
      + "are normalized). ";
  }

  /**
   * Sets whether if the attribute values are to be normalized in distance
   * calculation.
   * 
   * @param dontNormalize if true the values are not normalized
   */
  public void setDontNormalize(boolean dontNormalize) {
    m_DontNormalize = dontNormalize;
    invalidate();
  }

  /**
   * Gets whether if the attribute values are to be normazlied in distance
   * calculation. (default false i.e. attribute values are normalized.)
   * 
   * @return false if values get normalized
   */
  public boolean getDontNormalize() {
    return m_DontNormalize;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return "Specify range of attributes to act on. "
      + "This is a comma separated list of attribute indices, with "
      + "\"first\" and \"last\" valid values. Specify an inclusive "
      + "range with \"-\". E.g: \"first-3,5,6-10,last\".";
  }

  /**
   * Sets the range of attributes to use in the calculation of the distance. The
   * indices start from 1, 'first' and 'last' are valid as well. E.g.:
   * first-3,5,6-last
   * 
   * @param value the new attribute index range
   */
  @Override
  public void setAttributeIndices(String value) {
    m_AttributeIndices.setRanges(value);
    invalidate();
  }

  /**
   * Gets the range of attributes used in the calculation of the distance.
   * 
   * @return the attribute index range
   */
  @Override
  public String getAttributeIndices() {
    return m_AttributeIndices.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return "Set attribute selection mode. If false, only selected "
      + "attributes in the range will be used in the distance calculation; if "
      + "true, only non-selected attributes will be used for the calculation.";
  }

  /**
   * Sets whether the matching sense of attribute indices is inverted or not.
   * 
   * @param value if true the matching sense is inverted
   */
  @Override
  public void setInvertSelection(boolean value) {
    m_AttributeIndices.setInvert(value);
    invalidate();
  }

  /**
   * Gets whether the matching sense of attribute indices is inverted or not.
   * 
   * @return true if the matching sense is inverted
   */
  @Override
  public boolean getInvertSelection() {
    return m_AttributeIndices.getInvert();
  }

  /**
   * invalidates all initializations.
   */
  protected void invalidate() {
    m_Validated = false;
  }

  /**
   * performs the initializations if necessary.
   */
  protected void validate() {
    if (!m_Validated) {
      initialize();
      m_Validated = true;
    }
  }

  /**
   * initializes the ranges and the attributes being used.
   */
  protected void initialize() {
    initializeAttributeIndices();
    initializeRanges();
  }

  /**
   * initializes the attribute indices.
   */
  protected void initializeAttributeIndices() {
    m_AttributeIndices.setUpper(m_Data.numAttributes() - 1);
    m_ActiveIndices = new boolean[m_Data.numAttributes()];
    for (int i = 0; i < m_ActiveIndices.length; i++) {
      m_ActiveIndices[i] = m_AttributeIndices.isInRange(i);
    }
  }

  /**
   * Sets the instances.
   * 
   * @param insts the instances to use
   */
  @Override
  public void setInstances(Instances insts) {
    m_Data = insts;
    invalidate();
  }

  /**
   * returns the instances currently set.
   * 
   * @return the current instances
   */
  @Override
  public Instances getInstances() {
    return m_Data;
  }

  /**
   * Does nothing, derived classes may override it though.
   * 
   * @param distances the distances to post-process
   */
  @Override
  public void postProcessDistances(double[] distances) {
  }

  /**
   * Update the distance function (if necessary) for the newly added instance.
   * 
   * @param ins the instance to add
   */
  @Override
  public void update(Instance ins) {
    validate();

    m_Ranges = updateRanges(ins, m_Ranges);
  }

  /**
   * Calculates the distance between two instances.
   * 
   * @param first the first instance
   * @param second the second instance
   * @return the distance between the two given instances
   */
  @Override
  public double distance(Instance first, Instance second) {
    return distance(first, second, null);
  }

  /**
   * Calculates the distance between two instances.
   * 
   * @param first the first instance
   * @param second the second instance
   * @param stats the performance stats object
   * @return the distance between the two given instances
   */
  @Override
  public double distance(Instance first, Instance second, PerformanceStats stats) {
    return distance(first, second, Double.POSITIVE_INFINITY, stats);
  }

  /**
   * Calculates the distance between two instances. Offers speed up (if the
   * distance function class in use supports it) in nearest neighbour search by
   * taking into account the cutOff or maximum distance. Depending on the
   * distance function class, post processing of the distances by
   * postProcessDistances(double []) may be required if this function is used.
   * 
   * @param first the first instance
   * @param second the second instance
   * @param cutOffValue If the distance being calculated becomes larger than
   *          cutOffValue then the rest of the calculation is discarded.
   * @return the distance between the two given instances or
   *         Double.POSITIVE_INFINITY if the distance being calculated becomes
   *         larger than cutOffValue.
   */
  @Override
  public double distance(Instance first, Instance second, double cutOffValue) {
    return distance(first, second, cutOffValue, null);
  }

  /**
   * Calculates the distance between two instances. Offers speed up (if the
   * distance function class in use supports it) in nearest neighbour search by
   * taking into account the cutOff or maximum distance. Depending on the
   * distance function class, post processing of the distances by
   * postProcessDistances(double []) may be required if this function is used.
   * 
   * @param first the first instance
   * @param second the second instance
   * @param cutOffValue If the distance being calculated becomes larger than
   *          cutOffValue then the rest of the calculation is discarded.
   * @param stats the performance stats object
   * @return the distance between the two given instances or
   *         Double.POSITIVE_INFINITY if the distance being calculated becomes
   *         larger than cutOffValue.
   */
  @Override
  public double distance(Instance first, Instance second, double cutOffValue,
    PerformanceStats stats) {
    double distance = 0;
    int firstI, secondI;
    int firstNumValues = first.numValues();
    int secondNumValues = second.numValues();
    int numAttributes = m_Data.numAttributes();
    int classIndex = m_Data.classIndex();

    validate();

    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
      if (p1 >= firstNumValues) {
        firstI = numAttributes;
      } else {
        firstI = first.index(p1);
      }

      if (p2 >= secondNumValues) {
        secondI = numAttributes;
      } else {
        secondI = second.index(p2);
      }

      if (firstI == classIndex) {
        p1++;
        continue;
      }
      if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
        p1++;
        continue;
      }

      if (secondI == classIndex) {
        p2++;
        continue;
      }
      if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
        p2++;
        continue;
      }

      double diff;

      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      if (stats != null) {
        stats.incrCoordCount();
      }

      distance = updateDistance(distance, diff);
      if (distance > cutOffValue) {
        return Double.POSITIVE_INFINITY;
      }
    }

    return distance;
  }

  /**
   * Updates the current distance calculated so far with the new difference
   * between two attributes. The difference between the attributes was
   * calculated with the difference(int,double,double) method.
   * 
   * @param currDist the current distance calculated so far
   * @param diff the difference between two new attributes
   * @return the update distance
   * @see #difference(int, double, double)
   */
  protected abstract double updateDistance(double currDist, double diff);

  /**
   * Normalizes a given value of a numeric attribute.
   * 
   * @param x the value to be normalized
   * @param i the attribute's index
   * @return the normalized value
   */
  protected double norm(double x, int i) {
    if (Double.isNaN(m_Ranges[i][R_MIN])
      || (m_Ranges[i][R_MAX] == m_Ranges[i][R_MIN])) {
      return 0;
    } else {
      return (x - m_Ranges[i][R_MIN]) / (m_Ranges[i][R_WIDTH]);
    }
  }

  /**
   * Computes the difference between two given attribute values.
   * 
   * @param index the attribute index
   * @param val1 the first value
   * @param val2 the second value
   * @return the difference
   */
  protected double difference(int index, double val1, double val2) {
    switch (m_Data.attribute(index).type()) {
    case Attribute.NOMINAL:
      if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)
        || ((int) val1 != (int) val2)) {
        return 1;
      } else {
        return 0;
      }

    case Attribute.NUMERIC:
      if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) {
        if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) {
          if (!m_DontNormalize) {
            return 1;
          } else {
            return (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]);
          }
        } else {
          double diff;
          if (Instance.isMissingValue(val2)) {
            diff = (!m_DontNormalize) ? norm(val1, index) : val1;
          } else {
            diff = (!m_DontNormalize) ? norm(val2, index) : val2;
          }
          if (!m_DontNormalize && diff < 0.5) {
            diff = 1.0 - diff;
          } else if (m_DontNormalize) {
            if ((m_Ranges[index][R_MAX] - diff) > (diff - m_Ranges[index][R_MIN])) {
              return m_Ranges[index][R_MAX] - diff;
            } else {
              return diff - m_Ranges[index][R_MIN];
            }
          }
          return diff;
        }
      } else {
        return (!m_DontNormalize) ? (norm(val1, index) - norm(val2, index))
          : (val1 - val2);
      }

    default:
      return 0;
    }
  }

  /**
   * Initializes the ranges using all instances of the dataset. Sets m_Ranges.
   * 
   * @return the ranges
   */
  public double[][] initializeRanges() {
    if (m_Data == null) {
      m_Ranges = null;
      return m_Ranges;
    }

    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double[numAtt][3];

    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      m_Ranges = ranges;
      return m_Ranges;
    } else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(0), numAtt, ranges);
    }

    // update ranges, starting from the second
    for (int i = 1; i < m_Data.numInstances(); i++) {
      updateRanges(m_Data.instance(i), numAtt, ranges);
    }

    m_Ranges = ranges;

    return m_Ranges;
  }

  /**
   * Used to initialize the ranges. For this the values of the first instance is
   * used to save time. Sets low and high to the values of the first instance
   * and width to zero.
   * 
   * @param instance the new instance
   * @param numAtt number of attributes in the model
   * @param ranges low, high and width values for all attributes
   */
  public void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) {
    for (int j = 0; j < numAtt; j++) {
      if (!instance.isMissing(j)) {
        ranges[j][R_MIN] = instance.value(j);
        ranges[j][R_MAX] = instance.value(j);
        ranges[j][R_WIDTH] = 0.0;
      } else { // if value was missing
        ranges[j][R_MIN] = Double.POSITIVE_INFINITY;
        ranges[j][R_MAX] = -Double.POSITIVE_INFINITY;
        ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY;
      }
    }
  }

  /**
   * Updates the minimum and maximum and width values for all the attributes
   * based on a new instance.
   * 
   * @param instance the new instance
   * @param numAtt number of attributes in the model
   * @param ranges low, high and width values for all attributes
   */
  public void updateRanges(Instance instance, int numAtt, double[][] ranges) {
    // updateRangesFirst must have been called on ranges
    for (int j = 0; j < numAtt; j++) {
      double value = instance.value(j);
      if (!instance.isMissing(j)) {
        if (value < ranges[j][R_MIN]) {
          ranges[j][R_MIN] = value;
          ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          if (value > ranges[j][R_MAX]) { // if this is the first value that is
            ranges[j][R_MAX] = value; // not missing. The,0
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        } else {
          if (value > ranges[j][R_MAX]) {
            ranges[j][R_MAX] = value;
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        }
      }
    }
  }

  /**
   * Used to initialize the ranges.
   * 
   * @param numAtt number of attributes in the model
   * @param ranges low, high and width values for all attributes
   */
  public void initializeRangesEmpty(int numAtt, double[][] ranges) {
    for (int j = 0; j < numAtt; j++) {
      ranges[j][R_MIN] = Double.POSITIVE_INFINITY;
      ranges[j][R_MAX] = -Double.POSITIVE_INFINITY;
      ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY;
    }
  }

  /**
   * Updates the ranges given a new instance.
   * 
   * @param instance the new instance
   * @param ranges low, high and width values for all attributes
   * @return the updated ranges
   */
  public double[][] updateRanges(Instance instance, double[][] ranges) {
    // updateRangesFirst must have been called on ranges
    for (int j = 0; j < ranges.length; j++) {
      double value = instance.value(j);
      if (!instance.isMissing(j)) {
        if (value < ranges[j][R_MIN]) {
          ranges[j][R_MIN] = value;
          ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
        } else {
          if (instance.value(j) > ranges[j][R_MAX]) {
            ranges[j][R_MAX] = value;
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        }
      }
    }

    return ranges;
  }

  /**
   * Initializes the ranges of a subset of the instances of this dataset.
   * Therefore m_Ranges is not set.
   * 
   * @param instList list of indexes of the subset
   * @return the ranges
   * @throws Exception if something goes wrong
   */
  public double[][] initializeRanges(int[] instList) throws Exception {
    if (m_Data == null) {
      throw new Exception("No instances supplied.");
    }

    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double[numAtt][3];

    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      return ranges;
    } else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges);
      // update ranges, starting from the second
      for (int i = 1; i < instList.length; i++) {
        updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
      }
    }
    return ranges;
  }

  /**
   * Initializes the ranges of a subset of the instances of this dataset.
   * Therefore m_Ranges is not set. The caller of this method should ensure that
   * the supplied start and end indices are valid (start <= end,
   * end<instList.length etc) and correct.
   * 
   * @param instList list of indexes of the instances
   * @param startIdx start index of the subset of instances in the indices array
   * @param endIdx end index of the subset of instances in the indices array
   * @return the ranges
   * @throws Exception if something goes wrong
   */
  public double[][] initializeRanges(int[] instList, int startIdx, int endIdx)
    throws Exception {
    if (m_Data == null) {
      throw new Exception("No instances supplied.");
    }

    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double[numAtt][3];

    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      return ranges;
    } else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(instList[startIdx]), numAtt, ranges);
      // update ranges, starting from the second
      for (int i = startIdx + 1; i <= endIdx; i++) {
        updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
      }
    }

    return ranges;
  }

  /**
   * Update the ranges if a new instance comes.
   * 
   * @param instance the new instance
   */
  public void updateRanges(Instance instance) {
    validate();

    m_Ranges = updateRanges(instance, m_Ranges);
  }

  /**
   * Test if an instance is within the given ranges.
   * 
   * @param instance the instance
   * @param ranges the ranges the instance is tested to be in
   * @return true if instance is within the ranges
   */
  public boolean inRanges(Instance instance, double[][] ranges) {
    boolean isIn = true;

    // updateRangesFirst must have been called on ranges
    for (int j = 0; isIn && (j < ranges.length); j++) {
      if (!instance.isMissing(j)) {
        double value = instance.value(j);
        isIn = value <= ranges[j][R_MAX];
        if (isIn) {
          isIn = value >= ranges[j][R_MIN];
        }
      }
    }

    return isIn;
  }

  /**
   * Free any references to training instances
   */
  @Override
  public void clean() {
    m_Data = new Instances(m_Data, 0);
  }

  /**
   * Check if ranges are set.
   * 
   * @return true if ranges are set
   */
  public boolean rangesSet() {
    return (m_Ranges != null);
  }

  /**
   * Method to get the ranges.
   * 
   * @return the ranges
   * @throws Exception if no randes are set yet
   */
  public double[][] getRanges() throws Exception {
    validate();

    if (m_Ranges == null) {
      throw new Exception("Ranges not yet set.");
    }

    return m_Ranges;
  }

  /**
   * Returns an empty string.
   * 
   * @return an empty string
   */
  @Override
  public String toString() {
    return "";
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy