moa.classifiers.lazy.neighboursearch.NormalizableDistance Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of moa
Massive On-line Analysis is an environment for massive data mining. MOA provides a framework for data stream mining and includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.
There is a newer version: 2024.07.0
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    NormalizableDistance.java
 *    Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package moa.classifiers.lazy.neighboursearch;

import com.yahoo.labs.samoa.instances.Attribute;
import com.yahoo.labs.samoa.instances.Instance;
import com.yahoo.labs.samoa.instances.Instances;

/**
 * Represents the abstract ancestor for normalizable distance functions, like
 * Euclidean or Manhattan distance.
 *
 * @author Fracpete (fracpete at waikato dot ac dot nz)
 * @author Gabi Schmidberger ([email protected]) -- original code from weka.core.EuclideanDistance
 * @author Ashraf M. Kibriya ([email protected]) -- original code from weka.core.EuclideanDistance
 * @version $Revision: 8034 $
 */
public abstract class NormalizableDistance
  implements DistanceFunction {
  
  /** Index in ranges for MIN. */
  public static final int R_MIN = 0;

  /** Index in ranges for MAX. */
  
  public static final int R_MAX = 1;
  
  /** Index in ranges for WIDTH. */
  public static final int R_WIDTH = 2;

  /** the instances used internally. */
  protected Instances m_Data = null;

  /** True if normalization is turned off (default false).*/
  protected boolean m_DontNormalize = false;
  
  /** The range of the attributes. */
  protected double[][] m_Ranges;

  /** The range of attributes to use for calculating the distance. */
//  protected Range m_AttributeIndices = new Range("first-last");

  /** The boolean flags, whether an attribute will be used or not. */
  protected boolean[] m_ActiveIndices;
  
  /** Whether all the necessary preparations have been done. */
  protected boolean m_Validated;

  /**
   * Invalidates the distance function, Instances must be still set.
   */
  public NormalizableDistance() {
    invalidate();
  }

  /**
   * Initializes the distance function and automatically initializes the
   * ranges.
   * 
   * @param data 	the instances the distance function should work on
   */
  public NormalizableDistance(Instances data) {
    setInstances(data);
  }
  
  /**
   * Returns a string describing this object.
   * 
   * @return 		a description of the evaluator suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public abstract String globalInfo();
  
  /** 
   * Returns the tip text for this property.
   * 
   * @return 		tip text for this property suitable for
   *         		displaying in the explorer/experimenter gui
   */
  public String dontNormalizeTipText() {
    return "Whether if the normalization of attributes should be turned off " +
           "for distance calculation (Default: false i.e. attribute values " +
           "are normalized). ";
  }
  
  /** 
   * Sets whether if the attribute values are to be normalized in distance
   * calculation.
   * 
   * @param dontNormalize	if true the values are not normalized
   */
  public void setDontNormalize(boolean dontNormalize) {
    m_DontNormalize = dontNormalize;
    invalidate();
  }
  
  /**
   * Gets whether if the attribute values are to be normazlied in distance
   * calculation. (default false i.e. attribute values are normalized.)
   * 
   * @return		false if values get normalized
   */
  public boolean getDontNormalize() {
    return m_DontNormalize;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return 
        "Specify range of attributes to act on. "
      + "This is a comma separated list of attribute indices, with "
      + "\"first\" and \"last\" valid values. Specify an inclusive "
      + "range with \"-\". E.g: \"first-3,5,6-10,last\".";
  }

  /**
   * Sets the range of attributes to use in the calculation of the distance.
   * The indices start from 1, 'first' and 'last' are valid as well. 
   * E.g.: first-3,5,6-last
   * 
   * @param value	the new attribute index range
   */
  public void setAttributeIndices(String value) {
    //m_AttributeIndices.setRanges(value);
    invalidate();
  }
  
  /**
   * Gets the range of attributes used in the calculation of the distance.
   * 
   * @return		the attribute index range
   */
  public String getAttributeIndices() {
    return null; //m_AttributeIndices.getRanges();
  }   

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return 
        "Set attribute selection mode. If false, only selected "
      + "attributes in the range will be used in the distance calculation; if "
      + "true, only non-selected attributes will be used for the calculation.";
  }
  
  /**
   * Sets whether the matching sense of attribute indices is inverted or not.
   * 
   * @param value	if true the matching sense is inverted
   */
  public void setInvertSelection(boolean value) {
    //m_AttributeIndices.setInvert(value);
    invalidate();
  }
  
  /**
   * Gets whether the matching sense of attribute indices is inverted or not.
   * 
   * @return		true if the matching sense is inverted
   */
  public boolean getInvertSelection() {
    return false; //m_AttributeIndices.getInvert();
  }
  
  /**
   * invalidates all initializations.
   */
  protected void invalidate() {
    m_Validated = false;
  }
  
  /**
   * performs the initializations if necessary.
   */
  protected void validate() {
    if (!m_Validated) {
      initialize();
      m_Validated = true;
    }
  }
  
  /**
   * initializes the ranges and the attributes being used.
   */
  protected void initialize() {
    initializeAttributeIndices();
    initializeRanges();
  }

  /**
   * initializes the attribute indices.
   */
  protected void initializeAttributeIndices() {
    //m_AttributeIndices.setUpper(m_Data.numAttributes() - 1);
    m_ActiveIndices = new boolean[m_Data.numAttributes()];
    for (int i = 0; i < m_ActiveIndices.length; i++)
      m_ActiveIndices[i] = true; //m_AttributeIndices.isInRange(i);
  }

  /**
   * Sets the instances.
   * 
   * @param insts 	the instances to use
   */
  public void setInstances(Instances insts) {
    m_Data = insts;
    invalidate();
  }

  /**
   * returns the instances currently set.
   * 
   * @return 		the current instances
   */
  public Instances getInstances() {
    return m_Data;
  }

  /**
   * Does nothing, derived classes may override it though.
   * 
   * @param distances	the distances to post-process
   */
  public void postProcessDistances(double[] distances) {
  }

  /**
   * Update the distance function (if necessary) for the newly added instance.
   * 
   * @param ins		the instance to add
   */
  public void update(Instance ins) {
    validate();
    
    m_Ranges = updateRanges(ins, m_Ranges);
  }

  /**
   * Calculates the distance between two instances.
   * 
   * @param first 	the first instance
   * @param second 	the second instance
   * @return 		the distance between the two given instances
   */
  public double distance(Instance first, Instance second) {
    return distance(first, second, Double.POSITIVE_INFINITY);
  }


  /**
   * Calculates the distance between two instances. Offers speed up (if the 
   * distance function class in use supports it) in nearest neighbour search by 
   * taking into account the cutOff or maximum distance. Depending on the 
   * distance function class, post processing of the distances by 
   * postProcessDistances(double []) may be required if this function is used.
   *
   * @param first 	the first instance
   * @param second 	the second instance
   * @param cutOffValue If the distance being calculated becomes larger than 
   *                    cutOffValue then the rest of the calculation is 
   *                    discarded.
   * @return 		the distance between the two given instances or 
   * 			Double.POSITIVE_INFINITY if the distance being 
   * 			calculated becomes larger than cutOffValue. 
   */
  public double distance(Instance first, Instance second, double cutOffValue) {
    double distance = 0;
    int firstI, secondI;
    int firstNumValues = first.numValues();
    int secondNumValues = second.numValues();
    int numAttributes = m_Data.numAttributes();
    int classIndex = m_Data.classIndex();
    
    validate();
    
    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues; ) {
      if (p1 >= firstNumValues)
	firstI = numAttributes;
      else
	firstI = first.index(p1); 

      if (p2 >= secondNumValues)
	secondI = numAttributes;
      else
	secondI = second.index(p2);

      if (firstI == classIndex) {
	p1++; 
	continue;
      }
      if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
	p1++; 
	continue;
      }
       
      if (secondI == classIndex) {
	p2++; 
	continue;
      }
      if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
	p2++;
	continue;
      }
       
      double diff;
      
      if (firstI == secondI) {
	diff = difference(firstI,
	    		  first.valueSparse(p1),
	    		  second.valueSparse(p2));
	p1++;
	p2++;
      }
      else if (firstI > secondI) {
	diff = difference(secondI, 
	    		  0, second.valueSparse(p2));
	p2++;
      }
      else {
	diff = difference(firstI, 
	    		  first.valueSparse(p1), 0);
	p1++;
      }
      
      distance = updateDistance(distance, diff);
      if (distance > cutOffValue)
        return Double.POSITIVE_INFINITY;
    }

    return distance;
  }
  
  /**
   * Updates the current distance calculated so far with the new difference
   * between two attributes. The difference between the attributes was 
   * calculated with the difference(int,double,double) method.
   * 
   * @param currDist	the current distance calculated so far
   * @param diff	the difference between two new attributes
   * @return		the update distance
   * @see		#difference(int, double, double)
   */
  protected abstract double updateDistance(double currDist, double diff);
  
  /**
   * Normalizes a given value of a numeric attribute.
   *
   * @param x 		the value to be normalized
   * @param i 		the attribute's index
   * @return		the normalized value
   */
  protected double norm(double x, int i) {
    if (Double.isNaN(m_Ranges[i][R_MIN]) || (m_Ranges[i][R_MAX] == m_Ranges[i][R_MIN]))
      return 0;
    else
      return (x - m_Ranges[i][R_MIN]) / (m_Ranges[i][R_WIDTH]);
  }

  /**
   * Computes the difference between two given attribute
   * values.
   * 
   * @param index	the attribute index
   * @param val1	the first value
   * @param val2	the second value
   * @return		the difference
   */
  protected double difference(int index, double val1, double val2) {
    //switch (m_Data.attribute(index).type()) {
      //case Attribute.NOMINAL:
      if (m_Data.attribute(index).isNominal() == true){
        if (isMissingValue(val1) ||
           isMissingValue(val2) ||
           ((int) val1 != (int) val2)) {
          return 1;
        }
        else {
          return 0;
        }
      } else {  
      //case Attribute.NUMERIC:
        if (isMissingValue(val1) ||
           isMissingValue(val2)) {
          if (isMissingValue(val1) &&
             isMissingValue(val2)) {
            if (!m_DontNormalize)  //We are doing normalization
              return 1;
            else
              return (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]);
          }
          else {
            double diff;
            if (isMissingValue(val2)) {
              diff = (!m_DontNormalize) ? norm(val1, index) : val1;
            }
            else {
              diff = (!m_DontNormalize) ? norm(val2, index) : val2;
            }
            if (!m_DontNormalize && diff < 0.5) {
              diff = 1.0 - diff;
            }
            else if (m_DontNormalize) {
              if ((m_Ranges[index][R_MAX]-diff) > (diff-m_Ranges[index][R_MIN]))
                return m_Ranges[index][R_MAX]-diff;
              else
                return diff-m_Ranges[index][R_MIN];
            }
            return diff;
          }
        }
        else {
          return (!m_DontNormalize) ? 
              	 (norm(val1, index) - norm(val2, index)) :
              	 (val1 - val2);
        }
        
      //default:
      //  return 0;
    }
  }
  
  /**
   * Initializes the ranges using all instances of the dataset.
   * Sets m_Ranges.
   * 
   * @return 		the ranges
   */
  public double[][] initializeRanges() {
    if (m_Data == null) {
      m_Ranges = null;
      return m_Ranges;
    }
    
    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double [numAtt][3];
    
    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      m_Ranges = ranges;
      return m_Ranges;
    }
    else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(0), numAtt, ranges);
    }
    
    // update ranges, starting from the second
    for (int i = 1; i < m_Data.numInstances(); i++)
      updateRanges(m_Data.instance(i), numAtt, ranges);

    m_Ranges = ranges;
    
    return m_Ranges;
  }
  
  /**
   * Used to initialize the ranges. For this the values of the first
   * instance is used to save time.
   * Sets low and high to the values of the first instance and
   * width to zero.
   * 
   * @param instance 	the new instance
   * @param numAtt 	number of attributes in the model
   * @param ranges 	low, high and width values for all attributes
   */
  public void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) {
    for (int j = 0; j < numAtt; j++) {
      if (!instance.isMissing(j)) {
        ranges[j][R_MIN] = instance.value(j);
        ranges[j][R_MAX] = instance.value(j);
        ranges[j][R_WIDTH] = 0.0;
      }
      else { // if value was missing
        ranges[j][R_MIN] = Double.POSITIVE_INFINITY;
        ranges[j][R_MAX] = -Double.POSITIVE_INFINITY;
        ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY;
      }
    }
  }
  
  /**
   * Updates the minimum and maximum and width values for all the attributes
   * based on a new instance.
   * 
   * @param instance 	the new instance
   * @param numAtt 	number of attributes in the model
   * @param ranges 	low, high and width values for all attributes
   */
  public void updateRanges(Instance instance, int numAtt, double[][] ranges) {
    // updateRangesFirst must have been called on ranges
    for (int j = 0; j < numAtt; j++) {
      double value = instance.value(j);
      if (!instance.isMissing(j)) {
        if (value < ranges[j][R_MIN]) {
          ranges[j][R_MIN] = value;
          ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          if (value > ranges[j][R_MAX]) { //if this is the first value that is
            ranges[j][R_MAX] = value;    //not missing. The,0
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        }
        else {
          if (value > ranges[j][R_MAX]) {
            ranges[j][R_MAX] = value;
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        }
      }
    }
  }
  
  /**
   * Used to initialize the ranges.
   * 
   * @param numAtt 	number of attributes in the model
   * @param ranges 	low, high and width values for all attributes
   */
  public void initializeRangesEmpty(int numAtt, double[][] ranges) {
    for (int j = 0; j < numAtt; j++) {
      ranges[j][R_MIN] = Double.POSITIVE_INFINITY;
      ranges[j][R_MAX] = -Double.POSITIVE_INFINITY;
      ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY;
    }
  }
  
  /**
   * Updates the ranges given a new instance.
   * 
   * @param instance 	the new instance
   * @param ranges 	low, high and width values for all attributes
   * @return		the updated ranges
   */
  public double[][] updateRanges(Instance instance, double[][] ranges) {
    // updateRangesFirst must have been called on ranges
    for (int j = 0; j < ranges.length; j++) {
      double value = instance.value(j);
      if (!instance.isMissing(j)) {
        if (value < ranges[j][R_MIN]) {
          ranges[j][R_MIN] = value;
          ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
        } else {
          if (instance.value(j) > ranges[j][R_MAX]) {
            ranges[j][R_MAX] = value;
            ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
          }
        }
      }
    }
    
    return ranges;
  }
  
  /**
   * Initializes the ranges of a subset of the instances of this dataset.
   * Therefore m_Ranges is not set.
   * 
   * @param instList 	list of indexes of the subset
   * @return 		the ranges
   * @throws Exception	if something goes wrong
   */
  public double[][] initializeRanges(int[] instList) throws Exception {
    if (m_Data == null)
      throw new Exception("No instances supplied.");
    
    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double [numAtt][3];
    
    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      return ranges;
    }
    else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges);
      // update ranges, starting from the second
      for (int i = 1; i < instList.length; i++) {
        updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
      }
    }
    return ranges;
  }

  /**
   * Initializes the ranges of a subset of the instances of this dataset.
   * Therefore m_Ranges is not set.
   * The caller of this method should ensure that the supplied start and end 
   * indices are valid (start <= end, end<instList.length etc) and
   * correct.
   *
   * @param instList 	list of indexes of the instances
   * @param startIdx 	start index of the subset of instances in the indices array
   * @param endIdx 	end index of the subset of instances in the indices array
   * @return 		the ranges
   * @throws Exception	if something goes wrong
   */
  public double[][] initializeRanges(int[] instList, int startIdx, int endIdx) throws Exception {
    if (m_Data == null)
      throw new Exception("No instances supplied.");
    
    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double [numAtt][3];
    
    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      return ranges;
    }
    else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(instList[startIdx]), numAtt, ranges);
      // update ranges, starting from the second
      for (int i = startIdx+1; i <= endIdx; i++) {
        updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
      }
    }
    
    return ranges;
  }
  
  /**
   * Update the ranges if a new instance comes.
   * 
   * @param instance 	the new instance
   */
  public void updateRanges(Instance instance) {
    validate();
    
    m_Ranges = updateRanges(instance, m_Ranges);
  }
  
  /**
   * Test if an instance is within the given ranges.
   * 
   * @param instance 	the instance
   * @param ranges 	the ranges the instance is tested to be in
   * @return true 	if instance is within the ranges
   */
  public boolean inRanges(Instance instance, double[][] ranges) {
    boolean isIn = true;
    
    // updateRangesFirst must have been called on ranges
    for (int j = 0; isIn && (j < ranges.length); j++) {
      if (!instance.isMissing(j)) {
        double value = instance.value(j);
        isIn = value <= ranges[j][R_MAX];
        if (isIn) isIn = value >= ranges[j][R_MIN];
      }
    }
    
    return isIn;
  }
  
  /**
   * Check if ranges are set.
   * 
   * @return 		true if ranges are set
   */
  public boolean rangesSet() {
    return (m_Ranges != null);
  }
  
  /**
   * Method to get the ranges.
   * 
   * @return 		the ranges
   * @throws Exception	if no randes are set yet
   */
  public double[][] getRanges() throws Exception {
    validate();
    
    if (m_Ranges == null)
      throw new Exception("Ranges not yet set.");
    
    return m_Ranges;
  }
  
  /**
   * Returns an empty string.
   * 
   * @return		an empty string
   */
  public String toString() {
    return "";
  }
  
   /**
   * Tests if the given value codes "missing".
   *
   * @param val the value to be tested
   * @return true if val codes "missing"
   */
  public static boolean isMissingValue(double val) {

    return Double.isNaN(val);
  }
}