All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.pmml.MappingInfo Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    MappingInfo.java
 *    Copyright (C) 2008-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.pmml;

import java.io.Serializable;
import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.gui.Logger;

/**
 * Class that maintains the mapping between incoming data set structure and that
 * of the mining schema.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com
 * @version $Revision: 10203 $
 */
public class MappingInfo implements Serializable {

  /** For serialization */
  private static final long serialVersionUID = -475467721189397466L;

  /**
   * Index for incoming nominal values that are not defined in the mining
   * schema.
   */
  public static final int UNKNOWN_NOMINAL_VALUE = -1;

  /**
   * Map the incoming attributes to the mining schema attributes. Each entry
   * holds the index of the incoming attribute that corresponds to this mining
   * schema attribute.
   */
  private int[] m_fieldsMap = null;

  /**
   * Map indexes for nominal values in incoming structure to those in the mining
   * schema. There will be as many entries as there are attributes in this
   * array. Non-nominal attributes will have null entries. Each non-null entry
   * is an array of integer indexes. Each entry in a given array (for a given
   * attribute) holds the index of the mining schema value that corresponds to
   * this incoming value. UNKNOWN_NOMINAL_VALUE is used as the index for those
   * incoming values that are not defined in the mining schema.
   */
  private int[][] m_nominalValueMaps = null;

  /** Holds a textual description of the fields mapping */
  private String m_fieldsMappingText = null;

  /** For logging */
  private Logger m_log = null;

  public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception {
    m_log = log;
    // miningSchema.convertStringAttsToNominal();
    Instances fieldsI = miningSchema.getMiningSchemaAsInstances();

    m_fieldsMap = new int[fieldsI.numAttributes()];
    m_nominalValueMaps = new int[fieldsI.numAttributes()][];

    for (int i = 0; i < fieldsI.numAttributes(); i++) {
      String schemaAttName = fieldsI.attribute(i).name();
      boolean found = false;
      for (int j = 0; j < dataSet.numAttributes(); j++) {
        if (dataSet.attribute(j).name().equals(schemaAttName)) {
          Attribute miningSchemaAtt = fieldsI.attribute(i);
          Attribute incomingAtt = dataSet.attribute(j);
          // check type match
          if (miningSchemaAtt.type() != incomingAtt.type()) {
            if (miningSchemaAtt.isString() && incomingAtt.isNominal()) {
              // don't worry about String attributes in the mining schema
              // (as long as the corresponding incoming is a String or nominal),
              // since values for the String attributes are more than likely
              // revealed
              // by FieldRef elements in the actual model itself
            } else {
              throw new Exception("[MappingInfo] type mismatch for field "
                + schemaAttName + ". Mining schema type "
                + miningSchemaAtt.toString() + ". Incoming type "
                + incomingAtt.toString() + ".");
            }
          }

          // check nominal values (number, names...)
          if (miningSchemaAtt.numValues() != incomingAtt.numValues()) {
            String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
              + incomingAtt.name()
              + " does not have the same "
              + "number of values as the corresponding mining "
              + "schema attribute.";
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
          if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) {
            int[] valuesMap = new int[incomingAtt.numValues()];
            for (int k = 0; k < incomingAtt.numValues(); k++) {
              String incomingNomVal = incomingAtt.value(k);
              int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal);
              if (indexInSchema < 0) {
                String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
                  + incomingAtt.name()
                  + " has value "
                  + incomingNomVal
                  + " that doesn't occur in the mining schema.";
                if (m_log != null) {
                  m_log.logMessage(warningString);
                } else {
                  System.err.println(warningString);
                }
                valuesMap[k] = UNKNOWN_NOMINAL_VALUE;
              } else {
                valuesMap[k] = indexInSchema;
              }
            }
            m_nominalValueMaps[i] = valuesMap;
          }

          /*
           * if (miningSchemaAtt.isNominal()) { for (int k = 0; k <
           * miningSchemaAtt.numValues(); k++) { if
           * (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) { throw
           * new Exception("[PMMLUtils] value " + k + " (" +
           * miningSchemaAtt.value(k) + ") does not match " + "incoming value ("
           * + incomingAtt.value(k) + ") for attribute " +
           * miningSchemaAtt.name() + ".");
           * 
           * } } }
           */
          found = true;
          m_fieldsMap[i] = j;
        }
      }
      if (!found) {
        throw new Exception(
          "[MappingInfo] Unable to find a match for mining schema "
            + "attribute " + schemaAttName + " in the " + "incoming instances!");
      }
    }

    // check class attribute (if set)
    if (fieldsI.classIndex() >= 0) {
      if (dataSet.classIndex() < 0) {
        // first see if we can find a matching class
        String className = fieldsI.classAttribute().name();
        Attribute classMatch = dataSet.attribute(className);
        if (classMatch == null) {
          throw new Exception(
            "[MappingInfo] Can't find match for target field " + className
              + "in incoming instances!");
        }
        dataSet.setClass(classMatch);
      } else if (!fieldsI.classAttribute().name()
        .equals(dataSet.classAttribute().name())) {
        throw new Exception(
          "[MappingInfo] class attribute in mining schema does not match "
            + "class attribute in incoming instances!");
      }
    }

    // Set up the textual description of the mapping
    fieldsMappingString(fieldsI, dataSet);
  }

  private void fieldsMappingString(Instances miningSchemaI, Instances incomingI) {
    StringBuffer result = new StringBuffer();

    int maxLength = 0;
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      if (miningSchemaI.attribute(i).name().length() > maxLength) {
        maxLength = miningSchemaI.attribute(i).name().length();
      }
    }
    maxLength += 12; // length of " (nominal)"/" (numeric)"

    int minLength = 13; // "Mining schema".length()
    String headerS = "Mining schema";
    String sep = "-------------";

    if (maxLength < minLength) {
      maxLength = minLength;
    }

    headerS = PMMLUtils.pad(headerS, " ", maxLength, false);
    sep = PMMLUtils.pad(sep, "-", maxLength, false);

    sep += "\t    ----------------\n";
    headerS += "\t    Incoming fields\n";
    result.append(headerS);
    result.append(sep);

    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      Attribute temp = miningSchemaI.attribute(i);
      String attName = "(" + ((temp.isNumeric()) ? "numeric)" : "nominal)")
        + " " + temp.name();
      attName = PMMLUtils.pad(attName, " ", maxLength, false);
      attName += "\t--> ";
      result.append(attName);

      Attribute incoming = incomingI.attribute(m_fieldsMap[i]);
      String fieldName = "" + (m_fieldsMap[i] + 1) + " ("
        + ((incoming.isNumeric()) ? "numeric)" : "nominal)");
      fieldName += " " + incoming.name();
      result.append(fieldName + "\n");
    }

    m_fieldsMappingText = result.toString();
  }

  /**
   * Convert an Instance to an array of values that matches the
   * format of the mining schema. First maps raw attribute values and then
   * applies rules for missing values, outliers etc.
   * 
   * @param inst the Instance to convert
   * @param miningSchema the mining schema incoming instance attributes
   * @return an array of doubles that are values from the incoming Instances,
   *         correspond to the format of the mining schema and have had missing
   *         values, outliers etc. dealt with.
   * @throws Exception if something goes wrong
   */
  public double[] instanceToSchema(Instance inst, MiningSchema miningSchema)
    throws Exception {
    Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances();

    // allocate enough space for both mining schema fields and any derived
    // fields
    double[] result = new double[miningSchema.getFieldsAsInstances()
      .numAttributes()];

    // Copy over the values
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      // if (miningSchemaI.attribute(i).isNumeric()) {
      result[i] = inst.value(m_fieldsMap[i]);
      if (miningSchemaI.attribute(i).isNominal()
        || miningSchemaI.attribute(i).isString()) {
        // If not missing, look up the index of this incoming categorical value
        // in
        // the mining schema
        if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) {
          int[] valueMap = m_nominalValueMaps[i];
          int index = valueMap[(int) inst.value(m_fieldsMap[i])];
          String incomingAttValue = inst.attribute(m_fieldsMap[i]).value(
            (int) inst.value(m_fieldsMap[i]));
          /*
           * int index =
           * miningSchemaI.attribute(i).indexOfValue(incomingAttValue);
           */
          if (index >= 0) {
            result[i] = index;
          } else {
            // set this to "unknown" (-1) for nominal valued attributes
            result[i] = UNKNOWN_NOMINAL_VALUE;
            String warningString = "[MappingInfo] WARNING: Can't match nominal value "
              + incomingAttValue;
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
        }
      }
    }

    // Now deal with missing values and outliers...
    miningSchema.applyMissingAndOutlierTreatments(result);
    // printInst(result);

    // now fill in any derived values
    ArrayList derivedFields = miningSchema
      .getDerivedFields();
    for (int i = 0; i < derivedFields.size(); i++) {
      DerivedFieldMetaInfo temp = derivedFields.get(i);
      // System.err.println("Applying : " + temp);
      double r = temp.getDerivedValue(result);
      result[i + miningSchemaI.numAttributes()] = r;
    }

    /*
     * System.err.print("==> "); for (int i = 0; i < result.length; i++) {
     * System.err.print(" " + result[i]); } System.err.println();
     */

    return result;
  }

  /**
   * Get a textual description of them mapping between mining schema fields and
   * incoming data fields.
   * 
   * @return a description of the fields mapping as a String
   */
  public String getFieldsMappingString() {
    if (m_fieldsMappingText == null) {
      return "No fields mapping constructed!";
    }
    return m_fieldsMappingText;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy