weka.core.pmml.MappingInfo Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* MappingInfo.java
* Copyright (C) 2008-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.pmml;
import java.io.Serializable;
import java.util.ArrayList;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.gui.Logger;
/**
* Class that maintains the mapping between incoming data set structure and that
* of the mining schema.
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com
* @version $Revision: 10203 $
*/
public class MappingInfo implements Serializable {
/** For serialization */
private static final long serialVersionUID = -475467721189397466L;
/**
* Index for incoming nominal values that are not defined in the mining
* schema.
*/
public static final int UNKNOWN_NOMINAL_VALUE = -1;
/**
* Map the incoming attributes to the mining schema attributes. Each entry
* holds the index of the incoming attribute that corresponds to this mining
* schema attribute.
*/
private int[] m_fieldsMap = null;
/**
* Map indexes for nominal values in incoming structure to those in the mining
* schema. There will be as many entries as there are attributes in this
* array. Non-nominal attributes will have null entries. Each non-null entry
* is an array of integer indexes. Each entry in a given array (for a given
* attribute) holds the index of the mining schema value that corresponds to
* this incoming value. UNKNOWN_NOMINAL_VALUE is used as the index for those
* incoming values that are not defined in the mining schema.
*/
private int[][] m_nominalValueMaps = null;
/** Holds a textual description of the fields mapping */
private String m_fieldsMappingText = null;
/** For logging */
private Logger m_log = null;
public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception {
m_log = log;
// miningSchema.convertStringAttsToNominal();
Instances fieldsI = miningSchema.getMiningSchemaAsInstances();
m_fieldsMap = new int[fieldsI.numAttributes()];
m_nominalValueMaps = new int[fieldsI.numAttributes()][];
for (int i = 0; i < fieldsI.numAttributes(); i++) {
String schemaAttName = fieldsI.attribute(i).name();
boolean found = false;
for (int j = 0; j < dataSet.numAttributes(); j++) {
if (dataSet.attribute(j).name().equals(schemaAttName)) {
Attribute miningSchemaAtt = fieldsI.attribute(i);
Attribute incomingAtt = dataSet.attribute(j);
// check type match
if (miningSchemaAtt.type() != incomingAtt.type()) {
if (miningSchemaAtt.isString() && incomingAtt.isNominal()) {
// don't worry about String attributes in the mining schema
// (as long as the corresponding incoming is a String or nominal),
// since values for the String attributes are more than likely
// revealed
// by FieldRef elements in the actual model itself
} else {
throw new Exception("[MappingInfo] type mismatch for field "
+ schemaAttName + ". Mining schema type "
+ miningSchemaAtt.toString() + ". Incoming type "
+ incomingAtt.toString() + ".");
}
}
// check nominal values (number, names...)
if (miningSchemaAtt.numValues() != incomingAtt.numValues()) {
String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
+ incomingAtt.name()
+ " does not have the same "
+ "number of values as the corresponding mining "
+ "schema attribute.";
if (m_log != null) {
m_log.logMessage(warningString);
} else {
System.err.println(warningString);
}
}
if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) {
int[] valuesMap = new int[incomingAtt.numValues()];
for (int k = 0; k < incomingAtt.numValues(); k++) {
String incomingNomVal = incomingAtt.value(k);
int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal);
if (indexInSchema < 0) {
String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
+ incomingAtt.name()
+ " has value "
+ incomingNomVal
+ " that doesn't occur in the mining schema.";
if (m_log != null) {
m_log.logMessage(warningString);
} else {
System.err.println(warningString);
}
valuesMap[k] = UNKNOWN_NOMINAL_VALUE;
} else {
valuesMap[k] = indexInSchema;
}
}
m_nominalValueMaps[i] = valuesMap;
}
/*
* if (miningSchemaAtt.isNominal()) { for (int k = 0; k <
* miningSchemaAtt.numValues(); k++) { if
* (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) { throw
* new Exception("[PMMLUtils] value " + k + " (" +
* miningSchemaAtt.value(k) + ") does not match " + "incoming value ("
* + incomingAtt.value(k) + ") for attribute " +
* miningSchemaAtt.name() + ".");
*
* } } }
*/
found = true;
m_fieldsMap[i] = j;
}
}
if (!found) {
throw new Exception(
"[MappingInfo] Unable to find a match for mining schema "
+ "attribute " + schemaAttName + " in the " + "incoming instances!");
}
}
// check class attribute (if set)
if (fieldsI.classIndex() >= 0) {
if (dataSet.classIndex() < 0) {
// first see if we can find a matching class
String className = fieldsI.classAttribute().name();
Attribute classMatch = dataSet.attribute(className);
if (classMatch == null) {
throw new Exception(
"[MappingInfo] Can't find match for target field " + className
+ "in incoming instances!");
}
dataSet.setClass(classMatch);
} else if (!fieldsI.classAttribute().name()
.equals(dataSet.classAttribute().name())) {
throw new Exception(
"[MappingInfo] class attribute in mining schema does not match "
+ "class attribute in incoming instances!");
}
}
// Set up the textual description of the mapping
fieldsMappingString(fieldsI, dataSet);
}
private void fieldsMappingString(Instances miningSchemaI, Instances incomingI) {
StringBuffer result = new StringBuffer();
int maxLength = 0;
for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
if (miningSchemaI.attribute(i).name().length() > maxLength) {
maxLength = miningSchemaI.attribute(i).name().length();
}
}
maxLength += 12; // length of " (nominal)"/" (numeric)"
int minLength = 13; // "Mining schema".length()
String headerS = "Mining schema";
String sep = "-------------";
if (maxLength < minLength) {
maxLength = minLength;
}
headerS = PMMLUtils.pad(headerS, " ", maxLength, false);
sep = PMMLUtils.pad(sep, "-", maxLength, false);
sep += "\t ----------------\n";
headerS += "\t Incoming fields\n";
result.append(headerS);
result.append(sep);
for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
Attribute temp = miningSchemaI.attribute(i);
String attName = "(" + ((temp.isNumeric()) ? "numeric)" : "nominal)")
+ " " + temp.name();
attName = PMMLUtils.pad(attName, " ", maxLength, false);
attName += "\t--> ";
result.append(attName);
Attribute incoming = incomingI.attribute(m_fieldsMap[i]);
String fieldName = "" + (m_fieldsMap[i] + 1) + " ("
+ ((incoming.isNumeric()) ? "numeric)" : "nominal)");
fieldName += " " + incoming.name();
result.append(fieldName + "\n");
}
m_fieldsMappingText = result.toString();
}
/**
* Convert an Instance
to an array of values that matches the
* format of the mining schema. First maps raw attribute values and then
* applies rules for missing values, outliers etc.
*
* @param inst the Instance
to convert
* @param miningSchema the mining schema incoming instance attributes
* @return an array of doubles that are values from the incoming Instances,
* correspond to the format of the mining schema and have had missing
* values, outliers etc. dealt with.
* @throws Exception if something goes wrong
*/
public double[] instanceToSchema(Instance inst, MiningSchema miningSchema)
throws Exception {
Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances();
// allocate enough space for both mining schema fields and any derived
// fields
double[] result = new double[miningSchema.getFieldsAsInstances()
.numAttributes()];
// Copy over the values
for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
// if (miningSchemaI.attribute(i).isNumeric()) {
result[i] = inst.value(m_fieldsMap[i]);
if (miningSchemaI.attribute(i).isNominal()
|| miningSchemaI.attribute(i).isString()) {
// If not missing, look up the index of this incoming categorical value
// in
// the mining schema
if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) {
int[] valueMap = m_nominalValueMaps[i];
int index = valueMap[(int) inst.value(m_fieldsMap[i])];
String incomingAttValue = inst.attribute(m_fieldsMap[i]).value(
(int) inst.value(m_fieldsMap[i]));
/*
* int index =
* miningSchemaI.attribute(i).indexOfValue(incomingAttValue);
*/
if (index >= 0) {
result[i] = index;
} else {
// set this to "unknown" (-1) for nominal valued attributes
result[i] = UNKNOWN_NOMINAL_VALUE;
String warningString = "[MappingInfo] WARNING: Can't match nominal value "
+ incomingAttValue;
if (m_log != null) {
m_log.logMessage(warningString);
} else {
System.err.println(warningString);
}
}
}
}
}
// Now deal with missing values and outliers...
miningSchema.applyMissingAndOutlierTreatments(result);
// printInst(result);
// now fill in any derived values
ArrayList derivedFields = miningSchema
.getDerivedFields();
for (int i = 0; i < derivedFields.size(); i++) {
DerivedFieldMetaInfo temp = derivedFields.get(i);
// System.err.println("Applying : " + temp);
double r = temp.getDerivedValue(result);
result[i + miningSchemaI.numAttributes()] = r;
}
/*
* System.err.print("==> "); for (int i = 0; i < result.length; i++) {
* System.err.print(" " + result[i]); } System.err.println();
*/
return result;
}
/**
* Get a textual description of them mapping between mining schema fields and
* incoming data fields.
*
* @return a description of the fields mapping as a String
*/
public String getFieldsMappingString() {
if (m_fieldsMappingText == null) {
return "No fields mapping constructed!";
}
return m_fieldsMappingText;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy