smile.feature.NumericAttributeFeature Maven / Gradle / Ivy
The newest version!
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.feature;
import smile.math.Math;
import smile.data.Attribute;
import smile.data.NumericAttribute;
import smile.sort.QuickSelect;
/**
* Numeric attribute normalization/standardization feature generator.
* Many machine learning methods such as Neural Networks and SVM with Gaussian
* kernel also require the features properly scaled/standardized. For example,
* each variable is scaled into interval [0, 1] or to have mean 0 and standard
* deviation 1.
*
* @author Haifeng Li
*/
public class NumericAttributeFeature implements Feature {
/**
* The types of data scaling.
*/
public static enum Scaling {
/**
* No scaling at all.
*/
NONE,
/**
* Takes logarithms of input data when they contain order-of-magnitude
* larger and smaller values. Note logarithms are defined only for
* positive values.
*/
LOGARITHM,
/**
* Normalization scales all numeric variables in the range [0, 1].
* If the dataset has outliers, normalization will certainly scale
* the "normal" data to a very small interval. In this case, the
* Winsorization procedure should be applied: values greater than the
* specified upper limit are replaced with the upper limit, and those
* below the lower limit are replace with the lower limit. Often, the
* specified range is indicate in terms of percentiles of the original
* distribution (like the 5th and 95th percentile).
*/
NORMALIZATION,
/**
* Standardization transforms a variable to have zero mean and unit
* variance. Standardization makes an assumption that the data follows
* a Gaussian distribution and are also not robust when outliers present.
* A robust alternative is to subtract the median and divide by the IQR.
*/
STANDARDIZATION
}
/**
* The variable attributes.
*/
private Attribute[] attributes;
/**
* The attributes of generated binary dummy variables.
*/
private Attribute[] features;
/**
* A map from feature id to original attribute index.
*/
private int[] map;
/**
* The types of scaling.
*/
private Scaling scaling;
/**
* For normalization, this is min or lower limit.
* For standardization, this is mean or median.
*/
private double[] a;
/**
* For normalization, this is max - min or upper limit - lower limit.
* For standardization, this is standard deviation or IQR.
*/
private double[] b;
/**
* Constructor. Scales numeric attributes into proper range. For logarithm
* scaling, the attributes must have positive values.
* @param attributes the variable attributes. Of which, numeric variables
* will be scaled.
* @param scaling the way of scaling. The scaling type must be NONE or
* LOGARITHM because they do not need training data.
*/
public NumericAttributeFeature(Attribute[] attributes, Scaling scaling) {
if (scaling != Scaling.NONE && scaling != Scaling.LOGARITHM) {
throw new IllegalArgumentException("Invalid scaling operation without training data: " + scaling);
}
this.attributes = attributes;
this.scaling = scaling;
int p = 0;
for (Attribute attribute : attributes) {
if (attribute instanceof NumericAttribute) {
p++;
}
}
features = new Attribute[p];
map = new int[p];
for (int i = 0, j = 0; j < attributes.length; j++) {
Attribute attribute = attributes[j];
if (attribute instanceof NumericAttribute) {
if (scaling == Scaling.NONE) {
features[i] = attribute;
} else {
features[i] = new NumericAttribute(attribute.name + "_" + scaling, attribute.description, attribute.weight);
}
map[i++] = j;
}
}
}
/**
* Constructor. Scales numeric attributes into proper range. For logarithm
* scaling, the attributes must have positive values. In case of
* normalization, the min and max values of attributes are used as lower
* and upper limits. For standardization, variables are scaled to have zero
* mean and unit variance.
* @param attributes the variable attributes. Of which, numeric variables
* will be scaled.
* @param scaling the way of scaling.
* @param data the training data to learn scaling parameters.
*/
public NumericAttributeFeature(Attribute[] attributes, Scaling scaling, double[][] data) {
this.attributes = attributes;
this.scaling = scaling;
int n = data.length;
int p = 0;
for (Attribute attribute : attributes) {
if (attribute instanceof NumericAttribute) {
p++;
}
}
features = new Attribute[p];
map = new int[p];
a = new double[p];
b = new double[p];
double[] x = new double[n];
for (int i = 0, j = 0; j < attributes.length; j++) {
Attribute attribute = attributes[j];
if (attribute instanceof NumericAttribute) {
if (scaling == Scaling.NONE) {
features[i] = attribute;
} else {
features[i] = new NumericAttribute(attribute.name + "_" + scaling, attribute.description, attribute.weight);
if (scaling == Scaling.NORMALIZATION || scaling == Scaling.STANDARDIZATION) {
for (int k = 0; k < n; k++) {
x[k] = data[k][j];
}
if (scaling == Scaling.NORMALIZATION) {
a[i] = Math.min(x);
b[i] = Math.max(x) - a[i];
if (b[i] == 0.0) {
throw new IllegalArgumentException("Attribute " + attribute + " has constant values.");
}
}
if (scaling == Scaling.STANDARDIZATION) {
a[i] = Math.mean(x);
b[i] = Math.sd(x);
if (b[i] == 0.0) {
throw new IllegalArgumentException("Attribute " + attribute + " has constant values.");
}
}
}
}
map[i++] = j;
}
}
}
/**
* Constructor. Normalizes numeric attributes with Winsorization: values
* greater than the specified upper limit are replaced with the upper
* limit, and those below the lower limit are replace with the lower limit.
* The specified lower/upper limits are indicate in terms of percentiles of
* the original distribution.
* @param attributes the variable attributes. Of which, numeric variables
* will be normalized.
* @param lower the lower limit in terms of percentiles of the original
* distribution (say 5th percentile).
* @param upper the upper limit in terms of percentiles of the original
* distribution (say 95th percentile).
* @param data the training data to learn scaling parameters.
*/
public NumericAttributeFeature(Attribute[] attributes, double lower, double upper, double[][] data) {
if (lower < 0.0 || lower > 0.5) {
throw new IllegalArgumentException("Invalid lower limit: " + lower);
}
if (upper < 0.5 || lower > 1.0) {
throw new IllegalArgumentException("Invalid upper limit: " + upper);
}
if (upper <= lower) {
throw new IllegalArgumentException("Invalid lower and upper limit pair: " + lower + " >= " + upper);
}
this.attributes = attributes;
this.scaling = Scaling.NORMALIZATION;
int n = data.length;
int p = 0;
for (Attribute attribute : attributes) {
if (attribute instanceof NumericAttribute) {
p++;
}
}
int i1 = (int) Math.round(lower * n);
int i2 = (int) Math.round(upper * n);
if (i2 == n) {
i2 = n - 1;
}
features = new Attribute[p];
map = new int[p];
a = new double[p];
b = new double[p];
double[] x = new double[n];
for (int i = 0, j = 0; j < attributes.length; j++) {
Attribute attribute = attributes[j];
if (attribute instanceof NumericAttribute) {
features[i] = new NumericAttribute(attribute.name + "_" + scaling, attribute.description, attribute.weight);
for (int k = 0; k < n; k++) {
x[k] = data[k][j];
}
a[i] = QuickSelect.select(x, i1);
b[i] = QuickSelect.select(x, i2) - a[i];
if (b[i] == 0.0) {
throw new IllegalArgumentException("Attribute " + attribute + " has constant values in the given range.");
}
map[i++] = j;
}
}
}
/**
* Constructor. Robustly standardizes numeric attributes by subtracting
* the median and dividing by the IQR.
* @param attributes the variable attributes. Of which, numeric variables
* will be standardized.
* @param data the training data to learn scaling parameters.
*/
public NumericAttributeFeature(Attribute[] attributes, double[][] data) {
this.attributes = attributes;
this.scaling = Scaling.STANDARDIZATION;
int n = data.length;
int p = 0;
for (Attribute attribute : attributes) {
if (attribute instanceof NumericAttribute) {
p++;
}
}
features = new Attribute[p];
map = new int[p];
a = new double[p];
b = new double[p];
double[] x = new double[n];
for (int i = 0, j = 0; j < attributes.length; j++) {
Attribute attribute = attributes[j];
if (attribute instanceof NumericAttribute) {
features[i] = new NumericAttribute(attribute.name + "_" + scaling, attribute.description, attribute.weight);
for (int k = 0; k < n; k++) {
x[k] = data[k][j];
}
a[i] = QuickSelect.median(x);
b[i] = QuickSelect.q3(x) - QuickSelect.q1(x);
if (b[i] == 0.0) {
throw new IllegalArgumentException("Attribute " + attribute + " has constant values between Q1 and Q3.");
}
map[i++] = j;
}
}
}
@Override
public Attribute[] attributes() {
return features;
}
@Override
public double f(double[] object, int id) {
if (object.length != attributes.length) {
throw new IllegalArgumentException(String.format("Invalide object size %d, expected %d", object.length, attributes.length));
}
if (id < 0 || id >= features.length) {
throw new IllegalArgumentException("Invalide feature id: " + id);
}
double x = object[map[id]];
switch (scaling) {
case NONE:
return x;
case LOGARITHM:
if (x <= 0.0) {
throw new IllegalArgumentException("Invalid value for logarithm: " + x);
}
return Math.log(x);
case NORMALIZATION:
double y = (x - a[id]) / b[id];
if (y < 0.0) y = 0.0;
if (y > 1.0) y = 1.0;
return y;
case STANDARDIZATION:
return (x - a[id]) / b[id];
}
throw new IllegalStateException("Impossible to reach here.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy