smile.feature.selection.InformationValue Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */
package smile.feature.selection;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.IntStream;

import smile.classification.ClassLabels;
import smile.data.DataFrame;
import smile.data.measure.NominalScale;
import smile.data.transform.ColumnTransform;
import smile.data.type.StructField;
import smile.data.type.StructType;
import smile.data.vector.BaseVector;
import smile.math.Function;
import smile.sort.QuickSort;

/**
 * Information Value (IV) measures the predictive strength of a feature
 * for a binary dependent variable. IV is essentially a weighted
 * sum of all the individual Weight of Evidence (WoE) values, where the
 * weights incorporate the absolute difference between the numerator and
 * the denominator (WoE captures the relative difference). Note that the
 * weight follows the same sign as WoE hence ensuring that the IV is always
 * a positive number.
 * 
 * IV is a good measure of the predictive power of a feature. It also helps
 * point out the suspicious feature. Unlike other feature selection methods
 * available, the features selected using IV might not be the best feature
 * set for a non-linear model building.
 *
 * 
 *     
 *     
 *     
 *     
 *     
 *     
 *     
 * Interpretation of Information ValueInformation Value Predictive power
<0.02 Useless
0.02 to 0.1 Weak predictors
0.1 to 0.3 Medium Predictors
0.3 to 0.5 Strong predictors
>0.5 Suspicious
 *
 * Weight of Evidence (WoE) measures the predictive power of every
 * bin/category of a feature for a binary dependent variable.
 * WoE is calculated as
 *  * WoE = ln (percentage of events / percentage of non-events).
 * 
 * Note that the conditional log odds is exactly what a logistic
 * regression model tries to predict.
 * 
 * WoE values of a categorical variable can be used to convert
 * a categorical feature to a numerical feature. If a continuous
 * feature does not have a linear relationship with the log odds,
 * the feature can be binned into groups and a new feature created
 * by replaced each bin with its WoE value. Therefore, WoE is a good
 * variable transformation method for logistic regression.
 * 

 * On arranging a numerical feature in ascending order, if the WoE
 * values are all linear, we know that the feature has the right
 * linear relation with the target. However, if the feature's WoE
 * is non-linear, we should either discard it or consider some other
 * variable transformation to ensure the linearity. Hence, WoE helps
 * check the linear relationship of a feature with its dependent variable
 * to be used in the model. Though WoE and IV are highly useful,
 * always ensure that it is only used with logistic regression.
 * 
 * WoE is better than on-hot encoding as it does not increase the
 * complexity of the model.
 *
 * @param feature The feature name.
 * @param iv The information value.
 * @param woe The weight of evidence.
 * @param breaks The breakpoints of intervals for numerical variables.
 * @author Haifeng Li
 */
public record InformationValue(String feature, double iv, double[] woe, double[] breaks) implements Comparable {
    @Override
    public int compareTo(InformationValue other) {
        return Double.compare(iv, other.iv);
    }

    @Override
    public String toString() {
        return String.format("InformationValue(%s, %.4f)", feature, iv);
    }

    /**
     * Returns the code of predictive power of information value.
     * @param iv information value
     * @return the code of predictive power
     */
    private static String predictivePower(double iv) {
        if (Double.isNaN(iv)) return "";
        if (iv < 0.02) return "Not useful";
        else if (iv <= 0.1) return "Weak";
        else if (iv <= 0.3) return "Medium";
        else if (iv <= 0.5) return "Strong";
        else return "Suspicious";
    }

    /**
     * Returns a string representation of the array of information values.
     * @param ivs the array of information values.
     * @return a string representation of information values
     */
    public static String toString(InformationValue[] ivs) {
        StringBuilder builder = new StringBuilder();

        builder.append("Feature                   Information Value    Predictive Power\n");
        for (var iv : ivs) {
            builder.append(String.format("%-25s %17.4f    %16s%n", iv.feature, iv.iv, predictivePower(iv.iv)));
        }

        return builder.toString();
    }

    /**
     * Returns the data transformation that covert feature value to its weight of evidence.
     * @param values the information value objects of features.
     * @return the transform.
     */
    public static ColumnTransform toTransform(InformationValue[] values) {
        Map transforms = new HashMap<>();
        for (InformationValue iv : values) {
            Function transform = new Function() {
                @Override
                public double f(double x) {
                    int i;
                    if (iv.breaks == null) {
                        i = (int) x;
                        if (i < 0 || i >= iv.woe.length) {
                            throw new IllegalArgumentException("Invalid nominal value: " + i);
                        }
                        return iv.woe[i];
                    } else {
                        i = Arrays.binarySearch(iv.breaks, x);
                        if (i < 0) i = -i - 1;
                    }
                    return iv.woe[i];
                }

                @Override
                public String toString() {
                    return iv.feature + "_WoE";
                }
            };

            transforms.put(iv.feature, transform);
        }

        return new ColumnTransform("WoE", transforms);
    }

    /**
     * Calculates the information value.
     *
     * @param data the data frame of the explanatory and response variables.
     * @param clazz the column name of binary class labels.
     * @return the information value.
     */
    public static InformationValue[] fit(DataFrame data, String clazz) {
        return fit(data, clazz, 10);
    }

    /**
     * Calculates the information value.
     *
     * @param data the data frame of the explanatory and response variables.
     * @param clazz the column name of binary class labels.
     * @param nbins the number of bins to discretize numeric variables in WOE calculation.
     * @return the information value.
     */
    public static InformationValue[] fit(DataFrame data, String clazz, int nbins) {
        if (nbins < 2) {
            throw new IllegalArgumentException("Invalid number of bins: " + nbins);
        }

        BaseVector y = data.column(clazz);
        ClassLabels codec = ClassLabels.fit(y);

        if (codec.k != 2) {
            throw new UnsupportedOperationException("Information Value is applicable only to binary classification");
        }

        int n = data.nrow();
        StructType schema = data.schema();

        return IntStream.range(0, schema.length()).mapToObj(i -> {
            int[] events;
            int[] nonevents;
            double[] breaks = null;

            StructField field = schema.field(i);
            if (field.measure instanceof NominalScale scale) {
                int k = scale.size();
                events = new int[k];
                nonevents = new int[k];

                int[] xi = data.column(i).toIntArray();
                for (int j = 0; j < n; j++) {
                    if (codec.y[j] == 1) {
                        events[xi[j]]++;
                    } else {
                        nonevents[xi[j]]++;
                    }
                }
            } else if (field.isNumeric()) {
                events = new int[nbins];
                nonevents = new int[nbins];
                breaks = new double[nbins - 1];

                double[] xi = data.column(i).toDoubleArray();
                int[] order = QuickSort.sort(xi);

                int begin = 0;
                for (int j = 0; j < nbins; j++) {
                    int end = (j + 1) * n / nbins;
                    if (j < nbins - 1) breaks[j] = xi[end];

                    for (int k = begin; k < end; k++) {
                        if (codec.y[order[k]] == 1) {
                            events[j]++;
                        } else {
                            nonevents[j]++;
                        }
                    }
                    begin = end;
                }
            } else {
                return null;
            }

            int k = events.length;
            double[] woe = new double[k];
            double iv = 0.0;
            for (int j = 0; j < k; j++) {
                double pnonevents = Math.max(nonevents[j], 0.5) / codec.ni[0];
                double pevents = Math.max(events[j], 0.5) / codec.ni[1];
                woe[j] = Math.log(pnonevents / pevents);
                iv += (pnonevents - pevents) * woe[j];
            }

            return new InformationValue(field.name, iv, woe, breaks);
        }).filter(iv -> iv != null && !iv.feature.equals(clazz)).toArray(InformationValue[]::new);
    }
}
Information Value	Predictive power
<0.02	Useless
0.02 to 0.1	Weak predictors
0.1 to 0.3	Medium Predictors
0.3 to 0.5	Strong predictors
>0.5	Suspicious