smile.feature.selection.GAFE Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.feature.selection;

import java.util.function.BiFunction;
import smile.classification.Classifier;
import smile.classification.DataFrameClassifier;
import smile.data.DataFrame;
import smile.data.formula.Formula;
import smile.gap.*;
import smile.math.MathEx;
import smile.regression.DataFrameRegression;
import smile.regression.Regression;
import smile.validation.metric.*;

/**
 * Genetic algorithm based feature selection. This method finds many (random)
 * subsets of variables of expected classification power using a Genetic
 * Algorithm. The "fitness" of each subset of variables is determined by its
 * ability to classify the samples according to a given classification
 * method. When many such subsets of variables are obtained, the one with the best
 * performance may be used as selected features. Alternatively, the frequencies
 * with which variables are selected may be analyzed further. The most
 * frequently selected variables may be presumed to be the most relevant to
 * sample distinction and are finally used for prediction. Although GA avoids
 * brute-force search, it is still much slower than univariate feature selection.
 * 
 * References
 * 
 *  Leping Li and Clarice R. Weinberg. Gene Selection and Sample Classification Using a Genetic Algorithm/k-Nearest Neighbor Method.
 * 
 * 
 * @author Haifeng Li
 */
public class GAFE {
    /**
     * Selection strategy.
     */
    private final Selection selection;
    /**
     * The number of best chromosomes to copy to new population. When creating
     * new population by crossover and mutation, we have a big chance, that we
     * will loose the best chromosome. Elitism first copies the best chromosome
     * (or a few best chromosomes) to new population. The rest is done in
     * classical way. Elitism can very rapidly increase performance of GA,
     * because it prevents losing the best found solution.
     */
    private final int elitism;
    /**
     * Crossover strategy.
     */
    private final Crossover crossover;
    /**
     * Crossover rate.
     */
    private final double crossoverRate;
    /**
     * Mutation rate.
     * The mutation parameters are set higher than usual to prevent premature convergence.
     */
    private final double mutationRate;

    /**
     * Constructor.
     */
    public GAFE() {
        this(Selection.Tournament(3, 0.95), 1, Crossover.TWO_POINT, 1.0, 0.01);
    }

    /**
     * Constructor.
     * @param selection the selection strategy.
     * @param elitism the number of best chromosomes to copy to new population.
     * @param crossover the strategy of crossover operation.
     * @param crossoverRate the crossover rate.
     * @param mutationRate the mutation rate.
     */
    public GAFE(Selection selection, int elitism, Crossover crossover, double crossoverRate, double mutationRate) {
        this.selection = selection;
        this.elitism = elitism;
        this.crossover = crossover;
        this.crossoverRate = crossoverRate;
        this.mutationRate = mutationRate;
    }
    
    /**
     * Genetic algorithm based feature selection for classification.
     * @param size the population size of Genetic Algorithm.
     * @param generation the maximum number of iterations.
     * @param length the length of bit string, i.e. the number of features.
     * @param fitness the fitness function.
     * @return bit strings of last generation.
     */
    public BitString[] apply(int size, int generation, int length, Fitness fitness) {
        if (size <= 0) {
            throw new IllegalArgumentException("Invalid population size: " + size);
        }
        
        BitString[] seeds = new BitString[size];
        for (int i = 0; i < size; i++) {
            seeds[i] = new BitString(length, fitness, crossover, crossoverRate, mutationRate);
        }

        GeneticAlgorithm ga = new GeneticAlgorithm<>(seeds, selection, elitism);
        ga.evolve(generation);       
        
        return seeds;
    }

    /**
     * Returns the index of 1's.
     * @return the index of 1's.
     */
    private static int[] indexOf(byte[] bits) {
        int p = MathEx.sum(bits);
        if (p == 0) return null;

        int[] index = new int[p];
        for (int i = 0, ii = 0; i < bits.length; i++) {
            if (bits[i] == 1) index[ii++] = i;
        }
        return index;
    }

    /**
     * Returns the data with selected features.
     * @return the data with selected features.
     */
    private static double[][] select(double[][] x, int[] features) {
        int p = features.length;
        int n = x.length;

        double[][] xx = new double[n][p];
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < p; j++) {
                xx[i][j] = x[i][features[j]];
            }
        }

        return xx;
    }

    /**
     * Returns the fitness of the classification model.
     *
     * @param x training samples.
     * @param y training labels.
     * @param testx testing samples.
     * @param testy testing labels.
     * @param metric classification metric.
     * @param trainer the lambda to train a model.
     * @return the fitness of model.
     */
    public static Fitness fitness(double[][] x, int[] y, double[][] testx, int[] testy, ClassificationMetric metric, BiFunction> trainer) {
        return chromosome -> {
            byte[] bits = chromosome.bits();
            int[] features = indexOf(bits);
            if (features == null) return 0.0;

            double[][] xx = select(x, features);
            double[][] testxx = select(testx, features);

            Classifier model = trainer.apply(xx, y);
            return metric.score(testy, model.predict(testxx));
        };
    }

    /**
     * Returns the fitness of the regression model.
     *
     * @param x training samples.
     * @param y training response.
     * @param testx testing samples.
     * @param testy testing response.
     * @param metric classification metric.
     * @param trainer the lambda to train a model.
     * @return the fitness of model.
     */
    public static Fitness fitness(double[][] x, double[] y, double[][] testx, double[] testy, RegressionMetric metric, BiFunction> trainer) {
        return chromosome -> {
            byte[] bits = chromosome.bits();
            int[] features = indexOf(bits);
            if (features == null) return Double.NEGATIVE_INFINITY;

            double[][] xx = select(x, features);
            double[][] testxx = select(testx, features);

            Regression model = trainer.apply(xx, y);
            return -metric.score(testy, model.predict(testxx));
        };
    }

    /** Returns the selected features. */
    private static String[] selectedFeatures(byte[] bits, String[] names, String y) {
        int p = MathEx.sum(bits);
        if (p == 0) return null;

        int offset = 0;
        String[] features = new String[p];
        for (int i = 0, ii = 0; i < bits.length; i++) {
            if (names[i].equals(y)) {
                offset++;
            }

            if (bits[i] == 1) {
                features[ii++] = names[i+offset];
            }
        }
        return features;
    }

    /**
     * Returns the fitness of the classification model.
     *
     * @param y the column name of class labels.
     * @param train training data.
     * @param test testing data.
     * @param metric classification metric.
     * @param trainer the lambda to train a model.
     * @return the fitness of model.
     */
    public static Fitness fitness(String y, DataFrame train, DataFrame test, ClassificationMetric metric, BiFunction trainer) {
        String[] names = train.names();
        int[] testy = test.column(y).toIntArray();

        return chromosome -> {
            byte[] bits = chromosome.bits();
            String[] features = selectedFeatures(bits, names, y);
            if (features == null) return 0.0;

            Formula formula = Formula.of(y, features);
            DataFrameClassifier model = trainer.apply(formula, train);
            return metric.score(testy, model.predict(test));
        };
    }

    /**
     * Returns the fitness of the regression model.
     *
     * @param y the column name of response variable.
     * @param train training data.
     * @param test testing data.
     * @param metric classification metric.
     * @param trainer the lambda to train a model.
     * @return the fitness of model.
     */
    public static Fitness fitness(String y, DataFrame train, DataFrame test, RegressionMetric metric, BiFunction trainer) {
        String[] names = train.names();
        double[] testy = test.column(y).toDoubleArray();

        return chromosome -> {
            byte[] bits = chromosome.bits();
            String[] features = selectedFeatures(bits, names, y);
            if (features == null) return Double.NEGATIVE_INFINITY;

            Formula formula = Formula.of(y, features);
            DataFrameRegression model = trainer.apply(formula, train);
            return -metric.score(testy, model.predict(test));
        };
    }
}