smile.feature.selection.GAFE Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.feature.selection;
import java.util.function.BiFunction;
import smile.classification.Classifier;
import smile.classification.DataFrameClassifier;
import smile.data.DataFrame;
import smile.data.formula.Formula;
import smile.gap.*;
import smile.math.MathEx;
import smile.regression.DataFrameRegression;
import smile.regression.Regression;
import smile.validation.metric.*;
/**
* Genetic algorithm based feature selection. This method finds many (random)
* subsets of variables of expected classification power using a Genetic
* Algorithm. The "fitness" of each subset of variables is determined by its
* ability to classify the samples according to a given classification
* method. When many such subsets of variables are obtained, the one with the best
* performance may be used as selected features. Alternatively, the frequencies
* with which variables are selected may be analyzed further. The most
* frequently selected variables may be presumed to be the most relevant to
* sample distinction and are finally used for prediction. Although GA avoids
* brute-force search, it is still much slower than univariate feature selection.
*
* References
*
* - Leping Li and Clarice R. Weinberg. Gene Selection and Sample Classification Using a Genetic Algorithm/k-Nearest Neighbor Method.
*
*
* @author Haifeng Li
*/
public class GAFE {
/**
* Selection strategy.
*/
private final Selection selection;
/**
* The number of best chromosomes to copy to new population. When creating
* new population by crossover and mutation, we have a big chance, that we
* will loose the best chromosome. Elitism first copies the best chromosome
* (or a few best chromosomes) to new population. The rest is done in
* classical way. Elitism can very rapidly increase performance of GA,
* because it prevents losing the best found solution.
*/
private final int elitism;
/**
* Crossover strategy.
*/
private final Crossover crossover;
/**
* Crossover rate.
*/
private final double crossoverRate;
/**
* Mutation rate.
* The mutation parameters are set higher than usual to prevent premature convergence.
*/
private final double mutationRate;
/**
* Constructor.
*/
public GAFE() {
this(Selection.Tournament(3, 0.95), 1, Crossover.TWO_POINT, 1.0, 0.01);
}
/**
* Constructor.
* @param selection the selection strategy.
* @param elitism the number of best chromosomes to copy to new population.
* @param crossover the strategy of crossover operation.
* @param crossoverRate the crossover rate.
* @param mutationRate the mutation rate.
*/
public GAFE(Selection selection, int elitism, Crossover crossover, double crossoverRate, double mutationRate) {
this.selection = selection;
this.elitism = elitism;
this.crossover = crossover;
this.crossoverRate = crossoverRate;
this.mutationRate = mutationRate;
}
/**
* Genetic algorithm based feature selection for classification.
* @param size the population size of Genetic Algorithm.
* @param generation the maximum number of iterations.
* @param length the length of bit string, i.e. the number of features.
* @param fitness the fitness function.
* @return bit strings of last generation.
*/
public BitString[] apply(int size, int generation, int length, Fitness fitness) {
if (size <= 0) {
throw new IllegalArgumentException("Invalid population size: " + size);
}
BitString[] seeds = new BitString[size];
for (int i = 0; i < size; i++) {
seeds[i] = new BitString(length, fitness, crossover, crossoverRate, mutationRate);
}
GeneticAlgorithm ga = new GeneticAlgorithm<>(seeds, selection, elitism);
ga.evolve(generation);
return seeds;
}
/**
* Returns the index of 1's.
* @return the index of 1's.
*/
private static int[] indexOf(byte[] bits) {
int p = MathEx.sum(bits);
if (p == 0) return null;
int[] index = new int[p];
for (int i = 0, ii = 0; i < bits.length; i++) {
if (bits[i] == 1) index[ii++] = i;
}
return index;
}
/**
* Returns the data with selected features.
* @return the data with selected features.
*/
private static double[][] select(double[][] x, int[] features) {
int p = features.length;
int n = x.length;
double[][] xx = new double[n][p];
for (int i = 0; i < n; i++) {
for (int j = 0; j < p; j++) {
xx[i][j] = x[i][features[j]];
}
}
return xx;
}
/**
* Returns the fitness of the classification model.
*
* @param x training samples.
* @param y training labels.
* @param testx testing samples.
* @param testy testing labels.
* @param metric classification metric.
* @param trainer the lambda to train a model.
* @return the fitness of model.
*/
public static Fitness fitness(double[][] x, int[] y, double[][] testx, int[] testy, ClassificationMetric metric, BiFunction> trainer) {
return chromosome -> {
byte[] bits = chromosome.bits();
int[] features = indexOf(bits);
if (features == null) return 0.0;
double[][] xx = select(x, features);
double[][] testxx = select(testx, features);
Classifier model = trainer.apply(xx, y);
return metric.score(testy, model.predict(testxx));
};
}
/**
* Returns the fitness of the regression model.
*
* @param x training samples.
* @param y training response.
* @param testx testing samples.
* @param testy testing response.
* @param metric classification metric.
* @param trainer the lambda to train a model.
* @return the fitness of model.
*/
public static Fitness fitness(double[][] x, double[] y, double[][] testx, double[] testy, RegressionMetric metric, BiFunction> trainer) {
return chromosome -> {
byte[] bits = chromosome.bits();
int[] features = indexOf(bits);
if (features == null) return Double.NEGATIVE_INFINITY;
double[][] xx = select(x, features);
double[][] testxx = select(testx, features);
Regression model = trainer.apply(xx, y);
return -metric.score(testy, model.predict(testxx));
};
}
/** Returns the selected features. */
private static String[] selectedFeatures(byte[] bits, String[] names, String y) {
int p = MathEx.sum(bits);
if (p == 0) return null;
int offset = 0;
String[] features = new String[p];
for (int i = 0, ii = 0; i < bits.length; i++) {
if (names[i].equals(y)) {
offset++;
}
if (bits[i] == 1) {
features[ii++] = names[i+offset];
}
}
return features;
}
/**
* Returns the fitness of the classification model.
*
* @param y the column name of class labels.
* @param train training data.
* @param test testing data.
* @param metric classification metric.
* @param trainer the lambda to train a model.
* @return the fitness of model.
*/
public static Fitness fitness(String y, DataFrame train, DataFrame test, ClassificationMetric metric, BiFunction trainer) {
String[] names = train.names();
int[] testy = test.column(y).toIntArray();
return chromosome -> {
byte[] bits = chromosome.bits();
String[] features = selectedFeatures(bits, names, y);
if (features == null) return 0.0;
Formula formula = Formula.of(y, features);
DataFrameClassifier model = trainer.apply(formula, train);
return metric.score(testy, model.predict(test));
};
}
/**
* Returns the fitness of the regression model.
*
* @param y the column name of response variable.
* @param train training data.
* @param test testing data.
* @param metric classification metric.
* @param trainer the lambda to train a model.
* @return the fitness of model.
*/
public static Fitness fitness(String y, DataFrame train, DataFrame test, RegressionMetric metric, BiFunction trainer) {
String[] names = train.names();
double[] testy = test.column(y).toDoubleArray();
return chromosome -> {
byte[] bits = chromosome.bits();
String[] features = selectedFeatures(bits, names, y);
if (features == null) return Double.NEGATIVE_INFINITY;
Formula formula = Formula.of(y, features);
DataFrameRegression model = trainer.apply(formula, train);
return -metric.score(testy, model.predict(test));
};
}
}