smile.base.mlp.ActivationFunction Maven / Gradle / Ivy
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.base.mlp;
import smile.math.MathEx;
import java.io.Serializable;
/**
* The activation function in hidden layers.
*
* @author Haifeng Li
*/
public interface ActivationFunction extends Serializable {
/**
* Returns the name of activation function.
* @return the name of activation function.
*/
String name();
/**
* The output function.
* @param x the input vector.
*/
void f(double[] x);
/**
* The gradient function.
* @param g the gradient vector. On input, it holds W'*g, where W and g are the
* weight matrix and gradient of upper layer, respectively.
* On output, it is the gradient of this layer.
* @param y the output vector.
*/
void g(double[] g, double[] y);
/**
* Linear/Identity activation function.
*
* @return the linear activation function.
*/
static ActivationFunction linear() {
return new ActivationFunction() {
@Override
public String name() {
return "LINEAR";
}
@Override
public void f(double[] x) {
// Identity function keeps the input as is.
}
@Override
public void g(double[] g, double[] y) {
}
};
}
/**
* The rectifier activation function {@code max(0, x)}.
* It is introduced with strong biological motivations and mathematical
* justifications. The rectifier is the most popular activation function
* for deep neural networks. A unit employing the rectifier is called a
* rectified linear unit (ReLU).
*
* ReLU neurons can sometimes be pushed into states in which they become
* inactive for essentially all inputs. In this state, no gradients flow
* backward through the neuron, and so the neuron becomes stuck in a
* perpetually inactive state and "dies". This is a form of the vanishing
* gradient problem. In some cases, large numbers of neurons in a network
* can become stuck in dead states, effectively decreasing the model
* capacity. This problem typically arises when the learning rate is
* set too high. It may be mitigated by using leaky ReLUs instead,
* which assign a small positive slope for {@code x < 0} however the
* performance is reduced.
*
* @return the rectifier activation function.
*/
static ActivationFunction rectifier() {
return new ActivationFunction() {
@Override
public String name() {
return "RECTIFIER";
}
@Override
public void f(double[] x) {
for (int i = 0; i < x.length; i++) {
x[i] = Math.max(0.0, x[i]);
}
}
@Override
public void g(double[] g, double[] y) {
for (int i = 0; i < g.length; i++) {
g[i] *= y[i] > 0 ? 1 : 0;
}
}
};
}
/**
* The leaky rectifier activation function {@code max(x, 0.01x)}.
*
* @return the leaky rectifier activation function.
*/
static ActivationFunction leaky() {
return leaky(0.01);
}
/**
* The leaky rectifier activation function {@code max(x, ax)} where
* {@code 0 <= a < 1}. By default {@code a = 0.01}. Leaky ReLUs allow
* a small, positive gradient when the unit is not active.
* It has a relation to "maxout" networks.
*
* @param a the parameter of leaky ReLU.
* @return the leaky rectifier activation function.
*/
static ActivationFunction leaky(double a) {
if (a < 0 || a >= 1.0) {
throw new IllegalArgumentException("Invalid Leaky ReLU parameter: " + a);
}
return new ActivationFunction() {
@Override
public String name() {
return String.format("LEAKEY_RECTIFIER(%f)", a);
}
@Override
public void f(double[] x) {
for (int i = 0; i < x.length; i++) {
x[i] = Math.max(a * x[i], x[i]);
}
}
@Override
public void g(double[] g, double[] y) {
for (int i = 0; i < g.length; i++) {
g[i] *= y[i] > 0 ? 1 : a;
}
}
};
}
/**
* Logistic sigmoid function: sigmoid(v)=1/(1+exp(-v)).
* For multi-class classification, each unit in output layer
* corresponds to a class. For binary classification and cross
* entropy error function, there is only one output unit whose
* value can be regarded as posteriori probability.
*
* @return the logistic sigmoid activation function.
*/
static ActivationFunction sigmoid() {
return new ActivationFunction() {
@Override
public String name() {
return "SIGMOID";
}
@Override
public void f(double[] x) {
for (int i = 0; i < x.length; i++) {
x[i] = MathEx.sigmoid(x[i]);
}
}
@Override
public void g(double[] g, double[] y) {
for (int i = 0; i < g.length; i++) {
g[i] *= y[i] * (1.0 - y[i]);
}
}
};
}
/**
* Hyperbolic tangent activation function. The tanh function is a
* rescaling of the logistic sigmoid, such that its outputs range
* from -1 to 1.
*
* @return the hyperbolic tangent activation function.
*/
static ActivationFunction tanh() {
return new ActivationFunction() {
@Override
public String name() {
return "TANH";
}
@Override
public void f(double[] x) {
for (int i = 0; i < x.length; i++) {
x[i] = Math.tanh(x[i]);
}
}
@Override
public void g(double[] g, double[] y) {
int n = y.length;
for (int i = 0; i < n; i++) {
double ym1 = 1.0 - y[i];
g[i] *= ym1 * ym1;
}
}
};
}
}