smile.base.mlp.Layer Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.base.mlp;
import java.io.IOException;
import java.io.Serial;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import smile.math.MathEx;
import smile.math.matrix.Matrix;
import smile.util.Regex;
/**
* A layer in the neural network.
*
* @author Haifeng Li
*/
public abstract class Layer implements Serializable {
@Serial
private static final long serialVersionUID = 2L;
/**
* The number of neurons in this layer
*/
protected final int n;
/**
* The number of input variables.
*/
protected final int p;
/**
* The dropout rate. Dropout randomly sets input units to 0 with this rate
* at each step during training time, which helps prevent overfitting.
*/
protected final double dropout;
/**
* The affine transformation matrix.
*/
protected Matrix weight;
/**
* The bias.
*/
protected double[] bias;
/**
* The output vector.
*/
protected transient ThreadLocal output;
/**
* The output gradient.
*/
protected transient ThreadLocal outputGradient;
/**
* The weight gradient.
*/
protected transient ThreadLocal weightGradient;
/**
* The bias gradient.
*/
protected transient ThreadLocal biasGradient;
/**
* The first moment of weight gradient.
*/
protected transient ThreadLocal weightGradientMoment1;
/**
* The second moment of weight gradient.
*/
protected transient ThreadLocal weightGradientMoment2;
/**
* The first moment of bias gradient.
*/
protected transient ThreadLocal biasGradientMoment1;
/**
* The second moment of bias gradient.
*/
protected transient ThreadLocal biasGradientMoment2;
/**
* The weight update.
*/
protected transient ThreadLocal weightUpdate;
/**
* The bias update.
*/
protected transient ThreadLocal biasUpdate;
/**
* The dropout mask.
*/
protected transient ThreadLocal mask;
/**
* Constructor for input layer.
*
* @param n the number of neurons.
* @param dropout the dropout rate.
*/
Layer(int n, double dropout) {
if (dropout < 0.0 || dropout >= 1.0) {
throw new IllegalArgumentException("Invalid dropout rate: " + dropout);
}
this.n = n;
this.p = n;
this.dropout = dropout;
output = ThreadLocal.withInitial(() -> new double[n]);
if (dropout > 0.0) {
mask = ThreadLocal.withInitial(() -> new byte[n]);
}
}
/**
* Constructor. Randomly initialized weights and zero bias.
*
* @param n the number of neurons.
* @param p the number of input variables (not including bias value).
*/
public Layer(int n, int p) {
this(n, p, 0.0);
}
/**
* Constructor. Randomly initialized weights and zero bias.
*
* @param n the number of neurons.
* @param p the number of input variables (not including bias value).
* @param dropout the dropout rate.
*/
public Layer(int n, int p, double dropout) {
this(Matrix.rand(n, p, -Math.sqrt(6.0 / (n+p)), Math.sqrt(6.0 / (n+p))), new double[n], dropout);
}
/**
* Constructor.
* @param weight the weight matrix.
* @param bias the bias vector.
*/
public Layer(Matrix weight, double[] bias) {
this(weight, bias, 0.0);
}
/**
* Constructor.
* @param weight the weight matrix.
* @param bias the bias vector.
* @param dropout the dropout rate.
*/
public Layer(Matrix weight, double[] bias, double dropout) {
if (dropout < 0.0 || dropout >= 1.0) {
throw new IllegalArgumentException("Invalid dropout rate: " + dropout);
}
this.n = weight.nrow();
this.p = weight.ncol();
this.weight = weight;
this.bias = bias;
this.dropout = dropout;
init();
}
/**
* Initializes the workspace when deserializing the object.
* @param in the input stream.
* @throws IOException when fails to read the stream.
* @throws ClassNotFoundException when fails to load the class.
*/
@Serial
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
in.defaultReadObject();
init();
}
/**
* Initializes the workspace.
*/
private void init() {
output = ThreadLocal.withInitial(() -> new double[n]);
outputGradient = ThreadLocal.withInitial(() -> new double[n]);
weightGradient = ThreadLocal.withInitial(() -> new Matrix(n, p));
biasGradient = ThreadLocal.withInitial(() -> new double[n]);
weightGradientMoment1 = ThreadLocal.withInitial(() -> new Matrix(n, p));
weightGradientMoment2 = ThreadLocal.withInitial(() -> new Matrix(n, p));
biasGradientMoment1 = ThreadLocal.withInitial(() -> new double[n]);
biasGradientMoment2 = ThreadLocal.withInitial(() -> new double[n]);
weightUpdate = ThreadLocal.withInitial(() -> new Matrix(n, p));
biasUpdate = ThreadLocal.withInitial(() -> new double[n]);
if (dropout > 0.0) {
mask = ThreadLocal.withInitial(() -> new byte[n]);
}
}
/**
* Returns the dimension of output vector.
* @return the dimension of output vector.
*/
public int getOutputSize() {
return n;
}
/**
* Returns the dimension of input vector (not including bias value).
* @return the dimension of input vector.
*/
public int getInputSize() {
return p;
}
/**
* Returns the output vector.
* @return the output vector.
*/
public double[] output() {
return output.get();
}
/**
* Returns the output gradient vector.
* @return the output gradient vector.
*/
public double[] gradient() {
return outputGradient.get();
}
/**
* Propagates the signals from a lower layer to this layer.
* @param x the lower layer signals.
*/
public void propagate(double[] x) {
double[] output = this.output.get();
System.arraycopy(bias, 0, output, 0, n);
weight.mv(1.0, x, 1.0, output);
transform(output);
}
/**
* Propagates the output signals through the implicit dropout layer.
* Dropout randomly sets output units to 0. It should only be applied
* during training.
*/
public void propagateDropout() {
if (dropout > 0.0) {
double[] output = this.output.get();
byte[] mask = this.mask.get();
double scale = 1.0 / (1.0 - dropout);
for (int i = 0; i < n; i++) {
byte retain = (byte) (MathEx.random() < dropout ? 0 : 1);
mask[i] = retain;
output[i] *= retain * scale;
}
}
}
/**
* The activation or output function.
* @param x the input and output values.
*/
public abstract void transform(double[] x);
/**
* Propagates the errors back to a lower layer.
* @param lowerLayerGradient the gradient vector of lower layer.
*/
public abstract void backpropagate(double[] lowerLayerGradient);
/**
* Propagates the errors back through the (implicit) dropout layer.
*/
public void backpopagateDropout() {
if (dropout > 0.0) {
double[] gradient = this.outputGradient.get();
byte[] mask = this.mask.get();
double scale = 1.0 / (1.0 - dropout);
for (int i = 0; i < n; i++) {
gradient[i] *= mask[i] * scale;
}
}
}
/**
* Computes the parameter gradient and update the weights.
*
* @param x the input vector.
* @param learningRate the learning rate.
* @param momentum the momentum factor.
* @param decay weight decay factor.
*/
public void computeGradientUpdate(double[] x, double learningRate, double momentum, double decay) {
double[] outputGradient = this.outputGradient.get();
if (momentum > 0.0 && momentum < 1.0) {
Matrix weightUpdate = this.weightUpdate.get();
double[] biasUpdate = this.biasUpdate.get();
weightUpdate.mul(momentum);
weightUpdate.add(learningRate, outputGradient, x);
weight.add(weightUpdate);
for (int i = 0; i < n; i++) {
double b = momentum * biasUpdate[i] + learningRate * outputGradient[i];
biasUpdate[i] = b;
bias[i] += b;
}
} else {
weight.add(learningRate, outputGradient, x);
for (int i = 0; i < n; i++) {
bias[i] += learningRate * outputGradient[i];
}
}
if (decay > 0.9 && decay < 1.0) {
weight.mul(decay);
}
}
/**
* Computes the parameter gradient for a sample of (mini-)batch.
*
* @param x the input vector.
*/
public void computeGradient(double[] x) {
double[] outputGradient = this.outputGradient.get();
Matrix weightGradient = this.weightGradient.get();
double[] biasGradient = this.biasGradient.get();
weightGradient.add(1.0, outputGradient, x);
for (int i = 0; i < n; i++) {
biasGradient[i] += outputGradient[i];
}
}
/**
* Adjust network weights by back-propagation algorithm.
*
* @param m the size of mini-batch.
* @param learningRate the learning rate.
* @param momentum the momentum factor.
* @param decay weight decay factor.
* @param rho RMSProp discounting factor for the history/coming gradient.
* @param epsilon a small constant for numerical stability.
*/
public void update(int m, double learningRate, double momentum, double decay, double rho, double epsilon) {
Matrix weightGradient = this.weightGradient.get();
double[] biasGradient = this.biasGradient.get();
// Instead of computing the average gradient explicitly,
// we scale down the learning rate by the number of samples.
double eta = learningRate / m;
if (rho > 0.0 && rho < 1.0) {
// As gradient will be averaged and smoothed in RMSProp,
// we need to use the original learning rate.
eta = learningRate;
weightGradient.div(m);
for (int i = 0; i < n; i++) {
biasGradient[i] /= m;
}
Matrix rmsWeightGradient = this.weightGradientMoment2.get();
double[] rmsBiasGradient = this.biasGradientMoment2.get();
double rho1 = 1.0 - rho;
for (int j = 0; j < p; j++) {
for (int i = 0; i < n; i++) {
rmsWeightGradient.set(i, j, rho * rmsWeightGradient.get(i, j) + rho1 * MathEx.pow2(weightGradient.get(i, j)));
}
}
for (int i = 0; i < n; i++) {
rmsBiasGradient[i] = rho * rmsBiasGradient[i] + rho1 * MathEx.pow2(biasGradient[i]);
}
for (int j = 0; j < p; j++) {
for (int i = 0; i < n; i++) {
weightGradient.div(i, j, Math.sqrt(epsilon + rmsWeightGradient.get(i, j)));
}
}
for (int i = 0; i < n; i++) {
biasGradient[i] /= Math.sqrt(epsilon + rmsBiasGradient[i]);
}
}
if (momentum > 0.0 && momentum < 1.0) {
Matrix weightUpdate = this.weightUpdate.get();
double[] biasUpdate = this.biasUpdate.get();
weightUpdate.add(momentum, eta, weightGradient);
for (int i = 0; i < n; i++) {
biasUpdate[i] = momentum * biasUpdate[i] + eta * biasGradient[i];
}
weight.add(weightUpdate);
MathEx.add(bias, biasUpdate);
} else {
weight.add(eta, weightGradient);
for (int i = 0; i < n; i++) {
bias[i] += eta * biasGradient[i];
}
}
// Weight decay as the weights are multiplied
// by a factor slightly less than 1. This prevents the weights
// from growing too large, and can be seen as gradient descent
// on a quadratic regularization term.
if (decay > 0.9 && decay < 1.0) {
weight.mul(decay);
}
weightGradient.fill(0.0);
Arrays.fill(biasGradient, 0.0);
}
/**
* Returns a hidden layer.
* @param activation the activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @param param the optional activation function parameter.
* @return the layer builder.
*/
public static HiddenLayerBuilder builder(String activation, int neurons, double dropout, double param) {
switch(activation.toLowerCase(Locale.ROOT)) {
case "relu":
return rectifier(neurons, dropout);
case "sigmoid":
return sigmoid(neurons, dropout);
case "tanh":
return tanh(neurons, dropout);
case "linear":
return linear(neurons, dropout);
case "leaky":
if (Double.isNaN(param))
return leaky(neurons, dropout);
else
return leaky(neurons, dropout, param);
default:
throw new IllegalArgumentException("Unsupported activation function: " + activation);
}
}
/**
* Returns an input layer.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static LayerBuilder input(int neurons) {
return input(neurons, 0.0);
}
/**
* Returns an input layer.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static LayerBuilder input(int neurons, double dropout) {
return new LayerBuilder(neurons, dropout) {
@Override
public InputLayer build(int p) {
return new InputLayer(neurons, dropout);
}
};
}
/**
* Returns a hidden layer with linear activation function.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static HiddenLayerBuilder linear(int neurons) {
return linear(neurons, 0.0);
}
/**
* Returns a hidden layer with linear activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static HiddenLayerBuilder linear(int neurons, double dropout) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.linear());
}
/**
* Returns a hidden layer with rectified linear activation function.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static HiddenLayerBuilder rectifier(int neurons) {
return rectifier(neurons, 0.0);
}
/**
* Returns a hidden layer with rectified linear activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static HiddenLayerBuilder rectifier(int neurons, double dropout) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.rectifier());
}
/**
* Returns a hidden layer with leaky rectified linear activation function.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static HiddenLayerBuilder leaky(int neurons) {
return rectifier(neurons, 0.0);
}
/**
* Returns a hidden layer with leaky rectified linear activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static HiddenLayerBuilder leaky(int neurons, double dropout) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.leaky());
}
/**
* Returns a hidden layer with leaky rectified linear activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @param a the parameter of leaky ReLU.
* @return the layer builder.
*/
public static HiddenLayerBuilder leaky(int neurons, double dropout, double a) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.leaky(a));
}
/**
* Returns a hidden layer with sigmoid activation function.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static HiddenLayerBuilder sigmoid(int neurons) {
return sigmoid(neurons, 0.0);
}
/**
* Returns a hidden layer with sigmoid activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static HiddenLayerBuilder sigmoid(int neurons, double dropout) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.sigmoid());
}
/**
* Returns a hidden layer with hyperbolic tangent activation function.
* @param neurons the number of neurons.
* @return the layer builder.
*/
public static HiddenLayerBuilder tanh(int neurons) {
return tanh(neurons, 0.0);
}
/**
* Returns a hidden layer with hyperbolic tangent activation function.
* @param neurons the number of neurons.
* @param dropout the dropout rate.
* @return the layer builder.
*/
public static HiddenLayerBuilder tanh(int neurons, double dropout) {
return new HiddenLayerBuilder(neurons, dropout, ActivationFunction.tanh());
}
/**
* Returns an output layer with mean squared error cost function.
* @param neurons the number of neurons.
* @param output the output function.
* @return the layer builder.
*/
public static OutputLayerBuilder mse(int neurons, OutputFunction output) {
return new OutputLayerBuilder(neurons, output, Cost.MEAN_SQUARED_ERROR);
}
/**
* Returns an output layer with (log-)likelihood cost function.
* @param neurons the number of neurons.
* @param output the output function.
* @return the layer builder.
*/
public static OutputLayerBuilder mle(int neurons, OutputFunction output) {
return new OutputLayerBuilder(neurons, output, Cost.LIKELIHOOD);
}
/**
* Returns the layer builders given a string representation such as
* "Input(10, 0.2)|ReLU(50, 0.5)|Sigmoid(30, 0.5)|...".
*
* @param k the number of classes. {@code k < 2} for regression.
* @param p the number of input variables (not including bias value).
* @param spec the hidden layer specification.
* @return the layer builders.
*/
public static LayerBuilder[] of(int k, int p, String spec) {
Pattern regex = Pattern.compile(String.format("(\\w+)\\((%s)(,\\s*(%s))?(,\\s*(%s))?\\)", Regex.INTEGER_REGEX, Regex.DOUBLE_REGEX, Regex.DOUBLE_REGEX));
String[] layers = spec.split("\\|");
ArrayList builders = new ArrayList<>();
for (int i = 0; i < layers.length; i++) {
Matcher m = regex.matcher(layers[i]);
if (m.matches()) {
String activation = m.group(1);
int neurons = Integer.parseInt(m.group(2));
double dropout = 0.0;
if (m.group(3) != null) {
dropout = Double.parseDouble(m.group(4));
}
double param = Double.NaN; // activation function parameter
if (m.group(5) != null) {
param = Double.parseDouble(m.group(6));
}
if (i == 0) {
if (activation.equalsIgnoreCase("input")) {
builders.add(Layer.input(neurons, dropout));
} else {
builders.add(Layer.input(p));
builders.add(Layer.builder(activation, neurons, dropout, param));
}
} else {
builders.add(Layer.builder(activation, neurons, dropout, param));
}
} else {
throw new IllegalArgumentException("Invalid layer: " + layers[i]);
}
}
if (k < 2) {
builders.add(Layer.mse(1, OutputFunction.LINEAR));
} else if (k == 2) {
builders.add(Layer.mle(1, OutputFunction.SIGMOID));
} else {
builders.add(Layer.mle(k, OutputFunction.SOFTMAX));
}
return builders.toArray(new LayerBuilder[0]);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy