ai.djl.nn.transformer.ScaledDotProductAttentionBlock Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of api Show documentation
Deep Java Library - api
There is a newer version: 0.30.0
/*
 * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 * with the License. A copy of the License is located at
 *
 * http://aws.amazon.com/apache2.0/
 *
 * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */
package ai.djl.nn.transformer;

import ai.djl.ndarray.NDArray;
import ai.djl.ndarray.NDList;
import ai.djl.ndarray.NDManager;
import ai.djl.ndarray.types.DataType;
import ai.djl.ndarray.types.Shape;
import ai.djl.nn.AbstractBlock;
import ai.djl.nn.Block;
import ai.djl.nn.core.Linear;
import ai.djl.nn.norm.Dropout;
import ai.djl.training.ParameterStore;
import ai.djl.util.PairList;

/**
 * A Block implementing scaled product attention according to Vaswani et. al..
 *
 * Abbreviations used:
 *
 * 

 *   E = embedding size
 *   
B = batch size
 *   
N = number of attention heads
 *   
F = "from" sequence length (key/value sequence), the input sequence
 *   
T = "to" sequence length (query sequence), e.g. the length of the output sequence
 *   
S = a sequence length, either F or T
 *   
H = Attention head size (= E / N)
 * 
 *
 * In many use cases F=T. For self attention, the input is equal to the output.
 *
 * 
This block can process input in four forms:
 *
 * 

 *   Input size one: [Values] = [(B, F, E)], only input is used as key, query and value
 *       (unmasked self attention), e.g. BERT
 *   
Input size two: [Values, Mask] = [(B, F, E), (B, F, F)], first input is used as key, query
 *       and value, masked self attention
 *   
Input size three: [Keys, Queries, Values] = [(B, F, E), (B, T, E), (B, F, E)], inputs are
 *       interpreted as keys, queries and values, unmasked attention
 *   
Input size four: [Keys, Queries, Values, Mask] = [(B, F, E), (B, T, E), (B, F, E), (B, T,
 *       F)], inputs are interpreted as keys, queries, values and an attention mask, full masked
 *       attention.
 * 
 *
 * Attention masks must contain a 1 for positions to keep and a 0 for positions to mask.
 */
// We name local variables for tensor dimensions as in the paper and the reference code.
// While against the general code style, it makes things much easier readable here.
@SuppressWarnings({
    "LocalVariableName",
    "PMD.LocalVariableNamingConventions",
    "ParameterName",
    "PMD.FormalParameterNamingConventions"
})
public final class ScaledDotProductAttentionBlock extends AbstractBlock {

    private static final byte VERSION = 1;

    /** Size of the Word-/Token-embeddings we use the attention on. */
    private int embeddingSize;
    /** Number of attention heads. */
    private int headCount;
    /** Pointwise Linear projection of the keys. */
    private Linear keyProjection;
    /** Pointwise Linear projection of the queries. */
    private Linear queryProjection;
    /** Pointwise Linear projection of the values. */
    private Linear valueProjection;
    /** Pointwise Linear projection of the results. */
    private Linear resultProjection;
    /** Dropout operation to be applied after probability calculation. */
    private Dropout attentionProbsDropout;

    private ScaledDotProductAttentionBlock(Builder builder) {
        super(VERSION);

        this.embeddingSize = builder.embeddingSize;
        this.headCount = builder.headCount;

        this.keyProjection = addChildBlock("keyProjection", buildProjection());
        this.queryProjection = addChildBlock("queryProjection", buildProjection());
        this.valueProjection = addChildBlock("valueProjection", buildProjection());
        this.resultProjection = addChildBlock("resultProjection", buildProjection());

        this.attentionProbsDropout =
                addChildBlock(
                        "probabilityDropout",
                        Dropout.builder().optRate(builder.attentionProbsDropoutProb).build());
    }

    /**
     * Helper method to build a pointwise linear projection for the current embedding size.
     *
     * @return a linear projection with bias and an output size equal to the embedding size.
     */
    private Linear buildProjection() {
        return Linear.builder().setUnits(embeddingSize).optBias(true).build();
    }

    /**
     * Pointwise Linear projection of the keys.
     *
     * @return Pointwise Linear projection of the keys.
     */
    public Linear getKeyProjection() {
        return keyProjection;
    }
    /**
     * Pointwise Linear projection of the queries.
     *
     * @return Pointwise Linear projection of the queries.
     */
    public Linear getQueryProjection() {
        return queryProjection;
    }
    /**
     * Pointwise Linear projection of the values.
     *
     * @return Pointwise Linear projection of the values.
     */
    public Linear getValueProjection() {
        return valueProjection;
    }
    /**
     * Pointwise Linear projection of the results.
     *
     * @return Pointwise Linear projection of the results.
     */
    public Linear getResultProjection() {
        return resultProjection;
    }

    /** {@inheritDoc} */
    @Override
    public Shape[] getOutputShapes(Shape[] inputShapes) {
        // Return shape is the shape of the query. For 2 or less inputs we have self-attention, i.e.
        // the shape of the output is the shape of the input
        if (inputShapes.length == 1 || inputShapes.length == 2) {
            return new Shape[] {inputShapes[0]};
        } else if (inputShapes.length == 3 || inputShapes.length == 4) {
            // For attention with a dedicated query, the output shape is the query shape
            return new Shape[] {inputShapes[1]};
        } else {
            throw new IllegalArgumentException(
                    "Invalid number of input shapes: " + inputShapes.length + ", must be 1-4.");
        }
    }

    /** {@inheritDoc} */
    @Override
    public void initializeChildBlocks(NDManager manager, DataType dataType, Shape... inputShapes) {
        // The lookups are fed reshaped input where the batch size is combined with the sequence
        // length.
        // The linear layers only care about the 2nd dimension, so we set the first to -1.
        Shape projectionShape = new Shape(-1L, embeddingSize);
        // We initialize the lookup with that reshaped input shape
        for (Block projection : children.values()) {
            projection.initialize(manager, DataType.FLOAT32, projectionShape);
        }
    }

    /**
     * Utility function to reshape and transpose an input of the shape (B, S, E) into (B, N, S, H).
     *
     * @param projection projected embeddings
     * @param B batch size
     * @param S sequence length
     * @param N number of attention heads
     * @param H size of attention heads
     * @return the reshaped input
     */
    private NDArray createAttentionHeadsFromEmbeddings(
            NDArray projection, long B, long S, long N, long H) {
        // Reshape projection to sequence & heads: (B, S, E) -> (B, S, N, H)
        NDArray sequenceAndHeads = projection.reshape(B, S, N, H);
        // Switch sequence idx & head index, so we have sequences of heads at the end
        return sequenceAndHeads.transpose(0, 2, 1, 3);
    }

    /** {@inheritDoc} */
    @Override
    protected NDList forwardInternal(
            ParameterStore parameterStore,
            NDList inputs,
            boolean training,
            PairList params) {
        // E=embedding size
        long E = embeddingSize;
        // B=batch size
        long B = inputs.head().getShape().get(0);
        // N=number of attention heads
        long N = headCount;
        // F=from sequence length
        long F;
        // T=to sequence length
        long T;
        // H=Attention head size (= E / N)
        long H = E / N;
        // Create key, query & value input based on input size
        NDList flattenedKeyInput;
        NDList flattenedQueryInput;
        NDList flattenedValueInput;
        NDArray attentionMask;
        if (inputs.size() < 3) { // self attention, either masked or unmasked
            F = inputs.head().getShape().get(1);
            T = F;
            flattenedKeyInput = new NDList(inputs.head());
            flattenedQueryInput = flattenedKeyInput;
            flattenedValueInput = flattenedKeyInput;
        } else { // attention with separate key, query & value
            F = inputs.get(0).getShape().get(1);
            T = inputs.get(1).getShape().get(1);
            flattenedKeyInput = new NDList(inputs.get(0));
            flattenedQueryInput = new NDList(inputs.get(1));
            flattenedValueInput = new NDList(inputs.get(2));
        }
        if (inputs.size() == 2 || inputs.size() == 4) { // we have an additional attention mask
            attentionMask = inputs.get(inputs.size() - 1);
        } else {
            attentionMask = null;
        }
        // apply projection for key, query and value, preserves shape: (B, S, E)
        NDList keys = keyProjection.forward(parameterStore, flattenedKeyInput, training, params);
        NDList queries =
                queryProjection.forward(parameterStore, flattenedQueryInput, training, params);
        NDList values =
                valueProjection.forward(parameterStore, flattenedValueInput, training, params);
        // reshape to (B, N, S, H) to create separate attention heads
        NDArray keyHeads = createAttentionHeadsFromEmbeddings(keys.head(), B, F, N, H);
        NDArray queryHeads = createAttentionHeadsFromEmbeddings(queries.head(), B, T, N, H);
        NDArray valueHeads = createAttentionHeadsFromEmbeddings(values.head(), B, F, N, H);
        // Apply attention by multiplying the key and query vectors: (B, N, T, F)
        // (For each entry in the sequence there is a weight for each other head in the sequence)
        NDArray attentionScores = queryHeads.matMul(keyHeads.transpose(0, 1, 3, 2));
        // Normalize the scores with 1/sqrt(H)
        NDArray normalizedAttentionScores =
                attentionScores.mul(attentionScores.getManager().create(1f / (float) Math.sqrt(H)));
        // Apply masking if requested, mask has shape (B, T, F)
        if (attentionMask != null) {
            NDArray maskOffset;

            // The input mask is initially given as a list of integers with a 1 for each existing
            // token. In order to apply it to the attention result, it needs to be expanded and the
            // values turned into offsets for the softmax calculation. For stacked models, this
            // can be done once and reused - hence we check for the number of dimensions if we
            // have to do this locally or whether it was done for us.
            if (attentionMask.getShape().dimension() != 4) {
                // expand mask to be used on all heads at once
                NDArray expandedMask = attentionMask.reshape(B, 1, T, F);
                // we turn the mask from ints into floats and turn all 1s into 0s and all
                // 0s int o a value of -10000. Adding this to the scores will push all unwanted
                // values towards -inf and keep the unmasked values unchanged
                maskOffset =
                        expandedMask
                                .toType(DataType.FLOAT32, false)
                                .mul(expandedMask.getManager().create(-1f)) // turn 1 into -1
                                .add(
                                        expandedMask
                                                .getManager()
                                                .create(1f)) // turn 0s to 1s, -1s to 0s
                                .mul(
                                        expandedMask
                                                .getManager()
                                                .create(-100000f)); // turn 1s (original 0s) into
                // -100000
            } else {
                maskOffset = attentionMask;
            }
            // adding the mask to the scores removes the scores of unwanted positions
            normalizedAttentionScores = normalizedAttentionScores.add(maskOffset);
        }
        // Then apply softmax to get a probability distribution, shape (B, N, T, F)
        NDArray attentionProbs = normalizedAttentionScores.softmax(3);
        // We apply dropout to the attention probabilities - this will remove entire tokens from the
        // result of a position, as their probability will be set to 0
        NDArray attentionProbsAfterDropout =
                attentionProbsDropout
                        .forward(parameterStore, new NDList(attentionProbs), training)
                        .singletonOrThrow();
        // The result of the attention mechanism is created by a weighted sum using the attention
        // probs. The new head is the weighted sum of the value heads. (B, N, T, H)
        NDArray attentionResult = attentionProbsAfterDropout.matMul(valueHeads);
        // Finally, the heads are reshaped and concatenated into an embedding again
        NDArray resultEmbeddings =
                attentionResult // (B, N, T, H)
                        .transpose(0, 2, 1, 3) // -> (B, T, N, H)
                        .reshape(B, T, E); // -> (B, T, E)
        // As a last step, we add another linear projection for each token to the embedding size
        NDList projectedEmbeddings =
                resultProjection.forward(parameterStore, new NDList(resultEmbeddings), training);
        // done!
        return new NDList(projectedEmbeddings);
    }

    /**
     * Creates a new Builder to build an Attention Block with.
     *
     * @return a new Builder to build an Attention Block with.
     */
    public static Builder builder() {
        return new Builder();
    }

    /** A builder for {@link ScaledDotProductAttentionBlock}s. */
    public static final class Builder {

        private int embeddingSize;

        private int headCount;

        private float attentionProbsDropoutProb = 0.1f;

        private Builder() {}

        /**
         * Sets the embedding Size to be used for the internal token representation.
         *
         * @param embeddingSize the embedding Size to be used for the internal token representation.
         * @return this builder
         */
        public Builder setEmbeddingSize(int embeddingSize) {
            this.embeddingSize = embeddingSize;
            return this;
        }

        /**
         * Sets the number of attention Heads, must divide the embedding size without rest. I.e. if
         * embeddingSize = 10, a headCount of 3 would not be valid, a headCount of 1, 2 or 5 would
         * be.
         *
         * @param headCount the number of attention Heads
         * @return this builder
         */
        public Builder setHeadCount(int headCount) {
            this.headCount = headCount;
            return this;
        }

        /**
         * Sets the probability of applying dropout to the attention probability distribution. This
         * dropout can randomly remove a complete token from the result at a position.
         *
         * @param attentionProbsDropoutProb the probability of applying dropout to the attention
         *     probability distribution
         * @return this builder
         */
        public Builder optAttentionProbsDropoutProb(float attentionProbsDropoutProb) {
            this.attentionProbsDropoutProb = attentionProbsDropoutProb;
            return this;
        }

        /**
         * Creates a new {@code ScaledDotProductAttentionBlock} with the current configuration.
         *
         * @return a new {@code ScaledDotProductAttentionBlock} with the current configuration.
         */
        public ScaledDotProductAttentionBlock build() {
            if (embeddingSize < 1) {
                throw new IllegalStateException("Embedding size not initialized.");
            }
            if (headCount < 1) {
                throw new IllegalStateException("Head count not initialized.");
            }
            if (embeddingSize % headCount != 0) {
                throw new IllegalStateException(
                        "Embedding Size ("
                                + embeddingSize
                                + ") is not divisible by head count ("
                                + headCount
                                + ")");
            }
            return new ScaledDotProductAttentionBlock(this);
        }
    }
}