smile.llm.PositionalEncoding Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-deep Show documentation
smile-deep
The newest version!
/*
 * Copyright (c) 2010-2024 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */
package smile.llm;

import org.bytedeco.pytorch.Module;
import smile.deep.layer.Layer;
import smile.deep.tensor.Device;
import smile.deep.tensor.ScalarType;
import smile.deep.tensor.Tensor;
import static smile.deep.tensor.Index.*;

/**
 * Positional encoding in original Transformer. Positional encoding injects
 * some information about the absolute position of the tokens in the sequence.
 * The positional encodings have the same dimension as the embeddings, so that
 * the two can be summed. This class uses sine and cosine functions of
 * different frequencies.
 *
 * @author Haifeng Li
 */
public class PositionalEncoding implements Layer {
    private final Module module = new Module("PositionalEncoding");
    /** The positional encoding tensor. */
    private final Tensor pe;

    /**
     * Constructor.
     * @param dim the dimension of the frequency tensor.
     * @param end the end index for precomputing frequencies.
     */
    public PositionalEncoding(int dim, int end) {
        this(dim, end, 10000.0);
    }

    /**
     * Constructor.
     * @param dim the dimension of the frequency tensor.
     * @param end the end index for precomputing frequencies.
     * @param theta the scaling factor for frequency computation.
     */
    public PositionalEncoding(int dim, int end, double theta) {
        try (Tensor position = Tensor.arange(0, end,1).to(ScalarType.Float32).unsqueeze(1);
             Tensor divTerm = Tensor.arange(0, dim, 2).to(ScalarType.Float32).mul_(-Math.log(theta) / dim).exp_()) {
            position.mul_(divTerm);
            pe = Tensor.zeros(end, dim);
            pe.put_(position.sin(), Colon, slice(0, null, 2));
            pe.put_(position.cos(), Colon, slice(1, null, 2));
            pe.setRequireGrad(false);
            module.register_buffer("pe", pe.asTorch());
        }
    }

    @Override
    public Tensor forward(Tensor input) {
        try (Tensor p = pe.get(slice(null, input.size(0)), Colon)) {
            return input.add(p);
        }
    }

    @Override
    public Module asTorch() {
        return module;
    }

    /**
     * Moves the encoder to a device.
     * @param device the compute device.
     * @return this encoder.
     */
    public PositionalEncoding to(Device device) {
        pe.to(device);
        return this;
    }
}