org.bouncycastle.crypto.engines.AESLightEngine Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bcprov-jdk15on Show documentation
The Bouncy Castle Crypto package is a Java implementation of cryptographic algorithms. This jar contains JCE provider and lightweight API for the Bouncy Castle Cryptography APIs for JDK 1.5 and up.
The newest version!
package org.bouncycastle.crypto.engines;

import org.bouncycastle.crypto.BlockCipher;
import org.bouncycastle.crypto.CipherParameters;
import org.bouncycastle.crypto.DataLengthException;
import org.bouncycastle.crypto.OutputLengthException;
import org.bouncycastle.crypto.params.KeyParameter;
import org.bouncycastle.util.Pack;

/**
 * an implementation of the AES (Rijndael), from FIPS-197.
 * 
 * For further details see: https://csrc.nist.gov/encryption/aes/.
 *
 * This implementation is based on optimizations from Dr. Brian Gladman's paper and C code at
 * https://fp.gladman.plus.com/cryptography_technology/rijndael/
 *
 * There are three levels of tradeoff of speed vs memory
 * Because java has no preprocessor, they are written as three separate classes from which to choose
 *
 * The fastest uses 8Kbytes of static tables to precompute round calculations, 4 256 word tables for encryption
 * and 4 for decryption.
 *
 * The middle performance version uses only one 256 word table for each, for a total of 2Kbytes,
 * adding 12 rotate operations per round to compute the values contained in the other tables from
 * the contents of the first
 *
 * The slowest version uses no static tables at all and computes the values
 * in each round.
 * 
 * This file contains the slowest performance version with no static tables
 * for round precomputation, but it has the smallest foot print.
 *
 */
public class AESLightEngine
    implements BlockCipher
{
    // The S box
    private static final byte[] S = {
        (byte)99, (byte)124, (byte)119, (byte)123, (byte)242, (byte)107, (byte)111, (byte)197,
        (byte)48,   (byte)1, (byte)103,  (byte)43, (byte)254, (byte)215, (byte)171, (byte)118,
        (byte)202, (byte)130, (byte)201, (byte)125, (byte)250,  (byte)89,  (byte)71, (byte)240,
        (byte)173, (byte)212, (byte)162, (byte)175, (byte)156, (byte)164, (byte)114, (byte)192,
        (byte)183, (byte)253, (byte)147,  (byte)38,  (byte)54,  (byte)63, (byte)247, (byte)204,
        (byte)52, (byte)165, (byte)229, (byte)241, (byte)113, (byte)216,  (byte)49,  (byte)21,
        (byte)4, (byte)199,  (byte)35, (byte)195,  (byte)24, (byte)150,   (byte)5, (byte)154,
        (byte)7,  (byte)18, (byte)128, (byte)226, (byte)235,  (byte)39, (byte)178, (byte)117,
        (byte)9, (byte)131,  (byte)44,  (byte)26,  (byte)27, (byte)110,  (byte)90, (byte)160,
        (byte)82,  (byte)59, (byte)214, (byte)179,  (byte)41, (byte)227,  (byte)47, (byte)132,
        (byte)83, (byte)209,   (byte)0, (byte)237,  (byte)32, (byte)252, (byte)177,  (byte)91,
        (byte)106, (byte)203, (byte)190,  (byte)57,  (byte)74,  (byte)76,  (byte)88, (byte)207,
        (byte)208, (byte)239, (byte)170, (byte)251,  (byte)67,  (byte)77,  (byte)51, (byte)133,
        (byte)69, (byte)249,   (byte)2, (byte)127,  (byte)80,  (byte)60, (byte)159, (byte)168,
        (byte)81, (byte)163,  (byte)64, (byte)143, (byte)146, (byte)157,  (byte)56, (byte)245,
        (byte)188, (byte)182, (byte)218,  (byte)33,  (byte)16, (byte)255, (byte)243, (byte)210,
        (byte)205,  (byte)12,  (byte)19, (byte)236,  (byte)95, (byte)151,  (byte)68,  (byte)23,
        (byte)196, (byte)167, (byte)126,  (byte)61, (byte)100,  (byte)93,  (byte)25, (byte)115,
        (byte)96, (byte)129,  (byte)79, (byte)220,  (byte)34,  (byte)42, (byte)144, (byte)136,
        (byte)70, (byte)238, (byte)184,  (byte)20, (byte)222,  (byte)94,  (byte)11, (byte)219,
        (byte)224,  (byte)50,  (byte)58,  (byte)10,  (byte)73,   (byte)6,  (byte)36,  (byte)92,
        (byte)194, (byte)211, (byte)172,  (byte)98, (byte)145, (byte)149, (byte)228, (byte)121,
        (byte)231, (byte)200,  (byte)55, (byte)109, (byte)141, (byte)213,  (byte)78, (byte)169,
        (byte)108,  (byte)86, (byte)244, (byte)234, (byte)101, (byte)122, (byte)174,   (byte)8,
        (byte)186, (byte)120,  (byte)37,  (byte)46,  (byte)28, (byte)166, (byte)180, (byte)198,
        (byte)232, (byte)221, (byte)116,  (byte)31,  (byte)75, (byte)189, (byte)139, (byte)138,
        (byte)112,  (byte)62, (byte)181, (byte)102,  (byte)72,   (byte)3, (byte)246,  (byte)14,
        (byte)97,  (byte)53,  (byte)87, (byte)185, (byte)134, (byte)193,  (byte)29, (byte)158,
        (byte)225, (byte)248, (byte)152,  (byte)17, (byte)105, (byte)217, (byte)142, (byte)148,
        (byte)155,  (byte)30, (byte)135, (byte)233, (byte)206,  (byte)85,  (byte)40, (byte)223,
        (byte)140, (byte)161, (byte)137,  (byte)13, (byte)191, (byte)230,  (byte)66, (byte)104,
        (byte)65, (byte)153,  (byte)45,  (byte)15, (byte)176,  (byte)84, (byte)187,  (byte)22,
    };

    // The inverse S-box
    private static final byte[] Si = {
        (byte)82,   (byte)9, (byte)106, (byte)213,  (byte)48,  (byte)54, (byte)165,  (byte)56,
        (byte)191,  (byte)64, (byte)163, (byte)158, (byte)129, (byte)243, (byte)215, (byte)251,
        (byte)124, (byte)227,  (byte)57, (byte)130, (byte)155,  (byte)47, (byte)255, (byte)135,
        (byte)52, (byte)142,  (byte)67,  (byte)68, (byte)196, (byte)222, (byte)233, (byte)203,
        (byte)84, (byte)123, (byte)148,  (byte)50, (byte)166, (byte)194,  (byte)35,  (byte)61,
        (byte)238,  (byte)76, (byte)149,  (byte)11,  (byte)66, (byte)250, (byte)195,  (byte)78,
        (byte)8,  (byte)46, (byte)161, (byte)102,  (byte)40, (byte)217,  (byte)36, (byte)178,
        (byte)118,  (byte)91, (byte)162,  (byte)73, (byte)109, (byte)139, (byte)209,  (byte)37,
        (byte)114, (byte)248, (byte)246, (byte)100, (byte)134, (byte)104, (byte)152,  (byte)22,
        (byte)212, (byte)164,  (byte)92, (byte)204,  (byte)93, (byte)101, (byte)182, (byte)146,
        (byte)108, (byte)112,  (byte)72,  (byte)80, (byte)253, (byte)237, (byte)185, (byte)218,
        (byte)94,  (byte)21,  (byte)70,  (byte)87, (byte)167, (byte)141, (byte)157, (byte)132,
        (byte)144, (byte)216, (byte)171,   (byte)0, (byte)140, (byte)188, (byte)211,  (byte)10,
        (byte)247, (byte)228,  (byte)88,   (byte)5, (byte)184, (byte)179,  (byte)69,   (byte)6,
        (byte)208,  (byte)44,  (byte)30, (byte)143, (byte)202,  (byte)63,  (byte)15,   (byte)2,
        (byte)193, (byte)175, (byte)189,   (byte)3,   (byte)1,  (byte)19, (byte)138, (byte)107,
        (byte)58, (byte)145,  (byte)17,  (byte)65,  (byte)79, (byte)103, (byte)220, (byte)234,
        (byte)151, (byte)242, (byte)207, (byte)206, (byte)240, (byte)180, (byte)230, (byte)115,
        (byte)150, (byte)172, (byte)116,  (byte)34, (byte)231, (byte)173,  (byte)53, (byte)133,
        (byte)226, (byte)249,  (byte)55, (byte)232,  (byte)28, (byte)117, (byte)223, (byte)110,
        (byte)71, (byte)241,  (byte)26, (byte)113,  (byte)29,  (byte)41, (byte)197, (byte)137,
        (byte)111, (byte)183,  (byte)98,  (byte)14, (byte)170,  (byte)24, (byte)190,  (byte)27,
        (byte)252,  (byte)86,  (byte)62,  (byte)75, (byte)198, (byte)210, (byte)121,  (byte)32,
        (byte)154, (byte)219, (byte)192, (byte)254, (byte)120, (byte)205,  (byte)90, (byte)244,
        (byte)31, (byte)221, (byte)168,  (byte)51, (byte)136,   (byte)7, (byte)199,  (byte)49,
        (byte)177,  (byte)18,  (byte)16,  (byte)89,  (byte)39, (byte)128, (byte)236,  (byte)95,
        (byte)96,  (byte)81, (byte)127, (byte)169,  (byte)25, (byte)181,  (byte)74,  (byte)13,
        (byte)45, (byte)229, (byte)122, (byte)159, (byte)147, (byte)201, (byte)156, (byte)239,
        (byte)160, (byte)224,  (byte)59,  (byte)77, (byte)174,  (byte)42, (byte)245, (byte)176,
        (byte)200, (byte)235, (byte)187,  (byte)60, (byte)131,  (byte)83, (byte)153,  (byte)97,
        (byte)23,  (byte)43,   (byte)4, (byte)126, (byte)186, (byte)119, (byte)214,  (byte)38,
        (byte)225, (byte)105,  (byte)20,  (byte)99,  (byte)85,  (byte)33,  (byte)12, (byte)125,
        };

    // vector used in calculating key schedule (powers of x in GF(256))
    private static final int[] rcon = {
         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
         0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 };

    private static int shift(int r, int shift)
    {
        return (r >>> shift) | (r << -shift);
    }

    /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */

    private static final int m1 = 0x80808080;
    private static final int m2 = 0x7f7f7f7f;
    private static final int m3 = 0x0000001b;
    private static final int m4 = 0xC0C0C0C0;
    private static final int m5 = 0x3f3f3f3f;

    private static int FFmulX(int x)
    {
        return (((x & m2) << 1) ^ (((x & m1) >>> 7) * m3));
    }

    private static int FFmulX2(int x)
    {
        int t0  = (x & m5) << 2;
        int t1  = (x & m4);
            t1 ^= (t1 >>> 1);
        return t0 ^ (t1 >>> 2) ^ (t1 >>> 5);
    }

    /* 
       The following defines provide alternative definitions of FFmulX that might
       give improved performance if a fast 32-bit multiply is not available.
       
       private int FFmulX(int x) { int u = x & m1; u |= (u >> 1); return ((x & m2) << 1) ^ ((u >>> 3) | (u >>> 6)); } 
       private static final int  m4 = 0x1b1b1b1b;
       private int FFmulX(int x) { int u = x & m1; return ((x & m2) << 1) ^ ((u - (u >>> 7)) & m4); } 

    */

    private static int mcol(int x)
    {
        int t0, t1;
        t0  = shift(x, 8);
        t1  = x ^ t0;
        return shift(t1, 16) ^ t0 ^ FFmulX(t1);
    }

    private static int inv_mcol(int x)
    {
        int t0, t1;
        t0  = x;
        t1  = t0 ^ shift(t0, 8);
        t0 ^= FFmulX(t1);
        t1 ^= FFmulX2(t0);
        t0 ^= t1 ^ shift(t1, 16);
        return t0;
    }


    private static int subWord(int x)
    {
        return (S[x&255]&255 | ((S[(x>>8)&255]&255)<<8) | ((S[(x>>16)&255]&255)<<16) | S[(x>>24)&255]<<24);
    }

    /**
     * Calculate the necessary round keys
     * The number of calculations depends on key size and block size
     * AES specified a fixed block size of 128 bits and key sizes 128/192/256 bits
     * This code is written assuming those are the only possible values
     */
    private int[][] generateWorkingKey(byte[] key, boolean forEncryption)
    {
        int keyLen = key.length;
        if (keyLen < 16 || keyLen > 32 || (keyLen & 7) != 0)
        {
            throw new IllegalArgumentException("Key length not 128/192/256 bits.");
        }

        int KC = keyLen >>> 2;
        ROUNDS = KC + 6;  // This is not always true for the generalized Rijndael that allows larger block sizes
        int[][] W = new int[ROUNDS+1][4];   // 4 words in a block

        switch (KC)
        {
        case 4:
        {
            int col0 = Pack.littleEndianToInt(key,  0);     W[0][0] = col0;
            int col1 = Pack.littleEndianToInt(key,  4);     W[0][1] = col1;
            int col2 = Pack.littleEndianToInt(key,  8);     W[0][2] = col2;
            int col3 = Pack.littleEndianToInt(key, 12);     W[0][3] = col3;

            for (int i = 1; i <= 10; ++i)
            {
                int colx = subWord(shift(col3, 8)) ^ rcon[i - 1];
                col0 ^= colx;       W[i][0] = col0;
                col1 ^= col0;       W[i][1] = col1;
                col2 ^= col1;       W[i][2] = col2;
                col3 ^= col2;       W[i][3] = col3;
            }

            break;
        }
        case 6:
        {
            int col0 = Pack.littleEndianToInt(key,  0);     W[0][0] = col0;
            int col1 = Pack.littleEndianToInt(key,  4);     W[0][1] = col1;
            int col2 = Pack.littleEndianToInt(key,  8);     W[0][2] = col2;
            int col3 = Pack.littleEndianToInt(key, 12);     W[0][3] = col3;

            int col4 = Pack.littleEndianToInt(key, 16);
            int col5 = Pack.littleEndianToInt(key, 20);

            int i = 1, rcon = 1, colx;
            for (;;)
            {
                                    W[i    ][0] = col4;
                                    W[i    ][1] = col5;
                colx = subWord(shift(col5, 8)) ^ rcon; rcon <<= 1;
                col0 ^= colx;       W[i    ][2] = col0;
                col1 ^= col0;       W[i    ][3] = col1;

                col2 ^= col1;       W[i + 1][0] = col2;
                col3 ^= col2;       W[i + 1][1] = col3;
                col4 ^= col3;       W[i + 1][2] = col4;
                col5 ^= col4;       W[i + 1][3] = col5;

                colx = subWord(shift(col5, 8)) ^ rcon; rcon <<= 1;
                col0 ^= colx;       W[i + 2][0] = col0;
                col1 ^= col0;       W[i + 2][1] = col1;
                col2 ^= col1;       W[i + 2][2] = col2;
                col3 ^= col2;       W[i + 2][3] = col3;

                if ((i += 3) >= 13)
                {
                    break;
                }

                col4 ^= col3;
                col5 ^= col4;
            }

            break;
        }
        case 8:
        {
            int col0 = Pack.littleEndianToInt(key,  0);     W[0][0] = col0;
            int col1 = Pack.littleEndianToInt(key,  4);     W[0][1] = col1;
            int col2 = Pack.littleEndianToInt(key,  8);     W[0][2] = col2;
            int col3 = Pack.littleEndianToInt(key, 12);     W[0][3] = col3;

            int col4 = Pack.littleEndianToInt(key, 16);     W[1][0] = col4;
            int col5 = Pack.littleEndianToInt(key, 20);     W[1][1] = col5;
            int col6 = Pack.littleEndianToInt(key, 24);     W[1][2] = col6;
            int col7 = Pack.littleEndianToInt(key, 28);     W[1][3] = col7;

            int i = 2, rcon = 1, colx;
            for (;;)
            {
                colx = subWord(shift(col7, 8)) ^ rcon; rcon <<= 1;
                col0 ^= colx;       W[i][0] = col0;
                col1 ^= col0;       W[i][1] = col1;
                col2 ^= col1;       W[i][2] = col2;
                col3 ^= col2;       W[i][3] = col3;
                ++i;

                if (i >= 15)
                {
                    break;
                }

                colx = subWord(col3);
                col4 ^= colx;       W[i][0] = col4;
                col5 ^= col4;       W[i][1] = col5;
                col6 ^= col5;       W[i][2] = col6;
                col7 ^= col6;       W[i][3] = col7;
                ++i;
            }

            break;
        }
        default:
        {
            throw new IllegalStateException("Should never get here");
        }
        }

        if (!forEncryption)
        {
            for (int j = 1; j < ROUNDS; j++)
            {
                for (int i = 0; i < 4; i++)
                {
                    W[j][i] = inv_mcol(W[j][i]);
                }
            }
        }

        return W;
    }

    private int         ROUNDS;
    private int[][]     WorkingKey = null;
    private boolean     forEncryption;

    private static final int BLOCK_SIZE = 16;

    /**
     * default constructor - 128 bit block size.
     */
    public AESLightEngine()
    {
    }

    /**
     * initialise an AES cipher.
     *
     * @param forEncryption whether or not we are for encryption.
     * @param params the parameters required to set up the cipher.
     * @exception IllegalArgumentException if the params argument is
     * inappropriate.
     */
    public void init(
        boolean           forEncryption,
        CipherParameters  params)
    {
        if (params instanceof KeyParameter)
        {
            WorkingKey = generateWorkingKey(((KeyParameter)params).getKey(), forEncryption);
            this.forEncryption = forEncryption;
            return;
        }

        throw new IllegalArgumentException("invalid parameter passed to AES init - " + params.getClass().getName());
    }

    public String getAlgorithmName()
    {
        return "AES";
    }

    public int getBlockSize()
    {
        return BLOCK_SIZE;
    }

    public int processBlock(byte[] in, int inOff, byte[] out, int outOff)
    {
        if (WorkingKey == null)
        {
            throw new IllegalStateException("AES engine not initialised");
        }

        if (inOff > (in.length - BLOCK_SIZE))
        {
            throw new DataLengthException("input buffer too short");
        }

        if (outOff > (out.length - BLOCK_SIZE))
        {
            throw new OutputLengthException("output buffer too short");
        }

        if (forEncryption)
        {
            encryptBlock(in, inOff, out, outOff, WorkingKey);
        }
        else
        {
            decryptBlock(in, inOff, out, outOff, WorkingKey);
        }

        return BLOCK_SIZE;
    }

    public void reset()
    {
    }

    private void encryptBlock(byte[] in, int inOff, byte[] out, int outOff, int[][] KW)
    {
        int C0 = Pack.littleEndianToInt(in, inOff +  0);
        int C1 = Pack.littleEndianToInt(in, inOff +  4);
        int C2 = Pack.littleEndianToInt(in, inOff +  8);
        int C3 = Pack.littleEndianToInt(in, inOff + 12);

        int t0 = C0 ^ KW[0][0];
        int t1 = C1 ^ KW[0][1];
        int t2 = C2 ^ KW[0][2];

        int r = 1, r0, r1, r2, r3 = C3 ^ KW[0][3];
        while (r < ROUNDS - 1)
        {
            r0 = mcol((S[t0&255]&255) ^ ((S[(t1>>8)&255]&255)<<8) ^ ((S[(t2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
            r1 = mcol((S[t1&255]&255) ^ ((S[(t2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(t0>>24)&255]<<24)) ^ KW[r][1];
            r2 = mcol((S[t2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(t0>>16)&255]&255)<<16) ^ (S[(t1>>24)&255]<<24)) ^ KW[r][2];
            r3 = mcol((S[r3&255]&255) ^ ((S[(t0>>8)&255]&255)<<8) ^ ((S[(t1>>16)&255]&255)<<16) ^ (S[(t2>>24)&255]<<24)) ^ KW[r++][3];
            t0 = mcol((S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
            t1 = mcol((S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24)) ^ KW[r][1];
            t2 = mcol((S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24)) ^ KW[r][2];
            r3 = mcol((S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24)) ^ KW[r++][3];
        }

        r0 = mcol((S[t0&255]&255) ^ ((S[(t1>>8)&255]&255)<<8) ^ ((S[(t2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
        r1 = mcol((S[t1&255]&255) ^ ((S[(t2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(t0>>24)&255]<<24)) ^ KW[r][1];
        r2 = mcol((S[t2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(t0>>16)&255]&255)<<16) ^ (S[(t1>>24)&255]<<24)) ^ KW[r][2];
        r3 = mcol((S[r3&255]&255) ^ ((S[(t0>>8)&255]&255)<<8) ^ ((S[(t1>>16)&255]&255)<<16) ^ (S[(t2>>24)&255]<<24)) ^ KW[r++][3];

        // the final round is a simple function of S

        C0 = (S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24) ^ KW[r][0];
        C1 = (S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24) ^ KW[r][1];
        C2 = (S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24) ^ KW[r][2];
        C3 = (S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24) ^ KW[r][3];

        Pack.intToLittleEndian(C0, out, outOff +  0);
        Pack.intToLittleEndian(C1, out, outOff +  4);
        Pack.intToLittleEndian(C2, out, outOff +  8);
        Pack.intToLittleEndian(C3, out, outOff + 12);
    }

    private void decryptBlock(byte[] in, int inOff, byte[] out, int outOff, int[][] KW)
    {
        int C0 = Pack.littleEndianToInt(in, inOff +  0);
        int C1 = Pack.littleEndianToInt(in, inOff +  4);
        int C2 = Pack.littleEndianToInt(in, inOff +  8);
        int C3 = Pack.littleEndianToInt(in, inOff + 12);

        int t0 = C0 ^ KW[ROUNDS][0];
        int t1 = C1 ^ KW[ROUNDS][1];
        int t2 = C2 ^ KW[ROUNDS][2];

        int r = ROUNDS - 1, r0, r1, r2, r3 = C3 ^ KW[ROUNDS][3];
        while (r > 1)
        {
            r0 = inv_mcol((Si[t0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(t2>>16)&255]&255)<<16) ^ (Si[(t1>>24)&255]<<24)) ^ KW[r][0];
            r1 = inv_mcol((Si[t1&255]&255) ^ ((Si[(t0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(t2>>24)&255]<<24)) ^ KW[r][1];
            r2 = inv_mcol((Si[t2&255]&255) ^ ((Si[(t1>>8)&255]&255)<<8) ^ ((Si[(t0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
            r3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(t2>>8)&255]&255)<<8) ^ ((Si[(t1>>16)&255]&255)<<16) ^ (Si[(t0>>24)&255]<<24)) ^ KW[r--][3];
            t0 = inv_mcol((Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24)) ^ KW[r][0];
            t1 = inv_mcol((Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24)) ^ KW[r][1];
            t2 = inv_mcol((Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
            r3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24)) ^ KW[r--][3];
        }

        r0 = inv_mcol((Si[t0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(t2>>16)&255]&255)<<16) ^ (Si[(t1>>24)&255]<<24)) ^ KW[r][0];
        r1 = inv_mcol((Si[t1&255]&255) ^ ((Si[(t0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(t2>>24)&255]<<24)) ^ KW[r][1];
        r2 = inv_mcol((Si[t2&255]&255) ^ ((Si[(t1>>8)&255]&255)<<8) ^ ((Si[(t0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
        r3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(t2>>8)&255]&255)<<8) ^ ((Si[(t1>>16)&255]&255)<<16) ^ (Si[(t0>>24)&255]<<24)) ^ KW[r][3];

        // the final round's table is a simple function of Si

        C0 = (Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24) ^ KW[0][0];
        C1 = (Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24) ^ KW[0][1];
        C2 = (Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24) ^ KW[0][2];
        C3 = (Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24) ^ KW[0][3];

        Pack.intToLittleEndian(C0, out, outOff +  0);
        Pack.intToLittleEndian(C1, out, outOff +  4);
        Pack.intToLittleEndian(C2, out, outOff +  8);
        Pack.intToLittleEndian(C3, out, outOff + 12);
    }
}