native.intel.packet.gcm_pc.gcm_pc128w.c Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bcprov-lts8on Show documentation
Show all versions of bcprov-lts8on Show documentation
The Long Term Stable (LTS) Bouncy Castle Crypto package is a Java implementation of cryptographic algorithms. This jar contains the JCA/JCE provider and low-level API for the BC LTS version 2.73.7 for Java 8 and later.
#include
#include
#include "gcm_pc.h"
#include "gcm_pcHash128.h"
#include
packet_err *gcm_pc_process_buffer_enc(
uint8_t *in,
size_t inlen,
uint8_t *out,
size_t outputLen,
size_t *read,
size_t *written,
bool encryption,
size_t *bufBlockIndex,
int64_t *blocksRemaining,
__m128i *hashKeys,
__m128i *ctr1,
__m128i *roundKeys,
int num_rounds,
size_t *totalBytes,
__m128i *X,
size_t bufBlockLen,
uint8_t *bufBlock) {
*read = *written = 0;
if (encryption && *bufBlockIndex == 0 && inlen >= FOUR_BLOCKS && outputLen >= FOUR_BLOCKS) {
// Special case when nothing is buffered, and we have more than 4 blocks to process, and we are doing
// encryption.
// The hash is calculated on the cipher text so if we are going to interleave reduction and encryption
// then the reduction is always going to be on the previous cipher texts.
// Eg:
// 1. Create initial cipher texts
// 2. Create subsequent cipher texts supplying previous cipher texts for reduction.
// 3. Loop back to 2 until input is consumed.
// 4. Final trailing reduction
//
if (out == NULL) {
//
// Java api my supply a null output array if it expects no output, however
// if output does occur then we need to catch that here.
//
return make_packet_error("out is null, output generated when no output was expected by caller",
ILLEGAL_ARGUMENT);
}
if (*blocksRemaining < 4) {
return make_packet_error("attempt to process too many blocks in GCM", ILLEGAL_ARGUMENT);
}
*blocksRemaining -= 4;
// Hash keys are constant throughout.
const __m128i h4 = hashKeys[HASHKEY_0];
const __m128i h3 = hashKeys[(HASHKEY_0 - 1)];
const __m128i h2 = hashKeys[(HASHKEY_0 - 2)];
const __m128i h1 = hashKeys[(HASHKEY_0 - 3)];
// Initial set of 4 blocks.
__m128i id0 = _mm_loadu_si128((__m128i *) &in[0 * 16]);
__m128i id1 = _mm_loadu_si128((__m128i *) &in[1 * 16]);
__m128i id2 = _mm_loadu_si128((__m128i *) &in[2 * 16]);
__m128i id3 = _mm_loadu_si128((__m128i *) &in[3 * 16]);
*ctr1 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr2 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr3 = _mm_add_epi32(ctr2, *ONE);
__m128i ctr4 = _mm_add_epi32(ctr3, *ONE);
__m128i tmp1 = _mm_shuffle_epi8(*ctr1, *BSWAP_EPI64);
__m128i tmp2 = _mm_shuffle_epi8(ctr2, *BSWAP_EPI64);
__m128i tmp3 = _mm_shuffle_epi8(ctr3, *BSWAP_EPI64);
__m128i tmp4 = _mm_shuffle_epi8(ctr4, *BSWAP_EPI64);
apply_aes_no_reduction(
&id0, &id1, &id2, &id3,
tmp1, tmp2, tmp3, tmp4,
roundKeys, num_rounds
);
_mm_storeu_si128((__m128i *) &out[0 * 16], id0);
_mm_storeu_si128((__m128i *) &out[1 * 16], id1);
_mm_storeu_si128((__m128i *) &out[2 * 16], id2);
_mm_storeu_si128((__m128i *) &out[3 * 16], id3);
// id0..3 are the initial set of cipher texts but bit swapped
id0 = _mm_shuffle_epi8(id0, *BSWAP_MASK);
id1 = _mm_shuffle_epi8(id1, *BSWAP_MASK);
id2 = _mm_shuffle_epi8(id2, *BSWAP_MASK);
id3 = _mm_shuffle_epi8(id3, *BSWAP_MASK);
*written += FOUR_BLOCKS;
*read += FOUR_BLOCKS;
*totalBytes += FOUR_BLOCKS;
inlen -= FOUR_BLOCKS;
outputLen -= FOUR_BLOCKS;
in += FOUR_BLOCKS;
out += FOUR_BLOCKS;
*ctr1 = ctr4;
while (inlen >= FOUR_BLOCKS && outputLen >= FOUR_BLOCKS) {
if (*blocksRemaining < 4) {
return make_packet_error("attempt to process too many blocks in GCM", ILLEGAL_ARGUMENT);
}
*blocksRemaining -= 4;
// Encrypt next set of 4 blocks passing the result of the last encryption for reduction.
__m128i d0 = _mm_loadu_si128((__m128i *) &in[0 * 16]);
__m128i d1 = _mm_loadu_si128((__m128i *) &in[1 * 16]);
__m128i d2 = _mm_loadu_si128((__m128i *) &in[2 * 16]);
__m128i d3 = _mm_loadu_si128((__m128i *) &in[3 * 16]);
*ctr1 = _mm_add_epi32(*ctr1, *ONE);
ctr2 = _mm_add_epi32(*ctr1, *ONE);
ctr3 = _mm_add_epi32(ctr2, *ONE);
ctr4 = _mm_add_epi32(ctr3, *ONE);
tmp1 = _mm_shuffle_epi8(*ctr1, *BSWAP_EPI64);
tmp2 = _mm_shuffle_epi8(ctr2, *BSWAP_EPI64);
tmp3 = _mm_shuffle_epi8(ctr3, *BSWAP_EPI64);
tmp4 = _mm_shuffle_epi8(ctr4, *BSWAP_EPI64);
id0 = _mm_xor_si128(id0, *X);
apply_aes_with_reduction(&d0, &d1, &d2, &d3,
id0, id1, id2, id3,
h1, h2, h3, h4,
tmp1, tmp2, tmp3, tmp4,
roundKeys, X, num_rounds);
_mm_storeu_si128((__m128i *) &out[0 * 16], d0);
_mm_storeu_si128((__m128i *) &out[1 * 16], d1);
_mm_storeu_si128((__m128i *) &out[2 * 16], d2);
_mm_storeu_si128((__m128i *) &out[3 * 16], d3);
// id0..3 are now the last cipher texts but bit swapped
id0 = _mm_shuffle_epi8(d0, *BSWAP_MASK);
id1 = _mm_shuffle_epi8(d1, *BSWAP_MASK);
id2 = _mm_shuffle_epi8(d2, *BSWAP_MASK);
id3 = _mm_shuffle_epi8(d3, *BSWAP_MASK);
*written += FOUR_BLOCKS;
*read += FOUR_BLOCKS;
*totalBytes += FOUR_BLOCKS;
inlen -= FOUR_BLOCKS;
outputLen -= FOUR_BLOCKS;
in += FOUR_BLOCKS;
out += FOUR_BLOCKS;
*ctr1 = ctr4;
}
//
// Do trailing reduction
//
id0 = _mm_xor_si128(id0, *X);
gfmul_multi_reduce(
id0, id1, id2, id3,
h1, h2, h3, h4,
X);
// fall through to existing code that will buffer trailing blocks if necessary
}
size_t rem = bufBlockLen - *bufBlockIndex;
size_t toCopy = inlen < rem ? inlen : rem;
memcpy(bufBlock + *bufBlockIndex, in, toCopy);
*bufBlockIndex += toCopy;
*totalBytes += toCopy;
if (*bufBlockIndex == bufBlockLen) {
if (outputLen < FOUR_BLOCKS) {
return make_packet_error("output len too short", OUTPUT_LENGTH);
}
if (out == NULL) {
//
// Java api my supply a null output array if it expects no output, however
// if output does occur then we need to catch that here.
//
return make_packet_error("out is null, output generated when no output was expected by caller",
ILLEGAL_ARGUMENT);
}
if (*blocksRemaining < 4) {
return make_packet_error("attempt to process too many blocks in GCM", ILLEGAL_ARGUMENT);
}
*blocksRemaining -= 4;
const __m128i h4 = hashKeys[HASHKEY_0];
const __m128i h3 = hashKeys[(HASHKEY_0 - 1)];
const __m128i h2 = hashKeys[(HASHKEY_0 - 2)];
const __m128i h1 = hashKeys[(HASHKEY_0 - 3)];
const int rounds = num_rounds;
*ctr1 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr2 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr3 = _mm_add_epi32(ctr2, *ONE);
__m128i ctr4 = _mm_add_epi32(ctr3, *ONE);
__m128i tmp1 = _mm_shuffle_epi8(*ctr1, *BSWAP_EPI64);
__m128i tmp2 = _mm_shuffle_epi8(ctr2, *BSWAP_EPI64);
__m128i tmp3 = _mm_shuffle_epi8(ctr3, *BSWAP_EPI64);
__m128i tmp4 = _mm_shuffle_epi8(ctr4, *BSWAP_EPI64);
__m128i rk = roundKeys[0];
aes_xor(&tmp1, &tmp2, &tmp3, &tmp4, rk);
__m128i in1 = _mm_loadu_si128(((__m128i *) &in[0 * 16]));
__m128i in2 = _mm_loadu_si128(((__m128i *) &in[1 * 16]));
__m128i in3 = _mm_loadu_si128(((__m128i *) &in[2 * 16]));
__m128i in4 = _mm_loadu_si128(((__m128i *) &in[3 * 16]));
int j;
for (j = 1; j < rounds; j++) {
aes_enc(&tmp1, &tmp2, &tmp3, &tmp4, roundKeys[j]);
}
aes_enc_last(&tmp1, &tmp2, &tmp3, &tmp4, roundKeys[j]);
tmp1 = _mm_xor_si128(tmp1, in1);
tmp2 = _mm_xor_si128(tmp2, in2);
tmp3 = _mm_xor_si128(tmp3, in3);
tmp4 = _mm_xor_si128(tmp4, in4);
_mm_storeu_si128((__m128i *) &out[0 * 16], tmp1);
_mm_storeu_si128((__m128i *) &out[1 * 16], tmp2);
_mm_storeu_si128((__m128i *) &out[2 * 16], tmp3);
_mm_storeu_si128((__m128i *) &out[3 * 16], tmp4);
tmp1 = _mm_shuffle_epi8(tmp1, *BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, *BSWAP_MASK);
tmp3 = _mm_shuffle_epi8(tmp3, *BSWAP_MASK);
tmp4 = _mm_shuffle_epi8(tmp4, *BSWAP_MASK);
tmp1 = _mm_xor_si128(tmp1, *X);
gfmul_multi_reduce(tmp1, tmp2, tmp3, tmp4,
h1, h2, h3, h4,
X);
*ctr1 = ctr4;
*bufBlockIndex -= FOUR_BLOCKS;
*written += FOUR_BLOCKS;
}
*read += toCopy;
return NULL;
}
packet_err *gcm_pc_processFourBlocks_dec(uint8_t *in, uint8_t *out, int64_t *blocksRemaining, __m128i *hashKeys,
const int num_rounds, __m128i *ctr1, __m128i *roundKeys, __m128i *X) {
if (out == NULL) {
//
// Java api my supply a null output array if it expects no output, however
// if output does occur then we need to catch that here.
//
return make_packet_error("out is null, output generated when no output was expected by caller",
ILLEGAL_ARGUMENT);
}
__m128i ctr2, ctr3, ctr4, tmp12, tmp34, tmp56, tmp78;
// Hash keys are constant throughout.
const __m128i h4 = hashKeys[HASHKEY_0];
const __m128i h3 = hashKeys[(HASHKEY_0 - 1)];
const __m128i h2 = hashKeys[(HASHKEY_0 - 2)];
const __m128i h1 = hashKeys[(HASHKEY_0 - 3)];
if (*blocksRemaining < 4) {
return make_packet_error("attempt to process too many blocks in GCM", ILLEGAL_ARGUMENT);
}
*blocksRemaining -= 4;
*ctr1 = _mm_add_epi32(*ctr1, *ONE);
ctr2 = _mm_add_epi32(*ctr1, *ONE);
ctr3 = _mm_add_epi32(ctr2, *ONE);
ctr4 = _mm_add_epi32(ctr3, *ONE);
tmp12 = _mm_shuffle_epi8(*ctr1, *BSWAP_EPI64);
tmp34 = _mm_shuffle_epi8(ctr2, *BSWAP_EPI64);
tmp56 = _mm_shuffle_epi8(ctr3, *BSWAP_EPI64);
tmp78 = _mm_shuffle_epi8(ctr4, *BSWAP_EPI64);
__m128i in1 = _mm_loadu_si128(((__m128i *) &in[0 * 16]));
__m128i in2 = _mm_loadu_si128(((__m128i *) &in[1 * 16]));
__m128i in3 = _mm_loadu_si128(((__m128i *) &in[2 * 16]));
__m128i in4 = _mm_loadu_si128(((__m128i *) &in[3 * 16]));
apply_aes_with_reduction_dec(&in1, &in2, &in3, &in4,
h1, h2, h3, h4,
tmp12, tmp34, tmp56, tmp78,
roundKeys, X, num_rounds);
_mm_storeu_si128((__m128i *) &out[0 * 16], in1);
_mm_storeu_si128((__m128i *) &out[1 * 16], in2);
_mm_storeu_si128((__m128i *) &out[2 * 16], in3);
_mm_storeu_si128((__m128i *) &out[3 * 16], in4);
*ctr1 = ctr4;
return NULL;
}
packet_err *gcm_pc_process_buffer_dec(uint8_t *in, size_t inlen, uint8_t *out, size_t outputLen, size_t *read,
size_t *written, size_t *bufBlockIndex, int64_t *blocksRemaining, __m128i *hashKeys,
__m128i *ctr1, __m128i *roundKeys, int num_rounds, size_t *totalBytes, __m128i *X,
size_t bufBlockLen, uint8_t *bufBlock, size_t macBlockLen) {
*read = *written = 0;
if (*bufBlockIndex > 0 && *bufBlockIndex + inlen >= bufBlockLen) {
// We have 4 or more blocks with of data in the buffer.
// Process them now and copy any residual back to the start of the buffer.
if (*bufBlockIndex >= FOUR_BLOCKS) {
if (outputLen < FOUR_BLOCKS) {
return make_packet_error("output len too short", OUTPUT_LENGTH);
}
packet_err *err = gcm_pc_processFourBlocks_dec(bufBlock, out, blocksRemaining, hashKeys,
num_rounds, ctr1, roundKeys, X);
if (err != NULL) {
return err;
}
*written += FOUR_BLOCKS;
outputLen -= FOUR_BLOCKS;
out += FOUR_BLOCKS;
//
// Copy whatever bytes after the 4 blocks back to the start of the buffer.
// Internal copy so read does not change.
//
size_t toCopy = *bufBlockIndex - FOUR_BLOCKS;
memcpy(bufBlock, bufBlock + *bufBlockIndex, toCopy);
*bufBlockIndex = toCopy;
}
//
// There may still data in the buffer but less than before, does
// our condition for rounding the buffer out still exist with respect
// to the available input?
//
if (*bufBlockIndex > 0 && *bufBlockIndex + inlen >= bufBlockLen) {
size_t toCopy = FOUR_BLOCKS - *bufBlockIndex;
// Copy from the input what we need to round out the buffer.
memcpy(bufBlock + *bufBlockIndex, in, toCopy);
if (outputLen < FOUR_BLOCKS) {
return make_packet_error("output len too short", OUTPUT_LENGTH);
}
packet_err *err = gcm_pc_processFourBlocks_dec(bufBlock, out, blocksRemaining, hashKeys,
num_rounds, ctr1, roundKeys, X);
if (err != NULL) {
return err;
}
*bufBlockIndex = 0;
*written += FOUR_BLOCKS;
*read += toCopy;
*totalBytes += toCopy;
outputLen -= FOUR_BLOCKS;
in += toCopy;
out += FOUR_BLOCKS;
}
}
//
// Bulk decryption.
//
if (*bufBlockIndex == 0 && inlen >= bufBlockLen && outputLen >= FOUR_BLOCKS) {
// Hash keys are constant throughout.
const __m128i h4 = hashKeys[HASHKEY_0];
const __m128i h3 = hashKeys[(HASHKEY_0 - 1)];
const __m128i h2 = hashKeys[(HASHKEY_0 - 2)];
const __m128i h1 = hashKeys[(HASHKEY_0 - 3)];
__m128i d0, d1, d2, d3, tmp12, tmp34, tmp56, tmp78;
while (inlen >= bufBlockLen && outputLen >= FOUR_BLOCKS) {
if (*blocksRemaining < 4) {
return make_packet_error("attempt to process too many blocks in GCM", ILLEGAL_ARGUMENT);
}
*blocksRemaining -= 4;
d0 = _mm_loadu_si128((__m128i *) &in[0 * 16]);
d1 = _mm_loadu_si128((__m128i *) &in[1 * 16]);
d2 = _mm_loadu_si128((__m128i *) &in[2 * 16]);
d3 = _mm_loadu_si128((__m128i *) &in[3 * 16]);
*ctr1 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr2 = _mm_add_epi32(*ctr1, *ONE);
__m128i ctr3 = _mm_add_epi32(ctr2, *ONE);
__m128i ctr4 = _mm_add_epi32(ctr3, *ONE);
tmp12 = _mm_shuffle_epi8(*ctr1, *BSWAP_EPI64);
tmp34 = _mm_shuffle_epi8(ctr2, *BSWAP_EPI64);
tmp56 = _mm_shuffle_epi8(ctr3, *BSWAP_EPI64);
tmp78 = _mm_shuffle_epi8(ctr4, *BSWAP_EPI64);
*ctr1 = ctr4;
apply_aes_with_reduction_dec(&d0, &d1, &d2, &d3,
h1, h2, h3, h4,
tmp12, tmp34, tmp56, tmp78,
roundKeys, X, num_rounds);
_mm_storeu_si128((__m128i *) &out[0 * 16], d0);
_mm_storeu_si128((__m128i *) &out[1 * 16], d1);
_mm_storeu_si128((__m128i *) &out[2 * 16], d2);
_mm_storeu_si128((__m128i *) &out[3 * 16], d3);
// id0..3 are now the last cipher texts but bit swapped
*written += FOUR_BLOCKS;
*read += FOUR_BLOCKS;
*totalBytes += FOUR_BLOCKS;
inlen -= FOUR_BLOCKS;
outputLen -= FOUR_BLOCKS;
in += FOUR_BLOCKS;
out += FOUR_BLOCKS;
} // while
} else {
if (*bufBlockIndex == 0 && inlen >= bufBlockLen) {
if (outputLen < FOUR_BLOCKS) {
return make_packet_error("output len too short", OUTPUT_LENGTH);
}
packet_err *err = gcm_pc_processFourBlocks_dec(in, out, blocksRemaining, hashKeys,
num_rounds, ctr1, roundKeys, X);
if (err != NULL) {
return err;
}
*written += FOUR_BLOCKS;
*read += FOUR_BLOCKS;
*totalBytes += FOUR_BLOCKS;
} else {
size_t rem = bufBlockLen - *bufBlockIndex;
size_t toCopy = inlen < rem ? inlen : rem;
memcpy(bufBlock + *bufBlockIndex, in, toCopy);
*bufBlockIndex += toCopy;
*totalBytes += toCopy;
if (*bufBlockIndex == bufBlockLen) {
if (outputLen < FOUR_BLOCKS) {
return make_packet_error("output len too short", OUTPUT_LENGTH);
}
packet_err *err = gcm_pc_processFourBlocks_dec(bufBlock, out, blocksRemaining, hashKeys,
num_rounds, ctr1, roundKeys, X);
if (err != NULL) {
return err;
}
if (macBlockLen == 16) {
_mm_storeu_si128((__m128i *) bufBlock,
_mm_loadu_si128((__m128i *) (bufBlock + FOUR_BLOCKS)));
} else {
memcpy(bufBlock, bufBlock + FOUR_BLOCKS, macBlockLen);
}
*bufBlockIndex -= FOUR_BLOCKS;
*written += FOUR_BLOCKS;
}
*read += toCopy;
}
}
return NULL;
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy