![JAR search and dependency download from the Maven repository](/logo.png)
vendor.github.com.cloudflare.circl.xof.k12.k12.go Maven / Gradle / Ivy
The newest version!
// k12 implements the KangarooTwelve XOF.
//
// KangarooTwelve is being standardised at the CFRG working group
// of the IRTF. This package implements draft 10.
//
// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/
package k12
import (
"encoding/binary"
"github.com/cloudflare/circl/internal/sha3"
"github.com/cloudflare/circl/simd/keccakf1600"
)
const chunkSize = 8192 // aka B
// KangarooTwelve splits the message into chunks of 8192 bytes each.
// The first chunk is absorbed directly in a TurboSHAKE128 instance, which
// we call the stalk. The subsequent chunks aren't absorbed directly, but
// instead their hash is absorbed: they're like leafs on a stalk.
// If we have a fast TurboSHAKE128 available, we buffer chunks until we have
// enough to do the parallel TurboSHAKE128. If not, we absorb directly into
// a separate TurboSHAKE128 state.
type State struct {
initialTodo int // Bytes left to absorb for the first chunk.
stalk sha3.State
context []byte // context string "C" provided by the user
// buffer of incoming data so we can do parallel TurboSHAKE128:
// nil when we haven't absorbed the first chunk yet;
// empty if we have, but we do not have a fast parallel TurboSHAKE128;
// and chunkSize*lanes in length if we have.
buf []byte
offset int // offset in buf or bytes written to leaf
// Number of chunk hashes ("CV_i") absorbed into the stalk.
chunk uint
// TurboSHAKE128 instance to compute the leaf in case we don't have
// a fast parallel TurboSHAKE128, viz when lanes == 1.
leaf *sha3.State
lanes uint8 // number of TurboSHAKE128s to compute in parallel
}
// NewDraft10 creates a new instance of Kangaroo12 draft version -10.
func NewDraft10(c []byte) State {
var lanes byte = 1
if keccakf1600.IsEnabledX4() {
lanes = 4
} else if keccakf1600.IsEnabledX2() {
lanes = 2
}
return newDraft10(c, lanes)
}
func newDraft10(c []byte, lanes byte) State {
return State{
initialTodo: chunkSize,
stalk: sha3.NewTurboShake128(0x07),
context: c,
lanes: lanes,
}
}
func (s *State) Reset() {
s.initialTodo = chunkSize
s.stalk.Reset()
s.stalk.SwitchDS(0x07)
s.buf = nil
s.offset = 0
s.chunk = 0
}
func (s *State) Clone() State {
stalk := s.stalk.Clone().(*sha3.State)
ret := State{
initialTodo: s.initialTodo,
stalk: *stalk,
context: s.context,
offset: s.offset,
chunk: s.chunk,
lanes: s.lanes,
}
if s.leaf != nil {
ret.leaf = s.leaf.Clone().(*sha3.State)
}
if s.buf != nil {
ret.buf = make([]byte, len(s.buf))
copy(ret.buf, s.buf)
}
return ret
}
func Draft10Sum(hash []byte, msg []byte, c []byte) {
// TODO Tweak number of lanes depending on the length of the message
s := NewDraft10(c)
_, _ = s.Write(msg)
_, _ = s.Read(hash)
}
func (s *State) Write(p []byte) (int, error) {
written := len(p)
// The first chunk is written directly to the stalk.
if s.initialTodo > 0 {
taken := s.initialTodo
if len(p) < taken {
taken = len(p)
}
headP := p[:taken]
_, _ = s.stalk.Write(headP)
s.initialTodo -= taken
p = p[taken:]
}
if len(p) == 0 {
return written, nil
}
// If this is the first bit of data written after the initial chunk,
// we're out of the fast-path and allocate some buffers.
if s.buf == nil {
if s.lanes != 1 {
s.buf = make([]byte, int(s.lanes)*chunkSize)
} else {
// We create the buffer to signal we're past the first chunk,
// but do not use it.
s.buf = make([]byte, 0)
h := sha3.NewTurboShake128(0x0B)
s.leaf = &h
}
_, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})
s.stalk.SwitchDS(0x06)
}
// If we're just using one lane, we don't need to cache in a buffer
// for parallel hashing. Instead, we feed directly to TurboSHAKE.
if s.lanes == 1 {
for len(p) > 0 {
// Write to current leaf.
to := chunkSize - s.offset
if len(p) < to {
to = len(p)
}
_, _ = s.leaf.Write(p[:to])
p = p[to:]
s.offset += to
// Did we fill the chunk?
if s.offset == chunkSize {
var cv [32]byte
_, _ = s.leaf.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
s.leaf.Reset()
s.offset = 0
s.chunk++
}
}
return written, nil
}
// If we can't fill all our lanes or the buffer isn't empty, we write the
// data to the buffer.
if s.offset != 0 || len(p) < len(s.buf) {
to := len(s.buf) - s.offset
if len(p) < to {
to = len(p)
}
p2 := p[:to]
p = p[to:]
copy(s.buf[s.offset:], p2)
s.offset += to
}
// Absorb the buffer if we filled it
if s.offset == len(s.buf) {
s.writeX(s.buf)
s.offset = 0
}
// Note that at this point we may assume that s.offset = 0 if len(p) != 0
if len(p) != 0 && s.offset != 0 {
panic("shouldn't happen")
}
// Absorb a bunch of chunks at the same time.
if len(p) >= int(s.lanes)*chunkSize {
p = s.writeX(p)
}
// Put the remainder in the buffer.
if len(p) > 0 {
copy(s.buf, p)
s.offset = len(p)
}
return written, nil
}
// Absorb a multiple of a multiple of lanes * chunkSize.
// Returns the remainder.
func (s *State) writeX(p []byte) []byte {
switch s.lanes {
case 4:
return s.writeX4(p)
default:
return s.writeX2(p)
}
}
func (s *State) writeX4(p []byte) []byte {
for len(p) >= 4*chunkSize {
var x4 keccakf1600.StateX4
a := x4.Initialize(true)
for offset := 0; offset < 48*168; offset += 168 {
for i := 0; i < 21; i++ {
a[i*4] ^= binary.LittleEndian.Uint64(
p[8*i+offset:],
)
a[i*4+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+offset:],
)
a[i*4+2] ^= binary.LittleEndian.Uint64(
p[chunkSize*2+8*i+offset:],
)
a[i*4+3] ^= binary.LittleEndian.Uint64(
p[chunkSize*3+8*i+offset:],
)
}
x4.Permute()
}
for i := 0; i < 16; i++ {
a[i*4] ^= binary.LittleEndian.Uint64(
p[8*i+48*168:],
)
a[i*4+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+48*168:],
)
a[i*4+2] ^= binary.LittleEndian.Uint64(
p[chunkSize*2+8*i+48*168:],
)
a[i*4+3] ^= binary.LittleEndian.Uint64(
p[chunkSize*3+8*i+48*168:],
)
}
a[16*4] ^= 0x0b
a[16*4+1] ^= 0x0b
a[16*4+2] ^= 0x0b
a[16*4+3] ^= 0x0b
a[20*4] ^= 0x80 << 56
a[20*4+1] ^= 0x80 << 56
a[20*4+2] ^= 0x80 << 56
a[20*4+3] ^= 0x80 << 56
x4.Permute()
var buf [32 * 4]byte
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint64(buf[8*i:], a[4*i])
binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1])
binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2])
binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3])
}
_, _ = s.stalk.Write(buf[:])
p = p[chunkSize*4:]
s.chunk += 4
}
return p
}
func (s *State) writeX2(p []byte) []byte {
// TODO On M2 Pro, 1/3 of the time is spent on this function
// and LittleEndian.Uint64 excluding the actual permutation.
// Rewriting in assembler might be worthwhile.
for len(p) >= 2*chunkSize {
var x2 keccakf1600.StateX2
a := x2.Initialize(true)
for offset := 0; offset < 48*168; offset += 168 {
for i := 0; i < 21; i++ {
a[i*2] ^= binary.LittleEndian.Uint64(
p[8*i+offset:],
)
a[i*2+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+offset:],
)
}
x2.Permute()
}
for i := 0; i < 16; i++ {
a[i*2] ^= binary.LittleEndian.Uint64(
p[8*i+48*168:],
)
a[i*2+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+48*168:],
)
}
a[16*2] ^= 0x0b
a[16*2+1] ^= 0x0b
a[20*2] ^= 0x80 << 56
a[20*2+1] ^= 0x80 << 56
x2.Permute()
var buf [32 * 2]byte
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint64(buf[8*i:], a[2*i])
binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1])
}
_, _ = s.stalk.Write(buf[:])
p = p[chunkSize*2:]
s.chunk += 2
}
return p
}
func (s *State) Read(p []byte) (int, error) {
if s.stalk.IsAbsorbing() {
// Write context string C
_, _ = s.Write(s.context)
// Write length_encode( |C| )
var buf [9]byte
binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context)))
// Find first non-zero digit in big endian encoding of context length
i := 0
for buf[i] == 0 && i < 8 {
i++
}
buf[8] = byte(8 - i) // number of bytes to represent |C|
_, _ = s.Write(buf[i:])
// We need to write the chunk number if we're past the first chunk.
if s.buf != nil {
// Write last remaining chunk(s)
var cv [32]byte
if s.lanes == 1 {
if s.offset != 0 {
_, _ = s.leaf.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
s.chunk++
}
} else {
remainingBuf := s.buf[:s.offset]
for len(remainingBuf) > 0 {
h := sha3.NewTurboShake128(0x0B)
to := chunkSize
if len(remainingBuf) < to {
to = len(remainingBuf)
}
_, _ = h.Write(remainingBuf[:to])
_, _ = h.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
s.chunk++
remainingBuf = remainingBuf[to:]
}
}
// Write length_encode( chunk )
binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk))
// Find first non-zero digit in big endian encoding of number of chunks
i = 0
for buf[i] == 0 && i < 8 {
i++
}
buf[8] = byte(8 - i) // number of bytes to represent number of chunks.
_, _ = s.stalk.Write(buf[i:])
_, _ = s.stalk.Write([]byte{0xff, 0xff})
}
}
return s.stalk.Read(p)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy