
package.src._util.converter.mjs Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of float16 Show documentation
Show all versions of float16 Show documentation
IEEE 754 half-precision floating-point for JavaScript
The newest version!
import {
EPSILON,
MathAbs,
NativeArrayBuffer,
NativeFloat32Array,
NativeUint16Array,
NativeUint32Array,
NativeUint8Array,
NumberIsFinite,
NumberIsNaN,
} from "./primordials.mjs";
const INVERSE_OF_EPSILON = 1 / EPSILON;
/**
* rounds to the nearest value;
* if the number falls midway, it is rounded to the nearest value with an even least significant digit
* @param {number} num
* @returns {number}
*/
function roundTiesToEven(num) {
return (num + INVERSE_OF_EPSILON) - INVERSE_OF_EPSILON;
}
const FLOAT16_MIN_VALUE = 6.103515625e-05;
const FLOAT16_MAX_VALUE = 65504;
const FLOAT16_EPSILON = 0.0009765625;
const FLOAT16_EPSILON_MULTIPLIED_BY_FLOAT16_MIN_VALUE = FLOAT16_EPSILON * FLOAT16_MIN_VALUE;
const FLOAT16_EPSILON_DEVIDED_BY_EPSILON = FLOAT16_EPSILON * INVERSE_OF_EPSILON;
/**
* round a number to a half float number
* @param {unknown} num - double float
* @returns {number} half float number
*/
export function roundToFloat16(num) {
const number = +num;
// NaN, Infinity, -Infinity, 0, -0
if (!NumberIsFinite(number) || number === 0) {
return number;
}
// finite except 0, -0
const sign = number > 0 ? 1 : -1;
const absolute = MathAbs(number);
// small number
if (absolute < FLOAT16_MIN_VALUE) {
return sign * roundTiesToEven(absolute / FLOAT16_EPSILON_MULTIPLIED_BY_FLOAT16_MIN_VALUE) * FLOAT16_EPSILON_MULTIPLIED_BY_FLOAT16_MIN_VALUE;
}
const temp = (1 + FLOAT16_EPSILON_DEVIDED_BY_EPSILON) * absolute;
const result = temp - (temp - absolute);
// large number
if (result > FLOAT16_MAX_VALUE || NumberIsNaN(result)) {
return sign * Infinity;
}
return sign * result;
}
// base algorithm: http://fox-toolkit.org/ftp/fasthalffloatconversion.pdf
const buffer = new NativeArrayBuffer(4);
const floatView = new NativeFloat32Array(buffer);
const uint32View = new NativeUint32Array(buffer);
const baseTable = new NativeUint16Array(512);
const shiftTable = new NativeUint8Array(512);
for (let i = 0; i < 256; ++i) {
const e = i - 127;
// very small number (0, -0)
if (e < -24) {
baseTable[i] = 0x0000;
baseTable[i | 0x100] = 0x8000;
shiftTable[i] = 24;
shiftTable[i | 0x100] = 24;
// small number (denorm)
} else if (e < -14) {
baseTable[i] = 0x0400 >> (-e - 14);
baseTable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
shiftTable[i] = -e - 1;
shiftTable[i | 0x100] = -e - 1;
// normal number
} else if (e <= 15) {
baseTable[i] = (e + 15) << 10;
baseTable[i | 0x100] = ((e + 15) << 10) | 0x8000;
shiftTable[i] = 13;
shiftTable[i | 0x100] = 13;
// large number (Infinity, -Infinity)
} else if (e < 128) {
baseTable[i] = 0x7c00;
baseTable[i | 0x100] = 0xfc00;
shiftTable[i] = 24;
shiftTable[i | 0x100] = 24;
// stay (NaN, Infinity, -Infinity)
} else {
baseTable[i] = 0x7c00;
baseTable[i | 0x100] = 0xfc00;
shiftTable[i] = 13;
shiftTable[i | 0x100] = 13;
}
}
/**
* round a number to a half float number bits
* @param {unknown} num - double float
* @returns {number} half float number bits
*/
export function roundToFloat16Bits(num) {
floatView[0] = roundToFloat16(num);
const f = uint32View[0];
const e = (f >> 23) & 0x1ff;
return baseTable[e] + ((f & 0x007fffff) >> shiftTable[e]);
}
const mantissaTable = new NativeUint32Array(2048);
for (let i = 1; i < 1024; ++i) {
let m = i << 13; // zero pad mantissa bits
let e = 0; // zero exponent
// normalized
while ((m & 0x00800000) === 0) {
m <<= 1;
e -= 0x00800000; // decrement exponent
}
m &= ~0x00800000; // clear leading 1 bit
e += 0x38800000; // adjust bias
mantissaTable[i] = m | e;
}
for (let i = 1024; i < 2048; ++i) {
mantissaTable[i] = 0x38000000 + ((i - 1024) << 13);
}
const exponentTable = new NativeUint32Array(64);
for (let i = 1; i < 31; ++i) {
exponentTable[i] = i << 23;
}
exponentTable[31] = 0x47800000;
exponentTable[32] = 0x80000000;
for (let i = 33; i < 63; ++i) {
exponentTable[i] = 0x80000000 + ((i - 32) << 23);
}
exponentTable[63] = 0xc7800000;
const offsetTable = new NativeUint16Array(64);
for (let i = 1; i < 64; ++i) {
if (i !== 32) {
offsetTable[i] = 1024;
}
}
/**
* convert a half float number bits to a number
* @param {number} float16bits - half float number bits
* @returns {number} double float
*/
export function convertToNumber(float16bits) {
const i = float16bits >> 10;
uint32View[0] = mantissaTable[offsetTable[i] + (float16bits & 0x3ff)] + exponentTable[i];
return floatView[0];
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy