org.graalvm.compiler.lir.amd64.AMD64MathCosOp Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of compiler Show documentation
Show all versions of compiler Show documentation
The GraalVM compiler and the Graal-truffle optimizer.
/*
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, Intel Corporation. All rights reserved.
* Intel Math Library (LIBM) Source Code
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.graalvm.compiler.lir.amd64;
import static jdk.vm.ci.amd64.AMD64.r10;
import static jdk.vm.ci.amd64.AMD64.r11;
import static jdk.vm.ci.amd64.AMD64.r8;
import static jdk.vm.ci.amd64.AMD64.r9;
import static jdk.vm.ci.amd64.AMD64.rax;
import static jdk.vm.ci.amd64.AMD64.rbx;
import static jdk.vm.ci.amd64.AMD64.rcx;
import static jdk.vm.ci.amd64.AMD64.rdi;
import static jdk.vm.ci.amd64.AMD64.rdx;
import static jdk.vm.ci.amd64.AMD64.rsi;
import static jdk.vm.ci.amd64.AMD64.rsp;
import static jdk.vm.ci.amd64.AMD64.xmm0;
import static jdk.vm.ci.amd64.AMD64.xmm1;
import static jdk.vm.ci.amd64.AMD64.xmm2;
import static jdk.vm.ci.amd64.AMD64.xmm3;
import static jdk.vm.ci.amd64.AMD64.xmm4;
import static jdk.vm.ci.amd64.AMD64.xmm5;
import static jdk.vm.ci.amd64.AMD64.xmm6;
import static jdk.vm.ci.amd64.AMD64.xmm7;
import static org.graalvm.compiler.lir.amd64.AMD64LIRHelper.pointerConstant;
import static org.graalvm.compiler.lir.amd64.AMD64LIRHelper.recordExternalAddress;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.SyncPort;
import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
/**
*
* ALGORITHM DESCRIPTION - COS()
* ---------------------
*
* 1. RANGE REDUCTION
*
* We perform an initial range reduction from X to r with
*
* X =~= N * pi/32 + r
*
* so that |r| <= pi/64 + epsilon. We restrict inputs to those
* where |N| <= 932560. Beyond this, the range reduction is
* insufficiently accurate. For extremely small inputs,
* denormalization can occur internally, impacting performance.
* This means that the main path is actually only taken for
* 2^-252 <= |X| < 90112.
*
* To avoid branches, we perform the range reduction to full
* accuracy each time.
*
* X - N * (P_1 + P_2 + P_3)
*
* where P_1 and P_2 are 32-bit numbers (so multiplication by N
* is exact) and P_3 is a 53-bit number. Together, these
* approximate pi well enough for all cases in the restricted
* range.
*
* The main reduction sequence is:
*
* y = 32/pi * x
* N = integer(y)
* (computed by adding and subtracting off SHIFTER)
*
* m_1 = N * P_1
* m_2 = N * P_2
* r_1 = x - m_1
* r = r_1 - m_2
* (this r can be used for most of the calculation)
*
* c_1 = r_1 - r
* m_3 = N * P_3
* c_2 = c_1 - m_2
* c = c_2 - m_3
*
* 2. MAIN ALGORITHM
*
* The algorithm uses a table lookup based on B = M * pi / 32
* where M = N mod 64. The stored values are:
* sigma closest power of 2 to cos(B)
* C_hl 53-bit cos(B) - sigma
* S_hi + S_lo 2 * 53-bit sin(B)
*
* The computation is organized as follows:
*
* sin(B + r + c) = [sin(B) + sigma * r] +
* r * (cos(B) - sigma) +
* sin(B) * [cos(r + c) - 1] +
* cos(B) * [sin(r + c) - r]
*
* which is approximately:
*
* [S_hi + sigma * r] +
* C_hl * r +
* S_lo + S_hi * [(cos(r) - 1) - r * c] +
* (C_hl + sigma) * [(sin(r) - r) + c]
*
* and this is what is actually computed. We separate this sum
* into four parts:
*
* hi + med + pols + corr
*
* where
*
* hi = S_hi + sigma r
* med = C_hl * r
* pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
* corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
*
* 3. POLYNOMIAL
*
* The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
* (sin(r) - r) can be rearranged freely, since it is quite
* small, so we exploit parallelism to the fullest.
*
* psc4 = SC_4 * r_1
* msc4 = psc4 * r
* r2 = r * r
* msc2 = SC_2 * r2
* r4 = r2 * r2
* psc3 = SC_3 + msc4
* psc1 = SC_1 + msc2
* msc3 = r4 * psc3
* sincospols = psc1 + msc3
* pols = sincospols *
*
*
* 4. CORRECTION TERM
*
* This is where the "c" component of the range reduction is
* taken into account; recall that just "r" is used for most of
* the calculation.
*
* -c = m_3 - c_2
* -d = S_hi * r - (C_hl + sigma)
* corr = -c * -d + S_lo
*
* 5. COMPENSATED SUMMATIONS
*
* The two successive compensated summations add up the high
* and medium parts, leaving just the low parts to add up at
* the end.
*
* rs = sigma * r
* res_int = S_hi + rs
* k_0 = S_hi - res_int
* k_2 = k_0 + rs
* med = C_hl * r
* res_hi = res_int + med
* k_1 = res_int - res_hi
* k_3 = k_1 + med
*
* 6. FINAL SUMMATION
*
* We now add up all the small parts:
*
* res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
*
* Now the overall result is just:
*
* res_hi + res_lo
*
* 7. SMALL ARGUMENTS
*
* Inputs with |X| < 2^-252 are treated specially as
* 1 - |x|.
*
* Special cases:
* cos(NaN) = quiet NaN, and raise invalid exception
* cos(INF) = NaN and raise invalid exception
* cos(0) = 1
*
*/
// @formatter:off
@SyncPort(from = "https://github.com/openjdk/jdk/blob/12358e6c94bc96e618efc3ec5299a2cfe1b4669d/src/hotspot/cpu/x86/stubGenerator_x86_64_cos.cpp#L31-L623",
sha1 = "0de5f00f2586d20205fefb3f198dc10e23e0fd07")
@SyncPort(from = "https://github.com/openjdk/jdk/blob/12358e6c94bc96e618efc3ec5299a2cfe1b4669d/src/hotspot/cpu/x86/stubGenerator_x86_64_constants.cpp#L30-L235",
sha1 = "ce403a1b9833df093021585f62da6b58fd24cb4e")
// @formatter:on
public final class AMD64MathCosOp extends AMD64MathIntrinsicUnaryOp {
public static final LIRInstructionClass TYPE = LIRInstructionClass.create(AMD64MathCosOp.class);
public AMD64MathCosOp() {
super(TYPE, /* GPR */ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11,
/* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
}
private static ArrayDataPointerConstant onehalf = pointerConstant(16, new int[]{
// @formatter:off
0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000
// @formatter:on
});
private static ArrayDataPointerConstant p2 = pointerConstant(16, new int[]{
// @formatter:off
0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461
// @formatter:on
});
private static ArrayDataPointerConstant sc4 = pointerConstant(16, new int[]{
// @formatter:off
0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0
// @formatter:on
});
private static ArrayDataPointerConstant ctable = pointerConstant(16, new int[]{
// @formatter:off
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e,
0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8,
0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a,
0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d,
0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963,
0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000,
0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000,
0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267,
0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3,
0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32,
0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34,
0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741,
0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000,
0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000,
0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c,
0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b,
0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d,
0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1,
0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0,
0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000,
0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000,
0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000,
0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d,
0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5,
0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217,
0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda,
0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000,
0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000,
0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c,
0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66,
0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758,
0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1,
0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd,
0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000,
0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000,
0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267,
0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d,
0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf,
0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced,
0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06,
0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000,
0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000,
0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e,
0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31,
0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b,
0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000,
0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000,
0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1,
0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000,
0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d,
0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974,
0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6,
0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000,
0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000,
0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc,
0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000,
0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66,
0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e,
0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46,
0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000,
0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000,
0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f,
0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000,
0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d,
0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000,
0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526,
0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000,
0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000,
0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031,
0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000,
0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b,
0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e,
0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3,
0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000,
0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000,
0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279,
0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000,
0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3,
0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974,
0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b,
0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000,
0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000,
0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea,
0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000,
0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8,
0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31,
0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
0x00000000, 0x3ff00000
// @formatter:on
});
private static ArrayDataPointerConstant sc2 = pointerConstant(16, new int[]{
// @formatter:off
0x11111111, 0x3f811111, 0x55555555, 0x3fa55555
// @formatter:on
});
private static ArrayDataPointerConstant sc3 = pointerConstant(16, new int[]{
// @formatter:off
0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c
// @formatter:on
});
private static ArrayDataPointerConstant sc1 = pointerConstant(16, new int[]{
// @formatter:off
0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000
// @formatter:on
});
private static ArrayDataPointerConstant piInvTable = pointerConstant(16, new int[]{
// @formatter:off
0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1,
0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561,
0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c,
0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff,
0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7,
0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7,
0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab,
0xf0cfbc21
// @formatter:on
});
private static ArrayDataPointerConstant pi4 = pointerConstant(8, new int[]{
// @formatter:off
0x40000000, 0x3fe921fb, 0x18469899, 0x3e64442d
// @formatter:on
});
private static ArrayDataPointerConstant pi48 = pointerConstant(8, new int[]{
// @formatter:off
0x18469899, 0x3e64442d
// @formatter:on
});
private static ArrayDataPointerConstant pi32Inv = pointerConstant(8, new int[]{
// @formatter:off
0x6dc9c883, 0x40245f30
// @formatter:on
});
private static ArrayDataPointerConstant signMask = pointerConstant(8, new int[]{
// @formatter:off
0x00000000, 0x80000000
// @formatter:on
});
private static ArrayDataPointerConstant p3 = pointerConstant(8, new int[]{
// @formatter:off
0x2e037073, 0x3b63198a
// @formatter:on
});
private static ArrayDataPointerConstant p1 = pointerConstant(8, new int[]{
// @formatter:off
0x54400000, 0x3fb921fb
// @formatter:on
});
private static ArrayDataPointerConstant negZero = pointerConstant(8, new int[]{
// @formatter:off
0x00000000, 0x80000000
// @formatter:on
});
// The 64 bit code is at most SSE2 compliant
private static ArrayDataPointerConstant one = pointerConstant(8, new int[]{
// @formatter:off
0x00000000, 0x3ff00000
// @formatter:on
});
@Override
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
/*
* This code relies on recordExternalAddress providing the same address when called
* repeatedly. Especially for piInvTable.
*/
Label block0 = new Label();
Label block1 = new Label();
Label block2 = new Label();
Label block3 = new Label();
Label block4 = new Label();
Label block5 = new Label();
Label block6 = new Label();
Label block7 = new Label();
Label block8 = new Label();
Label block9 = new Label();
Label block10 = new Label();
Label block11 = new Label();
Label block12 = new Label();
Label block13 = new Label();
masm.push(rbx);
masm.subq(rsp, 16);
masm.movsd(new AMD64Address(rsp, 8), xmm0);
masm.movl(rax, new AMD64Address(rsp, 12));
masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30
masm.andl(rax, 2147418112);
masm.subl(rax, 808452096);
masm.cmplAndJcc(rax, 281346048, ConditionFlag.Above, block0, false);
masm.mulsd(xmm1, xmm0);
masm.movdqu(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000,
// 0x00000000, 0x3fe00000
masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000
masm.pand(xmm4, xmm0);
masm.por(xmm5, xmm4);
masm.addpd(xmm1, xmm5);
masm.cvttsd2sil(rdx, xmm1);
masm.cvtsi2sdl(xmm1, rdx);
masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461,
// 0x1a600000, 0x3d90b461
masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb
masm.mulsd(xmm3, xmm1);
masm.unpcklpd(xmm1, xmm1);
masm.addq(rdx, 1865232);
masm.movdqu(xmm4, xmm0);
masm.andq(rdx, 63);
masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3,
// 0x1a01a01a, 0x3efa01a0
masm.leaq(rax, recordExternalAddress(crb, ctable));
masm.shlq(rdx, 5);
masm.addq(rax, rdx);
masm.mulpd(xmm2, xmm1);
masm.subsd(xmm0, xmm3);
masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a
masm.subsd(xmm4, xmm3);
masm.movq(xmm7, new AMD64Address(rax, 8));
masm.unpcklpd(xmm0, xmm0);
masm.movdqu(xmm3, xmm4);
masm.subsd(xmm4, xmm2);
masm.mulpd(xmm5, xmm0);
masm.subpd(xmm0, xmm2);
masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111,
// 0x55555555, 0x3fa55555
masm.mulsd(xmm7, xmm4);
masm.subsd(xmm3, xmm4);
masm.mulpd(xmm5, xmm0);
masm.mulpd(xmm0, xmm0);
masm.subsd(xmm3, xmm2);
masm.movdqu(xmm2, new AMD64Address(rax, 0));
masm.subsd(xmm1, xmm3);
masm.movq(xmm3, new AMD64Address(rax, 24));
masm.addsd(xmm2, xmm3);
masm.subsd(xmm7, xmm2);
masm.mulsd(xmm2, xmm4);
masm.mulpd(xmm6, xmm0);
masm.mulsd(xmm3, xmm4);
masm.mulpd(xmm2, xmm0);
masm.mulpd(xmm0, xmm0);
masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0,
// 0x16c16c17, 0xbf56c16c
masm.mulsd(xmm4, new AMD64Address(rax, 0));
masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555,
// 0x00000000, 0xbfe00000
masm.mulpd(xmm5, xmm0);
masm.movdqu(xmm0, xmm3);
masm.addsd(xmm3, new AMD64Address(rax, 8));
masm.mulpd(xmm1, xmm7);
masm.movdqu(xmm7, xmm4);
masm.addsd(xmm4, xmm3);
masm.addpd(xmm6, xmm5);
masm.movq(xmm5, new AMD64Address(rax, 8));
masm.subsd(xmm5, xmm3);
masm.subsd(xmm3, xmm4);
masm.addsd(xmm1, new AMD64Address(rax, 16));
masm.mulpd(xmm6, xmm2);
masm.addsd(xmm0, xmm5);
masm.addsd(xmm3, xmm7);
masm.addsd(xmm0, xmm1);
masm.addsd(xmm0, xmm3);
masm.addsd(xmm0, xmm6);
masm.unpckhpd(xmm6, xmm6);
masm.addsd(xmm0, xmm6);
masm.addsd(xmm0, xmm4);
masm.jmp(block13);
masm.bind(block0);
masm.jcc(ConditionFlag.Greater, block1);
masm.pextrw(rax, xmm0, 3);
masm.andl(rax, 32767);
masm.pinsrw(xmm0, rax, 3);
masm.movq(xmm1, recordExternalAddress(crb, one)); // 0x00000000, 0x3ff00000
masm.subsd(xmm1, xmm0);
masm.movdqu(xmm0, xmm1);
masm.jmp(block13);
masm.bind(block1);
masm.pextrw(rax, xmm0, 3);
masm.andl(rax, 32752);
masm.cmplAndJcc(rax, 32752, ConditionFlag.Equal, block2, false);
masm.pextrw(rcx, xmm0, 3);
masm.andl(rcx, 32752);
masm.subl(rcx, 16224);
masm.shrl(rcx, 7);
masm.andl(rcx, 65532);
masm.leaq(r11, recordExternalAddress(crb, piInvTable));
masm.addq(rcx, r11);
masm.movdq(rax, xmm0);
masm.movl(r10, new AMD64Address(rcx, 20));
masm.movl(r8, new AMD64Address(rcx, 24));
masm.movl(rdx, rax);
masm.shrq(rax, 21);
masm.orl(rax, Integer.MIN_VALUE);
masm.shrl(rax, 11);
masm.movl(r9, r10);
masm.imulq(r10, rdx);
masm.imulq(r9, rax);
masm.imulq(r8, rax);
masm.movl(rsi, new AMD64Address(rcx, 16));
masm.movl(rdi, new AMD64Address(rcx, 12));
masm.movl(r11, r10);
masm.shrq(r10, 32);
masm.addq(r9, r10);
masm.addq(r11, r8);
masm.movl(r8, r11);
masm.shrq(r11, 32);
masm.addq(r9, r11);
masm.movl(r10, rsi);
masm.imulq(rsi, rdx);
masm.imulq(r10, rax);
masm.movl(r11, rdi);
masm.imulq(rdi, rdx);
masm.movl(rbx, rsi);
masm.shrq(rsi, 32);
masm.addq(r9, rbx);
masm.movl(rbx, r9);
masm.shrq(r9, 32);
masm.addq(r10, rsi);
masm.addq(r10, r9);
masm.shlq(rbx, 32);
masm.orq(r8, rbx);
masm.imulq(r11, rax);
masm.movl(r9, new AMD64Address(rcx, 8));
masm.movl(rsi, new AMD64Address(rcx, 4));
masm.movl(rbx, rdi);
masm.shrq(rdi, 32);
masm.addq(r10, rbx);
masm.movl(rbx, r10);
masm.shrq(r10, 32);
masm.addq(r11, rdi);
masm.addq(r11, r10);
masm.movq(rdi, r9);
masm.imulq(r9, rdx);
masm.imulq(rdi, rax);
masm.movl(r10, r9);
masm.shrq(r9, 32);
masm.addq(r11, r10);
masm.movl(r10, r11);
masm.shrq(r11, 32);
masm.addq(rdi, r9);
masm.addq(rdi, r11);
masm.movq(r9, rsi);
masm.imulq(rsi, rdx);
masm.imulq(r9, rax);
masm.shlq(r10, 32);
masm.orq(r10, rbx);
masm.movl(rax, new AMD64Address(rcx, 0));
masm.movl(r11, rsi);
masm.shrq(rsi, 32);
masm.addq(rdi, r11);
masm.movl(r11, rdi);
masm.shrq(rdi, 32);
masm.addq(r9, rsi);
masm.addq(r9, rdi);
masm.imulq(rdx, rax);
masm.pextrw(rbx, xmm0, 3);
masm.leaq(rdi, recordExternalAddress(crb, piInvTable));
masm.subq(rcx, rdi);
masm.addl(rcx, rcx);
masm.addl(rcx, rcx);
masm.addl(rcx, rcx);
masm.addl(rcx, 19);
masm.movl(rsi, 32768);
masm.andl(rsi, rbx);
masm.shrl(rbx, 4);
masm.andl(rbx, 2047);
masm.subl(rbx, 1023);
masm.subl(rcx, rbx);
masm.addq(r9, rdx);
masm.movl(rdx, rcx);
masm.addl(rdx, 32);
masm.cmplAndJcc(rcx, 1, ConditionFlag.Less, block3, false);
masm.negl(rcx);
masm.addl(rcx, 29);
masm.shll(r9);
masm.movl(rdi, r9);
masm.andl(r9, 536870911);
masm.testlAndJcc(r9, 268435456, ConditionFlag.NotEqual, block4, false);
masm.shrl(r9);
masm.movl(rbx, 0);
masm.shlq(r9, 32);
masm.orq(r9, r11);
masm.bind(block5);
masm.bind(block6);
masm.cmpqAndJcc(r9, 0, ConditionFlag.Equal, block7, false);
masm.bind(block8);
masm.bsrq(r11, r9);
masm.movl(rcx, 29);
masm.sublAndJcc(rcx, r11, ConditionFlag.LessEqual, block9, false);
masm.shlq(r9);
masm.movq(rax, r10);
masm.shlq(r10);
masm.addl(rdx, rcx);
masm.negl(rcx);
masm.addl(rcx, 64);
masm.shrq(rax);
masm.shrq(r8);
masm.orq(r9, rax);
masm.orq(r10, r8);
masm.bind(block10);
masm.cvtsi2sdq(xmm0, r9);
masm.shrq(r10, 1);
masm.cvtsi2sdq(xmm3, r10);
masm.xorpd(xmm4, xmm4);
masm.shll(rdx, 4);
masm.negl(rdx);
masm.addl(rdx, 16368);
masm.orl(rdx, rsi);
masm.xorl(rdx, rbx);
masm.pinsrw(xmm4, rdx, 3);
masm.movq(xmm2, recordExternalAddress(crb, pi4)); // 0x40000000, 0x3fe921fb,
// 0x18469899, 0x3e64442d
masm.movq(xmm6, recordExternalAddress(crb, pi48)); // 0x3fe921fb, 0x18469899,
// 0x3e64442d
masm.xorpd(xmm5, xmm5);
masm.subl(rdx, 1008);
masm.pinsrw(xmm5, rdx, 3);
masm.mulsd(xmm0, xmm4);
masm.shll(rsi, 16);
masm.sarl(rsi, 31);
masm.mulsd(xmm3, xmm5);
masm.movdqu(xmm1, xmm0);
masm.mulsd(xmm0, xmm2);
masm.shrl(rdi, 29);
masm.addsd(xmm1, xmm3);
masm.mulsd(xmm3, xmm2);
masm.addl(rdi, rsi);
masm.xorl(rdi, rsi);
masm.mulsd(xmm6, xmm1);
masm.movl(rax, rdi);
masm.addsd(xmm6, xmm3);
masm.movdqu(xmm2, xmm0);
masm.addsd(xmm0, xmm6);
masm.subsd(xmm2, xmm0);
masm.addsd(xmm6, xmm2);
masm.bind(block11);
masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30
masm.mulsd(xmm1, xmm0);
masm.movq(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000,
// 0x00000000, 0x3fe00000
masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000
masm.pand(xmm4, xmm0);
masm.por(xmm5, xmm4);
masm.addpd(xmm1, xmm5);
masm.cvttsd2siq(rdx, xmm1);
masm.cvtsi2sdq(xmm1, rdx);
masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb
masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461,
// 0x1a600000, 0x3d90b461
masm.mulsd(xmm3, xmm1);
masm.unpcklpd(xmm1, xmm1);
masm.shll(rax, 3);
masm.addl(rdx, 1865232);
masm.movdqu(xmm4, xmm0);
masm.addl(rdx, rax);
masm.andl(rdx, 63);
masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3,
// 0x1a01a01a, 0x3efa01a0
masm.leaq(rax, recordExternalAddress(crb, ctable));
masm.shll(rdx, 5);
masm.addq(rax, rdx);
masm.mulpd(xmm2, xmm1);
masm.subsd(xmm0, xmm3);
masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a
masm.subsd(xmm4, xmm3);
masm.movq(xmm7, new AMD64Address(rax, 8));
masm.unpcklpd(xmm0, xmm0);
masm.movdqu(xmm3, xmm4);
masm.subsd(xmm4, xmm2);
masm.mulpd(xmm5, xmm0);
masm.subpd(xmm0, xmm2);
masm.mulsd(xmm7, xmm4);
masm.subsd(xmm3, xmm4);
masm.mulpd(xmm5, xmm0);
masm.mulpd(xmm0, xmm0);
masm.subsd(xmm3, xmm2);
masm.movdqu(xmm2, new AMD64Address(rax, 0));
masm.subsd(xmm1, xmm3);
masm.movq(xmm3, new AMD64Address(rax, 24));
masm.addsd(xmm2, xmm3);
masm.subsd(xmm7, xmm2);
masm.subsd(xmm1, xmm6);
masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111,
// 0x55555555, 0x3fa55555
masm.mulsd(xmm2, xmm4);
masm.mulpd(xmm6, xmm0);
masm.mulsd(xmm3, xmm4);
masm.mulpd(xmm2, xmm0);
masm.mulpd(xmm0, xmm0);
masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0,
// 0x16c16c17, 0xbf56c16c
masm.mulsd(xmm4, new AMD64Address(rax, 0));
masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555,
// 0x00000000, 0xbfe00000
masm.mulpd(xmm5, xmm0);
masm.movdqu(xmm0, xmm3);
masm.addsd(xmm3, new AMD64Address(rax, 8));
masm.mulpd(xmm1, xmm7);
masm.movdqu(xmm7, xmm4);
masm.addsd(xmm4, xmm3);
masm.addpd(xmm6, xmm5);
masm.movq(xmm5, new AMD64Address(rax, 8));
masm.subsd(xmm5, xmm3);
masm.subsd(xmm3, xmm4);
masm.addsd(xmm1, new AMD64Address(rax, 16));
masm.mulpd(xmm6, xmm2);
masm.addsd(xmm5, xmm0);
masm.addsd(xmm3, xmm7);
masm.addsd(xmm1, xmm5);
masm.addsd(xmm1, xmm3);
masm.addsd(xmm1, xmm6);
masm.unpckhpd(xmm6, xmm6);
masm.movdqu(xmm0, xmm4);
masm.addsd(xmm1, xmm6);
masm.addsd(xmm0, xmm1);
masm.jmp(block13);
masm.bind(block7);
masm.addl(rdx, 64);
masm.movq(r9, r10);
masm.movq(r10, r8);
masm.movl(r8, 0);
masm.cmpqAndJcc(r9, 0, ConditionFlag.NotEqual, block8, false);
masm.addl(rdx, 64);
masm.movq(r9, r10);
masm.movq(r10, r8);
masm.cmpqAndJcc(r9, 0, ConditionFlag.NotEqual, block8, false);
masm.xorpd(xmm0, xmm0);
masm.xorpd(xmm6, xmm6);
masm.jmp(block11);
masm.bind(block9);
masm.jcc(ConditionFlag.Equal, block10);
masm.negl(rcx);
masm.shrq(r10);
masm.movq(rax, r9);
masm.shrq(r9);
masm.subl(rdx, rcx);
masm.negl(rcx);
masm.addl(rcx, 64);
masm.shlq(rax);
masm.orq(r10, rax);
masm.jmp(block10);
masm.bind(block3);
masm.negl(rcx);
masm.shlq(r9, 32);
masm.orq(r9, r11);
masm.shlq(r9);
masm.movq(rdi, r9);
masm.testlAndJcc(r9, Integer.MIN_VALUE, ConditionFlag.NotEqual, block12, false);
masm.shrl(r9);
masm.movl(rbx, 0);
masm.shrq(rdi, 3);
masm.jmp(block6);
masm.bind(block4);
masm.shrl(r9);
masm.movl(rbx, 536870912);
masm.shrl(rbx);
masm.shlq(r9, 32);
masm.orq(r9, r11);
masm.shlq(rbx, 32);
masm.addl(rdi, 536870912);
masm.movl(rcx, 0);
masm.movl(r11, 0);
masm.subq(rcx, r8);
masm.sbbq(r11, r10);
masm.sbbq(rbx, r9);
masm.movq(r8, rcx);
masm.movq(r10, r11);
masm.movq(r9, rbx);
masm.movl(rbx, 32768);
masm.jmp(block5);
masm.bind(block12);
masm.shrl(r9);
masm.movq(rbx, 0x100000000L);
masm.shrq(rbx);
masm.movl(rcx, 0);
masm.movl(r11, 0);
masm.subq(rcx, r8);
masm.sbbq(r11, r10);
masm.sbbq(rbx, r9);
masm.movq(r8, rcx);
masm.movq(r10, r11);
masm.movq(r9, rbx);
masm.movl(rbx, 32768);
masm.shrq(rdi, 3);
masm.addl(rdi, 536870912);
masm.jmp(block6);
masm.bind(block2);
masm.movsd(xmm0, new AMD64Address(rsp, 8));
masm.mulsd(xmm0, recordExternalAddress(crb, negZero)); // 0x00000000, 0x80000000
masm.movq(new AMD64Address(rsp, 0), xmm0);
masm.bind(block13);
masm.addq(rsp, 16);
masm.pop(rbx);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy