jdk.graal.compiler.lir.amd64.AMD64BigIntegerMultiplyToLenOp Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of compiler Show documentation
The GraalVM compiler and the Graal-truffle optimizer.
There is a newer version: 24.1.1
/*
 * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package jdk.graal.compiler.lir.amd64;

import static jdk.vm.ci.amd64.AMD64.r10;
import static jdk.vm.ci.amd64.AMD64.r11;
import static jdk.vm.ci.amd64.AMD64.r12;
import static jdk.vm.ci.amd64.AMD64.r13;
import static jdk.vm.ci.amd64.AMD64.r14;
import static jdk.vm.ci.amd64.AMD64.r8;
import static jdk.vm.ci.amd64.AMD64.r9;
import static jdk.vm.ci.amd64.AMD64.rax;
import static jdk.vm.ci.amd64.AMD64.rbx;
import static jdk.vm.ci.amd64.AMD64.rcx;
import static jdk.vm.ci.amd64.AMD64.rdi;
import static jdk.vm.ci.amd64.AMD64.rdx;
import static jdk.vm.ci.amd64.AMD64.rsi;
import static jdk.vm.ci.amd64.AMD64.CPUFeature.ADX;
import static jdk.vm.ci.amd64.AMD64.CPUFeature.AVX;
import static jdk.vm.ci.amd64.AMD64.CPUFeature.BMI2;
import static jdk.vm.ci.code.ValueUtil.asRegister;

import jdk.graal.compiler.asm.Label;
import jdk.graal.compiler.asm.amd64.AMD64Address;
import jdk.graal.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
import jdk.graal.compiler.asm.amd64.AMD64MacroAssembler;
import jdk.graal.compiler.core.common.Stride;
import jdk.graal.compiler.debug.GraalError;
import jdk.graal.compiler.lir.LIRInstructionClass;
import jdk.graal.compiler.lir.SyncPort;
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
import jdk.vm.ci.amd64.AMD64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.meta.Value;

// @formatter:off
@SyncPort(from = "https://github.com/openjdk/jdk/blob/12a61bce8db5e6b152eb101de1662847bebb7997/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp#L3037-L3092",
          sha1 = "2bf2eb0a9feca080f99e6932d3750cdf3ce2ef3a")
@SyncPort(from = "https://github.com/openjdk/jdk/blob/fbe4cc96e223882a18c7ff666fe6f68b3fa2cfe4/src/hotspot/cpu/x86/macroAssembler_x86.cpp#L6692-L7149",
          sha1 = "0763af542cf9f40a1c542e4834a67fc4b2c74e1c")
// @formatter:on
public final class AMD64BigIntegerMultiplyToLenOp extends AMD64LIRInstruction {

    public static final LIRInstructionClass TYPE = LIRInstructionClass.create(AMD64BigIntegerMultiplyToLenOp.class);

    @Use({OperandFlag.REG}) private Value xValue;
    @Use({OperandFlag.REG}) private Value xlenValue;
    @Use({OperandFlag.REG}) private Value yValue;
    @Use({OperandFlag.REG}) private Value ylenValue;
    @Use({OperandFlag.REG}) private Value zValue;
    @Use({OperandFlag.REG}) private Value zlenValue;

    @Temp({OperandFlag.REG}) private Value tmp1Value;
    @Temp({OperandFlag.REG}) private Value[] tmpValues;

    public AMD64BigIntegerMultiplyToLenOp(
                    Value xValue,
                    Value xlenValue,
                    Value yValue,
                    Value ylenValue,
                    Value zValue,
                    Value zlenValue,
                    Register heapBaseRegister) {
        super(TYPE);

        // Due to lack of allocatable registers, we use fixed registers and mark them as @Use+@Temp.
        // This allows the fixed registers to be reused for hosting temporary values.
        GraalError.guarantee(asRegister(xValue).equals(rdi), "expect xValue at rdi, but was %s", xValue);
        GraalError.guarantee(asRegister(xlenValue).equals(rax), "expect xlenValue at rax, but was %s", xlenValue);
        GraalError.guarantee(asRegister(yValue).equals(rsi), "expect yValue at rsi, but was %s", yValue);
        GraalError.guarantee(asRegister(ylenValue).equals(rcx), "expect ylenValue at rcx, but was %s", ylenValue);
        GraalError.guarantee(asRegister(zValue).equals(r8), "expect zValue at r8, but was %s", zValue);
        GraalError.guarantee(asRegister(zlenValue).equals(r9), "expect zlenValue at r9, but was %s", zlenValue);

        this.xValue = xValue;
        this.xlenValue = xlenValue;
        this.yValue = yValue;
        this.ylenValue = ylenValue;
        this.zValue = zValue;
        this.zlenValue = zlenValue;

        this.tmp1Value = r12.equals(heapBaseRegister) ? r14.asValue() : r12.asValue();
        this.tmpValues = new Value[]{
                        rax.asValue(),
                        rcx.asValue(),
                        rdx.asValue(),
                        rbx.asValue(),
                        rsi.asValue(),
                        rdi.asValue(),
                        r8.asValue(),
                        r9.asValue(),
                        r10.asValue(),
                        r11.asValue(),
                        r13.asValue(),
        };
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
        GraalError.guarantee(xValue.getPlatformKind().equals(AMD64Kind.QWORD), "Invalid xValue kind: %s", xValue);
        GraalError.guarantee(xlenValue.getPlatformKind().equals(AMD64Kind.DWORD), "Invalid xlenValue kind: %s", xlenValue);
        GraalError.guarantee(yValue.getPlatformKind().equals(AMD64Kind.QWORD), "Invalid yValue kind: %s", yValue);
        GraalError.guarantee(ylenValue.getPlatformKind().equals(AMD64Kind.DWORD), "Invalid ylenValue kind: %s", ylenValue);
        GraalError.guarantee(zValue.getPlatformKind().equals(AMD64Kind.QWORD), "Invalid zValue kind: %s", zValue);
        GraalError.guarantee(zlenValue.getPlatformKind().equals(AMD64Kind.DWORD), "Invalid zlenValue kind: %s", zlenValue);

        Register x = asRegister(xValue);
        Register xlen = asRegister(xlenValue);
        Register y = asRegister(yValue);
        Register ylen = asRegister(ylenValue);
        Register z = asRegister(zValue);
        Register zlen = asRegister(zlenValue);

        Register tmp1 = asRegister(tmp1Value);
        Register tmp2 = r13;
        Register tmp3 = r11;
        Register tmp4 = r10;
        Register tmp5 = rbx;

        multiplyToLen(masm, x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
    }

    private static void add2WithCarry(AMD64MacroAssembler masm,
                    Register destHi,
                    Register destLo,
                    Register src1,
                    Register src2) {
        masm.addq(destLo, src1);
        masm.adcq(destHi, 0);
        masm.addq(destLo, src2);
        masm.adcq(destHi, 0);
    }

    /**
     * Multiply 64 bit by 64 bit first loop.
     */
    private static void multiply64x64Loop(AMD64MacroAssembler masm,
                    Register x,
                    Register xstart,
                    Register xAtXstart,
                    Register y,
                    Register yAtIdx,
                    Register z,
                    Register carry,
                    Register product,
                    Register idx,
                    Register kdx) {
        // @formatter:off
        //  jlong carry, x[], y[], z[];
        //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
        //    huge_128 product = y[idx] * x[xstart] + carry;
        //    z[kdx] = (jlong)product;
        //    carry  = (jlong)(product >>> 64);
        //  }
        //  z[xstart] = carry;
        // @formatter:on

        Label labelFirstLoop = new Label();
        Label labelFirstLoopExit = new Label();
        Label labelOneX = new Label();
        Label labelOneY = new Label();
        Label labelMultiply = new Label();

        masm.declAndJcc(xstart, ConditionFlag.Negative, labelOneX, false);

        masm.movq(xAtXstart, new AMD64Address(x, xstart, Stride.S4, 0));
        masm.rorq(xAtXstart, 32); // convert big-endian to little-endian

        masm.bind(labelFirstLoop);
        masm.declAndJcc(idx, ConditionFlag.Negative, labelFirstLoopExit, false);
        masm.declAndJcc(idx, ConditionFlag.Negative, labelOneY, false);
        masm.movq(yAtIdx, new AMD64Address(y, idx, Stride.S4, 0));
        masm.rorq(yAtIdx, 32); // convert big-endian to little-endian
        masm.bind(labelMultiply);
        masm.movq(product, xAtXstart);
        masm.mulq(yAtIdx); // product(rax) * yAtIdx -> rdx:rax
        masm.addq(product, carry);
        masm.adcq(rdx, 0);
        masm.subl(kdx, 2);
        masm.movl(new AMD64Address(z, kdx, Stride.S4, 4), product);
        masm.shrq(product, 32);
        masm.movl(new AMD64Address(z, kdx, Stride.S4, 0), product);
        masm.movq(carry, rdx);
        masm.jmp(labelFirstLoop);

        masm.bind(labelOneY);
        masm.movl(yAtIdx, new AMD64Address(y));
        masm.jmp(labelMultiply);

        masm.bind(labelOneX);
        masm.movl(xAtXstart, new AMD64Address(x));
        masm.jmp(labelFirstLoop);

        masm.bind(labelFirstLoopExit);
    }

    /**
     * Multiply 64 bit by 64 bit and add 128 bit.
     */
    private static void multiplyAdd128x128(AMD64MacroAssembler masm,
                    Register xAtXstart,
                    Register y,
                    Register z,
                    Register yzAtIdx,
                    Register idx,
                    Register carry,
                    Register product,
                    int offset) {
        // huge_128 product = (y[idx] * xAtXstart) + z[kdx] + carry;
        // z[kdx] = (jlong)product;

        masm.movq(yzAtIdx, new AMD64Address(y, idx, Stride.S4, offset));
        masm.rorq(yzAtIdx, 32); // convert big-endian to little-endian
        masm.movq(product, xAtXstart);
        masm.mulq(yzAtIdx);     // product(rax) * yzAtIdx -> rdx:product(rax)
        masm.movq(yzAtIdx, new AMD64Address(z, idx, Stride.S4, offset));
        masm.rorq(yzAtIdx, 32); // convert big-endian to little-endian

        add2WithCarry(masm, rdx, product, carry, yzAtIdx);

        masm.movl(new AMD64Address(z, idx, Stride.S4, offset + 4), product);
        masm.shrq(product, 32);
        masm.movl(new AMD64Address(z, idx, Stride.S4, offset), product);
    }

    /**
     * Multiply 128 bit by 128 bit. Unrolled inner loop.
     */
    private static void multiply128x128Loop(AMD64MacroAssembler masm,
                    Register xAtXstart,
                    Register y,
                    Register z,
                    Register yzAtIdx,
                    Register idx,
                    Register jdx,
                    Register carry,
                    Register product,
                    Register carry2) {
        // @formatter:off
        //   jlong carry, x[], y[], z[];
        //   int kdx = ystart+1;
        //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
        //     huge_128 product = (y[idx+1] * xAtXstart) + z[kdx+idx+1] + carry;
        //     z[kdx+idx+1] = (jlong)product;
        //     jlong carry2  = (jlong)(product >>> 64);
        //     product = (y[idx] * xAtXstart) + z[kdx+idx] + carry2;
        //     z[kdx+idx] = (jlong)product;
        //     carry  = (jlong)(product >>> 64);
        //   }
        //   idx += 2;
        //   if (idx > 0) {
        //     product = (y[idx] * xAtXstart) + z[kdx+idx] + carry;
        //     z[kdx+idx] = (jlong)product;
        //     carry  = (jlong)(product >>> 64);
        //   }
        // @formatter:on

        Label labelThirdLoop = new Label();
        Label labelThirdLoopExit = new Label();
        Label labelPostThirdLoopDone = new Label();
        Label labelCheck1 = new Label();

        masm.movl(jdx, idx);
        masm.andl(jdx, 0xFFFFFFFC);
        masm.shrl(jdx, 2);

        masm.bind(labelThirdLoop);
        masm.sublAndJcc(jdx, 1, ConditionFlag.Negative, labelThirdLoopExit, false);
        masm.subl(idx, 4);

        multiplyAdd128x128(masm, xAtXstart, y, z, yzAtIdx, idx, carry, product, 8);
        masm.movq(carry2, rdx);

        multiplyAdd128x128(masm, xAtXstart, y, z, yzAtIdx, idx, carry2, product, 0);
        masm.movq(carry, rdx);
        masm.jmp(labelThirdLoop);

        masm.bind(labelThirdLoopExit);

        masm.andlAndJcc(idx, 0x3, ConditionFlag.Zero, labelPostThirdLoopDone, false);

        masm.sublAndJcc(idx, 2, ConditionFlag.Negative, labelCheck1, false);

        multiplyAdd128x128(masm, xAtXstart, y, z, yzAtIdx, idx, carry, product, 0);
        masm.movq(carry, rdx);

        masm.bind(labelCheck1);
        masm.addl(idx, 0x2);
        masm.andl(idx, 0x1);
        masm.sublAndJcc(idx, 1, ConditionFlag.Negative, labelPostThirdLoopDone, false);

        masm.movl(yzAtIdx, new AMD64Address(y, idx, Stride.S4, 0));
        masm.movq(product, xAtXstart);
        masm.mulq(yzAtIdx); // product(rax) * yzAtIdx -> rdx:product(rax)
        masm.movl(yzAtIdx, new AMD64Address(z, idx, Stride.S4, 0));

        add2WithCarry(masm, rdx, product, yzAtIdx, carry);

        masm.movl(new AMD64Address(z, idx, Stride.S4, 0), product);
        masm.shrq(product, 32);

        masm.shlq(rdx, 32);
        masm.orq(product, rdx);
        masm.movq(carry, product);

        masm.bind(labelPostThirdLoopDone);
    }

    /**
     * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
     */
    private static void multiply128x128BMI2Loop(AMD64MacroAssembler masm,
                    Register y,
                    Register z,
                    Register carry,
                    Register carry2,
                    Register idx,
                    Register jdx,
                    Register yzAtIdx1,
                    Register yzAtIdx2,
                    Register tmp,
                    Register tmp3,
                    Register tmp4) {
        GraalError.guarantee(masm.supports(BMI2) && masm.supports(AVX), "should be used only when BMI2 is available");

        // @formatter:off
        //   jlong carry, x[], y[], z[];
        //   int kdx = ystart+1;
        //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
        //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
        //     jlong carry2  = (jlong)(tmp3 >>> 64);
        //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
        //     carry  = (jlong)(tmp4 >>> 64);
        //     z[kdx+idx+1] = (jlong)tmp3;
        //     z[kdx+idx] = (jlong)tmp4;
        //   }
        //   idx += 2;
        //   if (idx > 0) {
        //     yzAtIdx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
        //     z[kdx+idx] = (jlong)yzAtIdx1;
        //     carry  = (jlong)(yzAtIdx1 >>> 64);
        //   }
        // @formatter:on

        Label labelThirdLoop = new Label();
        Label labelThirdLoopExit = new Label();
        Label labelPostThirdLoopDone = new Label();
        Label labelCheck1 = new Label();

        masm.movl(jdx, idx);
        masm.andl(jdx, 0xFFFFFFFC);
        masm.shrl(jdx, 2);

        masm.bind(labelThirdLoop);
        masm.sublAndJcc(jdx, 1, ConditionFlag.Negative, labelThirdLoopExit, false);
        masm.subl(idx, 4);

        masm.movq(yzAtIdx1, new AMD64Address(y, idx, Stride.S4, 8));
        masm.rorxq(yzAtIdx1, yzAtIdx1, 32); // convert big-endian to little-endian
        masm.movq(yzAtIdx2, new AMD64Address(y, idx, Stride.S4, 0));
        masm.rorxq(yzAtIdx2, yzAtIdx2, 32);

        masm.mulxq(tmp4, tmp3, yzAtIdx1);  // yzAtIdx1 * rdx -> tmp4:tmp3
        masm.mulxq(carry2, tmp, yzAtIdx2); // yzAtIdx2 * rdx -> carry2:tmp

        masm.movq(yzAtIdx1, new AMD64Address(z, idx, Stride.S4, 8));
        masm.rorxq(yzAtIdx1, yzAtIdx1, 32);
        masm.movq(yzAtIdx2, new AMD64Address(z, idx, Stride.S4, 0));
        masm.rorxq(yzAtIdx2, yzAtIdx2, 32);

        if (masm.supports(ADX)) {
            masm.adcxq(tmp3, carry);
            masm.adoxq(tmp3, yzAtIdx1);

            masm.adcxq(tmp4, tmp);
            masm.adoxq(tmp4, yzAtIdx2);

            masm.movl(carry, 0); // does not affect flags
            masm.adcxq(carry2, carry);
            masm.adoxq(carry2, carry);
        } else {
            add2WithCarry(masm, tmp4, tmp3, carry, yzAtIdx1);
            add2WithCarry(masm, carry2, tmp4, tmp, yzAtIdx2);
        }
        masm.movq(carry, carry2);

        masm.movl(new AMD64Address(z, idx, Stride.S4, 12), tmp3);
        masm.shrq(tmp3, 32);
        masm.movl(new AMD64Address(z, idx, Stride.S4, 8), tmp3);

        masm.movl(new AMD64Address(z, idx, Stride.S4, 4), tmp4);
        masm.shrq(tmp4, 32);
        masm.movl(new AMD64Address(z, idx, Stride.S4, 0), tmp4);

        masm.jmp(labelThirdLoop);

        masm.bind(labelThirdLoopExit);

        masm.andlAndJcc(idx, 0x3, ConditionFlag.Zero, labelPostThirdLoopDone, false);

        masm.sublAndJcc(idx, 2, ConditionFlag.Negative, labelCheck1, false);

        masm.movq(yzAtIdx1, new AMD64Address(y, idx, Stride.S4, 0));
        masm.rorxq(yzAtIdx1, yzAtIdx1, 32);
        masm.mulxq(tmp4, tmp3, yzAtIdx1); // yzAtIdx1 * rdx -> tmp4:tmp3
        masm.movq(yzAtIdx2, new AMD64Address(z, idx, Stride.S4, 0));
        masm.rorxq(yzAtIdx2, yzAtIdx2, 32);

        add2WithCarry(masm, tmp4, tmp3, carry, yzAtIdx2);

        masm.movl(new AMD64Address(z, idx, Stride.S4, 4), tmp3);
        masm.shrq(tmp3, 32);
        masm.movl(new AMD64Address(z, idx, Stride.S4, 0), tmp3);
        masm.movq(carry, tmp4);

        masm.bind(labelCheck1);
        masm.addl(idx, 0x2);
        masm.andl(idx, 0x1);
        masm.sublAndJcc(idx, 1, ConditionFlag.Negative, labelPostThirdLoopDone, false);
        masm.movl(tmp4, new AMD64Address(y, idx, Stride.S4, 0));
        masm.mulxq(carry2, tmp3, tmp4);  // tmp4 * rdx -> carry2:tmp3
        masm.movl(tmp4, new AMD64Address(z, idx, Stride.S4, 0));

        add2WithCarry(masm, carry2, tmp3, tmp4, carry);

        masm.movl(new AMD64Address(z, idx, Stride.S4, 0), tmp3);
        masm.shrq(tmp3, 32);

        masm.shlq(carry2, 32);
        masm.orq(tmp3, carry2);
        masm.movq(carry, tmp3);

        masm.bind(labelPostThirdLoopDone);
    }

    private static void multiplyToLen(AMD64MacroAssembler masm,
                    Register x,
                    Register xlen,
                    Register y,
                    Register ylen,
                    Register z,
                    Register zlen,
                    Register tmp1,
                    Register tmp2,
                    Register tmp3,
                    Register tmp4,
                    Register tmp5) {
        Register idx = tmp1;
        Register kdx = tmp2;
        Register xstart = tmp3;
        Register yAtIdx = tmp4;
        Register carry = tmp5;

        Register product = xlen;
        Register xAtXstart = zlen;

        Label labelDone = new Label();
        Label labelSecondLoop = new Label();
        Label labelCarry = new Label();
        Label labelLastX = new Label();
        Label labelThirdLoopPrologue = new Label();

        boolean useBMI2Instructions = masm.supports(BMI2) && masm.supports(AVX);

        // @formatter:off
        // First Loop.
        //
        //  final static long LONG_MASK = 0xffffffffL;
        //  int xstart = xlen - 1;
        //  int ystart = ylen - 1;
        //  long carry = 0;
        //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
        //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
        //    z[kdx] = (int)product;
        //    carry = product >>> 32;
        //  }
        //  z[xstart] = (int)carry;
        // @formatter:on

        masm.movl(idx, ylen);      // idx = ylen;
        masm.movl(kdx, zlen);      // kdx = xlen+ylen;
        masm.xorq(carry, carry);   // carry = 0;

        masm.movl(xstart, xlen);
        masm.declAndJcc(xstart, ConditionFlag.Negative, labelDone, false);

        multiply64x64Loop(masm, x, xstart, xAtXstart, y, yAtIdx, z, carry, product, idx, kdx);

        masm.testlAndJcc(kdx, kdx, ConditionFlag.Zero, labelSecondLoop, false);

        masm.sublAndJcc(kdx, 1, ConditionFlag.Zero, labelCarry, false);

        masm.movl(new AMD64Address(z, kdx, Stride.S4, 0), carry);
        masm.shrq(carry, 32);
        masm.subl(kdx, 1);

        masm.bind(labelCarry);
        masm.movl(new AMD64Address(z, kdx, Stride.S4, 0), carry);

        // @formatter:off
        // Second and third (nested) loops.
        //
        // for (int i = xstart-1; i >= 0; i--) { // Second loop
        //   carry = 0;
        //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
        //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
        //                    (z[k] & LONG_MASK) + carry;
        //     z[k] = (int)product;
        //     carry = product >>> 32;
        //   }
        //   z[i] = (int)carry;
        // }
        //
        // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
        // @formatter:on

        Register jdx = tmp1;

        masm.bind(labelSecondLoop);
        masm.xorq(carry, carry);    // carry = 0;
        masm.movl(jdx, ylen);       // j = ystart+1
        // i = xstart-1;
        masm.sublAndJcc(xstart, 1, ConditionFlag.Negative, labelDone, false);

        masm.push(z);

        // z = z + k - j
        masm.leaq(z, new AMD64Address(z, xstart, Stride.S4, 4));
        // i = xstart-1;
        masm.sublAndJcc(xstart, 1, ConditionFlag.Negative, labelLastX, false);

        if (useBMI2Instructions) {
            masm.movq(rdx, new AMD64Address(x, xstart, Stride.S4, 0));
            masm.rorxq(rdx, rdx, 32); // convert big-endian to little-endian
        } else {
            masm.movq(xAtXstart, new AMD64Address(x, xstart, Stride.S4, 0));
            masm.rorq(xAtXstart, 32);  // convert big-endian to little-endian
        }

        masm.bind(labelThirdLoopPrologue);

        masm.push(x);
        masm.push(xstart);
        masm.push(ylen);

        if (useBMI2Instructions) {
            multiply128x128BMI2Loop(masm, y, z, carry, x, jdx, ylen, product, tmp2, xAtXstart, tmp3, tmp4);
        } else { // !UseBMI2Instructions
            multiply128x128Loop(masm, xAtXstart, y, z, yAtIdx, jdx, ylen, carry, product, x);
        }

        masm.pop(ylen);
        masm.pop(xlen);
        masm.pop(x);
        masm.pop(z);

        masm.movl(tmp3, xlen);
        masm.addl(tmp3, 1);
        masm.movl(new AMD64Address(z, tmp3, Stride.S4, 0), carry);
        masm.sublAndJcc(tmp3, 1, ConditionFlag.Negative, labelDone, false);

        masm.shrq(carry, 32);
        masm.movl(new AMD64Address(z, tmp3, Stride.S4, 0), carry);
        masm.jmp(labelSecondLoop);

        // Next infrequent code is moved outside loops.
        masm.bind(labelLastX);

        if (useBMI2Instructions) {
            masm.movl(rdx, new AMD64Address(x));
        } else {
            masm.movl(xAtXstart, new AMD64Address(x));
        }
        masm.jmp(labelThirdLoopPrologue);

        masm.bind(labelDone);
    }

    @Override
    public boolean modifiesStackPointer() {
        return true;
    }
}