jdk.graal.compiler.lir.aarch64.AArch64ArrayIndexOfOp Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of compiler Show documentation
The GraalVM compiler and the Graal-truffle optimizer.
The newest version!
/*
 * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2021, Arm Limited. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package jdk.graal.compiler.lir.aarch64;

import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R;
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDInstruction.LD4_MULTIPLE_4R;
import static jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ASIMDSize.FullReg;
import static jdk.graal.compiler.asm.aarch64.AArch64Address.createStructureImmediatePostIndexAddress;
import static jdk.graal.compiler.asm.aarch64.AArch64Assembler.ConditionFlag;
import static jdk.graal.compiler.lir.LIRInstruction.OperandFlag.REG;
import static jdk.vm.ci.aarch64.AArch64.zr;
import static jdk.vm.ci.code.ValueUtil.asRegister;

import java.util.Arrays;

import jdk.graal.compiler.asm.Label;
import jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler.ElementSize;
import jdk.graal.compiler.asm.aarch64.AArch64Address;
import jdk.graal.compiler.asm.aarch64.AArch64Address.AddressingMode;
import jdk.graal.compiler.asm.aarch64.AArch64Assembler.ExtendType;
import jdk.graal.compiler.asm.aarch64.AArch64Assembler.ShiftType;
import jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler;
import jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler.ScratchRegister;
import jdk.graal.compiler.core.common.Stride;
import jdk.graal.compiler.debug.GraalError;
import jdk.graal.compiler.lir.LIRInstructionClass;
import jdk.graal.compiler.lir.Opcode;
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
import jdk.graal.compiler.lir.gen.LIRGeneratorTool.ArrayIndexOfVariant;
import jdk.vm.ci.aarch64.AArch64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.Value;

@Opcode("AArch64_ARRAY_INDEX_OF")
public final class AArch64ArrayIndexOfOp extends AArch64ComplexVectorOp {
    public static final LIRInstructionClass TYPE = LIRInstructionClass.create(AArch64ArrayIndexOfOp.class);

    private final ArrayIndexOfVariant variant;
    private final boolean findTwoConsecutive;
    private final boolean withMask;
    private final Stride stride;

    @Def({REG}) protected AllocatableValue resultValue;
    @Alive({REG}) protected AllocatableValue arrayPtrValue;
    @Alive({REG}) protected AllocatableValue arrayOffsetValue;
    @Alive({REG}) protected AllocatableValue arrayLengthValue;
    @Alive({REG}) protected AllocatableValue fromIndexValue;
    @Alive({REG}) protected AllocatableValue[] searchValues;

    @Temp({REG}) protected AllocatableValue[] temp;
    @Temp({REG}) protected Value[] vectorTemp;

    public AArch64ArrayIndexOfOp(Stride stride, ArrayIndexOfVariant variant, LIRGeneratorTool tool,
                    AllocatableValue result, AllocatableValue arrayPtr, AllocatableValue arrayOffset, AllocatableValue arrayLength, AllocatableValue fromIndex,
                    AllocatableValue[] searchValues) {
        super(TYPE);

        int nValues = searchValues.length;
        GraalError.guarantee(result.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(arrayPtr.getPlatformKind() == AArch64Kind.QWORD, "pointer value expected");
        GraalError.guarantee(arrayOffset.getPlatformKind() == AArch64Kind.QWORD, "long value expected");
        GraalError.guarantee(arrayLength.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(fromIndex.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        if (variant == ArrayIndexOfVariant.Table) {
            GraalError.guarantee(searchValues.length == 1 && searchValues[0].getPlatformKind() == AArch64Kind.QWORD, "single pointer value expected");
        } else {
            GraalError.guarantee(Arrays.stream(searchValues).allMatch(sv -> sv.getPlatformKind() == AArch64Kind.DWORD), "int values expected");
        }

        this.stride = stride;
        this.variant = variant;
        this.findTwoConsecutive = variant == ArrayIndexOfVariant.FindTwoConsecutive || variant == ArrayIndexOfVariant.FindTwoConsecutiveWithMask;
        this.withMask = variant == ArrayIndexOfVariant.WithMask || variant == ArrayIndexOfVariant.FindTwoConsecutiveWithMask;
        resultValue = result;
        arrayPtrValue = arrayPtr;
        arrayOffsetValue = arrayOffset;
        arrayLengthValue = arrayLength;
        fromIndexValue = fromIndex;
        this.searchValues = searchValues;

        temp = allocateTempRegisters(tool, 5);
        vectorTemp = allocateVectorRegisters(tool, getNumberOfRequiredVectorRegisters(variant, stride, nValues), variant == ArrayIndexOfVariant.Table);
    }

    private static int getNumberOfRequiredVectorRegisters(ArrayIndexOfVariant variant, Stride stride, int nValues) {
        switch (variant) {
            case MatchAny:
                return nValues + (nValues > 1 ? 6 : 3);
            case MatchRange:
                return nValues + 6;
            case WithMask:
                return nValues + 3;
            case FindTwoConsecutive:
            case FindTwoConsecutiveWithMask:
                return nValues + 5;
            case Table:
                switch (stride) {
                    case S1:
                        return 7;
                    case S2:
                        return 9;
                    case S4:
                        return 11;
                    default:
                        throw GraalError.shouldNotReachHereUnexpectedValue(stride); // ExcludeFromJacocoGeneratedReport
                }
            default:
                throw GraalError.shouldNotReachHereUnexpectedValue(variant); // ExcludeFromJacocoGeneratedReport
        }
    }

    private void emitScalarCode(AArch64MacroAssembler masm, Register baseAddress, Register searchLength) {
        Register result = asRegister(resultValue);
        Register arrayLength = asRegister(arrayLengthValue);
        Register curValue = asRegister(temp[2]);

        Label match = new Label();
        Label searchByElementLoop = new Label();
        Label done = new Label();

        /*
         * AArch64 comparisons are at minimum 32 bits; since we are comparing against a
         * zero-extended value, the searchValue must also be zero-extended. This is already done by
         * the caller.
         */
        final int memAccessSize = (findTwoConsecutive ? 2 : 1) * stride.value * Byte.SIZE;
        Register searchValueReg;
        Register maskReg;
        int compareSize = Math.max(32, memAccessSize);
        if (findTwoConsecutive) {
            searchValueReg = asRegister(temp[3]);
            masm.lsl(compareSize, searchValueReg, asRegister(searchValues[1]), (long) stride.value * Byte.SIZE);
            masm.orr(compareSize, searchValueReg, searchValueReg, asRegister(searchValues[0]));
            if (withMask) {
                maskReg = asRegister(temp[4]);
                masm.lsl(compareSize, maskReg, asRegister(searchValues[3]), (long) stride.value * Byte.SIZE);
                masm.orr(compareSize, maskReg, maskReg, asRegister(searchValues[2]));
            } else {
                maskReg = null;
            }
        } else {
            searchValueReg = asRegister(searchValues[0]);
            if (withMask) {
                maskReg = asRegister(searchValues[1]);
            } else {
                maskReg = null;
            }
        }

        /*
         * Searching the array sequentially for the target value. Note that all indexing is done via
         * a negative offset from the first position beyond the end of search region.
         */
        Register endIndex = arrayLength;
        if (findTwoConsecutive) {
            /* The end of the region is the second to last element in the array */
            masm.sub(32, endIndex, endIndex, 1);
            masm.cbz(32, endIndex, done);
        }
        /*
         * Set the base address to be at the end of the region baseAddress + (endIndex << shift
         * size).
         */
        masm.add(64, baseAddress, baseAddress, endIndex, ExtendType.SXTW, stride.log2);
        /* Initial index is -searchLength << shift size */
        Register curIndex = searchLength;
        masm.sub(64, curIndex, zr, curIndex, ShiftType.LSL, stride.log2);
        /* Loop doing element-by-element search */
        masm.align(AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT);
        masm.bind(searchByElementLoop);
        masm.ldr(memAccessSize, curValue, AArch64Address.createRegisterOffsetAddress(memAccessSize, baseAddress, curIndex, false));
        switch (variant) {
            case MatchAny:
                for (AllocatableValue searchValue : searchValues) {
                    masm.cmp(compareSize, asRegister(searchValue), curValue);
                    masm.branchConditionally(ConditionFlag.EQ, match);
                }
                break;
            case MatchRange:
                for (int i = 0; i < searchValues.length; i += 2) {
                    Label noMatch = new Label();
                    masm.cmp(compareSize, curValue, asRegister(searchValues[i]));
                    masm.branchConditionally(ConditionFlag.LO, noMatch);
                    masm.cmp(compareSize, curValue, asRegister(searchValues[i + 1]));
                    masm.branchConditionally(ConditionFlag.LS, match);
                    masm.bind(noMatch);
                }
                break;
            case WithMask:
            case FindTwoConsecutiveWithMask:
                masm.orr(compareSize, curValue, curValue, maskReg);
                masm.cmp(compareSize, searchValueReg, curValue);
                masm.branchConditionally(ConditionFlag.EQ, match);
                break;
            case FindTwoConsecutive:
                masm.cmp(compareSize, searchValueReg, curValue);
                masm.branchConditionally(ConditionFlag.EQ, match);
                break;
            case Table:
                Label greaterThan0xff = new Label();
                try (ScratchRegister sc = masm.getScratchRegister()) {
                    Register tmp = sc.getRegister();
                    if (stride.value > 1) {
                        masm.compare(compareSize, curValue, 0xff);
                        masm.branchConditionally(ConditionFlag.HI, greaterThan0xff);
                    }
                    // get lower 4 bits
                    masm.and(compareSize, tmp, curValue, 0xf);
                    // get upper 4 bits
                    masm.asr(compareSize, curValue, curValue, 4);
                    // add offset to second lookup table
                    masm.add(compareSize, tmp, tmp, 16);
                    // load lookup table entries
                    masm.ldr(8, curValue, AArch64Address.createRegisterOffsetAddress(8, asRegister(searchValues[0]), curValue, false));
                    masm.ldr(8, tmp, AArch64Address.createRegisterOffsetAddress(8, asRegister(searchValues[0]), tmp, false));
                    // AND results
                    masm.tst(compareSize, curValue, tmp);
                    // if result is non-zero, a match was found
                    masm.branchConditionally(ConditionFlag.NE, match);
                    masm.bind(greaterThan0xff);
                }
                break;
        }
        /*
         * Add elementSize to the curIndex and retry if the end of the search region has not yet
         * been reached, i.e., the curIndex is still < 0.
         */
        masm.adds(64, curIndex, curIndex, stride.value);
        masm.branchConditionally(ConditionFlag.MI, searchByElementLoop);
        masm.jmp(done);

        masm.align(AArch64MacroAssembler.PREFERRED_BRANCH_TARGET_ALIGNMENT);
        masm.bind(match);
        /* index = endIndex + (curIndex >> shiftSize) */
        masm.add(32, result, endIndex, curIndex, ShiftType.ASR, stride.log2);
        masm.bind(done);
        if (findTwoConsecutive) {
            /* restore arrayLength since it is marked as @Alive */
            masm.add(32, endIndex, endIndex, 1);
        }
    }

    private void emitSIMDCode(AArch64MacroAssembler masm, Register baseAddress) {
        /*
         * @formatter:off
         *  Find a single char in a chunk:
         *  Chunk-based reading uses the following approach (with example for UTF-16) inspired from [1]. For SIMD implementation, the steps
         *  represent computation applied to one lane of 8 bytes. SIMD version replicates the following steps to all 4 lanes of 8 bytes each.
         *  1. Fill 8-byte chunk with search element ('searchElement'): 0x000000000000elem -> 0xelemelemelemelem
         *  2. Read and compare array chunk-by-chunk.
         *   2.1 Store end index at the beginning of the last chunk ('refAddress'). This ensures that we don't read beyond array boundary.
         *   2.2 Array is processed in chunks: the first unaligned chunk (head) -> all 32-byte aligned chunks -> the last unaligned chunk
         *   of 32 bytes (tail). Transitions between processing an unaligned and aligned chunk repeats element search on overlapping part.
         *   However, overhead gets smaller with longer strings and compensated by the gains from using larger (32-byte sized) chunks.
         *   2.3 After processing each chunk, adjust the address 32-byte aligned to read the next chunk. Repeat this step until the last
         *   chunk is reached (address of the next chunk >= 'refAddress').
         *   2.4 On reaching the last chunk, reset the address to read the last chunk.
         *  3. Check each read chunk to see if 'searchElement' is found, and if so, calculate the match index with calcIndexOfFirstMatch.
         *
         *  Find two consecutive chars in a chunk:
         *  The findTwoConsecutive case uses the same steps as finding a single character in a chunk but for two characters separately.
         *  To look for two consecutive characters 'c1c2', we search 'c1' in a first chunk [-1..n-1] and 'c2' in the second chunk at [0..n].
         *  Consequently, if the matching sequence is present in a chunk, it will be found at the same position in their respective chunks.
         *  The following list highlights the differences compared to the steps to search a single character in a chunk.
         *   1a. Use two registers, each repeating one of the two consecutive characters to search for.
         *   2a. Read the second chunk starting from the 32 byte aligned ref position. Read the first chunk by concatenating the last element 
         *       of the last iteration's second chunk and the current second chunk.
         *   3a. Compare a chunk for presence of the corresponding char.
         *    3a.1 The first chunk is compared with the register repeating the first char and the same for the second chunk.
         *    3a.2 Perform logical AND on the outcome of comparisons for the first and second char.
         *   4a. As the second chunk starts at a char offset from the first chunk, the result of AND from 3a.2 gives a register with all the
         *  bits set at the position where the match is found. The steps to find the position of the match in the searchString remain unchanged.
         *
         * Other variants are described in their implementation below.
         *
         *  [1] https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strchr.S
         * @formatter:on
         * */

        int chunkSize = getSIMDLoopChunkSize();

        Register result = asRegister(resultValue);
        Register arrayLength = asRegister(arrayLengthValue);
        Register fromIndex = asRegister(fromIndexValue);
        Register currOffset = asRegister(temp[1]);
        Register searchEnd = asRegister(temp[2]);
        Register refAddress = asRegister(temp[3]);
        Register array = asRegister(temp[4]);

        Register[] vecSearchValues = new Register[searchValues.length];
        Register[] vecTmp = new Register[vectorTemp.length - (searchValues.length + 2)];
        final Register vecArray1;
        final Register vecArray2;
        Register vecMask0x0f;
        Register vecTableHi;
        Register vecTableLo;

        if (variant == ArrayIndexOfVariant.Table) {
            vecMask0x0f = asRegister(vectorTemp[0]);
            vecTableHi = asRegister(vectorTemp[1]);
            vecTableLo = asRegister(vectorTemp[2]);
            switch (stride) {
                case S1:
                    vecArray1 = asRegister(vectorTemp[3]);
                    vecArray2 = asRegister(vectorTemp[4]);
                    vecTmp[0] = asRegister(vectorTemp[5]);
                    vecTmp[1] = asRegister(vectorTemp[6]);
                    break;
                case S2:
                    // consecutive register ordering required for LD2 instructions
                    vecArray1 = asRegister(vectorTemp[3]);
                    vecTmp[0] = asRegister(vectorTemp[4]);
                    vecArray2 = asRegister(vectorTemp[5]);
                    vecTmp[1] = asRegister(vectorTemp[6]);
                    vecTmp[2] = asRegister(vectorTemp[7]);
                    vecTmp[3] = asRegister(vectorTemp[8]);
                    break;
                case S4:
                    // consecutive register ordering required for LD4 instructions
                    vecArray1 = asRegister(vectorTemp[3]);
                    vecTmp[0] = asRegister(vectorTemp[4]);
                    vecTmp[1] = asRegister(vectorTemp[5]);
                    vecTmp[2] = asRegister(vectorTemp[6]);
                    vecArray2 = asRegister(vectorTemp[7]);
                    vecTmp[3] = asRegister(vectorTemp[8]);
                    vecTmp[4] = asRegister(vectorTemp[9]);
                    vecTmp[5] = asRegister(vectorTemp[10]);
                    break;
                default:
                    throw GraalError.shouldNotReachHereUnexpectedValue(stride); // ExcludeFromJacocoGeneratedReport
            }
        } else {
            vecArray1 = asRegister(vectorTemp[0]);
            vecArray2 = asRegister(vectorTemp[1]);
            for (int i = 0; i < searchValues.length; i++) {
                vecSearchValues[i] = asRegister(vectorTemp[i + 2]);
            }
            for (int i = searchValues.length + 2; i < vectorTemp.length - (findTwoConsecutive ? 1 : 0); i++) {
                vecTmp[i - (searchValues.length + 2)] = asRegister(vectorTemp[i]);
            }
            vecMask0x0f = null;
            vecTableHi = null;
            vecTableLo = null;
        }
        Register vecLastArray2 = findTwoConsecutive ? asRegister(vectorTemp[vectorTemp.length - 1]) : null;

        Label matchInChunk = new Label();
        Label searchByChunkLoopHead = new Label();
        Label searchByChunkLoopTail = new Label();
        Label processTail = new Label();
        Label end = new Label();

        ElementSize eSize = ElementSize.fromStride(stride);
        if (variant == ArrayIndexOfVariant.Table) {
            // in the table variant, searchValue0 is actually a pointer to a 32-byte array
            masm.fldp(128, vecTableHi, vecTableLo, AArch64Address.createPairBaseRegisterOnlyAddress(128, asRegister(searchValues[0])));
            masm.neon.moveVI(FullReg, ElementSize.Byte, vecMask0x0f, 0x0f);
        } else {
            /* 1. Duplicate the searchElement(s) to 16-bytes */
            for (int i = 0; i < searchValues.length; i++) {
                masm.neon.dupVG(FullReg, eSize, vecSearchValues[i], asRegister(searchValues[i]));
            }
        }
        /*
         * 2.1 Set searchEnd pointing to byte after the last valid element in the array and
         * 'refAddress' pointing to the beginning of the last chunk.
         */
        masm.add(64, searchEnd, baseAddress, arrayLength, ExtendType.SXTW, stride.log2);
        masm.sub(64, refAddress, searchEnd, chunkSize);
        /* Set 'array' pointing to the chunk from where the search begins. */
        masm.add(64, array, baseAddress, fromIndex, ExtendType.SXTW, stride.log2);

        if (findTwoConsecutive) {
            // initialize previous chunk tail vector to a value that can't match the first search
            // value.
            masm.neon.notVV(FullReg, vecLastArray2, vecSearchValues[0]);
            // peel first loop iteration because in this variant the loop body depends on the last
            // iteration's state, and we still want to align memory accesses
            masm.cmp(64, refAddress, array);
            masm.branchConditionally(ConditionFlag.LS, processTail);
            masm.sub(64, currOffset, array, baseAddress);
            masm.fldp(128, vecArray1, vecArray2, AArch64Address.createImmediateAddress(128, AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, array, 32));
            emitSIMDMatch(masm, eSize, array, vecSearchValues, vecTmp, vecArray1, vecArray2, vecLastArray2, vecMask0x0f, vecTableHi, vecTableLo, matchInChunk);
            // align address to 32-byte boundary
            masm.bic(64, array, array, chunkSize - 1);
            // fix vecLastArray2: load from array - 1 and move the resulting vector's first element
            // to the last
            masm.fldr(128, vecArray2, AArch64Address.createImmediateAddress(128, AddressingMode.IMMEDIATE_SIGNED_UNSCALED, array, -stride.value));
            masm.neon.insXX(eSize, vecLastArray2, (FullReg.bytes() / stride.value) - 1, vecArray2, 0);
        }

        masm.align(AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT);
        masm.bind(searchByChunkLoopHead);
        masm.cmp(64, refAddress, array);
        masm.branchConditionally(ConditionFlag.LS, processTail);
        masm.bind(searchByChunkLoopTail);
        masm.sub(64, currOffset, array, baseAddress);

        if (variant != ArrayIndexOfVariant.Table) {
            masm.fldp(128, vecArray1, vecArray2, AArch64Address.createImmediateAddress(128, AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, array, 32));
        }
        emitSIMDMatch(masm, eSize, array, vecSearchValues, vecTmp, vecArray1, vecArray2, vecLastArray2, vecMask0x0f, vecTableHi, vecTableLo, matchInChunk);
        /* No match; jump to next loop iteration. */
        if (!findTwoConsecutive) {
            // align address to 32-byte boundary
            masm.bic(64, array, array, chunkSize - 1);
        }
        masm.jmp(searchByChunkLoopHead);

        masm.align(AArch64MacroAssembler.PREFERRED_BRANCH_TARGET_ALIGNMENT);
        masm.bind(processTail);
        masm.cmp(64, array, searchEnd);
        masm.branchConditionally(ConditionFlag.HS, end);
        if (findTwoConsecutive) {
            masm.sub(64, array, searchEnd, chunkSize + stride.value);
            // fix vecLastArray2 for last iteration: load from last chunk index - 1 and move the
            // resulting vector's first element to the last
            masm.fldr(128, vecArray2, AArch64Address.createImmediateAddress(128, AddressingMode.IMMEDIATE_POST_INDEXED, array, stride.value));
            masm.neon.insXX(eSize, vecLastArray2, (FullReg.bytes() / stride.value) - 1, vecArray2, 0);
        } else {
            masm.sub(64, array, searchEnd, chunkSize);
        }
        /*
         * Set 'searchEnd' to zero because at the end of 'searchByChunkLoopTail', the 'array' will
         * be rolled back to a 32-byte aligned addressed. Thus, unless 'searchEnd' is adjusted, the
         * 'processTail' comparison condition 'array' >= 'searchEnd' may never be true.
         */
        masm.mov(64, searchEnd, zr);
        masm.jmp(searchByChunkLoopTail);

        /* 4. If the element is found in a 32-byte chunk then find its position. */
        masm.align(AArch64MacroAssembler.PREFERRED_BRANCH_TARGET_ALIGNMENT);
        masm.bind(matchInChunk);
        if (variant == ArrayIndexOfVariant.Table) {
            // convert matching bytes to 0xff
            masm.neon.cmtstVVV(FullReg, ElementSize.Byte, vecArray1, vecArray1, vecArray1);
            masm.neon.cmtstVVV(FullReg, ElementSize.Byte, vecArray2, vecArray2, vecArray2);
        }
        try (ScratchRegister scratchReg = masm.getScratchRegister()) {
            Register tmp = scratchReg.getRegister();
            initCalcIndexOfFirstMatchMask(masm, vecTmp[0], tmp);
            calcIndexOfFirstMatch(masm, tmp, vecArray1, vecArray2, vecTmp[0], false);
            if (variant == ArrayIndexOfVariant.Table) {
                masm.asr(64, currOffset, currOffset, stride.log2);
            }
            masm.add(64, result, currOffset, tmp, ShiftType.ASR, 1);
            if (findTwoConsecutive) {
                masm.sub(64, result, result, 1);
            }
        }
        if (getMatchResultStride().log2 != 0) {
            /* Convert byte offset of searchElement to its array index */
            masm.asr(64, result, result, getMatchResultStride().log2);
        }
        masm.bind(end);
    }

    private void emitSIMDMatch(AArch64MacroAssembler masm,
                    ElementSize eSize,
                    Register array,
                    Register[] vecSearchValues,
                    Register[] vecTmp,
                    Register vecArray1,
                    Register vecArray2,
                    Register vecLastArray2,
                    Register vecMask0x0f,
                    Register vecTableHi,
                    Register vecTableLo,
                    Label matchInChunk) {
        if (findTwoConsecutive) {
            /*
             * In the findTwoConsecutive case, the first search element is compared the current
             * chunk offset by -1. vecTmp[0] holds values vecLastArray2[15]:vecArray1[0:15] and
             * vecTmp[1] holds values vecArray1[15]:vecArray2[0:15].
             */
            // setting vecTmp[0] = vecLastArray2[15]:vecArray1[0:15]
            masm.neon.extVVV(FullReg, vecTmp[0], vecLastArray2, vecArray1, FullReg.bytes() - stride.value);
            // setting vecTmp[1] = vecArray1[15]:vecArray2[0:15]
            masm.neon.extVVV(FullReg, vecTmp[1], vecArray1, vecArray2, FullReg.bytes() - stride.value);
            // save vecArray2 for next iteration
            masm.neon.moveVV(FullReg, vecLastArray2, vecArray2);
        }
        switch (variant) {
            case MatchAny:
                int nValues = vecSearchValues.length;
                if (nValues == 1) {
                    masm.neon.cmeqVVV(FullReg, eSize, vecArray1, vecArray1, vecSearchValues[0]);
                    masm.neon.cmeqVVV(FullReg, eSize, vecArray2, vecArray2, vecSearchValues[0]);
                } else {
                    masm.neon.cmeqVVV(FullReg, eSize, vecTmp[0], vecArray1, vecSearchValues[0]);
                    masm.neon.cmeqVVV(FullReg, eSize, vecTmp[1], vecArray2, vecSearchValues[0]);
                    for (int i = 1; i < nValues; i++) {
                        masm.neon.cmeqVVV(FullReg, eSize, vecTmp[2], vecArray1, vecSearchValues[i]);
                        masm.neon.cmeqVVV(FullReg, eSize, vecTmp[3], vecArray2, vecSearchValues[i]);
                        masm.neon.orrVVV(FullReg, i == nValues - 1 ? vecArray1 : vecTmp[0], vecTmp[0], vecTmp[2]);
                        masm.neon.orrVVV(FullReg, i == nValues - 1 ? vecArray2 : vecTmp[1], vecTmp[1], vecTmp[3]);
                    }
                }
                break;
            case MatchRange:
                // match first range
                // check if array elements are greater or equal to lower bound
                masm.neon.cmhsVVV(FullReg, eSize, vecTmp[0], vecArray1, vecSearchValues[0]);
                masm.neon.cmhsVVV(FullReg, eSize, vecTmp[1], vecArray2, vecSearchValues[0]);
                // check if upper bound is greater or equal to array elements
                masm.neon.cmhsVVV(FullReg, eSize, vecTmp[2], vecSearchValues[1], vecArray1);
                masm.neon.cmhsVVV(FullReg, eSize, vecTmp[3], vecSearchValues[1], vecArray2);
                if (searchValues.length == 4) {
                    // merge results of first range comparisons
                    masm.neon.andVVV(FullReg, vecTmp[0], vecTmp[0], vecTmp[2]);
                    masm.neon.andVVV(FullReg, vecTmp[1], vecTmp[1], vecTmp[3]);

                    // match second range
                    // check if array elements are greater or equal to lower bound
                    masm.neon.cmhsVVV(FullReg, eSize, vecTmp[2], vecArray1, vecSearchValues[2]);
                    masm.neon.cmhsVVV(FullReg, eSize, vecTmp[3], vecArray2, vecSearchValues[2]);
                    // check if upper bound is greater or equal to array elements
                    masm.neon.cmhsVVV(FullReg, eSize, vecArray1, vecSearchValues[3], vecArray1);
                    masm.neon.cmhsVVV(FullReg, eSize, vecArray2, vecSearchValues[3], vecArray2);
                    // merge results of second range comparisons
                    masm.neon.andVVV(FullReg, vecTmp[2], vecTmp[2], vecArray1);
                    masm.neon.andVVV(FullReg, vecTmp[3], vecTmp[3], vecArray2);

                    // merge results of both range comparisons
                    masm.neon.orrVVV(FullReg, vecArray1, vecTmp[0], vecTmp[2]);
                    masm.neon.orrVVV(FullReg, vecArray2, vecTmp[1], vecTmp[3]);
                } else {
                    // merge results of first range comparisons
                    masm.neon.andVVV(FullReg, vecArray1, vecTmp[0], vecTmp[2]);
                    masm.neon.andVVV(FullReg, vecArray2, vecTmp[1], vecTmp[3]);
                }
                break;
            case WithMask:
                masm.neon.orrVVV(FullReg, vecArray1, vecArray1, vecSearchValues[1]);
                masm.neon.orrVVV(FullReg, vecArray2, vecArray2, vecSearchValues[1]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray1, vecArray1, vecSearchValues[0]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray2, vecArray2, vecSearchValues[0]);
                break;
            case FindTwoConsecutive:
                masm.neon.cmeqVVV(FullReg, eSize, vecTmp[0], vecTmp[0], vecSearchValues[0]);
                masm.neon.cmeqVVV(FullReg, eSize, vecTmp[1], vecTmp[1], vecSearchValues[0]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray1, vecArray1, vecSearchValues[1]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray2, vecArray2, vecSearchValues[1]);
                masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecTmp[0]);
                masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecTmp[1]);
                break;
            case FindTwoConsecutiveWithMask:
                masm.neon.orrVVV(FullReg, vecTmp[0], vecTmp[0], vecSearchValues[2]);
                masm.neon.orrVVV(FullReg, vecTmp[1], vecTmp[1], vecSearchValues[2]);
                masm.neon.orrVVV(FullReg, vecArray1, vecArray1, vecSearchValues[3]);
                masm.neon.orrVVV(FullReg, vecArray2, vecArray2, vecSearchValues[3]);
                masm.neon.cmeqVVV(FullReg, eSize, vecTmp[0], vecTmp[0], vecSearchValues[0]);
                masm.neon.cmeqVVV(FullReg, eSize, vecTmp[1], vecTmp[1], vecSearchValues[0]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray1, vecArray1, vecSearchValues[1]);
                masm.neon.cmeqVVV(FullReg, eSize, vecArray2, vecArray2, vecSearchValues[1]);
                masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecTmp[0]);
                masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecTmp[1]);
                break;
            case Table:
                switch (stride) {
                    case S1:
                        masm.fldp(128, vecArray1, vecArray2, AArch64Address.createImmediateAddress(128, AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, array, 32));
                        performTableLookup(masm, vecMask0x0f, vecTableHi, vecTableLo, vecArray1, vecArray2, vecTmp[0], vecTmp[1]);
                        break;
                    case S2:
                        // de-structuring load: get array element's upper and lower bytes into
                        // separate vectors
                        masm.neon.ld2MultipleVV(FullReg, ElementSize.Byte, vecArray1, vecTmp[0], createStructureImmediatePostIndexAddress(LD2_MULTIPLE_2R, FullReg, eSize, array, 32));
                        masm.neon.ld2MultipleVV(FullReg, ElementSize.Byte, vecArray2, vecTmp[1], createStructureImmediatePostIndexAddress(LD2_MULTIPLE_2R, FullReg, eSize, array, 32));
                        // compare upper bytes to zero
                        masm.neon.moveVI(FullReg, ElementSize.Byte, vecTmp[2], 0);
                        masm.neon.cmeqVVV(FullReg, ElementSize.Byte, vecTmp[0], vecTmp[0], vecTmp[2]);
                        masm.neon.cmeqVVV(FullReg, ElementSize.Byte, vecTmp[1], vecTmp[1], vecTmp[2]);
                        // perform table lookup on lower bytes
                        performTableLookup(masm, vecMask0x0f, vecTableHi, vecTableLo, vecArray1, vecArray2, vecTmp[2], vecTmp[3]);
                        // eliminate all matches where the corresponding high byte is not zero
                        masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecTmp[0]);
                        masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecTmp[1]);
                        break;
                    case S4:
                        // de-structuring load: get array element's upper and lower bytes into
                        // separate vectors
                        masm.neon.ld4MultipleVVVV(FullReg, ElementSize.Byte, vecArray1, vecTmp[0], vecTmp[1], vecTmp[2],
                                        createStructureImmediatePostIndexAddress(LD4_MULTIPLE_4R, FullReg, eSize, array, 64));
                        masm.neon.ld4MultipleVVVV(FullReg, ElementSize.Byte, vecArray2, vecTmp[3], vecTmp[4], vecTmp[5],
                                        createStructureImmediatePostIndexAddress(LD4_MULTIPLE_4R, FullReg, eSize, array, 64));
                        // merge upper bytes
                        masm.neon.orrVVV(FullReg, vecTmp[0], vecTmp[0], vecTmp[1]);
                        masm.neon.orrVVV(FullReg, vecTmp[3], vecTmp[3], vecTmp[4]);
                        masm.neon.orrVVV(FullReg, vecTmp[0], vecTmp[0], vecTmp[2]);
                        masm.neon.orrVVV(FullReg, vecTmp[1], vecTmp[3], vecTmp[5]);
                        // compare upper bytes to zero
                        masm.neon.moveVI(FullReg, ElementSize.Byte, vecTmp[2], 0);
                        masm.neon.cmeqVVV(FullReg, ElementSize.Byte, vecTmp[0], vecTmp[0], vecTmp[2]);
                        masm.neon.cmeqVVV(FullReg, ElementSize.Byte, vecTmp[1], vecTmp[1], vecTmp[2]);
                        // perform table lookup on lower bytes
                        performTableLookup(masm, vecMask0x0f, vecTableHi, vecTableLo, vecArray1, vecArray2, vecTmp[2], vecTmp[3]);
                        // eliminate all matches where the corresponding upper bytes are not zero
                        masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecTmp[0]);
                        masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecTmp[1]);
                        break;
                    default:
                        throw GraalError.shouldNotReachHereUnexpectedValue(stride); // ExcludeFromJacocoGeneratedReport
                }
                break;
        }
        masm.neon.orrVVV(FullReg, vecTmp[0], vecArray1, vecArray2);
        try (ScratchRegister sc = masm.getScratchRegister()) {
            Register tmp = sc.getRegister();
            /* If value != 0, then there was a match somewhere. */
            cbnzVector(masm, ElementSize.fromStride(getMatchResultStride()), vecTmp[0], vecTmp[0], tmp, variant != ArrayIndexOfVariant.Table, matchInChunk);
        }
    }

    private Stride getMatchResultStride() {
        return variant == ArrayIndexOfVariant.Table ? Stride.S1 : stride;
    }

    private int getSIMDLoopChunkSize() {
        return variant == ArrayIndexOfVariant.Table ? 32 * stride.value : 32;
    }

    private static void performTableLookup(AArch64MacroAssembler masm,
                    Register vecMask0xf,
                    Register vecTableHi,
                    Register vecTableLo,
                    Register vecArray1,
                    Register vecArray2,
                    Register vecTmp1,
                    Register vecTmp2) {
        // split bytes into low and high nibbles (4-bit values)
        masm.neon.ushrVVI(FullReg, ElementSize.Byte, vecTmp1, vecArray1, 4);
        masm.neon.ushrVVI(FullReg, ElementSize.Byte, vecTmp2, vecArray2, 4);
        masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecMask0xf);
        masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecMask0xf);
        // perform table lookup using nibbles as indices
        masm.neon.tblVVV(FullReg, vecTmp1, vecTableHi, vecTmp1);
        masm.neon.tblVVV(FullReg, vecTmp2, vecTableHi, vecTmp2);
        masm.neon.tblVVV(FullReg, vecArray1, vecTableLo, vecArray1);
        masm.neon.tblVVV(FullReg, vecArray2, vecTableLo, vecArray2);
        // AND lookup results. If the result is non-zero, a match was found
        masm.neon.andVVV(FullReg, vecArray1, vecArray1, vecTmp1);
        masm.neon.andVVV(FullReg, vecArray2, vecArray2, vecTmp2);
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
        Register result = asRegister(resultValue);
        Register arrayLength = asRegister(arrayLengthValue);
        Register fromIndex = asRegister(fromIndexValue);
        Register baseAddress = asRegister(temp[0]);
        Register searchLength = asRegister(temp[1]);

        Label done = new Label();

        /*
         * @formatter:off
         * The arguments satisfy the following constraints:
         *  1. 0 <= fromIndex <= arrayLength - 1 (or 2 when findTwoConsecutive is true)
         *  2. number of characters in searchChar is 1 (or 2 when findTwoConsecutive is true)
         * @formatter:on
        */
        masm.mov(result, -1);   // Return for empty strings
        masm.subs(32, searchLength, arrayLength, fromIndex);
        if (findTwoConsecutive) {
            /*
             * Because one is trying to find two consecutive elements, the search length is in
             * effect one less
             */
            masm.subs(32, searchLength, searchLength, 1);
        }
        masm.branchConditionally(ConditionFlag.LE, done);

        /* Load address of first array element */
        masm.add(64, baseAddress, asRegister(arrayPtrValue), asRegister(arrayOffsetValue));

        /*
         * Search element-by-element for small arrays (with search space size of less than 32 bytes,
         * i.e., 4 UTF-16 or 8 Latin1 elements) else search chunk-by-chunk.
         */
        Label searchByChunk = new Label();
        int chunkByteSize = getSIMDLoopChunkSize();
        masm.compare(32, searchLength, chunkByteSize / stride.value);
        masm.branchConditionally(ConditionFlag.GE, searchByChunk);

        /* Search sequentially for short arrays */
        emitScalarCode(masm, baseAddress, searchLength);
        masm.jmp(done);

        /* Search chunk-by-chunk for long arrays */
        masm.bind(searchByChunk);
        emitSIMDCode(masm, baseAddress);

        masm.bind(done);
    }
}