All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jdk.graal.compiler.lir.amd64.AMD64ArrayCopyWithConversionsOp Maven / Gradle / Ivy

There is a newer version: 24.1.1
Show newest version
/*
 * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package jdk.graal.compiler.lir.amd64;

import static jdk.vm.ci.amd64.AMD64.r8;
import static jdk.vm.ci.amd64.AMD64.rax;
import static jdk.vm.ci.amd64.AMD64.rcx;
import static jdk.vm.ci.amd64.AMD64.rdi;
import static jdk.vm.ci.amd64.AMD64.rdx;
import static jdk.vm.ci.amd64.AMD64.rsi;
import static jdk.vm.ci.code.ValueUtil.asRegister;
import static jdk.vm.ci.code.ValueUtil.isIllegal;
import static jdk.graal.compiler.asm.amd64.AVXKind.AVXSize.XMM;
import static jdk.graal.compiler.asm.amd64.AVXKind.AVXSize.YMM;

import java.util.Arrays;
import java.util.EnumSet;

import jdk.graal.compiler.asm.Label;
import jdk.graal.compiler.asm.amd64.AMD64Address;
import jdk.graal.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
import jdk.graal.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
import jdk.graal.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
import jdk.graal.compiler.asm.amd64.AMD64MacroAssembler;
import jdk.graal.compiler.asm.amd64.AVXKind.AVXSize;
import jdk.graal.compiler.core.common.LIRKind;
import jdk.graal.compiler.core.common.Stride;
import jdk.graal.compiler.core.common.StrideUtil;
import jdk.graal.compiler.debug.GraalError;
import jdk.graal.compiler.lir.LIRInstructionClass;
import jdk.graal.compiler.lir.Opcode;
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;

import jdk.vm.ci.amd64.AMD64.CPUFeature;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.RegisterValue;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.Value;

/**
 * Arraycopy operation for arbitrary source and destination arrays, with arbitrary byte offset, with
 * support for arbitrary compression and inflation of {@link Stride#S1 8 bit}, {@link Stride#S2 16
 * bit} or {@link Stride#S4 32 bit} array elements.
 * 

* CAUTION: the compression implementation assumes that the upper bytes of {@code char}/{@code int} * values to be compressed are zero. If this assumption is broken, compression will yield incorrect * results! */ @Opcode("ARRAY_COPY_WITH_CONVERSIONS") public final class AMD64ArrayCopyWithConversionsOp extends AMD64ComplexVectorOp { public static final LIRInstructionClass TYPE = LIRInstructionClass.create(AMD64ArrayCopyWithConversionsOp.class); private enum Op { compressCharToByte, compressIntToChar, compressIntToByte, inflateByteToChar, inflateByteToInt, inflateCharToInt, copy } private static final Register REG_ARRAY_SRC = rsi; private static final Register REG_OFFSET_SRC = rax; private static final Register REG_ARRAY_DST = rdi; private static final Register REG_OFFSET_DST = rcx; private static final Register REG_LENGTH = rdx; private static final Register REG_STRIDE = r8; private final Stride strideSrcConst; private final Stride strideDstConst; private final AMD64MacroAssembler.ExtendMode extendMode; @Use({OperandFlag.REG}) private Value arraySrc; @Use({OperandFlag.REG}) private Value offsetSrc; @Use({OperandFlag.REG}) private Value arrayDst; @Use({OperandFlag.REG}) private Value offsetDst; @Use({OperandFlag.REG}) private Value length; @Use({OperandFlag.REG, OperandFlag.ILLEGAL}) private Value dynamicStrides; @Temp({OperandFlag.REG}) private Value arraySrcTmp; @Temp({OperandFlag.REG}) private Value offsetSrcTmp; @Temp({OperandFlag.REG}) private Value arrayDstTmp; @Temp({OperandFlag.REG}) private Value offsetDstTmp; @Temp({OperandFlag.REG}) private Value lengthTmp; @Temp({OperandFlag.REG, OperandFlag.ILLEGAL}) private Value dynamicStridesTmp; @Temp({OperandFlag.REG}) private Value[] vectorTemp; /** * Arraycopy operation for arbitrary source and destination arrays, with arbitrary byte offset, * with support for arbitrary compression and inflation of {@link JavaKind#Byte 8 bit}, * {@link JavaKind#Char 16 bit} or {@link JavaKind#Int 32 bit} array elements. * * @param strideSrc source stride. May be {@link JavaKind#Byte 8 bit}, {@link JavaKind#Char 16 * bit} or {@link JavaKind#Int 32 bit}. * @param strideDst target stride. May be {@link JavaKind#Byte 8 bit}, {@link JavaKind#Char 16 * bit} or {@link JavaKind#Int 32 bit}. * @param arraySrc source array. * @param offsetSrc offset to be added to arraySrc, in bytes. Must include array base offset! * @param arrayDst destination array. * @param offsetDst offset to be added to arrayDst, in bytes. Must include array base offset! * @param length length of the region to copy, scaled to strideDst. * @param extendMode sign- or zero-extend array elements when inflating to a bigger stride. * @param dynamicStrides dynamic stride dispatch as described in {@link StrideUtil}. */ private AMD64ArrayCopyWithConversionsOp(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) { super(TYPE, tool, runtimeCheckedCPUFeatures, YMM); this.extendMode = extendMode; GraalError.guarantee(supports(tool.target(), runtimeCheckedCPUFeatures, CPUFeature.SSE2), "needs at least SSE2 support"); this.arraySrcTmp = this.arraySrc = arraySrc; this.offsetSrcTmp = this.offsetSrc = offsetSrc; this.arrayDstTmp = this.arrayDst = arrayDst; this.offsetDstTmp = this.offsetDst = offsetDst; this.lengthTmp = this.length = length; this.dynamicStridesTmp = this.dynamicStrides = dynamicStrides; if (StrideUtil.useConstantStrides(dynamicStrides)) { this.strideSrcConst = strideSrc; this.strideDstConst = strideDst; this.vectorTemp = new Value[getNumberOfRequiredVectorRegisters(getOp(strideDstConst, strideSrcConst))]; } else { strideSrcConst = null; strideDstConst = null; this.vectorTemp = new Value[5]; } for (int i = 0; i < vectorTemp.length; i++) { vectorTemp[i] = tool.newVariable(LIRKind.value(getVectorKind(JavaKind.Byte))); } } public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, AMD64MacroAssembler.ExtendMode extendMode) { return movParamsAndCreate(tool, strideSrc, strideDst, runtimeCheckedCPUFeatures, arraySrc, offsetSrc, arrayDst, offsetDst, length, Value.ILLEGAL, extendMode); } public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, EnumSet runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value stride, AMD64MacroAssembler.ExtendMode extendMode) { return movParamsAndCreate(tool, null, null, runtimeCheckedCPUFeatures, arraySrc, offsetSrc, arrayDst, offsetDst, length, stride, extendMode); } private static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) { RegisterValue regArraySrc = REG_ARRAY_SRC.asValue(arraySrc.getValueKind()); RegisterValue regOffsetSrc = REG_OFFSET_SRC.asValue(offsetSrc.getValueKind()); RegisterValue regArrayDst = REG_ARRAY_DST.asValue(arrayDst.getValueKind()); RegisterValue regOffsetDst = REG_OFFSET_DST.asValue(offsetDst.getValueKind()); RegisterValue regLength = REG_LENGTH.asValue(length.getValueKind()); final Value regDynamicStrides; tool.emitConvertNullToZero(regArraySrc, arraySrc); tool.emitMove(regOffsetSrc, offsetSrc); tool.emitConvertNullToZero(regArrayDst, arrayDst); tool.emitMove(regOffsetDst, offsetDst); tool.emitMove(regLength, length); if (isIllegal(dynamicStrides)) { regDynamicStrides = Value.ILLEGAL; } else { regDynamicStrides = REG_STRIDE.asValue(dynamicStrides.getValueKind()); tool.emitMove((RegisterValue) regDynamicStrides, dynamicStrides); } return new AMD64ArrayCopyWithConversionsOp(tool, strideSrc, strideDst, runtimeCheckedCPUFeatures, regArraySrc, regOffsetSrc, regArrayDst, regOffsetDst, regLength, regDynamicStrides, extendMode); } private static Op getOp(Stride strideDst, Stride strideSrc) { if (strideDst.value == strideSrc.value) { return Op.copy; } if (strideDst.value < strideSrc.value) { switch (strideSrc) { case S2: assert strideDst == Stride.S1 : strideDst; return Op.compressCharToByte; case S4: switch (strideDst) { case S1: return Op.compressIntToByte; case S2: return Op.compressIntToChar; default: throw new UnsupportedOperationException(); } default: throw new UnsupportedOperationException(); } } switch (strideSrc) { case S1: switch (strideDst) { case S2: return Op.inflateByteToChar; case S4: return Op.inflateByteToInt; default: throw new UnsupportedOperationException(); } case S2: assert strideDst == Stride.S4 : strideDst; return Op.inflateCharToInt; default: throw new UnsupportedOperationException(); } } private static int getNumberOfRequiredVectorRegisters(Op op) { switch (op) { case compressCharToByte: case compressIntToChar: return 2; case compressIntToByte: return 5; default: return 1; } } @Override public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { Register src = asRegister(arraySrc); Register sro = asRegister(offsetSrc); Register dst = asRegister(arrayDst); Register dso = asRegister(offsetDst); Register len = asRegister(length); asm.leaq(src, new AMD64Address(src, sro, Stride.S1)); asm.leaq(dst, new AMD64Address(dst, dso, Stride.S1)); if (isIllegal(dynamicStrides)) { emitOp(crb, asm, this.strideSrcConst, this.strideDstConst, src, sro, dst, len); } else { Label[] variants = new Label[9]; Label end = new Label(); for (int i = 0; i < variants.length; i++) { variants[i] = new Label(); } AMD64ControlFlow.RangeTableSwitchOp.emitJumpTable(crb, asm, dso, asRegister(dynamicStrides), 0, 8, Arrays.stream(variants)); // use the 1-byte-1-byte stride variant for the 2-2 and 4-4 cases by simply shifting the // length asm.align(preferredBranchTargetAlignment(crb)); asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S4, Stride.S4)]); asm.maybeEmitIndirectTargetMarker(); asm.shll(len, 1); asm.align(preferredBranchTargetAlignment(crb)); asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S2, Stride.S2)]); asm.maybeEmitIndirectTargetMarker(); asm.shll(len, 1); asm.align(preferredBranchTargetAlignment(crb)); asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S1, Stride.S1)]); asm.maybeEmitIndirectTargetMarker(); emitOp(crb, asm, Stride.S1, Stride.S1, src, sro, dst, len); asm.jmp(end); for (Stride strideSrc : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) { for (Stride strideDst : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) { if (strideSrc == strideDst) { continue; } asm.align(preferredBranchTargetAlignment(crb)); asm.bind(variants[StrideUtil.getDirectStubCallIndex(strideSrc, strideDst)]); asm.maybeEmitIndirectTargetMarker(); emitOp(crb, asm, strideSrc, strideDst, src, sro, dst, len); asm.jmp(end); } } asm.bind(end); } } private void emitOp(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register sro, Register dst, Register len) { if (strideSrc.value < strideDst.value) { emitInflate(crb, asm, strideSrc, strideDst, src, dst, len, sro); } else if (strideSrc.value == strideDst.value) { emitCopy(crb, asm, strideSrc, strideDst, src, dst, len, sro); } else { emitCompress(crb, asm, strideSrc, strideDst, src, dst, len, sro); } } /** * Compress array {@code src} into array {@code dst} using {@code PACKUSWB}/{@code PACKUSDW}. * CAUTION: This operation assumes that the upper bytes (which are remove by the compression) of * {@code src} are zero. Breaking this assumption will lead to incorrect results. */ private void emitCompress(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) { Op op = getOp(strideDst, strideSrc); Label labelScalarLoop = new Label(); Label labelDone = new Label(); if (asm.supports(CPUFeature.SSE4_2)) { Label labelPackVectorLoop = new Label(); Label labelPack16Bytes = new Label(); Label labelPack8Bytes = new Label(); Label labelCopyTail = new Label(); int vectorLength = vectorSize.getBytes() / strideDst.value; asm.movl(tmp, len); // tmp = tail count (in strideDst) asm.andl(tmp, vectorLength - 1); // len = vector count (in strideDst) asm.andlAndJcc(len, -vectorLength, ConditionFlag.Zero, supportsAVX2AndYMM() ? labelPack16Bytes : labelPack8Bytes, false); if (supportsAVX2AndYMM() && op == Op.compressIntToByte) { loadMask(crb, asm, asRegister(vectorTemp[4]), getAVX2IntToBytePackingUnscrambleMap()); } asm.leaq(src, new AMD64Address(src, len, strideSrc)); asm.leaq(dst, new AMD64Address(dst, len, strideDst)); asm.negq(len); // vectorized loop asm.align(preferredLoopAlignment(crb)); asm.bind(labelPackVectorLoop); packVector(asm, vectorSize, op, strideSrc, strideDst, src, dst, len, 0, false); asm.addqAndJcc(len, vectorLength, ConditionFlag.NotZero, labelPackVectorLoop, true); // vectorized tail packVector(asm, vectorSize, op, strideSrc, strideDst, src, dst, tmp, -vectorSize.getBytes(), false); asm.jmp(labelDone); if (supportsAVX2AndYMM()) { asm.bind(labelPack16Bytes); int vectorSizeXMM = XMM.getBytes() / strideDst.value; asm.cmplAndJcc(tmp, vectorSizeXMM, ConditionFlag.Less, labelPack8Bytes, true); packVector(asm, XMM, op, strideSrc, strideDst, src, dst, len, 0, true); packVector(asm, XMM, op, strideSrc, strideDst, src, dst, tmp, -XMM.getBytes(), false); asm.jmpb(labelDone); } asm.bind(labelPack8Bytes); int vectorSizeQ = 8 / strideDst.value; asm.cmplAndJcc(tmp, vectorSizeQ, ConditionFlag.Less, labelCopyTail, true); // array is too small for vectorized loop, use half vectors // pack aligned to beginning pack8Bytes(asm, op, strideSrc, strideDst, src, dst, len, 0, true); // pack aligned to end pack8Bytes(asm, op, strideSrc, strideDst, src, dst, tmp, -8, false); asm.jmpb(labelDone); asm.bind(labelCopyTail); asm.movl(len, tmp); } asm.testlAndJcc(len, len, ConditionFlag.Zero, labelDone, true); asm.leaq(src, new AMD64Address(src, len, strideSrc)); asm.leaq(dst, new AMD64Address(dst, len, strideDst)); asm.negq(len); // scalar loop asm.bind(labelScalarLoop); switch (op) { case compressCharToByte: asm.movzwl(tmp, new AMD64Address(src, len, strideSrc)); asm.movb(new AMD64Address(dst, len, strideDst), tmp); break; case compressIntToChar: asm.movl(tmp, new AMD64Address(src, len, strideSrc)); asm.movw(new AMD64Address(dst, len, strideDst), tmp); break; case compressIntToByte: asm.movl(tmp, new AMD64Address(src, len, strideSrc)); asm.movb(new AMD64Address(dst, len, strideDst), tmp); break; } asm.incqAndJcc(len, ConditionFlag.NotZero, labelScalarLoop, true); asm.bind(labelDone); } private void packVector(AMD64MacroAssembler asm, AVXSize vecSize, Op op, Stride strideSrc, Stride strideDst, Register src, Register dst, Register index, int displacement, boolean direct) { int displacementSrc = displacement << strideSrc.log2 - strideDst.log2; Register vec1 = asRegister(vectorTemp[0]); Register vec2 = asRegister(vectorTemp[1]); asm.movdqu(vecSize, vec1, indexAddressOrDirect(strideSrc, src, index, displacementSrc, 0, direct)); asm.movdqu(vecSize, vec2, indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes(), direct)); switch (op) { case compressCharToByte: asm.packuswb(vecSize, vec1, vec2); break; case compressIntToChar: asm.packusdw(vecSize, vec1, vec2); break; case compressIntToByte: Register vec3 = asRegister(vectorTemp[2]); Register vec4 = asRegister(vectorTemp[3]); asm.movdqu(vecSize, vec3, indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes() * 2, direct)); asm.movdqu(vecSize, vec4, indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes() * 3, direct)); asm.packusdw(vecSize, vec1, vec2); asm.packusdw(vecSize, vec3, vec4); asm.packuswb(vecSize, vec1, vec3); break; } if (vecSize == YMM) { if (op == Op.compressIntToByte) { VexRVMOp.VPERMD.emit(asm, vecSize, vec1, asRegister(vectorTemp[4]), vec1); } else { VexRMIOp.VPERMQ.emit(asm, vecSize, vec1, vec1, 0b11011000); } } asm.movdqu(vecSize, new AMD64Address(dst, index, strideDst, displacement), vec1); } private void pack8Bytes(AMD64MacroAssembler masm, Op op, Stride strideSrc, Stride strideDst, Register src, Register dst, Register index, int displacement, boolean direct) { Register vec1 = asRegister(vectorTemp[0]); Register vec2 = asRegister(vectorTemp[1]); int displacementSrc = displacement << strideSrc.log2 - strideDst.log2; masm.movdqu(vec1, indexAddressOrDirect(strideSrc, src, index, displacementSrc, 0, direct)); switch (op) { case compressCharToByte: masm.pxor(vec2, vec2); masm.packuswb(vec1, vec2); break; case compressIntToChar: masm.pxor(vec2, vec2); masm.packusdw(vec1, vec2); break; case compressIntToByte: masm.movdqu(vec2, indexAddressOrDirect(strideSrc, src, index, displacementSrc, 16, direct)); masm.packusdw(vec1, vec2); masm.packuswb(vec1, vec2); break; } masm.movq(new AMD64Address(dst, index, strideDst, displacement), vec1); } private static AMD64Address indexAddressOrDirect(Stride strideSrc, Register array, Register index, int baseOffset, int displacement, boolean direct) { return direct ? new AMD64Address(array, displacement) : new AMD64Address(array, index, strideSrc, baseOffset + displacement); } private void emitInflate(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) { Op op = getOp(strideDst, strideSrc); Register vec = asRegister(vectorTemp[0]); Label labelScalarLoop = new Label(); Label labelDone = new Label(); asm.movl(tmp, len); if (asm.supports(CPUFeature.SSE4_2)) { Label labelMainLoop = new Label(); Label labelSkipXMMHalf = new Label(); Label labelXMMTail = new Label(); Label labelTail = new Label(); int vectorLength = vectorSize.getBytes() / strideDst.value; int vectorLengthXMM = XMM.getBytes() / strideDst.value; int scaleDelta = strideDst.log2 - strideSrc.log2; asm.andl(tmp, vectorLength - 1); asm.andlAndJcc(len, -vectorLength, ConditionFlag.Zero, supportsAVX2AndYMM() ? labelXMMTail : labelTail, true); asm.leaq(src, new AMD64Address(src, len, strideSrc)); asm.leaq(dst, new AMD64Address(dst, len, strideDst)); asm.negq(len); // vectorized loop asm.align(preferredLoopAlignment(crb)); asm.bind(labelMainLoop); asm.pmovSZx(vectorSize, extendMode, vec, strideDst, new AMD64Address(src, len, strideSrc), strideSrc); asm.movdqu(vectorSize, new AMD64Address(dst, len, strideDst), vec); asm.addqAndJcc(len, vectorLength, ConditionFlag.NotZero, labelMainLoop, true); // vectorized tail asm.pmovSZx(vectorSize, extendMode, vec, strideDst, new AMD64Address(src, tmp, strideSrc, -vectorSize.getBytes() >> scaleDelta), strideSrc); asm.movdqu(vectorSize, new AMD64Address(dst, tmp, strideDst, -vectorSize.getBytes()), vec); asm.jmpb(labelDone); if (supportsAVX2AndYMM()) { asm.bind(labelXMMTail); asm.cmplAndJcc(tmp, vectorLengthXMM, ConditionFlag.Less, labelTail, true); // half vector size asm.pmovSZx(XMM, extendMode, vec, strideDst, new AMD64Address(src), strideSrc); asm.movdqu(new AMD64Address(dst), vec); // half vector size tail asm.pmovSZx(XMM, extendMode, vec, strideDst, new AMD64Address(src, tmp, strideSrc, -16 >> scaleDelta), strideSrc); asm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec); asm.jmpb(labelDone); } asm.bind(labelTail); asm.movl(len, tmp); // xmm half vector size if (op != Op.inflateByteToInt) { assert scaleDelta == 1 : scaleDelta; asm.cmplAndJcc(len, 4 >> strideSrc.log2, ConditionFlag.Less, labelSkipXMMHalf, true); asm.movdl(vec, new AMD64Address(src)); asm.pmovSZx(XMM, extendMode, vec, strideDst, vec, strideSrc); asm.movq(new AMD64Address(dst), vec); asm.movdl(vec, new AMD64Address(src, len, strideSrc, -4)); asm.pmovSZx(XMM, extendMode, vec, strideDst, vec, strideSrc); asm.movq(new AMD64Address(dst, len, strideDst, -8), vec); asm.jmpb(labelDone); } asm.bind(labelSkipXMMHalf); } asm.testlAndJcc(len, len, ConditionFlag.Zero, labelDone, true); asm.leaq(src, new AMD64Address(src, len, strideSrc)); asm.leaq(dst, new AMD64Address(dst, len, strideDst)); asm.negq(len); // scalar loop asm.bind(labelScalarLoop); switch (op) { case inflateByteToChar: asm.movzbl(tmp, new AMD64Address(src, len, strideSrc)); asm.movw(new AMD64Address(dst, len, strideDst), tmp); break; case inflateByteToInt: asm.movzbl(tmp, new AMD64Address(src, len, strideSrc)); asm.movl(new AMD64Address(dst, len, strideDst), tmp); break; case inflateCharToInt: asm.movzwl(tmp, new AMD64Address(src, len, strideSrc)); asm.movl(new AMD64Address(dst, len, strideDst), tmp); break; } asm.incqAndJcc(len, ConditionFlag.NotZero, labelScalarLoop, true); asm.bind(labelDone); } private void emitCopy(CompilationResultBuilder crb, AMD64MacroAssembler masm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) { Register vec = asRegister(vectorTemp[0]); Label labelTailXMM = new Label(); Label labelTailQWORD = new Label(); Label labelTailDWORD = new Label(); Label labelTailWORD = new Label(); Label labelTailBYTE = new Label(); Label labelDone = new Label(); int vectorLength = vectorSize.getBytes() / strideDst.value; masm.movl(tmp, len); masm.andl(tmp, vectorLength - 1); masm.andlAndJcc(len, -vectorLength, ConditionFlag.Zero, supportsAVX2AndYMM() ? labelTailXMM : labelTailQWORD, true); masm.leaq(src, new AMD64Address(src, len, strideSrc)); masm.leaq(dst, new AMD64Address(dst, len, strideDst)); masm.negq(len); Label labelYMMLoop = new Label(); Label labelXMMLoop = new Label(); if (supportsAVX2AndYMM()) { // vectorized loop masm.align(preferredLoopAlignment(crb)); masm.bind(labelYMMLoop); masm.vmovdqu(vec, new AMD64Address(src, len, strideSrc)); masm.vmovdqu(new AMD64Address(dst, len, strideDst), vec); masm.addqAndJcc(len, vectorLength, ConditionFlag.NotZero, labelYMMLoop, true); // vectorized tail masm.vmovdqu(vec, new AMD64Address(src, tmp, strideSrc, -32)); masm.vmovdqu(new AMD64Address(dst, tmp, strideDst, -32), vec); masm.jmpb(labelDone); // half vector size masm.bind(labelTailXMM); masm.cmplAndJcc(tmp, 16 / strideDst.value, ConditionFlag.Less, labelTailQWORD, true); masm.movdqu(vec, new AMD64Address(src)); masm.movdqu(new AMD64Address(dst), vec); // half vector size tail masm.movdqu(vec, new AMD64Address(src, tmp, strideSrc, -16)); masm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec); masm.jmpb(labelDone); } else { // xmm vectorized loop masm.align(preferredLoopAlignment(crb)); masm.bind(labelXMMLoop); masm.movdqu(vec, new AMD64Address(src, len, strideSrc)); masm.movdqu(new AMD64Address(dst, len, strideDst), vec); masm.addqAndJcc(len, vectorLength, ConditionFlag.NotZero, labelXMMLoop, true); // xmm vectorized tail masm.movdqu(vec, new AMD64Address(src, tmp, strideSrc, -16)); masm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec); masm.jmpb(labelDone); } /* * Tails for less than XMM size * * Since the initial vector length check (len & -vectorLength) sets len to zero, and tmp was * set to the tail length (tmp & (vectorLength - 1), equal to array length if this code path * is hit), from this point on, tmp holds the array length, and len is used as the temporary * register for copying data. */ masm.bind(labelTailQWORD); masm.cmplAndJcc(tmp, 8 / strideDst.value, ConditionFlag.Less, labelTailDWORD, true); masm.movq(len, new AMD64Address(src)); masm.movq(new AMD64Address(dst), len); masm.movq(len, new AMD64Address(src, tmp, strideSrc, -8)); masm.movq(new AMD64Address(dst, tmp, strideDst, -8), len); masm.jmpb(labelDone); masm.bind(labelTailDWORD); if (strideDst.value < 4) { masm.cmplAndJcc(tmp, 4 / strideDst.value, ConditionFlag.Less, labelTailWORD, true); } else { masm.testlAndJcc(tmp, tmp, ConditionFlag.Zero, labelDone, true); } masm.movl(len, new AMD64Address(src)); masm.movl(new AMD64Address(dst), len); masm.movl(len, new AMD64Address(src, tmp, strideSrc, -4)); masm.movl(new AMD64Address(dst, tmp, strideDst, -4), len); if (strideDst.value < 4) { masm.jmpb(labelDone); masm.bind(labelTailWORD); if (strideDst.value < 2) { masm.cmplAndJcc(tmp, 2 / strideDst.value, ConditionFlag.Less, labelTailBYTE, true); } else { masm.testlAndJcc(tmp, tmp, ConditionFlag.Zero, labelDone, true); } masm.movw(len, new AMD64Address(src)); masm.movw(new AMD64Address(dst), len); masm.movw(len, new AMD64Address(src, tmp, strideSrc, -2)); masm.movw(new AMD64Address(dst, tmp, strideDst, -2), len); } if (strideDst.value < 2) { masm.jmpb(labelDone); masm.bind(labelTailBYTE); masm.testlAndJcc(tmp, tmp, ConditionFlag.Zero, labelDone, true); masm.movb(len, new AMD64Address(src)); masm.movb(new AMD64Address(dst), len); } masm.bind(labelDone); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy