
cuda_kernels.matrixVectorMultiplicationKernel.ptx Maven / Gradle / Ivy
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31833905
// Cuda compilation tools, release 11.8, V11.8.89
// Based on NVVM 7.0.1
//
.version 7.8
.target sm_52
.address_size 64
// .globl matrixVectorMultiplicationKernel
.visible .entry matrixVectorMultiplicationKernel(
.param .u64 matrixVectorMultiplicationKernel_param_0,
.param .u64 matrixVectorMultiplicationKernel_param_1,
.param .u64 matrixVectorMultiplicationKernel_param_2,
.param .u32 matrixVectorMultiplicationKernel_param_3,
.param .u32 matrixVectorMultiplicationKernel_param_4
)
{
.reg .pred %p<7>;
.reg .b32 %r<25>;
.reg .f64 %fd<30>;
.reg .b64 %rd<28>;
ld.param.u64 %rd15, [matrixVectorMultiplicationKernel_param_0];
ld.param.u64 %rd16, [matrixVectorMultiplicationKernel_param_1];
ld.param.u64 %rd14, [matrixVectorMultiplicationKernel_param_2];
ld.param.u32 %r12, [matrixVectorMultiplicationKernel_param_3];
ld.param.u32 %r11, [matrixVectorMultiplicationKernel_param_4];
cvta.to.global.u64 %rd1, %rd16;
cvta.to.global.u64 %rd2, %rd15;
mov.u32 %r13, %ntid.x;
mov.u32 %r14, %ctaid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
setp.ge.s32 %p1, %r1, %r12;
@%p1 bra $L__BB0_9;
setp.lt.s32 %p2, %r11, 1;
mov.f64 %fd29, 0d0000000000000000;
@%p2 bra $L__BB0_8;
add.s32 %r17, %r11, -1;
and.b32 %r24, %r11, 3;
setp.lt.u32 %p3, %r17, 3;
mov.f64 %fd29, 0d0000000000000000;
mov.u32 %r23, 0;
@%p3 bra $L__BB0_5;
sub.s32 %r22, %r11, %r24;
mul.lo.s32 %r19, %r11, %r1;
mul.wide.s32 %rd17, %r19, 8;
add.s64 %rd18, %rd2, %rd17;
add.s64 %rd25, %rd18, 16;
mov.u64 %rd24, %rd1;
$L__BB0_4:
ld.global.f64 %fd12, [%rd24];
ld.global.f64 %fd13, [%rd25+-16];
fma.rn.f64 %fd14, %fd13, %fd12, %fd29;
ld.global.f64 %fd15, [%rd24+8];
ld.global.f64 %fd16, [%rd25+-8];
fma.rn.f64 %fd17, %fd16, %fd15, %fd14;
ld.global.f64 %fd18, [%rd24+16];
ld.global.f64 %fd19, [%rd25];
fma.rn.f64 %fd20, %fd19, %fd18, %fd17;
ld.global.f64 %fd21, [%rd24+24];
ld.global.f64 %fd22, [%rd25+8];
fma.rn.f64 %fd29, %fd22, %fd21, %fd20;
add.s32 %r23, %r23, 4;
add.s64 %rd25, %rd25, 32;
add.s64 %rd24, %rd24, 32;
add.s32 %r22, %r22, -4;
setp.ne.s32 %p4, %r22, 0;
@%p4 bra $L__BB0_4;
$L__BB0_5:
setp.eq.s32 %p5, %r24, 0;
@%p5 bra $L__BB0_8;
mul.wide.s32 %rd19, %r23, 8;
add.s64 %rd27, %rd1, %rd19;
mad.lo.s32 %r20, %r11, %r1, %r23;
mul.wide.s32 %rd20, %r20, 8;
add.s64 %rd26, %rd2, %rd20;
$L__BB0_7:
.pragma "nounroll";
ld.global.f64 %fd23, [%rd27];
ld.global.f64 %fd24, [%rd26];
fma.rn.f64 %fd29, %fd24, %fd23, %fd29;
add.s64 %rd27, %rd27, 8;
add.s64 %rd26, %rd26, 8;
add.s32 %r24, %r24, -1;
setp.ne.s32 %p6, %r24, 0;
@%p6 bra $L__BB0_7;
$L__BB0_8:
cvta.to.global.u64 %rd21, %rd14;
mul.wide.s32 %rd22, %r1, 8;
add.s64 %rd23, %rd21, %rd22;
st.global.f64 [%rd23], %fd29;
$L__BB0_9:
ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy