cuda_kernels.matrixVectorMultiplicationKernel.ptx Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lib Show documentation
Show all versions of lib Show documentation
Educational library for machine learning challenges
The newest version!
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31833905
// Cuda compilation tools, release 11.8, V11.8.89
// Based on NVVM 7.0.1
//
.version 7.8
.target sm_52
.address_size 64
// .globl matrixVectorMultiplicationKernel
.visible .entry matrixVectorMultiplicationKernel(
.param .u64 matrixVectorMultiplicationKernel_param_0,
.param .u64 matrixVectorMultiplicationKernel_param_1,
.param .u64 matrixVectorMultiplicationKernel_param_2,
.param .u32 matrixVectorMultiplicationKernel_param_3,
.param .u32 matrixVectorMultiplicationKernel_param_4
)
{
.reg .pred %p<7>;
.reg .b32 %r<25>;
.reg .f64 %fd<30>;
.reg .b64 %rd<28>;
ld.param.u64 %rd15, [matrixVectorMultiplicationKernel_param_0];
ld.param.u64 %rd16, [matrixVectorMultiplicationKernel_param_1];
ld.param.u64 %rd14, [matrixVectorMultiplicationKernel_param_2];
ld.param.u32 %r12, [matrixVectorMultiplicationKernel_param_3];
ld.param.u32 %r11, [matrixVectorMultiplicationKernel_param_4];
cvta.to.global.u64 %rd1, %rd16;
cvta.to.global.u64 %rd2, %rd15;
mov.u32 %r13, %ntid.x;
mov.u32 %r14, %ctaid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
setp.ge.s32 %p1, %r1, %r12;
@%p1 bra $L__BB0_9;
setp.lt.s32 %p2, %r11, 1;
mov.f64 %fd29, 0d0000000000000000;
@%p2 bra $L__BB0_8;
add.s32 %r17, %r11, -1;
and.b32 %r24, %r11, 3;
setp.lt.u32 %p3, %r17, 3;
mov.f64 %fd29, 0d0000000000000000;
mov.u32 %r23, 0;
@%p3 bra $L__BB0_5;
sub.s32 %r22, %r11, %r24;
mul.lo.s32 %r19, %r11, %r1;
mul.wide.s32 %rd17, %r19, 8;
add.s64 %rd18, %rd2, %rd17;
add.s64 %rd25, %rd18, 16;
mov.u64 %rd24, %rd1;
$L__BB0_4:
ld.global.f64 %fd12, [%rd24];
ld.global.f64 %fd13, [%rd25+-16];
fma.rn.f64 %fd14, %fd13, %fd12, %fd29;
ld.global.f64 %fd15, [%rd24+8];
ld.global.f64 %fd16, [%rd25+-8];
fma.rn.f64 %fd17, %fd16, %fd15, %fd14;
ld.global.f64 %fd18, [%rd24+16];
ld.global.f64 %fd19, [%rd25];
fma.rn.f64 %fd20, %fd19, %fd18, %fd17;
ld.global.f64 %fd21, [%rd24+24];
ld.global.f64 %fd22, [%rd25+8];
fma.rn.f64 %fd29, %fd22, %fd21, %fd20;
add.s32 %r23, %r23, 4;
add.s64 %rd25, %rd25, 32;
add.s64 %rd24, %rd24, 32;
add.s32 %r22, %r22, -4;
setp.ne.s32 %p4, %r22, 0;
@%p4 bra $L__BB0_4;
$L__BB0_5:
setp.eq.s32 %p5, %r24, 0;
@%p5 bra $L__BB0_8;
mul.wide.s32 %rd19, %r23, 8;
add.s64 %rd27, %rd1, %rd19;
mad.lo.s32 %r20, %r11, %r1, %r23;
mul.wide.s32 %rd20, %r20, 8;
add.s64 %rd26, %rd2, %rd20;
$L__BB0_7:
.pragma "nounroll";
ld.global.f64 %fd23, [%rd27];
ld.global.f64 %fd24, [%rd26];
fma.rn.f64 %fd29, %fd24, %fd23, %fd29;
add.s64 %rd27, %rd27, 8;
add.s64 %rd26, %rd26, 8;
add.s32 %r24, %r24, -1;
setp.ne.s32 %p6, %r24, 0;
@%p6 bra $L__BB0_7;
$L__BB0_8:
cvta.to.global.u64 %rd21, %rd14;
mul.wide.s32 %rd22, %r1, 8;
add.s64 %rd23, %rd21, %rd22;
st.global.f64 [%rd23], %fd29;
$L__BB0_9:
ret;
}