cuda_kernels.matrixMultiplicationKernel.ptx Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lib Show documentation
Show all versions of lib Show documentation
Educational library for machine learning challenges
The newest version!
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31833905
// Cuda compilation tools, release 11.8, V11.8.89
// Based on NVVM 7.0.1
//
.version 7.8
.target sm_52
.address_size 64
// .globl matrixMultiply
.visible .entry matrixMultiply(
.param .u64 matrixMultiply_param_0,
.param .u64 matrixMultiply_param_1,
.param .u64 matrixMultiply_param_2,
.param .u32 matrixMultiply_param_3,
.param .u32 matrixMultiply_param_4,
.param .u32 matrixMultiply_param_5
)
{
.reg .pred %p<9>;
.reg .b32 %r<32>;
.reg .f64 %fd<30>;
.reg .b64 %rd<35>;
ld.param.u64 %rd18, [matrixMultiply_param_0];
ld.param.u64 %rd19, [matrixMultiply_param_1];
ld.param.u64 %rd17, [matrixMultiply_param_2];
ld.param.u32 %r14, [matrixMultiply_param_3];
ld.param.u32 %r12, [matrixMultiply_param_4];
ld.param.u32 %r13, [matrixMultiply_param_5];
cvta.to.global.u64 %rd1, %rd19;
cvta.to.global.u64 %rd2, %rd18;
mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r1, %r16, %r15, %r17;
mov.u32 %r18, %ntid.x;
mov.u32 %r19, %ctaid.x;
mov.u32 %r20, %tid.x;
mad.lo.s32 %r2, %r19, %r18, %r20;
setp.ge.s32 %p1, %r1, %r14;
setp.ge.s32 %p2, %r2, %r13;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_9;
setp.lt.s32 %p4, %r12, 1;
mov.f64 %fd29, 0d0000000000000000;
@%p4 bra $L__BB0_8;
add.s32 %r22, %r12, -1;
and.b32 %r31, %r12, 3;
setp.lt.u32 %p5, %r22, 3;
mov.f64 %fd29, 0d0000000000000000;
mov.u32 %r30, 0;
@%p5 bra $L__BB0_5;
sub.s32 %r29, %r12, %r31;
mul.wide.s32 %rd20, %r2, 8;
add.s64 %rd32, %rd1, %rd20;
mul.lo.s32 %r24, %r12, %r1;
mul.wide.s32 %rd21, %r24, 8;
add.s64 %rd22, %rd2, %rd21;
add.s64 %rd31, %rd22, 16;
mul.wide.s32 %rd5, %r13, 8;
$L__BB0_4:
ld.global.f64 %fd12, [%rd32];
ld.global.f64 %fd13, [%rd31+-16];
fma.rn.f64 %fd14, %fd13, %fd12, %fd29;
add.s64 %rd23, %rd32, %rd5;
ld.global.f64 %fd15, [%rd23];
ld.global.f64 %fd16, [%rd31+-8];
fma.rn.f64 %fd17, %fd16, %fd15, %fd14;
add.s64 %rd24, %rd23, %rd5;
ld.global.f64 %fd18, [%rd24];
ld.global.f64 %fd19, [%rd31];
fma.rn.f64 %fd20, %fd19, %fd18, %fd17;
add.s64 %rd25, %rd24, %rd5;
add.s64 %rd32, %rd25, %rd5;
ld.global.f64 %fd21, [%rd25];
ld.global.f64 %fd22, [%rd31+8];
fma.rn.f64 %fd29, %fd22, %fd21, %fd20;
add.s32 %r30, %r30, 4;
add.s64 %rd31, %rd31, 32;
add.s32 %r29, %r29, -4;
setp.ne.s32 %p6, %r29, 0;
@%p6 bra $L__BB0_4;
$L__BB0_5:
setp.eq.s32 %p7, %r31, 0;
@%p7 bra $L__BB0_8;
mad.lo.s32 %r25, %r30, %r13, %r2;
mul.wide.s32 %rd26, %r25, 8;
add.s64 %rd34, %rd1, %rd26;
mul.wide.s32 %rd11, %r13, 8;
mad.lo.s32 %r26, %r12, %r1, %r30;
mul.wide.s32 %rd27, %r26, 8;
add.s64 %rd33, %rd2, %rd27;
$L__BB0_7:
.pragma "nounroll";
ld.global.f64 %fd23, [%rd34];
ld.global.f64 %fd24, [%rd33];
fma.rn.f64 %fd29, %fd24, %fd23, %fd29;
add.s64 %rd34, %rd34, %rd11;
add.s64 %rd33, %rd33, 8;
add.s32 %r31, %r31, -1;
setp.ne.s32 %p8, %r31, 0;
@%p8 bra $L__BB0_7;
$L__BB0_8:
mad.lo.s32 %r27, %r1, %r13, %r2;
cvta.to.global.u64 %rd28, %rd17;
mul.wide.s32 %rd29, %r27, 8;
add.s64 %rd30, %rd28, %rd29;
st.global.f64 [%rd30], %fd29;
$L__BB0_9:
ret;
}