
gust.linalg.cuda.enforceLUFloat.ptx Maven / Gradle / Ivy
The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep 5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_30
.address_size 64
.file 1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/enforceLUFloat.cu", 1413672495, 1095
.visible .entry enforceLU(
.param .u64 enforceLU_param_0,
.param .u32 enforceLU_param_1
)
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .s32 %r<5>;
.reg .s64 %rd<5>;
ld.param.u64 %rd2, [enforceLU_param_0];
ld.param.u32 %r3, [enforceLU_param_1];
cvta.to.global.u64 %rd1, %rd2;
.loc 1 8 1
mov.u32 %r1, %ctaid.x;
.loc 1 7 1
mov.u32 %r2, %tid.x;
.loc 1 9 1
setp.gt.s32 %p1, %r2, %r1;
@%p1 bra BB0_2;
.loc 1 10 1
setp.eq.s32 %p2, %r2, %r1;
selp.f32 %f1, 0f3F800000, 0f00000000, %p2;
mad.lo.s32 %r4, %r1, %r3, %r2;
mul.wide.s32 %rd3, %r4, 4;
add.s64 %rd4, %rd1, %rd3;
.loc 1 10 1
st.global.f32 [%rd4], %f1;
BB0_2:
.loc 1 11 2
ret;
}
.visible .entry zerosU(
.param .u32 zerosU_param_0,
.param .u32 zerosU_param_1,
.param .u64 zerosU_param_2,
.param .u32 zerosU_param_3,
.param .u32 zerosU_param_4
)
{
.reg .pred %p<8>;
.reg .s32 %r<16>;
.reg .s64 %rd<5>;
ld.param.u32 %r5, [zerosU_param_0];
ld.param.u32 %r6, [zerosU_param_1];
ld.param.u64 %rd3, [zerosU_param_2];
ld.param.u32 %r3, [zerosU_param_3];
ld.param.u32 %r4, [zerosU_param_4];
cvta.to.global.u64 %rd1, %rd3;
.loc 1 21 1
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
.loc 1 22 1
mov.u32 %r10, %ntid.y;
mov.u32 %r11, %ctaid.y;
mov.u32 %r12, %tid.y;
mad.lo.s32 %r2, %r10, %r11, %r12;
.loc 1 24 1
setp.ge.s32 %p1, %r2, %r6;
setp.ge.s32 %p2, %r1, %r5;
or.pred %p3, %p2, %p1;
.loc 1 24 1
@%p3 bra BB1_5;
.loc 1 26 1
setp.lt.s32 %p4, %r1, %r2;
.loc 1 27 1
mad.lo.s32 %r13, %r2, %r3, %r1;
mul.wide.s32 %rd4, %r13, 4;
add.s64 %rd2, %rd1, %rd4;
.loc 1 26 1
@%p4 bra BB1_4;
.loc 1 28 1
setp.eq.s32 %p5, %r1, %r2;
setp.ne.s32 %p6, %r4, 0;
and.pred %p7, %p5, %p6;
.loc 1 28 1
@!%p7 bra BB1_5;
bra.uni BB1_3;
BB1_3:
mov.u32 %r14, 0;
.loc 1 29 1
st.global.u32 [%rd2], %r14;
bra.uni BB1_5;
BB1_4:
mov.u32 %r15, 0;
.loc 1 27 1
st.global.u32 [%rd2], %r15;
BB1_5:
.loc 1 30 2
ret;
}
.visible .entry zerosL(
.param .u32 zerosL_param_0,
.param .u32 zerosL_param_1,
.param .u64 zerosL_param_2,
.param .u32 zerosL_param_3,
.param .u32 zerosL_param_4
)
{
.reg .pred %p<8>;
.reg .s32 %r<16>;
.reg .s64 %rd<5>;
ld.param.u32 %r5, [zerosL_param_0];
ld.param.u32 %r6, [zerosL_param_1];
ld.param.u64 %rd3, [zerosL_param_2];
ld.param.u32 %r3, [zerosL_param_3];
ld.param.u32 %r4, [zerosL_param_4];
cvta.to.global.u64 %rd1, %rd3;
.loc 1 39 1
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
.loc 1 40 1
mov.u32 %r10, %ntid.y;
mov.u32 %r11, %ctaid.y;
mov.u32 %r12, %tid.y;
mad.lo.s32 %r2, %r10, %r11, %r12;
.loc 1 42 1
setp.ge.s32 %p1, %r2, %r6;
setp.ge.s32 %p2, %r1, %r5;
or.pred %p3, %p2, %p1;
.loc 1 42 1
@%p3 bra BB2_5;
.loc 1 44 1
setp.gt.s32 %p4, %r1, %r2;
.loc 1 45 1
mad.lo.s32 %r13, %r2, %r3, %r1;
mul.wide.s32 %rd4, %r13, 4;
add.s64 %rd2, %rd1, %rd4;
.loc 1 44 1
@%p4 bra BB2_4;
.loc 1 46 1
setp.eq.s32 %p5, %r1, %r2;
setp.ne.s32 %p6, %r4, 0;
and.pred %p7, %p5, %p6;
.loc 1 46 1
@!%p7 bra BB2_5;
bra.uni BB2_3;
BB2_3:
mov.u32 %r14, 0;
.loc 1 47 1
st.global.u32 [%rd2], %r14;
bra.uni BB2_5;
BB2_4:
mov.u32 %r15, 0;
.loc 1 45 1
st.global.u32 [%rd2], %r15;
BB2_5:
.loc 1 48 2
ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy