
gust.linalg.cuda.elemWiseFloat.ptx Maven / Gradle / Ivy
The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep 5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_30
.address_size 64
.file 1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/elemWiseFloat.cu", 1413672495, 1071
.visible .entry hadamard(
.param .u32 hadamard_param_0,
.param .u32 hadamard_param_1,
.param .u64 hadamard_param_2,
.param .u32 hadamard_param_3,
.param .u64 hadamard_param_4,
.param .u32 hadamard_param_5,
.param .u64 hadamard_param_6,
.param .u32 hadamard_param_7
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .s32 %r<17>;
.reg .s64 %rd<13>;
ld.param.u32 %r6, [hadamard_param_0];
ld.param.u32 %r7, [hadamard_param_1];
ld.param.u64 %rd4, [hadamard_param_2];
ld.param.u32 %r3, [hadamard_param_3];
ld.param.u64 %rd5, [hadamard_param_4];
ld.param.u32 %r4, [hadamard_param_5];
ld.param.u64 %rd6, [hadamard_param_6];
ld.param.u32 %r5, [hadamard_param_7];
cvta.to.global.u64 %rd1, %rd6;
cvta.to.global.u64 %rd2, %rd5;
cvta.to.global.u64 %rd3, %rd4;
.loc 1 8 1
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r8, %r9, %r10;
.loc 1 9 1
mov.u32 %r11, %ntid.y;
mov.u32 %r12, %ctaid.y;
mov.u32 %r13, %tid.y;
mad.lo.s32 %r2, %r11, %r12, %r13;
.loc 1 11 1
setp.ge.s32 %p1, %r2, %r7;
setp.ge.s32 %p2, %r1, %r6;
or.pred %p3, %p2, %p1;
.loc 1 11 1
@%p3 bra BB0_2;
.loc 1 13 1
mad.lo.s32 %r14, %r2, %r3, %r1;
mul.wide.s32 %rd7, %r14, 4;
add.s64 %rd8, %rd3, %rd7;
.loc 1 13 1
mad.lo.s32 %r15, %r2, %r4, %r1;
mul.wide.s32 %rd9, %r15, 4;
add.s64 %rd10, %rd2, %rd9;
.loc 1 13 1
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
mul.f32 %f3, %f2, %f1;
mad.lo.s32 %r16, %r2, %r5, %r1;
mul.wide.s32 %rd11, %r16, 4;
add.s64 %rd12, %rd1, %rd11;
.loc 1 13 1
st.global.f32 [%rd12], %f3;
BB0_2:
.loc 1 14 2
ret;
}
.visible .entry matrix_sum(
.param .u32 matrix_sum_param_0,
.param .u32 matrix_sum_param_1,
.param .u64 matrix_sum_param_2,
.param .u32 matrix_sum_param_3,
.param .u64 matrix_sum_param_4,
.param .u32 matrix_sum_param_5,
.param .u64 matrix_sum_param_6,
.param .u32 matrix_sum_param_7
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .s32 %r<17>;
.reg .s64 %rd<13>;
ld.param.u32 %r6, [matrix_sum_param_0];
ld.param.u32 %r7, [matrix_sum_param_1];
ld.param.u64 %rd4, [matrix_sum_param_2];
ld.param.u32 %r3, [matrix_sum_param_3];
ld.param.u64 %rd5, [matrix_sum_param_4];
ld.param.u32 %r4, [matrix_sum_param_5];
ld.param.u64 %rd6, [matrix_sum_param_6];
ld.param.u32 %r5, [matrix_sum_param_7];
cvta.to.global.u64 %rd1, %rd6;
cvta.to.global.u64 %rd2, %rd5;
cvta.to.global.u64 %rd3, %rd4;
.loc 1 23 1
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r8, %r9, %r10;
.loc 1 24 1
mov.u32 %r11, %ntid.y;
mov.u32 %r12, %ctaid.y;
mov.u32 %r13, %tid.y;
mad.lo.s32 %r2, %r11, %r12, %r13;
.loc 1 26 1
setp.ge.s32 %p1, %r2, %r7;
setp.ge.s32 %p2, %r1, %r6;
or.pred %p3, %p2, %p1;
.loc 1 26 1
@%p3 bra BB1_2;
.loc 1 28 1
mad.lo.s32 %r14, %r2, %r3, %r1;
mul.wide.s32 %rd7, %r14, 4;
add.s64 %rd8, %rd3, %rd7;
.loc 1 28 1
mad.lo.s32 %r15, %r2, %r4, %r1;
mul.wide.s32 %rd9, %r15, 4;
add.s64 %rd10, %rd2, %rd9;
.loc 1 28 1
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mad.lo.s32 %r16, %r2, %r5, %r1;
mul.wide.s32 %rd11, %r16, 4;
add.s64 %rd12, %rd1, %rd11;
.loc 1 28 1
st.global.f32 [%rd12], %f3;
BB1_2:
.loc 1 29 2
ret;
}
.visible .entry copy(
.param .u32 copy_param_0,
.param .u32 copy_param_1,
.param .u64 copy_param_2,
.param .u32 copy_param_3,
.param .u64 copy_param_4,
.param .u32 copy_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<2>;
.reg .s32 %r<15>;
.reg .s64 %rd<9>;
ld.param.u32 %r5, [copy_param_0];
ld.param.u32 %r6, [copy_param_1];
ld.param.u64 %rd3, [copy_param_2];
ld.param.u32 %r3, [copy_param_3];
ld.param.u64 %rd4, [copy_param_4];
ld.param.u32 %r4, [copy_param_5];
cvta.to.global.u64 %rd1, %rd3;
cvta.to.global.u64 %rd2, %rd4;
.loc 1 38 1
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
.loc 1 39 1
mov.u32 %r10, %ntid.y;
mov.u32 %r11, %ctaid.y;
mov.u32 %r12, %tid.y;
mad.lo.s32 %r2, %r10, %r11, %r12;
.loc 1 41 1
setp.ge.s32 %p1, %r2, %r6;
setp.ge.s32 %p2, %r1, %r5;
or.pred %p3, %p2, %p1;
.loc 1 41 1
@%p3 bra BB2_2;
.loc 1 43 1
mad.lo.s32 %r13, %r2, %r4, %r1;
mul.wide.s32 %rd5, %r13, 4;
add.s64 %rd6, %rd2, %rd5;
.loc 1 43 1
mad.lo.s32 %r14, %r2, %r3, %r1;
mul.wide.s32 %rd7, %r14, 4;
add.s64 %rd8, %rd1, %rd7;
.loc 1 43 1
ld.global.f32 %f1, [%rd6];
st.global.f32 [%rd8], %f1;
BB2_2:
.loc 1 44 2
ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy