kernels.SystemML.ptx Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21124049
// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_30
.address_size 64
// .globl double2float_f
.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.func (.param .b64 func_retval0) __internal_accurate_pow
(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
;
.extern .shared .align 1 .b8 my_sdata[];
.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
.visible .entry double2float_f(
.param .u64 double2float_f_param_0,
.param .u64 double2float_f_param_1,
.param .u32 double2float_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [double2float_f_param_0];
ld.param.u64 %rd2, [double2float_f_param_1];
ld.param.u32 %r2, [double2float_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB0_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rn.f32.f64 %f1, %fd1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB0_2:
ret;
}
// .globl float2double_f
.visible .entry float2double_f(
.param .u64 float2double_f_param_0,
.param .u64 float2double_f_param_1,
.param .u32 float2double_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [float2double_f_param_0];
ld.param.u64 %rd2, [float2double_f_param_1];
ld.param.u32 %r2, [float2double_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB1_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd1, %f1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB1_2:
ret;
}
// .globl slice_sparse_dense_row_d
.visible .entry slice_sparse_dense_row_d(
.param .u64 slice_sparse_dense_row_d_param_0,
.param .u64 slice_sparse_dense_row_d_param_1,
.param .u64 slice_sparse_dense_row_d_param_2,
.param .u64 slice_sparse_dense_row_d_param_3,
.param .u32 slice_sparse_dense_row_d_param_4,
.param .u32 slice_sparse_dense_row_d_param_5,
.param .u32 slice_sparse_dense_row_d_param_6,
.param .u32 slice_sparse_dense_row_d_param_7,
.param .u32 slice_sparse_dense_row_d_param_8
)
{
.reg .pred %p<7>;
.reg .b32 %r<24>;
.reg .f64 %fd<2>;
.reg .b64 %rd<23>;
ld.param.u64 %rd9, [slice_sparse_dense_row_d_param_0];
ld.param.u64 %rd10, [slice_sparse_dense_row_d_param_1];
ld.param.u64 %rd11, [slice_sparse_dense_row_d_param_2];
ld.param.u64 %rd12, [slice_sparse_dense_row_d_param_3];
ld.param.u32 %r15, [slice_sparse_dense_row_d_param_4];
ld.param.u32 %r16, [slice_sparse_dense_row_d_param_5];
ld.param.u32 %r12, [slice_sparse_dense_row_d_param_6];
ld.param.u32 %r13, [slice_sparse_dense_row_d_param_7];
ld.param.u32 %r14, [slice_sparse_dense_row_d_param_8];
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r1, %r17, %r18, %r19;
add.s32 %r2, %r1, %r15;
setp.gt.s32 %p1, %r2, %r16;
@%p1 bra BB2_6;
cvta.to.global.u64 %rd13, %rd10;
mul.wide.s32 %rd14, %r2, 4;
add.s64 %rd1, %rd13, %rd14;
ld.global.u32 %r23, [%rd1];
ld.global.u32 %r22, [%rd1+4];
setp.ge.s32 %p2, %r23, %r22;
@%p2 bra BB2_6;
cvta.to.global.u64 %rd2, %rd12;
cvta.to.global.u64 %rd15, %rd9;
cvta.to.global.u64 %rd16, %rd11;
mul.lo.s32 %r20, %r1, %r14;
sub.s32 %r5, %r20, %r12;
mul.wide.s32 %rd17, %r23, 8;
add.s64 %rd22, %rd15, %rd17;
mul.wide.s32 %rd18, %r23, 4;
add.s64 %rd21, %rd16, %rd18;
BB2_3:
ld.global.u32 %r8, [%rd21];
setp.lt.s32 %p3, %r8, %r12;
setp.gt.s32 %p4, %r8, %r13;
or.pred %p5, %p3, %p4;
@%p5 bra BB2_5;
ld.global.f64 %fd1, [%rd22];
add.s32 %r21, %r5, %r8;
mul.wide.s32 %rd19, %r21, 8;
add.s64 %rd20, %rd2, %rd19;
st.global.f64 [%rd20], %fd1;
ld.global.u32 %r22, [%rd1+4];
BB2_5:
add.s64 %rd22, %rd22, 8;
add.s64 %rd21, %rd21, 4;
add.s32 %r23, %r23, 1;
setp.lt.s32 %p6, %r23, %r22;
@%p6 bra BB2_3;
BB2_6:
ret;
}
// .globl slice_sparse_dense_row_f
.visible .entry slice_sparse_dense_row_f(
.param .u64 slice_sparse_dense_row_f_param_0,
.param .u64 slice_sparse_dense_row_f_param_1,
.param .u64 slice_sparse_dense_row_f_param_2,
.param .u64 slice_sparse_dense_row_f_param_3,
.param .u32 slice_sparse_dense_row_f_param_4,
.param .u32 slice_sparse_dense_row_f_param_5,
.param .u32 slice_sparse_dense_row_f_param_6,
.param .u32 slice_sparse_dense_row_f_param_7,
.param .u32 slice_sparse_dense_row_f_param_8
)
{
.reg .pred %p<7>;
.reg .f32 %f<2>;
.reg .b32 %r<24>;
.reg .b64 %rd<22>;
ld.param.u64 %rd9, [slice_sparse_dense_row_f_param_0];
ld.param.u64 %rd10, [slice_sparse_dense_row_f_param_1];
ld.param.u64 %rd11, [slice_sparse_dense_row_f_param_2];
ld.param.u64 %rd12, [slice_sparse_dense_row_f_param_3];
ld.param.u32 %r15, [slice_sparse_dense_row_f_param_4];
ld.param.u32 %r16, [slice_sparse_dense_row_f_param_5];
ld.param.u32 %r12, [slice_sparse_dense_row_f_param_6];
ld.param.u32 %r13, [slice_sparse_dense_row_f_param_7];
ld.param.u32 %r14, [slice_sparse_dense_row_f_param_8];
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r1, %r17, %r18, %r19;
add.s32 %r2, %r1, %r15;
setp.gt.s32 %p1, %r2, %r16;
@%p1 bra BB3_6;
cvta.to.global.u64 %rd13, %rd10;
mul.wide.s32 %rd14, %r2, 4;
add.s64 %rd1, %rd13, %rd14;
ld.global.u32 %r23, [%rd1];
ld.global.u32 %r22, [%rd1+4];
setp.ge.s32 %p2, %r23, %r22;
@%p2 bra BB3_6;
cvta.to.global.u64 %rd2, %rd12;
cvta.to.global.u64 %rd15, %rd9;
cvta.to.global.u64 %rd16, %rd11;
mul.lo.s32 %r20, %r1, %r14;
sub.s32 %r5, %r20, %r12;
mul.wide.s32 %rd17, %r23, 4;
add.s64 %rd21, %rd15, %rd17;
add.s64 %rd20, %rd16, %rd17;
BB3_3:
ld.global.u32 %r8, [%rd20];
setp.lt.s32 %p3, %r8, %r12;
setp.gt.s32 %p4, %r8, %r13;
or.pred %p5, %p3, %p4;
@%p5 bra BB3_5;
ld.global.f32 %f1, [%rd21];
add.s32 %r21, %r5, %r8;
mul.wide.s32 %rd18, %r21, 4;
add.s64 %rd19, %rd2, %rd18;
st.global.f32 [%rd19], %f1;
ld.global.u32 %r22, [%rd1+4];
BB3_5:
add.s64 %rd21, %rd21, 4;
add.s64 %rd20, %rd20, 4;
add.s32 %r23, %r23, 1;
setp.lt.s32 %p6, %r23, %r22;
@%p6 bra BB3_3;
BB3_6:
ret;
}
// .globl slice_sparse_dense_nnz_d
.visible .entry slice_sparse_dense_nnz_d(
.param .u64 slice_sparse_dense_nnz_d_param_0,
.param .u64 slice_sparse_dense_nnz_d_param_1,
.param .u64 slice_sparse_dense_nnz_d_param_2,
.param .u64 slice_sparse_dense_nnz_d_param_3,
.param .u32 slice_sparse_dense_nnz_d_param_4,
.param .u32 slice_sparse_dense_nnz_d_param_5,
.param .u32 slice_sparse_dense_nnz_d_param_6,
.param .u32 slice_sparse_dense_nnz_d_param_7,
.param .u32 slice_sparse_dense_nnz_d_param_8
)
{
.reg .pred %p<6>;
.reg .b32 %r<22>;
.reg .f64 %fd<2>;
.reg .b64 %rd<22>;
ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_0];
ld.param.u64 %rd8, [slice_sparse_dense_nnz_d_param_1];
ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_2];
ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_3];
ld.param.u32 %r5, [slice_sparse_dense_nnz_d_param_4];
ld.param.u32 %r9, [slice_sparse_dense_nnz_d_param_5];
ld.param.u32 %r6, [slice_sparse_dense_nnz_d_param_6];
ld.param.u32 %r7, [slice_sparse_dense_nnz_d_param_7];
ld.param.u32 %r8, [slice_sparse_dense_nnz_d_param_8];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r13, %r10, %r11, %r12;
cvta.to.global.u64 %rd1, %rd8;
mul.wide.s32 %rd9, %r5, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.u32 %r14, [%rd10];
add.s32 %r1, %r13, %r14;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.u32 %r15, [%rd12+4];
setp.ge.s32 %p1, %r1, %r15;
@%p1 bra BB4_5;
cvta.to.global.u64 %rd2, %rd7;
cvta.to.global.u64 %rd3, %rd5;
cvta.to.global.u64 %rd13, %rd6;
cvt.s64.s32 %rd4, %r1;
mul.wide.s32 %rd14, %r1, 4;
add.s64 %rd15, %rd13, %rd14;
ld.global.u32 %r2, [%rd15];
setp.lt.s32 %p2, %r2, %r6;
setp.gt.s32 %p3, %r2, %r7;
or.pred %p4, %p2, %p3;
@%p4 bra BB4_5;
mov.u32 %r21, %r5;
BB4_3:
mov.u32 %r3, %r21;
add.s32 %r4, %r3, 1;
mul.wide.s32 %rd16, %r4, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.u32 %r16, [%rd17];
setp.le.s32 %p5, %r16, %r1;
mov.u32 %r21, %r4;
@%p5 bra BB4_3;
shl.b64 %rd18, %rd4, 3;
add.s64 %rd19, %rd3, %rd18;
ld.global.f64 %fd1, [%rd19];
sub.s32 %r17, %r3, %r5;
mul.lo.s32 %r18, %r17, %r8;
sub.s32 %r19, %r18, %r6;
add.s32 %r20, %r19, %r2;
mul.wide.s32 %rd20, %r20, 8;
add.s64 %rd21, %rd2, %rd20;
st.global.f64 [%rd21], %fd1;
BB4_5:
ret;
}
// .globl slice_sparse_dense_nnz_f
.visible .entry slice_sparse_dense_nnz_f(
.param .u64 slice_sparse_dense_nnz_f_param_0,
.param .u64 slice_sparse_dense_nnz_f_param_1,
.param .u64 slice_sparse_dense_nnz_f_param_2,
.param .u64 slice_sparse_dense_nnz_f_param_3,
.param .u32 slice_sparse_dense_nnz_f_param_4,
.param .u32 slice_sparse_dense_nnz_f_param_5,
.param .u32 slice_sparse_dense_nnz_f_param_6,
.param .u32 slice_sparse_dense_nnz_f_param_7,
.param .u32 slice_sparse_dense_nnz_f_param_8
)
{
.reg .pred %p<6>;
.reg .f32 %f<2>;
.reg .b32 %r<22>;
.reg .b64 %rd<22>;
ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_0];
ld.param.u64 %rd8, [slice_sparse_dense_nnz_f_param_1];
ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_2];
ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_3];
ld.param.u32 %r5, [slice_sparse_dense_nnz_f_param_4];
ld.param.u32 %r9, [slice_sparse_dense_nnz_f_param_5];
ld.param.u32 %r6, [slice_sparse_dense_nnz_f_param_6];
ld.param.u32 %r7, [slice_sparse_dense_nnz_f_param_7];
ld.param.u32 %r8, [slice_sparse_dense_nnz_f_param_8];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r13, %r10, %r11, %r12;
cvta.to.global.u64 %rd1, %rd8;
mul.wide.s32 %rd9, %r5, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.u32 %r14, [%rd10];
add.s32 %r1, %r13, %r14;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.u32 %r15, [%rd12+4];
setp.ge.s32 %p1, %r1, %r15;
@%p1 bra BB5_5;
cvta.to.global.u64 %rd2, %rd7;
cvta.to.global.u64 %rd3, %rd5;
cvta.to.global.u64 %rd13, %rd6;
cvt.s64.s32 %rd4, %r1;
mul.wide.s32 %rd14, %r1, 4;
add.s64 %rd15, %rd13, %rd14;
ld.global.u32 %r2, [%rd15];
setp.lt.s32 %p2, %r2, %r6;
setp.gt.s32 %p3, %r2, %r7;
or.pred %p4, %p2, %p3;
@%p4 bra BB5_5;
mov.u32 %r21, %r5;
BB5_3:
mov.u32 %r3, %r21;
add.s32 %r4, %r3, 1;
mul.wide.s32 %rd16, %r4, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.u32 %r16, [%rd17];
setp.le.s32 %p5, %r16, %r1;
mov.u32 %r21, %r4;
@%p5 bra BB5_3;
shl.b64 %rd18, %rd4, 2;
add.s64 %rd19, %rd3, %rd18;
ld.global.f32 %f1, [%rd19];
sub.s32 %r17, %r3, %r5;
mul.lo.s32 %r18, %r17, %r8;
sub.s32 %r19, %r18, %r6;
add.s32 %r20, %r19, %r2;
mul.wide.s32 %rd20, %r20, 4;
add.s64 %rd21, %rd2, %rd20;
st.global.f32 [%rd21], %f1;
BB5_5:
ret;
}
// .globl slice_dense_dense_d
.visible .entry slice_dense_dense_d(
.param .u64 slice_dense_dense_d_param_0,
.param .u64 slice_dense_dense_d_param_1,
.param .u32 slice_dense_dense_d_param_2,
.param .u32 slice_dense_dense_d_param_3,
.param .u32 slice_dense_dense_d_param_4,
.param .u32 slice_dense_dense_d_param_5,
.param .u32 slice_dense_dense_d_param_6,
.param .u32 slice_dense_dense_d_param_7,
.param .u32 slice_dense_dense_d_param_8
)
{
.reg .pred %p<4>;
.reg .b32 %r<15>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [slice_dense_dense_d_param_0];
ld.param.u64 %rd2, [slice_dense_dense_d_param_1];
ld.param.u32 %r3, [slice_dense_dense_d_param_2];
ld.param.u32 %r4, [slice_dense_dense_d_param_4];
ld.param.u32 %r5, [slice_dense_dense_d_param_6];
ld.param.u32 %r7, [slice_dense_dense_d_param_7];
ld.param.u32 %r6, [slice_dense_dense_d_param_8];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r9, %r8, %r10;
div.s32 %r2, %r1, %r6;
setp.lt.s32 %p1, %r2, %r7;
setp.gt.s32 %p2, %r6, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB6_2;
bra.uni BB6_1;
BB6_1:
rem.s32 %r11, %r1, %r6;
cvta.to.global.u64 %rd3, %rd1;
add.s32 %r12, %r2, %r3;
add.s32 %r13, %r11, %r4;
mad.lo.s32 %r14, %r12, %r5, %r13;
mul.wide.s32 %rd4, %r14, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB6_2:
ret;
}
// .globl slice_dense_dense_f
.visible .entry slice_dense_dense_f(
.param .u64 slice_dense_dense_f_param_0,
.param .u64 slice_dense_dense_f_param_1,
.param .u32 slice_dense_dense_f_param_2,
.param .u32 slice_dense_dense_f_param_3,
.param .u32 slice_dense_dense_f_param_4,
.param .u32 slice_dense_dense_f_param_5,
.param .u32 slice_dense_dense_f_param_6,
.param .u32 slice_dense_dense_f_param_7,
.param .u32 slice_dense_dense_f_param_8
)
{
.reg .pred %p<4>;
.reg .f32 %f<2>;
.reg .b32 %r<15>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [slice_dense_dense_f_param_0];
ld.param.u64 %rd2, [slice_dense_dense_f_param_1];
ld.param.u32 %r3, [slice_dense_dense_f_param_2];
ld.param.u32 %r4, [slice_dense_dense_f_param_4];
ld.param.u32 %r5, [slice_dense_dense_f_param_6];
ld.param.u32 %r7, [slice_dense_dense_f_param_7];
ld.param.u32 %r6, [slice_dense_dense_f_param_8];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r9, %r8, %r10;
div.s32 %r2, %r1, %r6;
setp.lt.s32 %p1, %r2, %r7;
setp.gt.s32 %p2, %r6, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB7_2;
bra.uni BB7_1;
BB7_1:
rem.s32 %r11, %r1, %r6;
cvta.to.global.u64 %rd3, %rd1;
add.s32 %r12, %r2, %r3;
add.s32 %r13, %r11, %r4;
mad.lo.s32 %r14, %r12, %r5, %r13;
mul.wide.s32 %rd4, %r14, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB7_2:
ret;
}
// .globl copy_u2l_dense_d
.visible .entry copy_u2l_dense_d(
.param .u64 copy_u2l_dense_d_param_0,
.param .u32 copy_u2l_dense_d_param_1,
.param .u32 copy_u2l_dense_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<10>;
.reg .f64 %fd<2>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_d_param_0];
ld.param.u32 %r3, [copy_u2l_dense_d_param_1];
ld.param.u32 %r4, [copy_u2l_dense_d_param_2];
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
div.s32 %r8, %r1, %r3;
rem.s32 %r9, %r1, %r3;
mad.lo.s32 %r2, %r9, %r3, %r8;
setp.gt.s32 %p1, %r9, %r8;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB8_2;
bra.uni BB8_1;
BB8_1:
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
ld.global.f64 %fd1, [%rd4];
mul.wide.s32 %rd5, %r2, 8;
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
BB8_2:
ret;
}
// .globl copy_u2l_dense_f
.visible .entry copy_u2l_dense_f(
.param .u64 copy_u2l_dense_f_param_0,
.param .u32 copy_u2l_dense_f_param_1,
.param .u32 copy_u2l_dense_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_f_param_0];
ld.param.u32 %r3, [copy_u2l_dense_f_param_1];
ld.param.u32 %r4, [copy_u2l_dense_f_param_2];
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
div.s32 %r8, %r1, %r3;
rem.s32 %r9, %r1, %r3;
mad.lo.s32 %r2, %r9, %r3, %r8;
setp.gt.s32 %p1, %r9, %r8;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB9_2;
bra.uni BB9_1;
BB9_1:
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 4;
add.s64 %rd4, %rd2, %rd3;
ld.global.f32 %f1, [%rd4];
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd2, %rd5;
st.global.f32 [%rd6], %f1;
BB9_2:
ret;
}
// .globl relu_d
.visible .entry relu_d(
.param .u64 relu_d_param_0,
.param .u64 relu_d_param_1,
.param .u32 relu_d_param_2,
.param .u32 relu_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [relu_d_param_0];
ld.param.u64 %rd2, [relu_d_param_1];
ld.param.u32 %r2, [relu_d_param_2];
ld.param.u32 %r3, [relu_d_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB10_2;
bra.uni BB10_1;
BB10_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
mov.f64 %fd2, 0d0000000000000000;
max.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd3;
BB10_2:
ret;
}
// .globl relu_f
.visible .entry relu_f(
.param .u64 relu_f_param_0,
.param .u64 relu_f_param_1,
.param .u32 relu_f_param_2,
.param .u32 relu_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<3>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [relu_f_param_0];
ld.param.u64 %rd2, [relu_f_param_1];
ld.param.u32 %r2, [relu_f_param_2];
ld.param.u32 %r3, [relu_f_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB11_2;
bra.uni BB11_1;
BB11_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd1, %f1;
mov.f64 %fd2, 0d0000000000000000;
max.f64 %fd3, %fd2, %fd1;
cvt.rn.f32.f64 %f2, %fd3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB11_2:
ret;
}
// .globl relu_backward_d
.visible .entry relu_backward_d(
.param .u64 relu_backward_d_param_0,
.param .u64 relu_backward_d_param_1,
.param .u64 relu_backward_d_param_2,
.param .u32 relu_backward_d_param_3,
.param .u32 relu_backward_d_param_4
)
{
.reg .pred %p<5>;
.reg .b32 %r<8>;
.reg .f64 %fd<6>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [relu_backward_d_param_0];
ld.param.u64 %rd3, [relu_backward_d_param_1];
ld.param.u64 %rd4, [relu_backward_d_param_2];
ld.param.u32 %r2, [relu_backward_d_param_3];
ld.param.u32 %r3, [relu_backward_d_param_4];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r4, %r5, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB12_4;
bra.uni BB12_1;
BB12_1:
cvta.to.global.u64 %rd5, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd4, [%rd7];
mov.f64 %fd5, 0d0000000000000000;
setp.leu.f64 %p4, %fd4, 0d0000000000000000;
@%p4 bra BB12_3;
cvta.to.global.u64 %rd8, %rd3;
shl.b64 %rd9, %rd1, 3;
add.s64 %rd10, %rd8, %rd9;
ld.global.f64 %fd5, [%rd10];
BB12_3:
cvta.to.global.u64 %rd11, %rd4;
shl.b64 %rd12, %rd1, 3;
add.s64 %rd13, %rd11, %rd12;
st.global.f64 [%rd13], %fd5;
BB12_4:
ret;
}
// .globl relu_backward_f
.visible .entry relu_backward_f(
.param .u64 relu_backward_f_param_0,
.param .u64 relu_backward_f_param_1,
.param .u64 relu_backward_f_param_2,
.param .u32 relu_backward_f_param_3,
.param .u32 relu_backward_f_param_4
)
{
.reg .pred %p<5>;
.reg .f32 %f<6>;
.reg .b32 %r<8>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [relu_backward_f_param_0];
ld.param.u64 %rd3, [relu_backward_f_param_1];
ld.param.u64 %rd4, [relu_backward_f_param_2];
ld.param.u32 %r2, [relu_backward_f_param_3];
ld.param.u32 %r3, [relu_backward_f_param_4];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r4, %r5, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB13_4;
bra.uni BB13_1;
BB13_1:
cvta.to.global.u64 %rd5, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f4, [%rd7];
mov.f32 %f5, 0f00000000;
setp.leu.f32 %p4, %f4, 0f00000000;
@%p4 bra BB13_3;
cvta.to.global.u64 %rd8, %rd3;
shl.b64 %rd9, %rd1, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f5, [%rd10];
BB13_3:
cvta.to.global.u64 %rd11, %rd4;
shl.b64 %rd12, %rd1, 2;
add.s64 %rd13, %rd11, %rd12;
st.global.f32 [%rd13], %f5;
BB13_4:
ret;
}
// .globl inplace_add_d
.visible .entry inplace_add_d(
.param .u64 inplace_add_d_param_0,
.param .u64 inplace_add_d_param_1,
.param .u32 inplace_add_d_param_2,
.param .u32 inplace_add_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [inplace_add_d_param_0];
ld.param.u64 %rd2, [inplace_add_d_param_1];
ld.param.u32 %r2, [inplace_add_d_param_2];
ld.param.u32 %r3, [inplace_add_d_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB14_2;
bra.uni BB14_1;
BB14_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
ld.global.f64 %fd1, [%rd7];
ld.global.f64 %fd2, [%rd5];
add.f64 %fd3, %fd2, %fd1;
st.global.f64 [%rd7], %fd3;
BB14_2:
ret;
}
// .globl inplace_add_f
.visible .entry inplace_add_f(
.param .u64 inplace_add_f_param_0,
.param .u64 inplace_add_f_param_1,
.param .u32 inplace_add_f_param_2,
.param .u32 inplace_add_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<8>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [inplace_add_f_param_0];
ld.param.u64 %rd2, [inplace_add_f_param_1];
ld.param.u32 %r2, [inplace_add_f_param_2];
ld.param.u32 %r3, [inplace_add_f_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.lt.s32 %p1, %r7, %r2;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB15_2;
bra.uni BB15_1;
BB15_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
ld.global.f32 %f1, [%rd7];
ld.global.f32 %f2, [%rd5];
add.f32 %f3, %f2, %f1;
st.global.f32 [%rd7], %f3;
BB15_2:
ret;
}
// .globl bias_add_d
.visible .entry bias_add_d(
.param .u64 bias_add_d_param_0,
.param .u64 bias_add_d_param_1,
.param .u64 bias_add_d_param_2,
.param .u32 bias_add_d_param_3,
.param .u32 bias_add_d_param_4,
.param .u32 bias_add_d_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_add_d_param_0];
ld.param.u64 %rd2, [bias_add_d_param_1];
ld.param.u64 %rd3, [bias_add_d_param_2];
ld.param.u32 %r4, [bias_add_d_param_3];
ld.param.u32 %r2, [bias_add_d_param_4];
ld.param.u32 %r3, [bias_add_d_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.lt.s32 %p1, %r8, %r4;
setp.gt.s32 %p2, %r2, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB16_2;
bra.uni BB16_1;
BB16_1:
rem.s32 %r9, %r1, %r2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
add.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB16_2:
ret;
}
// .globl bias_add_f
.visible .entry bias_add_f(
.param .u64 bias_add_f_param_0,
.param .u64 bias_add_f_param_1,
.param .u64 bias_add_f_param_2,
.param .u32 bias_add_f_param_3,
.param .u32 bias_add_f_param_4,
.param .u32 bias_add_f_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_add_f_param_0];
ld.param.u64 %rd2, [bias_add_f_param_1];
ld.param.u64 %rd3, [bias_add_f_param_2];
ld.param.u32 %r4, [bias_add_f_param_3];
ld.param.u32 %r2, [bias_add_f_param_4];
ld.param.u32 %r3, [bias_add_f_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.lt.s32 %p1, %r8, %r4;
setp.gt.s32 %p2, %r2, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB17_2;
bra.uni BB17_1;
BB17_1:
rem.s32 %r9, %r1, %r2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
ld.global.f32 %f2, [%rd6];
add.f32 %f3, %f2, %f1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f32 [%rd11], %f3;
BB17_2:
ret;
}
// .globl daxpy_matrix_vector_d
.visible .entry daxpy_matrix_vector_d(
.param .u64 daxpy_matrix_vector_d_param_0,
.param .u64 daxpy_matrix_vector_d_param_1,
.param .f64 daxpy_matrix_vector_d_param_2,
.param .u64 daxpy_matrix_vector_d_param_3,
.param .u32 daxpy_matrix_vector_d_param_4,
.param .u32 daxpy_matrix_vector_d_param_5,
.param .u32 daxpy_matrix_vector_d_param_6,
.param .u32 daxpy_matrix_vector_d_param_7
)
{
.reg .pred %p<5>;
.reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd3, [daxpy_matrix_vector_d_param_0];
ld.param.u64 %rd5, [daxpy_matrix_vector_d_param_1];
ld.param.f64 %fd2, [daxpy_matrix_vector_d_param_2];
ld.param.u64 %rd4, [daxpy_matrix_vector_d_param_3];
ld.param.u32 %r5, [daxpy_matrix_vector_d_param_4];
ld.param.u32 %r3, [daxpy_matrix_vector_d_param_5];
ld.param.u32 %r4, [daxpy_matrix_vector_d_param_6];
cvta.to.global.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r6, %r7, %r8;
div.s32 %r1, %r9, %r3;
rem.s32 %r2, %r9, %r3;
setp.lt.s32 %p1, %r1, %r5;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB18_4;
bra.uni BB18_1;
BB18_1:
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
@%p4 bra BB18_3;
bra.uni BB18_2;
BB18_3:
mul.wide.s32 %rd12, %r2, 8;
add.s64 %rd13, %rd1, %rd12;
ld.global.f64 %fd5, [%rd13];
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
st.global.f64 [%rd2], %fd6;
bra.uni BB18_4;
BB18_2:
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd1, %rd10;
ld.global.f64 %fd3, [%rd11];
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
st.global.f64 [%rd2], %fd4;
BB18_4:
ret;
}
// .globl daxpy_matrix_vector_f
.visible .entry daxpy_matrix_vector_f(
.param .u64 daxpy_matrix_vector_f_param_0,
.param .u64 daxpy_matrix_vector_f_param_1,
.param .f64 daxpy_matrix_vector_f_param_2,
.param .u64 daxpy_matrix_vector_f_param_3,
.param .u32 daxpy_matrix_vector_f_param_4,
.param .u32 daxpy_matrix_vector_f_param_5,
.param .u32 daxpy_matrix_vector_f_param_6,
.param .u32 daxpy_matrix_vector_f_param_7
)
{
.reg .pred %p<5>;
.reg .f32 %f<6>;
.reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd3, [daxpy_matrix_vector_f_param_0];
ld.param.u64 %rd5, [daxpy_matrix_vector_f_param_1];
ld.param.f64 %fd2, [daxpy_matrix_vector_f_param_2];
ld.param.u64 %rd4, [daxpy_matrix_vector_f_param_3];
ld.param.u32 %r5, [daxpy_matrix_vector_f_param_4];
ld.param.u32 %r3, [daxpy_matrix_vector_f_param_5];
ld.param.u32 %r4, [daxpy_matrix_vector_f_param_6];
cvta.to.global.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r6, %r7, %r8;
div.s32 %r1, %r9, %r3;
rem.s32 %r2, %r9, %r3;
setp.lt.s32 %p1, %r1, %r5;
setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB19_4;
bra.uni BB19_1;
BB19_1:
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
cvt.f64.f32 %fd1, %f1;
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
@%p4 bra BB19_3;
bra.uni BB19_2;
BB19_3:
mul.wide.s32 %rd12, %r2, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.f32 %f4, [%rd13];
cvt.f64.f32 %fd5, %f4;
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
cvt.rn.f32.f64 %f5, %fd6;
st.global.f32 [%rd2], %f5;
bra.uni BB19_4;
BB19_2:
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.f32 %f2, [%rd11];
cvt.f64.f32 %fd3, %f2;
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
cvt.rn.f32.f64 %f3, %fd4;
st.global.f32 [%rd2], %f3;
BB19_4:
ret;
}
// .globl bias_multiply_d
.visible .entry bias_multiply_d(
.param .u64 bias_multiply_d_param_0,
.param .u64 bias_multiply_d_param_1,
.param .u64 bias_multiply_d_param_2,
.param .u32 bias_multiply_d_param_3,
.param .u32 bias_multiply_d_param_4,
.param .u32 bias_multiply_d_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_multiply_d_param_0];
ld.param.u64 %rd2, [bias_multiply_d_param_1];
ld.param.u64 %rd3, [bias_multiply_d_param_2];
ld.param.u32 %r4, [bias_multiply_d_param_3];
ld.param.u32 %r2, [bias_multiply_d_param_4];
ld.param.u32 %r3, [bias_multiply_d_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.lt.s32 %p1, %r8, %r4;
setp.gt.s32 %p2, %r2, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB20_2;
bra.uni BB20_1;
BB20_1:
rem.s32 %r9, %r1, %r2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
mul.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB20_2:
ret;
}
// .globl bias_multiply_f
.visible .entry bias_multiply_f(
.param .u64 bias_multiply_f_param_0,
.param .u64 bias_multiply_f_param_1,
.param .u64 bias_multiply_f_param_2,
.param .u32 bias_multiply_f_param_3,
.param .u32 bias_multiply_f_param_4,
.param .u32 bias_multiply_f_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_multiply_f_param_0];
ld.param.u64 %rd2, [bias_multiply_f_param_1];
ld.param.u64 %rd3, [bias_multiply_f_param_2];
ld.param.u32 %r4, [bias_multiply_f_param_3];
ld.param.u32 %r2, [bias_multiply_f_param_4];
ld.param.u32 %r3, [bias_multiply_f_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.lt.s32 %p1, %r8, %r4;
setp.gt.s32 %p2, %r2, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB21_2;
bra.uni BB21_1;
BB21_1:
rem.s32 %r9, %r1, %r2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
ld.global.f32 %f2, [%rd6];
mul.f32 %f3, %f2, %f1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f32 [%rd11], %f3;
BB21_2:
ret;
}
// .globl matrix_matrix_cellwise_op_d
.visible .entry matrix_matrix_cellwise_op_d(
.param .u64 matrix_matrix_cellwise_op_d_param_0,
.param .u64 matrix_matrix_cellwise_op_d_param_1,
.param .u64 matrix_matrix_cellwise_op_d_param_2,
.param .u32 matrix_matrix_cellwise_op_d_param_3,
.param .u32 matrix_matrix_cellwise_op_d_param_4,
.param .u32 matrix_matrix_cellwise_op_d_param_5,
.param .u32 matrix_matrix_cellwise_op_d_param_6,
.param .u32 matrix_matrix_cellwise_op_d_param_7
)
{
.reg .pred %p<77>;
.reg .b32 %r<56>;
.reg .f64 %fd<55>;
.reg .b64 %rd<19>;
ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0];
ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1];
ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2];
ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_3];
ld.param.u32 %r6, [matrix_matrix_cellwise_op_d_param_4];
ld.param.u32 %r7, [matrix_matrix_cellwise_op_d_param_5];
ld.param.u32 %r8, [matrix_matrix_cellwise_op_d_param_6];
ld.param.u32 %r9, [matrix_matrix_cellwise_op_d_param_7];
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %ntid.x;
mov.u32 %r13, %tid.x;
mad.lo.s32 %r1, %r12, %r11, %r13;
div.s32 %r2, %r1, %r6;
setp.lt.s32 %p2, %r2, %r10;
setp.gt.s32 %p3, %r6, -1;
and.pred %p4, %p2, %p3;
@!%p4 bra BB22_65;
bra.uni BB22_1;
BB22_1:
rem.s32 %r14, %r1, %r6;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r3, %r2, %r6, %r14;
setp.eq.s32 %p5, %r7, 2;
selp.b32 %r15, %r14, %r3, %p5;
setp.eq.s32 %p6, %r7, 1;
selp.b32 %r16, %r2, %r15, %p6;
setp.eq.s32 %p7, %r8, 2;
selp.b32 %r17, %r14, %r3, %p7;
setp.eq.s32 %p8, %r8, 1;
selp.b32 %r18, %r2, %r17, %p8;
mul.wide.s32 %rd6, %r16, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
cvta.to.global.u64 %rd8, %rd3;
mul.wide.s32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
ld.global.f64 %fd2, [%rd10];
mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r9, 8;
@%p9 bra BB22_18;
setp.gt.s32 %p23, %r9, 3;
@%p23 bra BB22_10;
setp.gt.s32 %p30, %r9, 1;
@%p30 bra BB22_7;
setp.eq.s32 %p33, %r9, 0;
@%p33 bra BB22_63;
bra.uni BB22_5;
BB22_63:
add.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_18:
setp.gt.s32 %p10, %r9, 13;
@%p10 bra BB22_27;
setp.gt.s32 %p17, %r9, 10;
@%p17 bra BB22_23;
setp.eq.s32 %p21, %r9, 9;
@%p21 bra BB22_45;
bra.uni BB22_21;
BB22_45:
setp.eq.f64 %p50, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
bra.uni BB22_64;
BB22_10:
setp.gt.s32 %p24, %r9, 5;
@%p24 bra BB22_14;
setp.eq.s32 %p28, %r9, 4;
@%p28 bra BB22_48;
bra.uni BB22_12;
BB22_48:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd2;
}
bfe.u32 %r31, %r5, 20, 11;
add.s32 %r32, %r31, -1012;
mov.b64 %rd15, %fd2;
shl.b64 %rd1, %rd15, %r32;
setp.eq.s64 %p55, %rd1, -9223372036854775808;
abs.f64 %fd19, %fd1;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd19;
.param .b64 param1;
st.param.f64 [param1+0], %fd2;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd53, [retval0+0];
//{
}// Callseq End 0
setp.lt.s32 %p56, %r4, 0;
and.pred %p1, %p56, %p55;
@!%p1 bra BB22_50;
bra.uni BB22_49;
BB22_49:
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd53;
}
xor.b32 %r34, %r33, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r35, %temp}, %fd53;
}
mov.b64 %fd53, {%r35, %r34};
BB22_50:
mov.f64 %fd52, %fd53;
setp.eq.f64 %p57, %fd1, 0d0000000000000000;
@%p57 bra BB22_53;
bra.uni BB22_51;
BB22_53:
selp.b32 %r36, %r4, 0, %p55;
or.b32 %r37, %r36, 2146435072;
setp.lt.s32 %p61, %r5, 0;
selp.b32 %r38, %r37, %r36, %p61;
mov.u32 %r39, 0;
mov.b64 %fd52, {%r39, %r38};
bra.uni BB22_54;
BB22_27:
setp.gt.s32 %p11, %r9, 15;
@%p11 bra BB22_31;
setp.eq.s32 %p15, %r9, 14;
@%p15 bra BB22_42;
bra.uni BB22_29;
BB22_42:
cvt.rni.s64.f64 %rd11, %fd1;
cvt.rni.s64.f64 %rd12, %fd2;
cvt.u32.u64 %r25, %rd11;
cvt.u32.u64 %r26, %rd12;
or.b32 %r27, %r26, %r25;
setp.eq.s32 %p47, %r27, 0;
selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
bra.uni BB22_64;
BB22_7:
setp.eq.s32 %p31, %r9, 2;
@%p31 bra BB22_62;
bra.uni BB22_8;
BB22_62:
mul.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_23:
setp.eq.s32 %p18, %r9, 11;
@%p18 bra BB22_44;
setp.eq.s32 %p19, %r9, 12;
@%p19 bra BB22_43;
bra.uni BB22_25;
BB22_43:
max.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_14:
setp.eq.s32 %p25, %r9, 6;
@%p25 bra BB22_47;
setp.eq.s32 %p26, %r9, 7;
@%p26 bra BB22_46;
bra.uni BB22_16;
BB22_46:
setp.gt.f64 %p52, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
bra.uni BB22_64;
BB22_31:
setp.eq.s32 %p12, %r9, 16;
@%p12 bra BB22_41;
setp.eq.s32 %p13, %r9, 17;
@%p13 bra BB22_37;
bra.uni BB22_33;
BB22_37:
setp.eq.f64 %p39, %fd2, 0d0000000000000000;
setp.eq.f64 %p40, %fd2, 0d8000000000000000;
or.pred %p41, %p39, %p40;
mov.f64 %fd54, 0d7FF8000000000000;
@%p41 bra BB22_64;
div.rn.f64 %fd54, %fd1, %fd2;
abs.f64 %fd39, %fd54;
setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000;
@%p42 bra BB22_64;
{
.reg .b32 %temp;
mov.b64 {%r22, %temp}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd54;
}
and.b32 %r24, %r23, 2147483647;
setp.ne.s32 %p43, %r24, 2146435072;
setp.ne.s32 %p44, %r22, 0;
or.pred %p45, %p43, %p44;
@!%p45 bra BB22_64;
bra.uni BB22_40;
BB22_40:
cvt.rmi.f64.f64 %fd40, %fd54;
mul.f64 %fd41, %fd2, %fd40;
sub.f64 %fd54, %fd1, %fd41;
bra.uni BB22_64;
BB22_5:
setp.eq.s32 %p34, %r9, 1;
@%p34 bra BB22_6;
bra.uni BB22_64;
BB22_6:
sub.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_21:
setp.eq.s32 %p22, %r9, 10;
@%p22 bra BB22_22;
bra.uni BB22_64;
BB22_22:
setp.neu.f64 %p49, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
bra.uni BB22_64;
BB22_12:
setp.eq.s32 %p29, %r9, 5;
@%p29 bra BB22_13;
bra.uni BB22_64;
BB22_13:
setp.lt.f64 %p54, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
bra.uni BB22_64;
BB22_29:
setp.eq.s32 %p16, %r9, 15;
@%p16 bra BB22_30;
bra.uni BB22_64;
BB22_30:
mul.f64 %fd43, %fd1, %fd2;
mov.f64 %fd44, 0d3FF0000000000000;
sub.f64 %fd54, %fd44, %fd43;
bra.uni BB22_64;
BB22_8:
setp.eq.s32 %p32, %r9, 3;
@%p32 bra BB22_9;
bra.uni BB22_64;
BB22_9:
div.rn.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_44:
min.f64 %fd54, %fd1, %fd2;
bra.uni BB22_64;
BB22_25:
setp.eq.s32 %p20, %r9, 13;
@%p20 bra BB22_26;
bra.uni BB22_64;
BB22_26:
cvt.rni.s64.f64 %rd13, %fd1;
cvt.rni.s64.f64 %rd14, %fd2;
cvt.u32.u64 %r28, %rd13;
cvt.u32.u64 %r29, %rd14;
and.b32 %r30, %r29, %r28;
setp.eq.s32 %p48, %r30, 0;
selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
bra.uni BB22_64;
BB22_47:
setp.le.f64 %p53, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
bra.uni BB22_64;
BB22_16:
setp.eq.s32 %p27, %r9, 8;
@%p27 bra BB22_17;
bra.uni BB22_64;
BB22_17:
setp.ge.f64 %p51, %fd1, %fd2;
selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
bra.uni BB22_64;
BB22_41:
setp.neu.f64 %p46, %fd1, 0d0000000000000000;
sub.f64 %fd42, %fd1, %fd2;
selp.f64 %fd54, %fd42, 0d0000000000000000, %p46;
bra.uni BB22_64;
BB22_33:
setp.ne.s32 %p14, %r9, 18;
@%p14 bra BB22_64;
div.rn.f64 %fd54, %fd1, %fd2;
abs.f64 %fd37, %fd54;
setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000;
@%p35 bra BB22_64;
{
.reg .b32 %temp;
mov.b64 {%r19, %temp}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r20}, %fd54;
}
and.b32 %r21, %r20, 2147483647;
setp.ne.s32 %p36, %r21, 2146435072;
setp.ne.s32 %p37, %r19, 0;
or.pred %p38, %p36, %p37;
@!%p38 bra BB22_64;
bra.uni BB22_36;
BB22_36:
cvt.rmi.f64.f64 %fd54, %fd54;
bra.uni BB22_64;
BB22_51:
setp.gt.s32 %p58, %r4, -1;
@%p58 bra BB22_54;
cvt.rzi.f64.f64 %fd45, %fd2;
setp.neu.f64 %p59, %fd45, %fd2;
selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59;
BB22_54:
mov.f64 %fd25, %fd52;
add.f64 %fd26, %fd1, %fd2;
{
.reg .b32 %temp;
mov.b64 {%temp, %r40}, %fd26;
}
and.b32 %r41, %r40, 2146435072;
setp.ne.s32 %p62, %r41, 2146435072;
mov.f64 %fd51, %fd25;
@%p62 bra BB22_61;
setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000;
mov.f64 %fd51, %fd26;
@%p63 bra BB22_61;
abs.f64 %fd46, %fd2;
setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000;
mov.f64 %fd50, %fd26;
mov.f64 %fd51, %fd50;
@%p64 bra BB22_61;
{
.reg .b32 %temp;
mov.b64 {%r42, %temp}, %fd2;
}
and.b32 %r43, %r5, 2147483647;
setp.eq.s32 %p65, %r43, 2146435072;
setp.eq.s32 %p66, %r42, 0;
and.pred %p67, %p65, %p66;
@%p67 bra BB22_60;
bra.uni BB22_58;
BB22_60:
setp.gt.f64 %p71, %fd19, 0d3FF0000000000000;
selp.b32 %r51, 2146435072, 0, %p71;
xor.b32 %r52, %r51, 2146435072;
setp.lt.s32 %p72, %r5, 0;
selp.b32 %r53, %r52, %r51, %p72;
setp.eq.f64 %p73, %fd1, 0dBFF0000000000000;
selp.b32 %r54, 1072693248, %r53, %p73;
mov.u32 %r55, 0;
mov.b64 %fd51, {%r55, %r54};
bra.uni BB22_61;
BB22_58:
{
.reg .b32 %temp;
mov.b64 {%r44, %temp}, %fd1;
}
and.b32 %r45, %r4, 2147483647;
setp.eq.s32 %p68, %r45, 2146435072;
setp.eq.s32 %p69, %r44, 0;
and.pred %p70, %p68, %p69;
mov.f64 %fd51, %fd25;
@!%p70 bra BB22_61;
bra.uni BB22_59;
BB22_59:
shr.s32 %r46, %r5, 31;
and.b32 %r47, %r46, -2146435072;
selp.b32 %r48, -1048576, 2146435072, %p1;
add.s32 %r49, %r48, %r47;
mov.u32 %r50, 0;
mov.b64 %fd51, {%r50, %r49};
BB22_61:
setp.eq.f64 %p74, %fd2, 0d0000000000000000;
setp.eq.f64 %p75, %fd1, 0d3FF0000000000000;
or.pred %p76, %p75, %p74;
selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76;
BB22_64:
cvta.to.global.u64 %rd16, %rd4;
mul.wide.s32 %rd17, %r3, 8;
add.s64 %rd18, %rd16, %rd17;
st.global.f64 [%rd18], %fd54;
bar.sync 0;
BB22_65:
ret;
}
// .globl matrix_matrix_cellwise_op_f
.visible .entry matrix_matrix_cellwise_op_f(
.param .u64 matrix_matrix_cellwise_op_f_param_0,
.param .u64 matrix_matrix_cellwise_op_f_param_1,
.param .u64 matrix_matrix_cellwise_op_f_param_2,
.param .u32 matrix_matrix_cellwise_op_f_param_3,
.param .u32 matrix_matrix_cellwise_op_f_param_4,
.param .u32 matrix_matrix_cellwise_op_f_param_5,
.param .u32 matrix_matrix_cellwise_op_f_param_6,
.param .u32 matrix_matrix_cellwise_op_f_param_7
)
{
.reg .pred %p<76>;
.reg .f32 %f<134>;
.reg .b32 %r<42>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0];
ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1];
ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2];
ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_3];
ld.param.u32 %r4, [matrix_matrix_cellwise_op_f_param_4];
ld.param.u32 %r5, [matrix_matrix_cellwise_op_f_param_5];
ld.param.u32 %r6, [matrix_matrix_cellwise_op_f_param_6];
ld.param.u32 %r7, [matrix_matrix_cellwise_op_f_param_7];
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %ctaid.x;
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r9, %r10, %r11;
div.s32 %r2, %r1, %r4;
setp.lt.s32 %p2, %r2, %r8;
setp.gt.s32 %p3, %r4, -1;
and.pred %p4, %p2, %p3;
@!%p4 bra BB23_63;
bra.uni BB23_1;
BB23_1:
rem.s32 %r12, %r1, %r4;
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r3, %r2, %r4, %r12;
setp.eq.s32 %p5, %r5, 2;
selp.b32 %r13, %r12, %r3, %p5;
setp.eq.s32 %p6, %r5, 1;
selp.b32 %r14, %r2, %r13, %p6;
setp.eq.s32 %p7, %r6, 2;
selp.b32 %r15, %r12, %r3, %p7;
setp.eq.s32 %p8, %r6, 1;
selp.b32 %r16, %r2, %r15, %p8;
mul.wide.s32 %rd5, %r14, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r16, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mov.f32 %f133, 0f7F7FFFFF;
setp.gt.s32 %p9, %r7, 8;
@%p9 bra BB23_18;
setp.gt.s32 %p23, %r7, 3;
@%p23 bra BB23_10;
setp.gt.s32 %p30, %r7, 1;
@%p30 bra BB23_7;
setp.eq.s32 %p33, %r7, 0;
@%p33 bra BB23_61;
bra.uni BB23_5;
BB23_61:
add.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_18:
setp.gt.s32 %p10, %r7, 13;
@%p10 bra BB23_27;
setp.gt.s32 %p17, %r7, 10;
@%p17 bra BB23_23;
setp.eq.s32 %p21, %r7, 9;
@%p21 bra BB23_43;
bra.uni BB23_21;
BB23_43:
setp.eq.f32 %p44, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p44;
bra.uni BB23_62;
BB23_10:
setp.gt.s32 %p24, %r7, 5;
@%p24 bra BB23_14;
setp.eq.s32 %p28, %r7, 4;
@%p28 bra BB23_46;
bra.uni BB23_12;
BB23_46:
mul.f32 %f53, %f2, 0f3F000000;
cvt.rzi.f32.f32 %f54, %f53;
fma.rn.f32 %f55, %f54, 0fC0000000, %f2;
abs.f32 %f19, %f55;
abs.f32 %f20, %f1;
setp.lt.f32 %p49, %f20, 0f00800000;
mul.f32 %f56, %f20, 0f4B800000;
selp.f32 %f57, 0fC3170000, 0fC2FE0000, %p49;
selp.f32 %f58, %f56, %f20, %p49;
mov.b32 %r23, %f58;
and.b32 %r24, %r23, 8388607;
or.b32 %r25, %r24, 1065353216;
mov.b32 %f59, %r25;
shr.u32 %r26, %r23, 23;
cvt.rn.f32.u32 %f60, %r26;
add.f32 %f61, %f57, %f60;
setp.gt.f32 %p50, %f59, 0f3FB504F3;
mul.f32 %f62, %f59, 0f3F000000;
add.f32 %f63, %f61, 0f3F800000;
selp.f32 %f64, %f62, %f59, %p50;
selp.f32 %f65, %f63, %f61, %p50;
add.f32 %f66, %f64, 0fBF800000;
add.f32 %f50, %f64, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f49,%f50;
// inline asm
add.f32 %f67, %f66, %f66;
mul.f32 %f68, %f49, %f67;
mul.f32 %f69, %f68, %f68;
mov.f32 %f70, 0f3C4CAF63;
mov.f32 %f71, 0f3B18F0FE;
fma.rn.f32 %f72, %f71, %f69, %f70;
mov.f32 %f73, 0f3DAAAABD;
fma.rn.f32 %f74, %f72, %f69, %f73;
mul.rn.f32 %f75, %f74, %f69;
mul.rn.f32 %f76, %f75, %f68;
sub.f32 %f77, %f66, %f68;
neg.f32 %f78, %f68;
add.f32 %f79, %f77, %f77;
fma.rn.f32 %f80, %f78, %f66, %f79;
mul.rn.f32 %f81, %f49, %f80;
add.f32 %f82, %f76, %f68;
sub.f32 %f83, %f68, %f82;
add.f32 %f84, %f76, %f83;
add.f32 %f85, %f81, %f84;
add.f32 %f86, %f82, %f85;
sub.f32 %f87, %f82, %f86;
add.f32 %f88, %f85, %f87;
mov.f32 %f89, 0f3F317200;
mul.rn.f32 %f90, %f65, %f89;
mov.f32 %f91, 0f35BFBE8E;
mul.rn.f32 %f92, %f65, %f91;
add.f32 %f93, %f90, %f86;
sub.f32 %f94, %f90, %f93;
add.f32 %f95, %f86, %f94;
add.f32 %f96, %f88, %f95;
add.f32 %f97, %f92, %f96;
add.f32 %f98, %f93, %f97;
sub.f32 %f99, %f93, %f98;
add.f32 %f100, %f97, %f99;
abs.f32 %f21, %f2;
setp.gt.f32 %p51, %f21, 0f77F684DF;
mul.f32 %f101, %f2, 0f39000000;
selp.f32 %f102, %f101, %f2, %p51;
mul.rn.f32 %f103, %f102, %f98;
neg.f32 %f104, %f103;
fma.rn.f32 %f105, %f102, %f98, %f104;
fma.rn.f32 %f106, %f102, %f100, %f105;
mov.f32 %f107, 0f00000000;
fma.rn.f32 %f108, %f107, %f98, %f106;
add.rn.f32 %f109, %f103, %f108;
neg.f32 %f110, %f109;
add.rn.f32 %f111, %f103, %f110;
add.rn.f32 %f112, %f111, %f108;
mov.b32 %r27, %f109;
setp.eq.s32 %p52, %r27, 1118925336;
add.s32 %r28, %r27, -1;
mov.b32 %f113, %r28;
add.f32 %f114, %f112, 0f37000000;
selp.f32 %f115, %f113, %f109, %p52;
selp.f32 %f22, %f114, %f112, %p52;
mul.f32 %f116, %f115, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f117, %f116;
mov.f32 %f118, 0fBF317200;
fma.rn.f32 %f119, %f117, %f118, %f115;
mov.f32 %f120, 0fB5BFBE8E;
fma.rn.f32 %f121, %f117, %f120, %f119;
mul.f32 %f52, %f121, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f51,%f52;
// inline asm
add.f32 %f122, %f117, 0f00000000;
ex2.approx.f32 %f123, %f122;
mul.f32 %f124, %f51, %f123;
setp.lt.f32 %p53, %f115, 0fC2D20000;
selp.f32 %f125, 0f00000000, %f124, %p53;
setp.gt.f32 %p54, %f115, 0f42D20000;
selp.f32 %f131, 0f7F800000, %f125, %p54;
setp.eq.f32 %p55, %f131, 0f7F800000;
@%p55 bra BB23_48;
fma.rn.f32 %f131, %f131, %f22, %f131;
BB23_48:
setp.lt.f32 %p56, %f1, 0f00000000;
setp.eq.f32 %p57, %f19, 0f3F800000;
and.pred %p1, %p56, %p57;
mov.b32 %r29, %f131;
xor.b32 %r30, %r29, -2147483648;
mov.b32 %f126, %r30;
selp.f32 %f132, %f126, %f131, %p1;
setp.eq.f32 %p58, %f1, 0f00000000;
@%p58 bra BB23_51;
bra.uni BB23_49;
BB23_51:
add.f32 %f128, %f1, %f1;
mov.b32 %r31, %f128;
selp.b32 %r32, %r31, 0, %p57;
or.b32 %r33, %r32, 2139095040;
setp.lt.f32 %p62, %f2, 0f00000000;
selp.b32 %r34, %r33, %r32, %p62;
mov.b32 %f132, %r34;
bra.uni BB23_52;
BB23_27:
setp.gt.s32 %p11, %r7, 15;
@%p11 bra BB23_31;
setp.eq.s32 %p15, %r7, 14;
@%p15 bra BB23_40;
bra.uni BB23_29;
BB23_40:
cvt.rni.s64.f32 %rd10, %f1;
cvt.rni.s64.f32 %rd11, %f2;
cvt.u32.u64 %r17, %rd10;
cvt.u32.u64 %r18, %rd11;
or.b32 %r19, %r18, %r17;
setp.eq.s32 %p41, %r19, 0;
selp.f32 %f133, 0f00000000, 0f3F800000, %p41;
bra.uni BB23_62;
BB23_7:
setp.eq.s32 %p31, %r7, 2;
@%p31 bra BB23_60;
bra.uni BB23_8;
BB23_60:
mul.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_23:
setp.eq.s32 %p18, %r7, 11;
@%p18 bra BB23_42;
setp.eq.s32 %p19, %r7, 12;
@%p19 bra BB23_41;
bra.uni BB23_25;
BB23_41:
max.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_14:
setp.eq.s32 %p25, %r7, 6;
@%p25 bra BB23_45;
setp.eq.s32 %p26, %r7, 7;
@%p26 bra BB23_44;
bra.uni BB23_16;
BB23_44:
setp.gt.f32 %p46, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p46;
bra.uni BB23_62;
BB23_31:
setp.eq.s32 %p12, %r7, 16;
@%p12 bra BB23_39;
setp.eq.s32 %p13, %r7, 17;
@%p13 bra BB23_36;
bra.uni BB23_33;
BB23_36:
setp.eq.f32 %p36, %f2, 0f00000000;
setp.eq.f32 %p37, %f2, 0f80000000;
or.pred %p38, %p36, %p37;
mov.f32 %f133, 0f7FC00000;
@%p38 bra BB23_62;
div.rn.f32 %f133, %f1, %f2;
abs.f32 %f43, %f133;
setp.geu.f32 %p39, %f43, 0f7F800000;
@%p39 bra BB23_62;
cvt.rmi.f32.f32 %f44, %f133;
mul.f32 %f45, %f2, %f44;
sub.f32 %f133, %f1, %f45;
bra.uni BB23_62;
BB23_5:
setp.eq.s32 %p34, %r7, 1;
@%p34 bra BB23_6;
bra.uni BB23_62;
BB23_6:
sub.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_21:
setp.eq.s32 %p22, %r7, 10;
@%p22 bra BB23_22;
bra.uni BB23_62;
BB23_22:
setp.neu.f32 %p43, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p43;
bra.uni BB23_62;
BB23_12:
setp.eq.s32 %p29, %r7, 5;
@%p29 bra BB23_13;
bra.uni BB23_62;
BB23_13:
setp.lt.f32 %p48, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p48;
bra.uni BB23_62;
BB23_29:
setp.eq.s32 %p16, %r7, 15;
@%p16 bra BB23_30;
bra.uni BB23_62;
BB23_30:
mul.f32 %f47, %f1, %f2;
mov.f32 %f48, 0f3F800000;
sub.f32 %f133, %f48, %f47;
bra.uni BB23_62;
BB23_8:
setp.eq.s32 %p32, %r7, 3;
@%p32 bra BB23_9;
bra.uni BB23_62;
BB23_9:
div.rn.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_42:
min.f32 %f133, %f1, %f2;
bra.uni BB23_62;
BB23_25:
setp.eq.s32 %p20, %r7, 13;
@%p20 bra BB23_26;
bra.uni BB23_62;
BB23_26:
cvt.rni.s64.f32 %rd12, %f1;
cvt.rni.s64.f32 %rd13, %f2;
cvt.u32.u64 %r20, %rd12;
cvt.u32.u64 %r21, %rd13;
and.b32 %r22, %r21, %r20;
setp.eq.s32 %p42, %r22, 0;
selp.f32 %f133, 0f00000000, 0f3F800000, %p42;
bra.uni BB23_62;
BB23_45:
setp.le.f32 %p47, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p47;
bra.uni BB23_62;
BB23_16:
setp.eq.s32 %p27, %r7, 8;
@%p27 bra BB23_17;
bra.uni BB23_62;
BB23_17:
setp.ge.f32 %p45, %f1, %f2;
selp.f32 %f133, 0f3F800000, 0f00000000, %p45;
bra.uni BB23_62;
BB23_39:
setp.neu.f32 %p40, %f1, 0f00000000;
sub.f32 %f46, %f1, %f2;
selp.f32 %f133, %f46, 0f00000000, %p40;
bra.uni BB23_62;
BB23_33:
setp.ne.s32 %p14, %r7, 18;
@%p14 bra BB23_62;
div.rn.f32 %f133, %f1, %f2;
abs.f32 %f41, %f133;
setp.geu.f32 %p35, %f41, 0f7F800000;
@%p35 bra BB23_62;
cvt.rmi.f32.f32 %f133, %f133;
bra.uni BB23_62;
BB23_49:
setp.geu.f32 %p59, %f1, 0f00000000;
@%p59 bra BB23_52;
cvt.rzi.f32.f32 %f127, %f2;
setp.neu.f32 %p60, %f127, %f2;
selp.f32 %f132, 0f7FFFFFFF, %f132, %p60;
BB23_52:
add.f32 %f129, %f20, %f21;
mov.b32 %r35, %f129;
setp.lt.s32 %p63, %r35, 2139095040;
@%p63 bra BB23_59;
setp.gtu.f32 %p64, %f20, 0f7F800000;
setp.gtu.f32 %p65, %f21, 0f7F800000;
or.pred %p66, %p64, %p65;
@%p66 bra BB23_58;
bra.uni BB23_54;
BB23_58:
add.f32 %f132, %f1, %f2;
bra.uni BB23_59;
BB23_54:
setp.eq.f32 %p67, %f21, 0f7F800000;
@%p67 bra BB23_57;
bra.uni BB23_55;
BB23_57:
setp.gt.f32 %p70, %f20, 0f3F800000;
selp.b32 %r39, 2139095040, 0, %p70;
xor.b32 %r40, %r39, 2139095040;
setp.lt.f32 %p71, %f2, 0f00000000;
selp.b32 %r41, %r40, %r39, %p71;
mov.b32 %f130, %r41;
setp.eq.f32 %p72, %f1, 0fBF800000;
selp.f32 %f132, 0f3F800000, %f130, %p72;
bra.uni BB23_59;
BB23_55:
setp.neu.f32 %p68, %f20, 0f7F800000;
@%p68 bra BB23_59;
setp.ge.f32 %p69, %f2, 0f00000000;
selp.b32 %r36, 2139095040, 0, %p69;
or.b32 %r37, %r36, -2147483648;
selp.b32 %r38, %r37, %r36, %p1;
mov.b32 %f132, %r38;
BB23_59:
setp.eq.f32 %p73, %f2, 0f00000000;
setp.eq.f32 %p74, %f1, 0f3F800000;
or.pred %p75, %p74, %p73;
selp.f32 %f133, 0f3F800000, %f132, %p75;
BB23_62:
cvta.to.global.u64 %rd14, %rd3;
mul.wide.s32 %rd15, %r3, 4;
add.s64 %rd16, %rd14, %rd15;
st.global.f32 [%rd16], %f133;
bar.sync 0;
BB23_63:
ret;
}
// .globl matrix_scalar_op_d
.visible .entry matrix_scalar_op_d(
.param .u64 matrix_scalar_op_d_param_0,
.param .f64 matrix_scalar_op_d_param_1,
.param .u64 matrix_scalar_op_d_param_2,
.param .u32 matrix_scalar_op_d_param_3,
.param .u32 matrix_scalar_op_d_param_4,
.param .u32 matrix_scalar_op_d_param_5
)
{
.reg .pred %p<141>;
.reg .b32 %r<86>;
.reg .f64 %fd<107>;
.reg .b64 %rd<20>;
ld.param.u64 %rd4, [matrix_scalar_op_d_param_0];
ld.param.f64 %fd68, [matrix_scalar_op_d_param_1];
ld.param.u64 %rd5, [matrix_scalar_op_d_param_2];
ld.param.u32 %r8, [matrix_scalar_op_d_param_3];
ld.param.u32 %r6, [matrix_scalar_op_d_param_4];
ld.param.u32 %r7, [matrix_scalar_op_d_param_5];
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %ctaid.x;
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r9, %r10, %r11;
setp.ge.s32 %p3, %r1, %r8;
@%p3 bra BB24_130;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
mul.wide.s32 %rd8, %r1, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
@%p4 bra BB24_66;
mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 8;
@%p5 bra BB24_19;
setp.gt.s32 %p19, %r6, 3;
@%p19 bra BB24_11;
setp.gt.s32 %p26, %r6, 1;
@%p26 bra BB24_8;
setp.eq.s32 %p29, %r6, 0;
@%p29 bra BB24_64;
bra.uni BB24_6;
BB24_64:
add.f64 %fd98, %fd1, %fd68;
bra.uni BB24_65;
BB24_66:
mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p73, %r6, 8;
@%p73 bra BB24_83;
setp.gt.s32 %p87, %r6, 3;
@%p87 bra BB24_75;
setp.gt.s32 %p94, %r6, 1;
@%p94 bra BB24_72;
setp.eq.s32 %p97, %r6, 0;
@%p97 bra BB24_128;
bra.uni BB24_70;
BB24_128:
add.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_19:
setp.gt.s32 %p6, %r6, 13;
@%p6 bra BB24_28;
setp.gt.s32 %p13, %r6, 10;
@%p13 bra BB24_24;
setp.eq.s32 %p17, %r6, 9;
@%p17 bra BB24_46;
bra.uni BB24_22;
BB24_46:
setp.eq.f64 %p46, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
bra.uni BB24_65;
BB24_83:
setp.gt.s32 %p74, %r6, 13;
@%p74 bra BB24_92;
setp.gt.s32 %p81, %r6, 10;
@%p81 bra BB24_88;
setp.eq.s32 %p85, %r6, 9;
@%p85 bra BB24_110;
bra.uni BB24_86;
BB24_110:
setp.eq.f64 %p114, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
bra.uni BB24_129;
BB24_11:
setp.gt.s32 %p20, %r6, 5;
@%p20 bra BB24_15;
setp.eq.s32 %p24, %r6, 4;
@%p24 bra BB24_49;
bra.uni BB24_13;
BB24_49:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd68;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd1;
}
bfe.u32 %r24, %r3, 20, 11;
add.s32 %r25, %r24, -1012;
mov.b64 %rd14, %fd1;
shl.b64 %rd2, %rd14, %r25;
setp.eq.s64 %p51, %rd2, -9223372036854775808;
abs.f64 %fd18, %fd68;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd18;
.param .b64 param1;
st.param.f64 [param1+0], %fd1;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd97, [retval0+0];
//{
}// Callseq End 1
setp.lt.s32 %p52, %r2, 0;
and.pred %p1, %p52, %p51;
@!%p1 bra BB24_51;
bra.uni BB24_50;
BB24_50:
{
.reg .b32 %temp;
mov.b64 {%temp, %r26}, %fd97;
}
xor.b32 %r27, %r26, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r28, %temp}, %fd97;
}
mov.b64 %fd97, {%r28, %r27};
BB24_51:
mov.f64 %fd96, %fd97;
setp.eq.f64 %p53, %fd68, 0d0000000000000000;
@%p53 bra BB24_54;
bra.uni BB24_52;
BB24_54:
selp.b32 %r29, %r2, 0, %p51;
or.b32 %r30, %r29, 2146435072;
setp.lt.s32 %p57, %r3, 0;
selp.b32 %r31, %r30, %r29, %p57;
mov.u32 %r32, 0;
mov.b64 %fd96, {%r32, %r31};
bra.uni BB24_55;
BB24_28:
setp.gt.s32 %p7, %r6, 15;
@%p7 bra BB24_32;
setp.eq.s32 %p11, %r6, 14;
@%p11 bra BB24_43;
bra.uni BB24_30;
BB24_43:
cvt.rni.s64.f64 %rd10, %fd68;
cvt.rni.s64.f64 %rd11, %fd1;
cvt.u32.u64 %r18, %rd10;
cvt.u32.u64 %r19, %rd11;
or.b32 %r20, %r19, %r18;
setp.eq.s32 %p43, %r20, 0;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
bra.uni BB24_65;
BB24_75:
setp.gt.s32 %p88, %r6, 5;
@%p88 bra BB24_79;
setp.eq.s32 %p92, %r6, 4;
@%p92 bra BB24_113;
bra.uni BB24_77;
BB24_113:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd68;
}
bfe.u32 %r61, %r5, 20, 11;
add.s32 %r62, %r61, -1012;
mov.b64 %rd19, %fd68;
shl.b64 %rd3, %rd19, %r62;
setp.eq.s64 %p119, %rd3, -9223372036854775808;
abs.f64 %fd51, %fd1;
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd51;
.param .b64 param1;
st.param.f64 [param1+0], %fd68;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd105, [retval0+0];
//{
}// Callseq End 2
setp.lt.s32 %p120, %r4, 0;
and.pred %p2, %p120, %p119;
@!%p2 bra BB24_115;
bra.uni BB24_114;
BB24_114:
{
.reg .b32 %temp;
mov.b64 {%temp, %r63}, %fd105;
}
xor.b32 %r64, %r63, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r65, %temp}, %fd105;
}
mov.b64 %fd105, {%r65, %r64};
BB24_115:
mov.f64 %fd104, %fd105;
setp.eq.f64 %p121, %fd1, 0d0000000000000000;
@%p121 bra BB24_118;
bra.uni BB24_116;
BB24_118:
selp.b32 %r66, %r4, 0, %p119;
or.b32 %r67, %r66, 2146435072;
setp.lt.s32 %p125, %r5, 0;
selp.b32 %r68, %r67, %r66, %p125;
mov.u32 %r69, 0;
mov.b64 %fd104, {%r69, %r68};
bra.uni BB24_119;
BB24_92:
setp.gt.s32 %p75, %r6, 15;
@%p75 bra BB24_96;
setp.eq.s32 %p79, %r6, 14;
@%p79 bra BB24_107;
bra.uni BB24_94;
BB24_107:
cvt.rni.s64.f64 %rd15, %fd1;
cvt.rni.s64.f64 %rd16, %fd68;
cvt.u32.u64 %r55, %rd15;
cvt.u32.u64 %r56, %rd16;
or.b32 %r57, %r56, %r55;
setp.eq.s32 %p111, %r57, 0;
selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
bra.uni BB24_129;
BB24_8:
setp.eq.s32 %p27, %r6, 2;
@%p27 bra BB24_63;
bra.uni BB24_9;
BB24_63:
mul.f64 %fd98, %fd1, %fd68;
bra.uni BB24_65;
BB24_24:
setp.eq.s32 %p14, %r6, 11;
@%p14 bra BB24_45;
setp.eq.s32 %p15, %r6, 12;
@%p15 bra BB24_44;
bra.uni BB24_26;
BB24_44:
max.f64 %fd98, %fd68, %fd1;
bra.uni BB24_65;
BB24_15:
setp.eq.s32 %p21, %r6, 6;
@%p21 bra BB24_48;
setp.eq.s32 %p22, %r6, 7;
@%p22 bra BB24_47;
bra.uni BB24_17;
BB24_47:
setp.lt.f64 %p48, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
bra.uni BB24_65;
BB24_32:
setp.eq.s32 %p8, %r6, 16;
@%p8 bra BB24_42;
setp.eq.s32 %p9, %r6, 17;
@%p9 bra BB24_38;
bra.uni BB24_34;
BB24_38:
setp.eq.f64 %p35, %fd1, 0d0000000000000000;
setp.eq.f64 %p36, %fd1, 0d8000000000000000;
or.pred %p37, %p35, %p36;
mov.f64 %fd98, 0d7FF8000000000000;
@%p37 bra BB24_65;
div.rn.f64 %fd98, %fd68, %fd1;
abs.f64 %fd72, %fd98;
setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000;
@%p38 bra BB24_65;
{
.reg .b32 %temp;
mov.b64 {%r15, %temp}, %fd98;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r16}, %fd98;
}
and.b32 %r17, %r16, 2147483647;
setp.ne.s32 %p39, %r17, 2146435072;
setp.ne.s32 %p40, %r15, 0;
or.pred %p41, %p39, %p40;
@!%p41 bra BB24_65;
bra.uni BB24_41;
BB24_41:
cvt.rmi.f64.f64 %fd73, %fd98;
mul.f64 %fd74, %fd1, %fd73;
sub.f64 %fd98, %fd68, %fd74;
bra.uni BB24_65;
BB24_72:
setp.eq.s32 %p95, %r6, 2;
@%p95 bra BB24_127;
bra.uni BB24_73;
BB24_127:
mul.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_88:
setp.eq.s32 %p82, %r6, 11;
@%p82 bra BB24_109;
setp.eq.s32 %p83, %r6, 12;
@%p83 bra BB24_108;
bra.uni BB24_90;
BB24_108:
max.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_79:
setp.eq.s32 %p89, %r6, 6;
@%p89 bra BB24_112;
setp.eq.s32 %p90, %r6, 7;
@%p90 bra BB24_111;
bra.uni BB24_81;
BB24_111:
setp.gt.f64 %p116, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
bra.uni BB24_129;
BB24_96:
setp.eq.s32 %p76, %r6, 16;
@%p76 bra BB24_106;
setp.eq.s32 %p77, %r6, 17;
@%p77 bra BB24_102;
bra.uni BB24_98;
BB24_102:
setp.eq.f64 %p103, %fd68, 0d0000000000000000;
setp.eq.f64 %p104, %fd68, 0d8000000000000000;
or.pred %p105, %p103, %p104;
mov.f64 %fd106, 0d7FF8000000000000;
@%p105 bra BB24_129;
div.rn.f64 %fd106, %fd1, %fd68;
abs.f64 %fd83, %fd106;
setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000;
@%p106 bra BB24_129;
{
.reg .b32 %temp;
mov.b64 {%r52, %temp}, %fd106;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r53}, %fd106;
}
and.b32 %r54, %r53, 2147483647;
setp.ne.s32 %p107, %r54, 2146435072;
setp.ne.s32 %p108, %r52, 0;
or.pred %p109, %p107, %p108;
@!%p109 bra BB24_129;
bra.uni BB24_105;
BB24_105:
cvt.rmi.f64.f64 %fd84, %fd106;
mul.f64 %fd85, %fd84, %fd68;
sub.f64 %fd106, %fd1, %fd85;
bra.uni BB24_129;
BB24_6:
setp.eq.s32 %p30, %r6, 1;
@%p30 bra BB24_7;
bra.uni BB24_65;
BB24_7:
sub.f64 %fd98, %fd68, %fd1;
bra.uni BB24_65;
BB24_22:
setp.eq.s32 %p18, %r6, 10;
@%p18 bra BB24_23;
bra.uni BB24_65;
BB24_23:
setp.neu.f64 %p45, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
bra.uni BB24_65;
BB24_13:
setp.eq.s32 %p25, %r6, 5;
@%p25 bra BB24_14;
bra.uni BB24_65;
BB24_14:
setp.gt.f64 %p50, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
bra.uni BB24_65;
BB24_30:
setp.eq.s32 %p12, %r6, 15;
@%p12 bra BB24_31;
bra.uni BB24_65;
BB24_31:
mul.f64 %fd76, %fd1, %fd68;
mov.f64 %fd77, 0d3FF0000000000000;
sub.f64 %fd98, %fd77, %fd76;
bra.uni BB24_65;
BB24_9:
setp.eq.s32 %p28, %r6, 3;
@%p28 bra BB24_10;
bra.uni BB24_65;
BB24_10:
div.rn.f64 %fd98, %fd68, %fd1;
bra.uni BB24_65;
BB24_45:
min.f64 %fd98, %fd68, %fd1;
bra.uni BB24_65;
BB24_26:
setp.eq.s32 %p16, %r6, 13;
@%p16 bra BB24_27;
bra.uni BB24_65;
BB24_27:
cvt.rni.s64.f64 %rd12, %fd68;
cvt.rni.s64.f64 %rd13, %fd1;
cvt.u32.u64 %r21, %rd12;
cvt.u32.u64 %r22, %rd13;
and.b32 %r23, %r22, %r21;
setp.eq.s32 %p44, %r23, 0;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
bra.uni BB24_65;
BB24_48:
setp.ge.f64 %p49, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
bra.uni BB24_65;
BB24_17:
setp.eq.s32 %p23, %r6, 8;
@%p23 bra BB24_18;
bra.uni BB24_65;
BB24_18:
setp.le.f64 %p47, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
bra.uni BB24_65;
BB24_42:
setp.neu.f64 %p42, %fd68, 0d0000000000000000;
sub.f64 %fd75, %fd68, %fd1;
selp.f64 %fd98, %fd75, 0d0000000000000000, %p42;
bra.uni BB24_65;
BB24_34:
setp.ne.s32 %p10, %r6, 18;
@%p10 bra BB24_65;
div.rn.f64 %fd98, %fd68, %fd1;
abs.f64 %fd70, %fd98;
setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000;
@%p31 bra BB24_65;
{
.reg .b32 %temp;
mov.b64 {%r12, %temp}, %fd98;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r13}, %fd98;
}
and.b32 %r14, %r13, 2147483647;
setp.ne.s32 %p32, %r14, 2146435072;
setp.ne.s32 %p33, %r12, 0;
or.pred %p34, %p32, %p33;
@!%p34 bra BB24_65;
bra.uni BB24_37;
BB24_37:
cvt.rmi.f64.f64 %fd98, %fd98;
bra.uni BB24_65;
BB24_70:
setp.eq.s32 %p98, %r6, 1;
@%p98 bra BB24_71;
bra.uni BB24_129;
BB24_71:
sub.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_86:
setp.eq.s32 %p86, %r6, 10;
@%p86 bra BB24_87;
bra.uni BB24_129;
BB24_87:
setp.neu.f64 %p113, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
bra.uni BB24_129;
BB24_77:
setp.eq.s32 %p93, %r6, 5;
@%p93 bra BB24_78;
bra.uni BB24_129;
BB24_78:
setp.lt.f64 %p118, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
bra.uni BB24_129;
BB24_94:
setp.eq.s32 %p80, %r6, 15;
@%p80 bra BB24_95;
bra.uni BB24_129;
BB24_95:
mul.f64 %fd87, %fd1, %fd68;
mov.f64 %fd88, 0d3FF0000000000000;
sub.f64 %fd106, %fd88, %fd87;
bra.uni BB24_129;
BB24_73:
setp.eq.s32 %p96, %r6, 3;
@%p96 bra BB24_74;
bra.uni BB24_129;
BB24_74:
div.rn.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_109:
min.f64 %fd106, %fd1, %fd68;
bra.uni BB24_129;
BB24_90:
setp.eq.s32 %p84, %r6, 13;
@%p84 bra BB24_91;
bra.uni BB24_129;
BB24_91:
cvt.rni.s64.f64 %rd17, %fd1;
cvt.rni.s64.f64 %rd18, %fd68;
cvt.u32.u64 %r58, %rd17;
cvt.u32.u64 %r59, %rd18;
and.b32 %r60, %r59, %r58;
setp.eq.s32 %p112, %r60, 0;
selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
bra.uni BB24_129;
BB24_112:
setp.le.f64 %p117, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
bra.uni BB24_129;
BB24_81:
setp.eq.s32 %p91, %r6, 8;
@%p91 bra BB24_82;
bra.uni BB24_129;
BB24_82:
setp.ge.f64 %p115, %fd1, %fd68;
selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
bra.uni BB24_129;
BB24_106:
setp.neu.f64 %p110, %fd1, 0d0000000000000000;
sub.f64 %fd86, %fd1, %fd68;
selp.f64 %fd106, %fd86, 0d0000000000000000, %p110;
bra.uni BB24_129;
BB24_98:
setp.ne.s32 %p78, %r6, 18;
@%p78 bra BB24_129;
div.rn.f64 %fd106, %fd1, %fd68;
abs.f64 %fd81, %fd106;
setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000;
@%p99 bra BB24_129;
{
.reg .b32 %temp;
mov.b64 {%r49, %temp}, %fd106;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r50}, %fd106;
}
and.b32 %r51, %r50, 2147483647;
setp.ne.s32 %p100, %r51, 2146435072;
setp.ne.s32 %p101, %r49, 0;
or.pred %p102, %p100, %p101;
@!%p102 bra BB24_129;
bra.uni BB24_101;
BB24_101:
cvt.rmi.f64.f64 %fd106, %fd106;
bra.uni BB24_129;
BB24_52:
setp.gt.s32 %p54, %r2, -1;
@%p54 bra BB24_55;
cvt.rzi.f64.f64 %fd78, %fd1;
setp.neu.f64 %p55, %fd78, %fd1;
selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55;
BB24_55:
mov.f64 %fd24, %fd96;
add.f64 %fd25, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd25;
}
and.b32 %r34, %r33, 2146435072;
setp.ne.s32 %p58, %r34, 2146435072;
mov.f64 %fd95, %fd24;
@%p58 bra BB24_62;
setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000;
mov.f64 %fd95, %fd25;
@%p59 bra BB24_62;
abs.f64 %fd79, %fd1;
setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000;
mov.f64 %fd94, %fd25;
mov.f64 %fd95, %fd94;
@%p60 bra BB24_62;
{
.reg .b32 %temp;
mov.b64 {%r35, %temp}, %fd1;
}
and.b32 %r36, %r3, 2147483647;
setp.eq.s32 %p61, %r36, 2146435072;
setp.eq.s32 %p62, %r35, 0;
and.pred %p63, %p61, %p62;
@%p63 bra BB24_61;
bra.uni BB24_59;
BB24_61:
setp.gt.f64 %p67, %fd18, 0d3FF0000000000000;
selp.b32 %r44, 2146435072, 0, %p67;
xor.b32 %r45, %r44, 2146435072;
setp.lt.s32 %p68, %r3, 0;
selp.b32 %r46, %r45, %r44, %p68;
setp.eq.f64 %p69, %fd68, 0dBFF0000000000000;
selp.b32 %r47, 1072693248, %r46, %p69;
mov.u32 %r48, 0;
mov.b64 %fd95, {%r48, %r47};
bra.uni BB24_62;
BB24_116:
setp.gt.s32 %p122, %r4, -1;
@%p122 bra BB24_119;
cvt.rzi.f64.f64 %fd89, %fd68;
setp.neu.f64 %p123, %fd89, %fd68;
selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123;
BB24_119:
mov.f64 %fd57, %fd104;
add.f64 %fd58, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r70}, %fd58;
}
and.b32 %r71, %r70, 2146435072;
setp.ne.s32 %p126, %r71, 2146435072;
mov.f64 %fd103, %fd57;
@%p126 bra BB24_126;
setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000;
mov.f64 %fd103, %fd58;
@%p127 bra BB24_126;
abs.f64 %fd90, %fd68;
setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000;
mov.f64 %fd102, %fd58;
mov.f64 %fd103, %fd102;
@%p128 bra BB24_126;
{
.reg .b32 %temp;
mov.b64 {%r72, %temp}, %fd68;
}
and.b32 %r73, %r5, 2147483647;
setp.eq.s32 %p129, %r73, 2146435072;
setp.eq.s32 %p130, %r72, 0;
and.pred %p131, %p129, %p130;
@%p131 bra BB24_125;
bra.uni BB24_123;
BB24_125:
setp.gt.f64 %p135, %fd51, 0d3FF0000000000000;
selp.b32 %r81, 2146435072, 0, %p135;
xor.b32 %r82, %r81, 2146435072;
setp.lt.s32 %p136, %r5, 0;
selp.b32 %r83, %r82, %r81, %p136;
setp.eq.f64 %p137, %fd1, 0dBFF0000000000000;
selp.b32 %r84, 1072693248, %r83, %p137;
mov.u32 %r85, 0;
mov.b64 %fd103, {%r85, %r84};
bra.uni BB24_126;
BB24_59:
{
.reg .b32 %temp;
mov.b64 {%r37, %temp}, %fd68;
}
and.b32 %r38, %r2, 2147483647;
setp.eq.s32 %p64, %r38, 2146435072;
setp.eq.s32 %p65, %r37, 0;
and.pred %p66, %p64, %p65;
mov.f64 %fd95, %fd24;
@!%p66 bra BB24_62;
bra.uni BB24_60;
BB24_60:
shr.s32 %r39, %r3, 31;
and.b32 %r40, %r39, -2146435072;
selp.b32 %r41, -1048576, 2146435072, %p1;
add.s32 %r42, %r41, %r40;
mov.u32 %r43, 0;
mov.b64 %fd95, {%r43, %r42};
BB24_62:
setp.eq.f64 %p70, %fd1, 0d0000000000000000;
setp.eq.f64 %p71, %fd68, 0d3FF0000000000000;
or.pred %p72, %p71, %p70;
selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72;
BB24_65:
st.global.f64 [%rd1], %fd98;
bra.uni BB24_130;
BB24_123:
{
.reg .b32 %temp;
mov.b64 {%r74, %temp}, %fd1;
}
and.b32 %r75, %r4, 2147483647;
setp.eq.s32 %p132, %r75, 2146435072;
setp.eq.s32 %p133, %r74, 0;
and.pred %p134, %p132, %p133;
mov.f64 %fd103, %fd57;
@!%p134 bra BB24_126;
bra.uni BB24_124;
BB24_124:
shr.s32 %r76, %r5, 31;
and.b32 %r77, %r76, -2146435072;
selp.b32 %r78, -1048576, 2146435072, %p2;
add.s32 %r79, %r78, %r77;
mov.u32 %r80, 0;
mov.b64 %fd103, {%r80, %r79};
BB24_126:
setp.eq.f64 %p138, %fd68, 0d0000000000000000;
setp.eq.f64 %p139, %fd1, 0d3FF0000000000000;
or.pred %p140, %p139, %p138;
selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140;
BB24_129:
st.global.f64 [%rd1], %fd106;
BB24_130:
bar.sync 0;
ret;
}
// .globl matrix_scalar_op_f
.visible .entry matrix_scalar_op_f(
.param .u64 matrix_scalar_op_f_param_0,
.param .f64 matrix_scalar_op_f_param_1,
.param .u64 matrix_scalar_op_f_param_2,
.param .u32 matrix_scalar_op_f_param_3,
.param .u32 matrix_scalar_op_f_param_4,
.param .u32 matrix_scalar_op_f_param_5
)
{
.reg .pred %p<139>;
.reg .f32 %f<265>;
.reg .b32 %r<58>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
ld.param.u64 %rd2, [matrix_scalar_op_f_param_0];
ld.param.f64 %fd1, [matrix_scalar_op_f_param_1];
ld.param.u64 %rd3, [matrix_scalar_op_f_param_2];
ld.param.u32 %r4, [matrix_scalar_op_f_param_3];
ld.param.u32 %r2, [matrix_scalar_op_f_param_4];
ld.param.u32 %r3, [matrix_scalar_op_f_param_5];
cvt.rn.f32.f64 %f1, %fd1;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
setp.ge.s32 %p3, %r1, %r4;
@%p3 bra BB25_126;
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f2, [%rd7];
add.s64 %rd1, %rd4, %rd6;
setp.eq.s32 %p4, %r3, 0;
@%p4 bra BB25_64;
mov.f32 %f261, 0f7F7FFFFF;
setp.gt.s32 %p5, %r2, 8;
@%p5 bra BB25_19;
setp.gt.s32 %p19, %r2, 3;
@%p19 bra BB25_11;
setp.gt.s32 %p26, %r2, 1;
@%p26 bra BB25_8;
setp.eq.s32 %p29, %r2, 0;
@%p29 bra BB25_62;
bra.uni BB25_6;
BB25_62:
add.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_64:
mov.f32 %f264, 0f7F7FFFFF;
setp.gt.s32 %p72, %r2, 8;
@%p72 bra BB25_81;
setp.gt.s32 %p86, %r2, 3;
@%p86 bra BB25_73;
setp.gt.s32 %p93, %r2, 1;
@%p93 bra BB25_70;
setp.eq.s32 %p96, %r2, 0;
@%p96 bra BB25_124;
bra.uni BB25_68;
BB25_124:
add.f32 %f264, %f1, %f2;
bra.uni BB25_125;
BB25_19:
setp.gt.s32 %p6, %r2, 13;
@%p6 bra BB25_28;
setp.gt.s32 %p13, %r2, 10;
@%p13 bra BB25_24;
setp.eq.s32 %p17, %r2, 9;
@%p17 bra BB25_44;
bra.uni BB25_22;
BB25_44:
setp.eq.f32 %p40, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p40;
bra.uni BB25_63;
BB25_81:
setp.gt.s32 %p73, %r2, 13;
@%p73 bra BB25_90;
setp.gt.s32 %p80, %r2, 10;
@%p80 bra BB25_86;
setp.eq.s32 %p84, %r2, 9;
@%p84 bra BB25_106;
bra.uni BB25_84;
BB25_106:
setp.eq.f32 %p107, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p107;
bra.uni BB25_125;
BB25_11:
setp.gt.s32 %p20, %r2, 5;
@%p20 bra BB25_15;
setp.eq.s32 %p24, %r2, 4;
@%p24 bra BB25_47;
bra.uni BB25_13;
BB25_47:
mul.f32 %f90, %f2, 0f3F000000;
cvt.rzi.f32.f32 %f91, %f90;
fma.rn.f32 %f92, %f91, 0fC0000000, %f2;
abs.f32 %f19, %f92;
abs.f32 %f20, %f1;
setp.lt.f32 %p45, %f20, 0f00800000;
mul.f32 %f93, %f20, 0f4B800000;
selp.f32 %f94, 0fC3170000, 0fC2FE0000, %p45;
selp.f32 %f95, %f93, %f20, %p45;
mov.b32 %r14, %f95;
and.b32 %r15, %r14, 8388607;
or.b32 %r16, %r15, 1065353216;
mov.b32 %f96, %r16;
shr.u32 %r17, %r14, 23;
cvt.rn.f32.u32 %f97, %r17;
add.f32 %f98, %f94, %f97;
setp.gt.f32 %p46, %f96, 0f3FB504F3;
mul.f32 %f99, %f96, 0f3F000000;
add.f32 %f100, %f98, 0f3F800000;
selp.f32 %f101, %f99, %f96, %p46;
selp.f32 %f102, %f100, %f98, %p46;
add.f32 %f103, %f101, 0fBF800000;
add.f32 %f87, %f101, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f86,%f87;
// inline asm
add.f32 %f104, %f103, %f103;
mul.f32 %f105, %f86, %f104;
mul.f32 %f106, %f105, %f105;
mov.f32 %f107, 0f3C4CAF63;
mov.f32 %f108, 0f3B18F0FE;
fma.rn.f32 %f109, %f108, %f106, %f107;
mov.f32 %f110, 0f3DAAAABD;
fma.rn.f32 %f111, %f109, %f106, %f110;
mul.rn.f32 %f112, %f111, %f106;
mul.rn.f32 %f113, %f112, %f105;
sub.f32 %f114, %f103, %f105;
neg.f32 %f115, %f105;
add.f32 %f116, %f114, %f114;
fma.rn.f32 %f117, %f115, %f103, %f116;
mul.rn.f32 %f118, %f86, %f117;
add.f32 %f119, %f113, %f105;
sub.f32 %f120, %f105, %f119;
add.f32 %f121, %f113, %f120;
add.f32 %f122, %f118, %f121;
add.f32 %f123, %f119, %f122;
sub.f32 %f124, %f119, %f123;
add.f32 %f125, %f122, %f124;
mov.f32 %f126, 0f3F317200;
mul.rn.f32 %f127, %f102, %f126;
mov.f32 %f128, 0f35BFBE8E;
mul.rn.f32 %f129, %f102, %f128;
add.f32 %f130, %f127, %f123;
sub.f32 %f131, %f127, %f130;
add.f32 %f132, %f123, %f131;
add.f32 %f133, %f125, %f132;
add.f32 %f134, %f129, %f133;
add.f32 %f135, %f130, %f134;
sub.f32 %f136, %f130, %f135;
add.f32 %f137, %f134, %f136;
abs.f32 %f21, %f2;
setp.gt.f32 %p47, %f21, 0f77F684DF;
mul.f32 %f138, %f2, 0f39000000;
selp.f32 %f139, %f138, %f2, %p47;
mul.rn.f32 %f140, %f139, %f135;
neg.f32 %f141, %f140;
fma.rn.f32 %f142, %f139, %f135, %f141;
fma.rn.f32 %f143, %f139, %f137, %f142;
mov.f32 %f144, 0f00000000;
fma.rn.f32 %f145, %f144, %f135, %f143;
add.rn.f32 %f146, %f140, %f145;
neg.f32 %f147, %f146;
add.rn.f32 %f148, %f140, %f147;
add.rn.f32 %f149, %f148, %f145;
mov.b32 %r18, %f146;
setp.eq.s32 %p48, %r18, 1118925336;
add.s32 %r19, %r18, -1;
mov.b32 %f150, %r19;
add.f32 %f151, %f149, 0f37000000;
selp.f32 %f152, %f150, %f146, %p48;
selp.f32 %f22, %f151, %f149, %p48;
mul.f32 %f153, %f152, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f154, %f153;
mov.f32 %f155, 0fBF317200;
fma.rn.f32 %f156, %f154, %f155, %f152;
mov.f32 %f157, 0fB5BFBE8E;
fma.rn.f32 %f158, %f154, %f157, %f156;
mul.f32 %f89, %f158, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f88,%f89;
// inline asm
add.f32 %f159, %f154, 0f00000000;
ex2.approx.f32 %f160, %f159;
mul.f32 %f161, %f88, %f160;
setp.lt.f32 %p49, %f152, 0fC2D20000;
selp.f32 %f162, 0f00000000, %f161, %p49;
setp.gt.f32 %p50, %f152, 0f42D20000;
selp.f32 %f259, 0f7F800000, %f162, %p50;
setp.eq.f32 %p51, %f259, 0f7F800000;
@%p51 bra BB25_49;
fma.rn.f32 %f259, %f259, %f22, %f259;
BB25_49:
setp.lt.f32 %p52, %f1, 0f00000000;
setp.eq.f32 %p53, %f19, 0f3F800000;
and.pred %p1, %p52, %p53;
mov.b32 %r20, %f259;
xor.b32 %r21, %r20, -2147483648;
mov.b32 %f163, %r21;
selp.f32 %f260, %f163, %f259, %p1;
setp.eq.f32 %p54, %f1, 0f00000000;
@%p54 bra BB25_52;
bra.uni BB25_50;
BB25_52:
add.f32 %f165, %f1, %f1;
mov.b32 %r22, %f165;
selp.b32 %r23, %r22, 0, %p53;
or.b32 %r24, %r23, 2139095040;
setp.lt.f32 %p58, %f2, 0f00000000;
selp.b32 %r25, %r24, %r23, %p58;
mov.b32 %f260, %r25;
bra.uni BB25_53;
BB25_28:
setp.gt.s32 %p7, %r2, 15;
@%p7 bra BB25_32;
setp.eq.s32 %p11, %r2, 14;
@%p11 bra BB25_41;
bra.uni BB25_30;
BB25_41:
cvt.rni.s64.f32 %rd8, %f1;
cvt.rni.s64.f32 %rd9, %f2;
cvt.u32.u64 %r8, %rd8;
cvt.u32.u64 %r9, %rd9;
or.b32 %r10, %r9, %r8;
setp.eq.s32 %p37, %r10, 0;
selp.f32 %f261, 0f00000000, 0f3F800000, %p37;
bra.uni BB25_63;
BB25_73:
setp.gt.s32 %p87, %r2, 5;
@%p87 bra BB25_77;
setp.eq.s32 %p91, %r2, 4;
@%p91 bra BB25_109;
bra.uni BB25_75;
BB25_109:
mul.f32 %f181, %f1, 0f3F000000;
cvt.rzi.f32.f32 %f182, %f181;
fma.rn.f32 %f183, %f182, 0fC0000000, %f1;
abs.f32 %f56, %f183;
abs.f32 %f57, %f2;
setp.lt.f32 %p112, %f57, 0f00800000;
mul.f32 %f184, %f57, 0f4B800000;
selp.f32 %f185, 0fC3170000, 0fC2FE0000, %p112;
selp.f32 %f186, %f184, %f57, %p112;
mov.b32 %r39, %f186;
and.b32 %r40, %r39, 8388607;
or.b32 %r41, %r40, 1065353216;
mov.b32 %f187, %r41;
shr.u32 %r42, %r39, 23;
cvt.rn.f32.u32 %f188, %r42;
add.f32 %f189, %f185, %f188;
setp.gt.f32 %p113, %f187, 0f3FB504F3;
mul.f32 %f190, %f187, 0f3F000000;
add.f32 %f191, %f189, 0f3F800000;
selp.f32 %f192, %f190, %f187, %p113;
selp.f32 %f193, %f191, %f189, %p113;
add.f32 %f194, %f192, 0fBF800000;
add.f32 %f178, %f192, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f177,%f178;
// inline asm
add.f32 %f195, %f194, %f194;
mul.f32 %f196, %f177, %f195;
mul.f32 %f197, %f196, %f196;
mov.f32 %f198, 0f3C4CAF63;
mov.f32 %f199, 0f3B18F0FE;
fma.rn.f32 %f200, %f199, %f197, %f198;
mov.f32 %f201, 0f3DAAAABD;
fma.rn.f32 %f202, %f200, %f197, %f201;
mul.rn.f32 %f203, %f202, %f197;
mul.rn.f32 %f204, %f203, %f196;
sub.f32 %f205, %f194, %f196;
neg.f32 %f206, %f196;
add.f32 %f207, %f205, %f205;
fma.rn.f32 %f208, %f206, %f194, %f207;
mul.rn.f32 %f209, %f177, %f208;
add.f32 %f210, %f204, %f196;
sub.f32 %f211, %f196, %f210;
add.f32 %f212, %f204, %f211;
add.f32 %f213, %f209, %f212;
add.f32 %f214, %f210, %f213;
sub.f32 %f215, %f210, %f214;
add.f32 %f216, %f213, %f215;
mov.f32 %f217, 0f3F317200;
mul.rn.f32 %f218, %f193, %f217;
mov.f32 %f219, 0f35BFBE8E;
mul.rn.f32 %f220, %f193, %f219;
add.f32 %f221, %f218, %f214;
sub.f32 %f222, %f218, %f221;
add.f32 %f223, %f214, %f222;
add.f32 %f224, %f216, %f223;
add.f32 %f225, %f220, %f224;
add.f32 %f226, %f221, %f225;
sub.f32 %f227, %f221, %f226;
add.f32 %f228, %f225, %f227;
abs.f32 %f58, %f1;
setp.gt.f32 %p114, %f58, 0f77F684DF;
mul.f32 %f229, %f1, 0f39000000;
selp.f32 %f230, %f229, %f1, %p114;
mul.rn.f32 %f231, %f230, %f226;
neg.f32 %f232, %f231;
fma.rn.f32 %f233, %f230, %f226, %f232;
fma.rn.f32 %f234, %f230, %f228, %f233;
mov.f32 %f235, 0f00000000;
fma.rn.f32 %f236, %f235, %f226, %f234;
add.rn.f32 %f237, %f231, %f236;
neg.f32 %f238, %f237;
add.rn.f32 %f239, %f231, %f238;
add.rn.f32 %f240, %f239, %f236;
mov.b32 %r43, %f237;
setp.eq.s32 %p115, %r43, 1118925336;
add.s32 %r44, %r43, -1;
mov.b32 %f241, %r44;
add.f32 %f242, %f240, 0f37000000;
selp.f32 %f243, %f241, %f237, %p115;
selp.f32 %f59, %f242, %f240, %p115;
mul.f32 %f244, %f243, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f245, %f244;
mov.f32 %f246, 0fBF317200;
fma.rn.f32 %f247, %f245, %f246, %f243;
mov.f32 %f248, 0fB5BFBE8E;
fma.rn.f32 %f249, %f245, %f248, %f247;
mul.f32 %f180, %f249, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f179,%f180;
// inline asm
add.f32 %f250, %f245, 0f00000000;
ex2.approx.f32 %f251, %f250;
mul.f32 %f252, %f179, %f251;
setp.lt.f32 %p116, %f243, 0fC2D20000;
selp.f32 %f253, 0f00000000, %f252, %p116;
setp.gt.f32 %p117, %f243, 0f42D20000;
selp.f32 %f262, 0f7F800000, %f253, %p117;
setp.eq.f32 %p118, %f262, 0f7F800000;
@%p118 bra BB25_111;
fma.rn.f32 %f262, %f262, %f59, %f262;
BB25_111:
setp.lt.f32 %p119, %f2, 0f00000000;
setp.eq.f32 %p120, %f56, 0f3F800000;
and.pred %p2, %p119, %p120;
mov.b32 %r45, %f262;
xor.b32 %r46, %r45, -2147483648;
mov.b32 %f254, %r46;
selp.f32 %f263, %f254, %f262, %p2;
setp.eq.f32 %p121, %f2, 0f00000000;
@%p121 bra BB25_114;
bra.uni BB25_112;
BB25_114:
add.f32 %f256, %f2, %f2;
mov.b32 %r47, %f256;
selp.b32 %r48, %r47, 0, %p120;
or.b32 %r49, %r48, 2139095040;
setp.lt.f32 %p125, %f1, 0f00000000;
selp.b32 %r50, %r49, %r48, %p125;
mov.b32 %f263, %r50;
bra.uni BB25_115;
BB25_90:
setp.gt.s32 %p74, %r2, 15;
@%p74 bra BB25_94;
setp.eq.s32 %p78, %r2, 14;
@%p78 bra BB25_103;
bra.uni BB25_92;
BB25_103:
cvt.rni.s64.f32 %rd12, %f2;
cvt.rni.s64.f32 %rd13, %f1;
cvt.u32.u64 %r33, %rd12;
cvt.u32.u64 %r34, %rd13;
or.b32 %r35, %r34, %r33;
setp.eq.s32 %p104, %r35, 0;
selp.f32 %f264, 0f00000000, 0f3F800000, %p104;
bra.uni BB25_125;
BB25_8:
setp.eq.s32 %p27, %r2, 2;
@%p27 bra BB25_61;
bra.uni BB25_9;
BB25_61:
mul.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_24:
setp.eq.s32 %p14, %r2, 11;
@%p14 bra BB25_43;
setp.eq.s32 %p15, %r2, 12;
@%p15 bra BB25_42;
bra.uni BB25_26;
BB25_42:
max.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_15:
setp.eq.s32 %p21, %r2, 6;
@%p21 bra BB25_46;
setp.eq.s32 %p22, %r2, 7;
@%p22 bra BB25_45;
bra.uni BB25_17;
BB25_45:
setp.gt.f32 %p42, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p42;
bra.uni BB25_63;
BB25_32:
setp.eq.s32 %p8, %r2, 16;
@%p8 bra BB25_40;
setp.eq.s32 %p9, %r2, 17;
@%p9 bra BB25_37;
bra.uni BB25_34;
BB25_37:
setp.eq.f32 %p32, %f2, 0f00000000;
setp.eq.f32 %p33, %f2, 0f80000000;
or.pred %p34, %p32, %p33;
mov.f32 %f261, 0f7FC00000;
@%p34 bra BB25_63;
div.rn.f32 %f261, %f1, %f2;
abs.f32 %f80, %f261;
setp.geu.f32 %p35, %f80, 0f7F800000;
@%p35 bra BB25_63;
cvt.rmi.f32.f32 %f81, %f261;
mul.f32 %f82, %f2, %f81;
sub.f32 %f261, %f1, %f82;
bra.uni BB25_63;
BB25_70:
setp.eq.s32 %p94, %r2, 2;
@%p94 bra BB25_123;
bra.uni BB25_71;
BB25_123:
mul.f32 %f264, %f1, %f2;
bra.uni BB25_125;
BB25_86:
setp.eq.s32 %p81, %r2, 11;
@%p81 bra BB25_105;
setp.eq.s32 %p82, %r2, 12;
@%p82 bra BB25_104;
bra.uni BB25_88;
BB25_104:
max.f32 %f264, %f2, %f1;
bra.uni BB25_125;
BB25_77:
setp.eq.s32 %p88, %r2, 6;
@%p88 bra BB25_108;
setp.eq.s32 %p89, %r2, 7;
@%p89 bra BB25_107;
bra.uni BB25_79;
BB25_107:
setp.gt.f32 %p109, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p109;
bra.uni BB25_125;
BB25_94:
setp.eq.s32 %p75, %r2, 16;
@%p75 bra BB25_102;
setp.eq.s32 %p76, %r2, 17;
@%p76 bra BB25_99;
bra.uni BB25_96;
BB25_99:
setp.eq.f32 %p99, %f1, 0f00000000;
setp.eq.f32 %p100, %f1, 0f80000000;
or.pred %p101, %p99, %p100;
mov.f32 %f264, 0f7FC00000;
@%p101 bra BB25_125;
div.rn.f32 %f264, %f2, %f1;
abs.f32 %f171, %f264;
setp.geu.f32 %p102, %f171, 0f7F800000;
@%p102 bra BB25_125;
cvt.rmi.f32.f32 %f172, %f264;
mul.f32 %f173, %f1, %f172;
sub.f32 %f264, %f2, %f173;
bra.uni BB25_125;
BB25_6:
setp.eq.s32 %p30, %r2, 1;
@%p30 bra BB25_7;
bra.uni BB25_63;
BB25_7:
sub.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_22:
setp.eq.s32 %p18, %r2, 10;
@%p18 bra BB25_23;
bra.uni BB25_63;
BB25_23:
setp.neu.f32 %p39, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p39;
bra.uni BB25_63;
BB25_13:
setp.eq.s32 %p25, %r2, 5;
@%p25 bra BB25_14;
bra.uni BB25_63;
BB25_14:
setp.lt.f32 %p44, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p44;
bra.uni BB25_63;
BB25_30:
setp.eq.s32 %p12, %r2, 15;
@%p12 bra BB25_31;
bra.uni BB25_63;
BB25_31:
mul.f32 %f84, %f1, %f2;
mov.f32 %f85, 0f3F800000;
sub.f32 %f261, %f85, %f84;
bra.uni BB25_63;
BB25_9:
setp.eq.s32 %p28, %r2, 3;
@%p28 bra BB25_10;
bra.uni BB25_63;
BB25_10:
div.rn.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_43:
min.f32 %f261, %f1, %f2;
bra.uni BB25_63;
BB25_26:
setp.eq.s32 %p16, %r2, 13;
@%p16 bra BB25_27;
bra.uni BB25_63;
BB25_27:
cvt.rni.s64.f32 %rd10, %f1;
cvt.rni.s64.f32 %rd11, %f2;
cvt.u32.u64 %r11, %rd10;
cvt.u32.u64 %r12, %rd11;
and.b32 %r13, %r12, %r11;
setp.eq.s32 %p38, %r13, 0;
selp.f32 %f261, 0f00000000, 0f3F800000, %p38;
bra.uni BB25_63;
BB25_46:
setp.le.f32 %p43, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p43;
bra.uni BB25_63;
BB25_17:
setp.eq.s32 %p23, %r2, 8;
@%p23 bra BB25_18;
bra.uni BB25_63;
BB25_18:
setp.ge.f32 %p41, %f1, %f2;
selp.f32 %f261, 0f3F800000, 0f00000000, %p41;
bra.uni BB25_63;
BB25_40:
setp.neu.f32 %p36, %f1, 0f00000000;
sub.f32 %f83, %f1, %f2;
selp.f32 %f261, %f83, 0f00000000, %p36;
bra.uni BB25_63;
BB25_34:
setp.ne.s32 %p10, %r2, 18;
@%p10 bra BB25_63;
div.rn.f32 %f261, %f1, %f2;
abs.f32 %f78, %f261;
setp.geu.f32 %p31, %f78, 0f7F800000;
@%p31 bra BB25_63;
cvt.rmi.f32.f32 %f261, %f261;
bra.uni BB25_63;
BB25_68:
setp.eq.s32 %p97, %r2, 1;
@%p97 bra BB25_69;
bra.uni BB25_125;
BB25_69:
sub.f32 %f264, %f2, %f1;
bra.uni BB25_125;
BB25_84:
setp.eq.s32 %p85, %r2, 10;
@%p85 bra BB25_85;
bra.uni BB25_125;
BB25_85:
setp.neu.f32 %p106, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p106;
bra.uni BB25_125;
BB25_75:
setp.eq.s32 %p92, %r2, 5;
@%p92 bra BB25_76;
bra.uni BB25_125;
BB25_76:
setp.lt.f32 %p111, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p111;
bra.uni BB25_125;
BB25_92:
setp.eq.s32 %p79, %r2, 15;
@%p79 bra BB25_93;
bra.uni BB25_125;
BB25_93:
mul.f32 %f175, %f1, %f2;
mov.f32 %f176, 0f3F800000;
sub.f32 %f264, %f176, %f175;
bra.uni BB25_125;
BB25_71:
setp.eq.s32 %p95, %r2, 3;
@%p95 bra BB25_72;
bra.uni BB25_125;
BB25_72:
div.rn.f32 %f264, %f2, %f1;
bra.uni BB25_125;
BB25_105:
min.f32 %f264, %f2, %f1;
bra.uni BB25_125;
BB25_88:
setp.eq.s32 %p83, %r2, 13;
@%p83 bra BB25_89;
bra.uni BB25_125;
BB25_89:
cvt.rni.s64.f32 %rd14, %f2;
cvt.rni.s64.f32 %rd15, %f1;
cvt.u32.u64 %r36, %rd14;
cvt.u32.u64 %r37, %rd15;
and.b32 %r38, %r37, %r36;
setp.eq.s32 %p105, %r38, 0;
selp.f32 %f264, 0f00000000, 0f3F800000, %p105;
bra.uni BB25_125;
BB25_108:
setp.le.f32 %p110, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p110;
bra.uni BB25_125;
BB25_79:
setp.eq.s32 %p90, %r2, 8;
@%p90 bra BB25_80;
bra.uni BB25_125;
BB25_80:
setp.ge.f32 %p108, %f2, %f1;
selp.f32 %f264, 0f3F800000, 0f00000000, %p108;
bra.uni BB25_125;
BB25_102:
setp.neu.f32 %p103, %f2, 0f00000000;
sub.f32 %f174, %f2, %f1;
selp.f32 %f264, %f174, 0f00000000, %p103;
bra.uni BB25_125;
BB25_96:
setp.ne.s32 %p77, %r2, 18;
@%p77 bra BB25_125;
div.rn.f32 %f264, %f2, %f1;
abs.f32 %f169, %f264;
setp.geu.f32 %p98, %f169, 0f7F800000;
@%p98 bra BB25_125;
cvt.rmi.f32.f32 %f264, %f264;
bra.uni BB25_125;
BB25_50:
setp.geu.f32 %p55, %f1, 0f00000000;
@%p55 bra BB25_53;
cvt.rzi.f32.f32 %f164, %f2;
setp.neu.f32 %p56, %f164, %f2;
selp.f32 %f260, 0f7FFFFFFF, %f260, %p56;
BB25_53:
add.f32 %f166, %f20, %f21;
mov.b32 %r26, %f166;
setp.lt.s32 %p59, %r26, 2139095040;
@%p59 bra BB25_60;
setp.gtu.f32 %p60, %f20, 0f7F800000;
setp.gtu.f32 %p61, %f21, 0f7F800000;
or.pred %p62, %p60, %p61;
@%p62 bra BB25_59;
bra.uni BB25_55;
BB25_59:
add.f32 %f260, %f1, %f2;
bra.uni BB25_60;
BB25_55:
setp.eq.f32 %p63, %f21, 0f7F800000;
@%p63 bra BB25_58;
bra.uni BB25_56;
BB25_58:
setp.gt.f32 %p66, %f20, 0f3F800000;
selp.b32 %r30, 2139095040, 0, %p66;
xor.b32 %r31, %r30, 2139095040;
setp.lt.f32 %p67, %f2, 0f00000000;
selp.b32 %r32, %r31, %r30, %p67;
mov.b32 %f167, %r32;
setp.eq.f32 %p68, %f1, 0fBF800000;
selp.f32 %f260, 0f3F800000, %f167, %p68;
bra.uni BB25_60;
BB25_112:
setp.geu.f32 %p122, %f2, 0f00000000;
@%p122 bra BB25_115;
cvt.rzi.f32.f32 %f255, %f1;
setp.neu.f32 %p123, %f255, %f1;
selp.f32 %f263, 0f7FFFFFFF, %f263, %p123;
BB25_115:
add.f32 %f257, %f57, %f58;
mov.b32 %r51, %f257;
setp.lt.s32 %p126, %r51, 2139095040;
@%p126 bra BB25_122;
setp.gtu.f32 %p127, %f57, 0f7F800000;
setp.gtu.f32 %p128, %f58, 0f7F800000;
or.pred %p129, %p127, %p128;
@%p129 bra BB25_121;
bra.uni BB25_117;
BB25_121:
add.f32 %f263, %f1, %f2;
bra.uni BB25_122;
BB25_117:
setp.eq.f32 %p130, %f58, 0f7F800000;
@%p130 bra BB25_120;
bra.uni BB25_118;
BB25_120:
setp.gt.f32 %p133, %f57, 0f3F800000;
selp.b32 %r55, 2139095040, 0, %p133;
xor.b32 %r56, %r55, 2139095040;
setp.lt.f32 %p134, %f1, 0f00000000;
selp.b32 %r57, %r56, %r55, %p134;
mov.b32 %f258, %r57;
setp.eq.f32 %p135, %f2, 0fBF800000;
selp.f32 %f263, 0f3F800000, %f258, %p135;
bra.uni BB25_122;
BB25_56:
setp.neu.f32 %p64, %f20, 0f7F800000;
@%p64 bra BB25_60;
setp.ge.f32 %p65, %f2, 0f00000000;
selp.b32 %r27, 2139095040, 0, %p65;
or.b32 %r28, %r27, -2147483648;
selp.b32 %r29, %r28, %r27, %p1;
mov.b32 %f260, %r29;
BB25_60:
setp.eq.f32 %p69, %f2, 0f00000000;
setp.eq.f32 %p70, %f1, 0f3F800000;
or.pred %p71, %p70, %p69;
selp.f32 %f261, 0f3F800000, %f260, %p71;
BB25_63:
st.global.f32 [%rd1], %f261;
bra.uni BB25_126;
BB25_118:
setp.neu.f32 %p131, %f57, 0f7F800000;
@%p131 bra BB25_122;
setp.ge.f32 %p132, %f1, 0f00000000;
selp.b32 %r52, 2139095040, 0, %p132;
or.b32 %r53, %r52, -2147483648;
selp.b32 %r54, %r53, %r52, %p2;
mov.b32 %f263, %r54;
BB25_122:
setp.eq.f32 %p136, %f1, 0f00000000;
setp.eq.f32 %p137, %f2, 0f3F800000;
or.pred %p138, %p137, %p136;
selp.f32 %f264, 0f3F800000, %f263, %p138;
BB25_125:
st.global.f32 [%rd1], %f264;
BB25_126:
bar.sync 0;
ret;
}
// .globl fill_d
.visible .entry fill_d(
.param .u64 fill_d_param_0,
.param .f64 fill_d_param_1,
.param .u32 fill_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [fill_d_param_0];
ld.param.f64 %fd1, [fill_d_param_1];
ld.param.u32 %r2, [fill_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB26_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
BB26_2:
ret;
}
// .globl fill_f
.visible .entry fill_f(
.param .u64 fill_f_param_0,
.param .f64 fill_f_param_1,
.param .u32 fill_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [fill_f_param_0];
ld.param.f64 %fd1, [fill_f_param_1];
ld.param.u32 %r2, [fill_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB27_2;
cvt.rn.f32.f64 %f1, %fd1;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 4;
add.s64 %rd4, %rd2, %rd3;
st.global.f32 [%rd4], %f1;
BB27_2:
ret;
}
// .globl cbind_d
.visible .entry cbind_d(
.param .u64 cbind_d_param_0,
.param .u64 cbind_d_param_1,
.param .u64 cbind_d_param_2,
.param .u32 cbind_d_param_3,
.param .u32 cbind_d_param_4,
.param .u32 cbind_d_param_5,
.param .u32 cbind_d_param_6
)
{
.reg .pred %p<7>;
.reg .b32 %r<18>;
.reg .f64 %fd<3>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [cbind_d_param_0];
ld.param.u64 %rd3, [cbind_d_param_1];
ld.param.u64 %rd4, [cbind_d_param_2];
ld.param.u32 %r7, [cbind_d_param_3];
ld.param.u32 %r4, [cbind_d_param_4];
ld.param.u32 %r5, [cbind_d_param_5];
ld.param.u32 %r6, [cbind_d_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r11, %r8, %r9, %r10;
max.s32 %r12, %r4, %r6;
div.s32 %r1, %r11, %r12;
rem.s32 %r2, %r11, %r12;
add.s32 %r3, %r6, %r4;
setp.lt.s32 %p1, %r1, %r7;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB28_2;
bra.uni BB28_1;
BB28_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r13, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r13, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
mad.lo.s32 %r14, %r1, %r3, %r2;
mul.wide.s32 %rd8, %r14, 8;
add.s64 %rd9, %rd1, %rd8;
st.global.f64 [%rd9], %fd1;
BB28_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
@!%p6 bra BB28_4;
bra.uni BB28_3;
BB28_3:
cvta.to.global.u64 %rd10, %rd3;
mad.lo.s32 %r15, %r1, %r6, %r2;
mul.wide.s32 %rd11, %r15, 8;
add.s64 %rd12, %rd10, %rd11;
ld.global.f64 %fd2, [%rd12];
add.s32 %r16, %r2, %r4;
mad.lo.s32 %r17, %r1, %r3, %r16;
mul.wide.s32 %rd13, %r17, 8;
add.s64 %rd14, %rd1, %rd13;
st.global.f64 [%rd14], %fd2;
BB28_4:
ret;
}
// .globl cbind_f
.visible .entry cbind_f(
.param .u64 cbind_f_param_0,
.param .u64 cbind_f_param_1,
.param .u64 cbind_f_param_2,
.param .u32 cbind_f_param_3,
.param .u32 cbind_f_param_4,
.param .u32 cbind_f_param_5,
.param .u32 cbind_f_param_6
)
{
.reg .pred %p<7>;
.reg .f32 %f<3>;
.reg .b32 %r<18>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [cbind_f_param_0];
ld.param.u64 %rd3, [cbind_f_param_1];
ld.param.u64 %rd4, [cbind_f_param_2];
ld.param.u32 %r7, [cbind_f_param_3];
ld.param.u32 %r4, [cbind_f_param_4];
ld.param.u32 %r5, [cbind_f_param_5];
ld.param.u32 %r6, [cbind_f_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r11, %r8, %r9, %r10;
max.s32 %r12, %r4, %r6;
div.s32 %r1, %r11, %r12;
rem.s32 %r2, %r11, %r12;
add.s32 %r3, %r6, %r4;
setp.lt.s32 %p1, %r1, %r7;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB29_2;
bra.uni BB29_1;
BB29_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r13, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r13, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
mad.lo.s32 %r14, %r1, %r3, %r2;
mul.wide.s32 %rd8, %r14, 4;
add.s64 %rd9, %rd1, %rd8;
st.global.f32 [%rd9], %f1;
BB29_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
@!%p6 bra BB29_4;
bra.uni BB29_3;
BB29_3:
cvta.to.global.u64 %rd10, %rd3;
mad.lo.s32 %r15, %r1, %r6, %r2;
mul.wide.s32 %rd11, %r15, 4;
add.s64 %rd12, %rd10, %rd11;
ld.global.f32 %f2, [%rd12];
add.s32 %r16, %r2, %r4;
mad.lo.s32 %r17, %r1, %r3, %r16;
mul.wide.s32 %rd13, %r17, 4;
add.s64 %rd14, %rd1, %rd13;
st.global.f32 [%rd14], %f2;
BB29_4:
ret;
}
// .globl rbind_d
.visible .entry rbind_d(
.param .u64 rbind_d_param_0,
.param .u64 rbind_d_param_1,
.param .u64 rbind_d_param_2,
.param .u32 rbind_d_param_3,
.param .u32 rbind_d_param_4,
.param .u32 rbind_d_param_5,
.param .u32 rbind_d_param_6
)
{
.reg .pred %p<7>;
.reg .b32 %r<16>;
.reg .f64 %fd<3>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [rbind_d_param_0];
ld.param.u64 %rd3, [rbind_d_param_1];
ld.param.u64 %rd4, [rbind_d_param_2];
ld.param.u32 %r3, [rbind_d_param_3];
ld.param.u32 %r4, [rbind_d_param_4];
ld.param.u32 %r5, [rbind_d_param_5];
ld.param.u32 %r6, [rbind_d_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
max.s32 %r11, %r4, %r6;
div.s32 %r1, %r10, %r11;
rem.s32 %r2, %r10, %r11;
setp.lt.s32 %p1, %r1, %r3;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB30_2;
bra.uni BB30_1;
BB30_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r12, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
add.s64 %rd8, %rd1, %rd6;
st.global.f64 [%rd8], %fd1;
BB30_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
@!%p6 bra BB30_4;
bra.uni BB30_3;
BB30_3:
cvta.to.global.u64 %rd9, %rd3;
mad.lo.s32 %r13, %r1, %r6, %r2;
mul.wide.s32 %rd10, %r13, 8;
add.s64 %rd11, %rd9, %rd10;
ld.global.f64 %fd2, [%rd11];
add.s32 %r14, %r1, %r3;
mad.lo.s32 %r15, %r14, %r4, %r2;
mul.wide.s32 %rd12, %r15, 8;
add.s64 %rd13, %rd1, %rd12;
st.global.f64 [%rd13], %fd2;
BB30_4:
ret;
}
// .globl rbind_f
.visible .entry rbind_f(
.param .u64 rbind_f_param_0,
.param .u64 rbind_f_param_1,
.param .u64 rbind_f_param_2,
.param .u32 rbind_f_param_3,
.param .u32 rbind_f_param_4,
.param .u32 rbind_f_param_5,
.param .u32 rbind_f_param_6
)
{
.reg .pred %p<7>;
.reg .f32 %f<3>;
.reg .b32 %r<16>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [rbind_f_param_0];
ld.param.u64 %rd3, [rbind_f_param_1];
ld.param.u64 %rd4, [rbind_f_param_2];
ld.param.u32 %r3, [rbind_f_param_3];
ld.param.u32 %r4, [rbind_f_param_4];
ld.param.u32 %r5, [rbind_f_param_5];
ld.param.u32 %r6, [rbind_f_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
max.s32 %r11, %r4, %r6;
div.s32 %r1, %r10, %r11;
rem.s32 %r2, %r10, %r11;
setp.lt.s32 %p1, %r1, %r3;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB31_2;
bra.uni BB31_1;
BB31_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r12, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
add.s64 %rd8, %rd1, %rd6;
st.global.f32 [%rd8], %f1;
BB31_2:
setp.lt.s32 %p4, %r1, %r5;
setp.lt.s32 %p5, %r2, %r6;
and.pred %p6, %p4, %p5;
@!%p6 bra BB31_4;
bra.uni BB31_3;
BB31_3:
cvta.to.global.u64 %rd9, %rd3;
mad.lo.s32 %r13, %r1, %r6, %r2;
mul.wide.s32 %rd10, %r13, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f2, [%rd11];
add.s32 %r14, %r1, %r3;
mad.lo.s32 %r15, %r14, %r4, %r2;
mul.wide.s32 %rd12, %r15, 4;
add.s64 %rd13, %rd1, %rd12;
st.global.f32 [%rd13], %f2;
BB31_4:
ret;
}
// .globl reduce_sum_d
.visible .entry reduce_sum_d(
.param .u64 reduce_sum_d_param_0,
.param .u64 reduce_sum_d_param_1,
.param .u32 reduce_sum_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_sum_d_param_0];
ld.param.u64 %rd3, [reduce_sum_d_param_1];
ld.param.u32 %r5, [reduce_sum_d_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d0000000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB32_4;
BB32_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
add.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB32_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
add.f64 %fd78, %fd78, %fd31;
BB32_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB32_1;
BB32_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB32_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB32_7;
ld.shared.f64 %fd32, [%rd1+4096];
add.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB32_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB32_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB32_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB32_11;
ld.shared.f64 %fd33, [%rd1+2048];
add.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB32_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB32_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB32_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB32_15;
ld.shared.f64 %fd34, [%rd1+1024];
add.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB32_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB32_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB32_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB32_19;
ld.shared.f64 %fd35, [%rd1+512];
add.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB32_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB32_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB32_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB32_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
add.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB32_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB32_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
add.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB32_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB32_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
add.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB32_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB32_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
add.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB32_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB32_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
add.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB32_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB32_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
add.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB32_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB32_35;
ld.shared.f64 %fd43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB32_35:
ret;
}
// .globl reduce_sum_f
.visible .entry reduce_sum_f(
.param .u64 reduce_sum_f_param_0,
.param .u64 reduce_sum_f_param_1,
.param .u32 reduce_sum_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<79>;
.reg .b32 %r<33>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_sum_f_param_0];
ld.param.u64 %rd3, [reduce_sum_f_param_1];
ld.param.u32 %r5, [reduce_sum_f_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f32 %f76, 0f00000000;
mov.f32 %f77, %f76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB33_4;
BB33_1:
mov.f32 %f1, %f77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f30, [%rd6];
add.f32 %f78, %f1, %f30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB33_3;
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f31, [%rd9];
add.f32 %f78, %f78, %f31;
BB33_3:
mov.f32 %f77, %f78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f32 %f76, %f77;
@%p3 bra BB33_1;
BB33_4:
mov.f32 %f74, %f76;
mul.wide.u32 %rd10, %r6, 4;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB33_8;
setp.gt.u32 %p5, %r6, 511;
mov.f32 %f75, %f74;
@%p5 bra BB33_7;
ld.shared.f32 %f32, [%rd1+2048];
add.f32 %f75, %f74, %f32;
st.shared.f32 [%rd1], %f75;
BB33_7:
mov.f32 %f74, %f75;
bar.sync 0;
BB33_8:
mov.f32 %f72, %f74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB33_12;
setp.gt.u32 %p7, %r6, 255;
mov.f32 %f73, %f72;
@%p7 bra BB33_11;
ld.shared.f32 %f33, [%rd1+1024];
add.f32 %f73, %f72, %f33;
st.shared.f32 [%rd1], %f73;
BB33_11:
mov.f32 %f72, %f73;
bar.sync 0;
BB33_12:
mov.f32 %f70, %f72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB33_16;
setp.gt.u32 %p9, %r6, 127;
mov.f32 %f71, %f70;
@%p9 bra BB33_15;
ld.shared.f32 %f34, [%rd1+512];
add.f32 %f71, %f70, %f34;
st.shared.f32 [%rd1], %f71;
BB33_15:
mov.f32 %f70, %f71;
bar.sync 0;
BB33_16:
mov.f32 %f68, %f70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB33_20;
setp.gt.u32 %p11, %r6, 63;
mov.f32 %f69, %f68;
@%p11 bra BB33_19;
ld.shared.f32 %f35, [%rd1+256];
add.f32 %f69, %f68, %f35;
st.shared.f32 [%rd1], %f69;
BB33_19:
mov.f32 %f68, %f69;
bar.sync 0;
BB33_20:
mov.f32 %f67, %f68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB33_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB33_23;
ld.volatile.shared.f32 %f36, [%rd1+128];
add.f32 %f67, %f67, %f36;
st.volatile.shared.f32 [%rd1], %f67;
BB33_23:
mov.f32 %f66, %f67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB33_25;
ld.volatile.shared.f32 %f37, [%rd1+64];
add.f32 %f66, %f66, %f37;
st.volatile.shared.f32 [%rd1], %f66;
BB33_25:
mov.f32 %f65, %f66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB33_27;
ld.volatile.shared.f32 %f38, [%rd1+32];
add.f32 %f65, %f65, %f38;
st.volatile.shared.f32 [%rd1], %f65;
BB33_27:
mov.f32 %f64, %f65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB33_29;
ld.volatile.shared.f32 %f39, [%rd1+16];
add.f32 %f64, %f64, %f39;
st.volatile.shared.f32 [%rd1], %f64;
BB33_29:
mov.f32 %f63, %f64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB33_31;
ld.volatile.shared.f32 %f40, [%rd1+8];
add.f32 %f63, %f63, %f40;
st.volatile.shared.f32 [%rd1], %f63;
BB33_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB33_33;
ld.volatile.shared.f32 %f41, [%rd1+4];
add.f32 %f42, %f63, %f41;
st.volatile.shared.f32 [%rd1], %f42;
BB33_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB33_35;
ld.shared.f32 %f43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f43;
BB33_35:
ret;
}
// .globl reduce_row_sum_d
.visible .entry reduce_row_sum_d(
.param .u64 reduce_row_sum_d_param_0,
.param .u64 reduce_row_sum_d_param_1,
.param .u32 reduce_row_sum_d_param_2,
.param .u32 reduce_row_sum_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_sum_d_param_0];
ld.param.u64 %rd2, [reduce_row_sum_d_param_1];
ld.param.u32 %r5, [reduce_row_sum_d_param_2];
ld.param.u32 %r4, [reduce_row_sum_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB34_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d0000000000000000;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB34_4;
cvta.to.global.u64 %rd3, %rd1;
BB34_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB34_3;
BB34_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB34_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB34_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB34_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB34_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB34_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB34_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB34_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB34_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB34_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB34_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB34_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB34_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB34_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB34_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB34_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB34_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB34_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB34_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB34_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB34_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB34_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB34_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB34_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB34_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB34_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB34_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB34_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB34_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB34_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB34_35;
ld.shared.f64 %fd40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB34_35:
ret;
}
// .globl reduce_row_sum_f
.visible .entry reduce_row_sum_f(
.param .u64 reduce_row_sum_f_param_0,
.param .u64 reduce_row_sum_f_param_1,
.param .u32 reduce_row_sum_f_param_2,
.param .u32 reduce_row_sum_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<74>;
.reg .b32 %r<39>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_sum_f_param_0];
ld.param.u64 %rd2, [reduce_row_sum_f_param_1];
ld.param.u32 %r5, [reduce_row_sum_f_param_2];
ld.param.u32 %r4, [reduce_row_sum_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB35_35;
mov.u32 %r38, %tid.x;
mov.f32 %f72, 0f00000000;
mov.f32 %f73, %f72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB35_4;
cvta.to.global.u64 %rd3, %rd1;
BB35_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
add.f32 %f73, %f73, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f32 %f72, %f73;
@%p3 bra BB35_3;
BB35_4:
mov.f32 %f70, %f72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 4;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f32 [%rd8], %f70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB35_8;
setp.gt.u32 %p5, %r10, 511;
mov.f32 %f71, %f70;
@%p5 bra BB35_7;
ld.shared.f32 %f29, [%rd8+2048];
add.f32 %f71, %f70, %f29;
st.shared.f32 [%rd8], %f71;
BB35_7:
mov.f32 %f70, %f71;
bar.sync 0;
BB35_8:
mov.f32 %f68, %f70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB35_12;
setp.gt.u32 %p7, %r10, 255;
mov.f32 %f69, %f68;
@%p7 bra BB35_11;
ld.shared.f32 %f30, [%rd8+1024];
add.f32 %f69, %f68, %f30;
st.shared.f32 [%rd8], %f69;
BB35_11:
mov.f32 %f68, %f69;
bar.sync 0;
BB35_12:
mov.f32 %f66, %f68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB35_16;
setp.gt.u32 %p9, %r10, 127;
mov.f32 %f67, %f66;
@%p9 bra BB35_15;
ld.shared.f32 %f31, [%rd8+512];
add.f32 %f67, %f66, %f31;
st.shared.f32 [%rd8], %f67;
BB35_15:
mov.f32 %f66, %f67;
bar.sync 0;
BB35_16:
mov.f32 %f64, %f66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB35_20;
setp.gt.u32 %p11, %r10, 63;
mov.f32 %f65, %f64;
@%p11 bra BB35_19;
ld.shared.f32 %f32, [%rd8+256];
add.f32 %f65, %f64, %f32;
st.shared.f32 [%rd8], %f65;
BB35_19:
mov.f32 %f64, %f65;
bar.sync 0;
BB35_20:
mov.f32 %f63, %f64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB35_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB35_23;
ld.volatile.shared.f32 %f33, [%rd8+128];
add.f32 %f63, %f63, %f33;
st.volatile.shared.f32 [%rd8], %f63;
BB35_23:
mov.f32 %f62, %f63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB35_25;
ld.volatile.shared.f32 %f34, [%rd8+64];
add.f32 %f62, %f62, %f34;
st.volatile.shared.f32 [%rd8], %f62;
BB35_25:
mov.f32 %f61, %f62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB35_27;
ld.volatile.shared.f32 %f35, [%rd8+32];
add.f32 %f61, %f61, %f35;
st.volatile.shared.f32 [%rd8], %f61;
BB35_27:
mov.f32 %f60, %f61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB35_29;
ld.volatile.shared.f32 %f36, [%rd8+16];
add.f32 %f60, %f60, %f36;
st.volatile.shared.f32 [%rd8], %f60;
BB35_29:
mov.f32 %f59, %f60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB35_31;
ld.volatile.shared.f32 %f37, [%rd8+8];
add.f32 %f59, %f59, %f37;
st.volatile.shared.f32 [%rd8], %f59;
BB35_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB35_33;
ld.volatile.shared.f32 %f38, [%rd8+4];
add.f32 %f39, %f59, %f38;
st.volatile.shared.f32 [%rd8], %f39;
BB35_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB35_35;
ld.shared.f32 %f40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 4;
add.s64 %rd41, %rd39, %rd40;
st.global.f32 [%rd41], %f40;
BB35_35:
ret;
}
// .globl reduce_col_sum_d
.visible .entry reduce_col_sum_d(
.param .u64 reduce_col_sum_d_param_0,
.param .u64 reduce_col_sum_d_param_1,
.param .u32 reduce_col_sum_d_param_2,
.param .u32 reduce_col_sum_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_d_param_0];
ld.param.u64 %rd3, [reduce_col_sum_d_param_1];
ld.param.u32 %r5, [reduce_col_sum_d_param_2];
ld.param.u32 %r6, [reduce_col_sum_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB36_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d0000000000000000;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB36_4;
mov.u32 %r10, %r1;
BB36_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB36_3;
BB36_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB36_5:
ret;
}
// .globl reduce_col_sum_f
.visible .entry reduce_col_sum_f(
.param .u64 reduce_col_sum_f_param_0,
.param .u64 reduce_col_sum_f_param_1,
.param .u32 reduce_col_sum_f_param_2,
.param .u32 reduce_col_sum_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<10>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_f_param_0];
ld.param.u64 %rd3, [reduce_col_sum_f_param_1];
ld.param.u32 %r5, [reduce_col_sum_f_param_2];
ld.param.u32 %r6, [reduce_col_sum_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB37_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f32 %f8, 0f00000000;
mov.f32 %f9, %f8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB37_4;
mov.u32 %r10, %r1;
BB37_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
add.f32 %f9, %f9, %f6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f32 %f8, %f9;
@%p3 bra BB37_3;
BB37_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB37_5:
ret;
}
// .globl reduce_max_d
.visible .entry reduce_max_d(
.param .u64 reduce_max_d_param_0,
.param .u64 reduce_max_d_param_1,
.param .u32 reduce_max_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_max_d_param_0];
ld.param.u64 %rd3, [reduce_max_d_param_1];
ld.param.u32 %r5, [reduce_max_d_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB38_4;
BB38_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
max.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB38_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
max.f64 %fd78, %fd78, %fd31;
BB38_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB38_1;
BB38_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB38_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB38_7;
ld.shared.f64 %fd32, [%rd1+4096];
max.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB38_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB38_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB38_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB38_11;
ld.shared.f64 %fd33, [%rd1+2048];
max.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB38_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB38_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB38_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB38_15;
ld.shared.f64 %fd34, [%rd1+1024];
max.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB38_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB38_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB38_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB38_19;
ld.shared.f64 %fd35, [%rd1+512];
max.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB38_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB38_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB38_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB38_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
max.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB38_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB38_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
max.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB38_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB38_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
max.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB38_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB38_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
max.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB38_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB38_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
max.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB38_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB38_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
max.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB38_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB38_35;
ld.shared.f64 %fd43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB38_35:
ret;
}
// .globl reduce_max_f
.visible .entry reduce_max_f(
.param .u64 reduce_max_f_param_0,
.param .u64 reduce_max_f_param_1,
.param .u32 reduce_max_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<79>;
.reg .b32 %r<33>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_max_f_param_0];
ld.param.u64 %rd3, [reduce_max_f_param_1];
ld.param.u32 %r5, [reduce_max_f_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f32 %f76, 0fFF7FFFFF;
mov.f32 %f77, %f76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB39_4;
BB39_1:
mov.f32 %f1, %f77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f30, [%rd6];
max.f32 %f78, %f1, %f30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB39_3;
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f31, [%rd9];
max.f32 %f78, %f78, %f31;
BB39_3:
mov.f32 %f77, %f78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f32 %f76, %f77;
@%p3 bra BB39_1;
BB39_4:
mov.f32 %f74, %f76;
mul.wide.u32 %rd10, %r6, 4;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB39_8;
setp.gt.u32 %p5, %r6, 511;
mov.f32 %f75, %f74;
@%p5 bra BB39_7;
ld.shared.f32 %f32, [%rd1+2048];
max.f32 %f75, %f74, %f32;
st.shared.f32 [%rd1], %f75;
BB39_7:
mov.f32 %f74, %f75;
bar.sync 0;
BB39_8:
mov.f32 %f72, %f74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB39_12;
setp.gt.u32 %p7, %r6, 255;
mov.f32 %f73, %f72;
@%p7 bra BB39_11;
ld.shared.f32 %f33, [%rd1+1024];
max.f32 %f73, %f72, %f33;
st.shared.f32 [%rd1], %f73;
BB39_11:
mov.f32 %f72, %f73;
bar.sync 0;
BB39_12:
mov.f32 %f70, %f72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB39_16;
setp.gt.u32 %p9, %r6, 127;
mov.f32 %f71, %f70;
@%p9 bra BB39_15;
ld.shared.f32 %f34, [%rd1+512];
max.f32 %f71, %f70, %f34;
st.shared.f32 [%rd1], %f71;
BB39_15:
mov.f32 %f70, %f71;
bar.sync 0;
BB39_16:
mov.f32 %f68, %f70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB39_20;
setp.gt.u32 %p11, %r6, 63;
mov.f32 %f69, %f68;
@%p11 bra BB39_19;
ld.shared.f32 %f35, [%rd1+256];
max.f32 %f69, %f68, %f35;
st.shared.f32 [%rd1], %f69;
BB39_19:
mov.f32 %f68, %f69;
bar.sync 0;
BB39_20:
mov.f32 %f67, %f68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB39_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB39_23;
ld.volatile.shared.f32 %f36, [%rd1+128];
max.f32 %f67, %f67, %f36;
st.volatile.shared.f32 [%rd1], %f67;
BB39_23:
mov.f32 %f66, %f67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB39_25;
ld.volatile.shared.f32 %f37, [%rd1+64];
max.f32 %f66, %f66, %f37;
st.volatile.shared.f32 [%rd1], %f66;
BB39_25:
mov.f32 %f65, %f66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB39_27;
ld.volatile.shared.f32 %f38, [%rd1+32];
max.f32 %f65, %f65, %f38;
st.volatile.shared.f32 [%rd1], %f65;
BB39_27:
mov.f32 %f64, %f65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB39_29;
ld.volatile.shared.f32 %f39, [%rd1+16];
max.f32 %f64, %f64, %f39;
st.volatile.shared.f32 [%rd1], %f64;
BB39_29:
mov.f32 %f63, %f64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB39_31;
ld.volatile.shared.f32 %f40, [%rd1+8];
max.f32 %f63, %f63, %f40;
st.volatile.shared.f32 [%rd1], %f63;
BB39_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB39_33;
ld.volatile.shared.f32 %f41, [%rd1+4];
max.f32 %f42, %f63, %f41;
st.volatile.shared.f32 [%rd1], %f42;
BB39_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB39_35;
ld.shared.f32 %f43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f43;
BB39_35:
ret;
}
// .globl reduce_row_max_d
.visible .entry reduce_row_max_d(
.param .u64 reduce_row_max_d_param_0,
.param .u64 reduce_row_max_d_param_1,
.param .u32 reduce_row_max_d_param_2,
.param .u32 reduce_row_max_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_max_d_param_0];
ld.param.u64 %rd2, [reduce_row_max_d_param_1];
ld.param.u32 %r5, [reduce_row_max_d_param_2];
ld.param.u32 %r4, [reduce_row_max_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB40_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB40_4;
cvta.to.global.u64 %rd3, %rd1;
BB40_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
max.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB40_3;
BB40_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB40_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB40_7;
ld.shared.f64 %fd29, [%rd8+4096];
max.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB40_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB40_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB40_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB40_11;
ld.shared.f64 %fd30, [%rd8+2048];
max.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB40_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB40_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB40_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB40_15;
ld.shared.f64 %fd31, [%rd8+1024];
max.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB40_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB40_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB40_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB40_19;
ld.shared.f64 %fd32, [%rd8+512];
max.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB40_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB40_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB40_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB40_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
max.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB40_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB40_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
max.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB40_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB40_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
max.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB40_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB40_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
max.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB40_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB40_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
max.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB40_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB40_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
max.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB40_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB40_35;
ld.shared.f64 %fd40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB40_35:
ret;
}
// .globl reduce_row_max_f
.visible .entry reduce_row_max_f(
.param .u64 reduce_row_max_f_param_0,
.param .u64 reduce_row_max_f_param_1,
.param .u32 reduce_row_max_f_param_2,
.param .u32 reduce_row_max_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<74>;
.reg .b32 %r<39>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_max_f_param_0];
ld.param.u64 %rd2, [reduce_row_max_f_param_1];
ld.param.u32 %r5, [reduce_row_max_f_param_2];
ld.param.u32 %r4, [reduce_row_max_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB41_35;
mov.u32 %r38, %tid.x;
mov.f32 %f72, 0fFF7FFFFF;
mov.f32 %f73, %f72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB41_4;
cvta.to.global.u64 %rd3, %rd1;
BB41_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
max.f32 %f73, %f73, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f32 %f72, %f73;
@%p3 bra BB41_3;
BB41_4:
mov.f32 %f70, %f72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 4;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f32 [%rd8], %f70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB41_8;
setp.gt.u32 %p5, %r10, 511;
mov.f32 %f71, %f70;
@%p5 bra BB41_7;
ld.shared.f32 %f29, [%rd8+2048];
max.f32 %f71, %f70, %f29;
st.shared.f32 [%rd8], %f71;
BB41_7:
mov.f32 %f70, %f71;
bar.sync 0;
BB41_8:
mov.f32 %f68, %f70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB41_12;
setp.gt.u32 %p7, %r10, 255;
mov.f32 %f69, %f68;
@%p7 bra BB41_11;
ld.shared.f32 %f30, [%rd8+1024];
max.f32 %f69, %f68, %f30;
st.shared.f32 [%rd8], %f69;
BB41_11:
mov.f32 %f68, %f69;
bar.sync 0;
BB41_12:
mov.f32 %f66, %f68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB41_16;
setp.gt.u32 %p9, %r10, 127;
mov.f32 %f67, %f66;
@%p9 bra BB41_15;
ld.shared.f32 %f31, [%rd8+512];
max.f32 %f67, %f66, %f31;
st.shared.f32 [%rd8], %f67;
BB41_15:
mov.f32 %f66, %f67;
bar.sync 0;
BB41_16:
mov.f32 %f64, %f66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB41_20;
setp.gt.u32 %p11, %r10, 63;
mov.f32 %f65, %f64;
@%p11 bra BB41_19;
ld.shared.f32 %f32, [%rd8+256];
max.f32 %f65, %f64, %f32;
st.shared.f32 [%rd8], %f65;
BB41_19:
mov.f32 %f64, %f65;
bar.sync 0;
BB41_20:
mov.f32 %f63, %f64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB41_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB41_23;
ld.volatile.shared.f32 %f33, [%rd8+128];
max.f32 %f63, %f63, %f33;
st.volatile.shared.f32 [%rd8], %f63;
BB41_23:
mov.f32 %f62, %f63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB41_25;
ld.volatile.shared.f32 %f34, [%rd8+64];
max.f32 %f62, %f62, %f34;
st.volatile.shared.f32 [%rd8], %f62;
BB41_25:
mov.f32 %f61, %f62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB41_27;
ld.volatile.shared.f32 %f35, [%rd8+32];
max.f32 %f61, %f61, %f35;
st.volatile.shared.f32 [%rd8], %f61;
BB41_27:
mov.f32 %f60, %f61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB41_29;
ld.volatile.shared.f32 %f36, [%rd8+16];
max.f32 %f60, %f60, %f36;
st.volatile.shared.f32 [%rd8], %f60;
BB41_29:
mov.f32 %f59, %f60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB41_31;
ld.volatile.shared.f32 %f37, [%rd8+8];
max.f32 %f59, %f59, %f37;
st.volatile.shared.f32 [%rd8], %f59;
BB41_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB41_33;
ld.volatile.shared.f32 %f38, [%rd8+4];
max.f32 %f39, %f59, %f38;
st.volatile.shared.f32 [%rd8], %f39;
BB41_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB41_35;
ld.shared.f32 %f40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 4;
add.s64 %rd41, %rd39, %rd40;
st.global.f32 [%rd41], %f40;
BB41_35:
ret;
}
// .globl reduce_col_max_d
.visible .entry reduce_col_max_d(
.param .u64 reduce_col_max_d_param_0,
.param .u64 reduce_col_max_d_param_1,
.param .u32 reduce_col_max_d_param_2,
.param .u32 reduce_col_max_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_d_param_0];
ld.param.u64 %rd3, [reduce_col_max_d_param_1];
ld.param.u32 %r5, [reduce_col_max_d_param_2];
ld.param.u32 %r6, [reduce_col_max_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB42_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB42_4;
mov.u32 %r10, %r1;
BB42_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
max.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB42_3;
BB42_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB42_5:
ret;
}
// .globl reduce_col_max_f
.visible .entry reduce_col_max_f(
.param .u64 reduce_col_max_f_param_0,
.param .u64 reduce_col_max_f_param_1,
.param .u32 reduce_col_max_f_param_2,
.param .u32 reduce_col_max_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<10>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_f_param_0];
ld.param.u64 %rd3, [reduce_col_max_f_param_1];
ld.param.u32 %r5, [reduce_col_max_f_param_2];
ld.param.u32 %r6, [reduce_col_max_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB43_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f32 %f8, 0fFF7FFFFF;
mov.f32 %f9, %f8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB43_4;
mov.u32 %r10, %r1;
BB43_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
max.f32 %f9, %f9, %f6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f32 %f8, %f9;
@%p3 bra BB43_3;
BB43_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB43_5:
ret;
}
// .globl reduce_min_d
.visible .entry reduce_min_d(
.param .u64 reduce_min_d_param_0,
.param .u64 reduce_min_d_param_1,
.param .u32 reduce_min_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_min_d_param_0];
ld.param.u64 %rd3, [reduce_min_d_param_1];
ld.param.u32 %r5, [reduce_min_d_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB44_4;
BB44_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
min.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB44_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
min.f64 %fd78, %fd78, %fd31;
BB44_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB44_1;
BB44_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB44_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB44_7;
ld.shared.f64 %fd32, [%rd1+4096];
min.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB44_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB44_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB44_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB44_11;
ld.shared.f64 %fd33, [%rd1+2048];
min.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB44_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB44_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB44_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB44_15;
ld.shared.f64 %fd34, [%rd1+1024];
min.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB44_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB44_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB44_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB44_19;
ld.shared.f64 %fd35, [%rd1+512];
min.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB44_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB44_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB44_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB44_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
min.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB44_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB44_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
min.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB44_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB44_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
min.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB44_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB44_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
min.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB44_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB44_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
min.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB44_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB44_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
min.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB44_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB44_35;
ld.shared.f64 %fd43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB44_35:
ret;
}
// .globl reduce_min_f
.visible .entry reduce_min_f(
.param .u64 reduce_min_f_param_0,
.param .u64 reduce_min_f_param_1,
.param .u32 reduce_min_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<79>;
.reg .b32 %r<33>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_min_f_param_0];
ld.param.u64 %rd3, [reduce_min_f_param_1];
ld.param.u32 %r5, [reduce_min_f_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f32 %f76, 0f7F7FFFFF;
mov.f32 %f77, %f76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB45_4;
BB45_1:
mov.f32 %f1, %f77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f30, [%rd6];
min.f32 %f78, %f1, %f30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB45_3;
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f31, [%rd9];
min.f32 %f78, %f78, %f31;
BB45_3:
mov.f32 %f77, %f78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f32 %f76, %f77;
@%p3 bra BB45_1;
BB45_4:
mov.f32 %f74, %f76;
mul.wide.u32 %rd10, %r6, 4;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB45_8;
setp.gt.u32 %p5, %r6, 511;
mov.f32 %f75, %f74;
@%p5 bra BB45_7;
ld.shared.f32 %f32, [%rd1+2048];
min.f32 %f75, %f74, %f32;
st.shared.f32 [%rd1], %f75;
BB45_7:
mov.f32 %f74, %f75;
bar.sync 0;
BB45_8:
mov.f32 %f72, %f74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB45_12;
setp.gt.u32 %p7, %r6, 255;
mov.f32 %f73, %f72;
@%p7 bra BB45_11;
ld.shared.f32 %f33, [%rd1+1024];
min.f32 %f73, %f72, %f33;
st.shared.f32 [%rd1], %f73;
BB45_11:
mov.f32 %f72, %f73;
bar.sync 0;
BB45_12:
mov.f32 %f70, %f72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB45_16;
setp.gt.u32 %p9, %r6, 127;
mov.f32 %f71, %f70;
@%p9 bra BB45_15;
ld.shared.f32 %f34, [%rd1+512];
min.f32 %f71, %f70, %f34;
st.shared.f32 [%rd1], %f71;
BB45_15:
mov.f32 %f70, %f71;
bar.sync 0;
BB45_16:
mov.f32 %f68, %f70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB45_20;
setp.gt.u32 %p11, %r6, 63;
mov.f32 %f69, %f68;
@%p11 bra BB45_19;
ld.shared.f32 %f35, [%rd1+256];
min.f32 %f69, %f68, %f35;
st.shared.f32 [%rd1], %f69;
BB45_19:
mov.f32 %f68, %f69;
bar.sync 0;
BB45_20:
mov.f32 %f67, %f68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB45_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB45_23;
ld.volatile.shared.f32 %f36, [%rd1+128];
min.f32 %f67, %f67, %f36;
st.volatile.shared.f32 [%rd1], %f67;
BB45_23:
mov.f32 %f66, %f67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB45_25;
ld.volatile.shared.f32 %f37, [%rd1+64];
min.f32 %f66, %f66, %f37;
st.volatile.shared.f32 [%rd1], %f66;
BB45_25:
mov.f32 %f65, %f66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB45_27;
ld.volatile.shared.f32 %f38, [%rd1+32];
min.f32 %f65, %f65, %f38;
st.volatile.shared.f32 [%rd1], %f65;
BB45_27:
mov.f32 %f64, %f65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB45_29;
ld.volatile.shared.f32 %f39, [%rd1+16];
min.f32 %f64, %f64, %f39;
st.volatile.shared.f32 [%rd1], %f64;
BB45_29:
mov.f32 %f63, %f64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB45_31;
ld.volatile.shared.f32 %f40, [%rd1+8];
min.f32 %f63, %f63, %f40;
st.volatile.shared.f32 [%rd1], %f63;
BB45_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB45_33;
ld.volatile.shared.f32 %f41, [%rd1+4];
min.f32 %f42, %f63, %f41;
st.volatile.shared.f32 [%rd1], %f42;
BB45_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB45_35;
ld.shared.f32 %f43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f43;
BB45_35:
ret;
}
// .globl reduce_row_min_d
.visible .entry reduce_row_min_d(
.param .u64 reduce_row_min_d_param_0,
.param .u64 reduce_row_min_d_param_1,
.param .u32 reduce_row_min_d_param_2,
.param .u32 reduce_row_min_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_min_d_param_0];
ld.param.u64 %rd2, [reduce_row_min_d_param_1];
ld.param.u32 %r5, [reduce_row_min_d_param_2];
ld.param.u32 %r4, [reduce_row_min_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB46_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB46_4;
cvta.to.global.u64 %rd3, %rd1;
BB46_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
min.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB46_3;
BB46_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB46_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB46_7;
ld.shared.f64 %fd29, [%rd8+4096];
min.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB46_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB46_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB46_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB46_11;
ld.shared.f64 %fd30, [%rd8+2048];
min.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB46_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB46_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB46_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB46_15;
ld.shared.f64 %fd31, [%rd8+1024];
min.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB46_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB46_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB46_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB46_19;
ld.shared.f64 %fd32, [%rd8+512];
min.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB46_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB46_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB46_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB46_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
min.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB46_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB46_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
min.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB46_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB46_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
min.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB46_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB46_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
min.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB46_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB46_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
min.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB46_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB46_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
min.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB46_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB46_35;
ld.shared.f64 %fd40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB46_35:
ret;
}
// .globl reduce_row_min_f
.visible .entry reduce_row_min_f(
.param .u64 reduce_row_min_f_param_0,
.param .u64 reduce_row_min_f_param_1,
.param .u32 reduce_row_min_f_param_2,
.param .u32 reduce_row_min_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<74>;
.reg .b32 %r<39>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_min_f_param_0];
ld.param.u64 %rd2, [reduce_row_min_f_param_1];
ld.param.u32 %r5, [reduce_row_min_f_param_2];
ld.param.u32 %r4, [reduce_row_min_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB47_35;
mov.u32 %r38, %tid.x;
mov.f32 %f72, 0f7F7FFFFF;
mov.f32 %f73, %f72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB47_4;
cvta.to.global.u64 %rd3, %rd1;
BB47_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
min.f32 %f73, %f73, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f32 %f72, %f73;
@%p3 bra BB47_3;
BB47_4:
mov.f32 %f70, %f72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 4;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f32 [%rd8], %f70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB47_8;
setp.gt.u32 %p5, %r10, 511;
mov.f32 %f71, %f70;
@%p5 bra BB47_7;
ld.shared.f32 %f29, [%rd8+2048];
min.f32 %f71, %f70, %f29;
st.shared.f32 [%rd8], %f71;
BB47_7:
mov.f32 %f70, %f71;
bar.sync 0;
BB47_8:
mov.f32 %f68, %f70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB47_12;
setp.gt.u32 %p7, %r10, 255;
mov.f32 %f69, %f68;
@%p7 bra BB47_11;
ld.shared.f32 %f30, [%rd8+1024];
min.f32 %f69, %f68, %f30;
st.shared.f32 [%rd8], %f69;
BB47_11:
mov.f32 %f68, %f69;
bar.sync 0;
BB47_12:
mov.f32 %f66, %f68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB47_16;
setp.gt.u32 %p9, %r10, 127;
mov.f32 %f67, %f66;
@%p9 bra BB47_15;
ld.shared.f32 %f31, [%rd8+512];
min.f32 %f67, %f66, %f31;
st.shared.f32 [%rd8], %f67;
BB47_15:
mov.f32 %f66, %f67;
bar.sync 0;
BB47_16:
mov.f32 %f64, %f66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB47_20;
setp.gt.u32 %p11, %r10, 63;
mov.f32 %f65, %f64;
@%p11 bra BB47_19;
ld.shared.f32 %f32, [%rd8+256];
min.f32 %f65, %f64, %f32;
st.shared.f32 [%rd8], %f65;
BB47_19:
mov.f32 %f64, %f65;
bar.sync 0;
BB47_20:
mov.f32 %f63, %f64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB47_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB47_23;
ld.volatile.shared.f32 %f33, [%rd8+128];
min.f32 %f63, %f63, %f33;
st.volatile.shared.f32 [%rd8], %f63;
BB47_23:
mov.f32 %f62, %f63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB47_25;
ld.volatile.shared.f32 %f34, [%rd8+64];
min.f32 %f62, %f62, %f34;
st.volatile.shared.f32 [%rd8], %f62;
BB47_25:
mov.f32 %f61, %f62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB47_27;
ld.volatile.shared.f32 %f35, [%rd8+32];
min.f32 %f61, %f61, %f35;
st.volatile.shared.f32 [%rd8], %f61;
BB47_27:
mov.f32 %f60, %f61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB47_29;
ld.volatile.shared.f32 %f36, [%rd8+16];
min.f32 %f60, %f60, %f36;
st.volatile.shared.f32 [%rd8], %f60;
BB47_29:
mov.f32 %f59, %f60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB47_31;
ld.volatile.shared.f32 %f37, [%rd8+8];
min.f32 %f59, %f59, %f37;
st.volatile.shared.f32 [%rd8], %f59;
BB47_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB47_33;
ld.volatile.shared.f32 %f38, [%rd8+4];
min.f32 %f39, %f59, %f38;
st.volatile.shared.f32 [%rd8], %f39;
BB47_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB47_35;
ld.shared.f32 %f40, [my_sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 4;
add.s64 %rd41, %rd39, %rd40;
st.global.f32 [%rd41], %f40;
BB47_35:
ret;
}
// .globl reduce_col_min_d
.visible .entry reduce_col_min_d(
.param .u64 reduce_col_min_d_param_0,
.param .u64 reduce_col_min_d_param_1,
.param .u32 reduce_col_min_d_param_2,
.param .u32 reduce_col_min_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_d_param_0];
ld.param.u64 %rd3, [reduce_col_min_d_param_1];
ld.param.u32 %r5, [reduce_col_min_d_param_2];
ld.param.u32 %r6, [reduce_col_min_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB48_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB48_4;
mov.u32 %r10, %r1;
BB48_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
min.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB48_3;
BB48_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB48_5:
ret;
}
// .globl reduce_col_min_f
.visible .entry reduce_col_min_f(
.param .u64 reduce_col_min_f_param_0,
.param .u64 reduce_col_min_f_param_1,
.param .u32 reduce_col_min_f_param_2,
.param .u32 reduce_col_min_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<10>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_f_param_0];
ld.param.u64 %rd3, [reduce_col_min_f_param_1];
ld.param.u32 %r5, [reduce_col_min_f_param_2];
ld.param.u32 %r6, [reduce_col_min_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB49_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f32 %f8, 0f7F7FFFFF;
mov.f32 %f9, %f8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB49_4;
mov.u32 %r10, %r1;
BB49_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
min.f32 %f9, %f9, %f6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f32 %f8, %f9;
@%p3 bra BB49_3;
BB49_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB49_5:
ret;
}
// .globl reduce_prod_d
.visible .entry reduce_prod_d(
.param .u64 reduce_prod_d_param_0,
.param .u64 reduce_prod_d_param_1,
.param .u32 reduce_prod_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_prod_d_param_0];
ld.param.u64 %rd3, [reduce_prod_d_param_1];
ld.param.u32 %r5, [reduce_prod_d_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d3FF0000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB50_4;
BB50_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
mul.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB50_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
mul.f64 %fd78, %fd78, %fd31;
BB50_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB50_1;
BB50_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB50_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB50_7;
ld.shared.f64 %fd32, [%rd1+4096];
mul.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB50_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB50_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB50_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB50_11;
ld.shared.f64 %fd33, [%rd1+2048];
mul.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB50_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB50_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB50_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB50_15;
ld.shared.f64 %fd34, [%rd1+1024];
mul.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB50_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB50_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB50_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB50_19;
ld.shared.f64 %fd35, [%rd1+512];
mul.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB50_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB50_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB50_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB50_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
mul.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB50_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB50_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
mul.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB50_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB50_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
mul.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB50_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB50_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
mul.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB50_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB50_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
mul.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB50_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB50_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
mul.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB50_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB50_35;
ld.shared.f64 %fd43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB50_35:
ret;
}
// .globl reduce_prod_f
.visible .entry reduce_prod_f(
.param .u64 reduce_prod_f_param_0,
.param .u64 reduce_prod_f_param_1,
.param .u32 reduce_prod_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<79>;
.reg .b32 %r<33>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_prod_f_param_0];
ld.param.u64 %rd3, [reduce_prod_f_param_1];
ld.param.u32 %r5, [reduce_prod_f_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f32 %f76, 0f3F800000;
mov.f32 %f77, %f76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB51_4;
BB51_1:
mov.f32 %f1, %f77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f30, [%rd6];
mul.f32 %f78, %f1, %f30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB51_3;
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f31, [%rd9];
mul.f32 %f78, %f78, %f31;
BB51_3:
mov.f32 %f77, %f78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f32 %f76, %f77;
@%p3 bra BB51_1;
BB51_4:
mov.f32 %f74, %f76;
mul.wide.u32 %rd10, %r6, 4;
mov.u64 %rd11, my_sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB51_8;
setp.gt.u32 %p5, %r6, 511;
mov.f32 %f75, %f74;
@%p5 bra BB51_7;
ld.shared.f32 %f32, [%rd1+2048];
mul.f32 %f75, %f74, %f32;
st.shared.f32 [%rd1], %f75;
BB51_7:
mov.f32 %f74, %f75;
bar.sync 0;
BB51_8:
mov.f32 %f72, %f74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB51_12;
setp.gt.u32 %p7, %r6, 255;
mov.f32 %f73, %f72;
@%p7 bra BB51_11;
ld.shared.f32 %f33, [%rd1+1024];
mul.f32 %f73, %f72, %f33;
st.shared.f32 [%rd1], %f73;
BB51_11:
mov.f32 %f72, %f73;
bar.sync 0;
BB51_12:
mov.f32 %f70, %f72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB51_16;
setp.gt.u32 %p9, %r6, 127;
mov.f32 %f71, %f70;
@%p9 bra BB51_15;
ld.shared.f32 %f34, [%rd1+512];
mul.f32 %f71, %f70, %f34;
st.shared.f32 [%rd1], %f71;
BB51_15:
mov.f32 %f70, %f71;
bar.sync 0;
BB51_16:
mov.f32 %f68, %f70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB51_20;
setp.gt.u32 %p11, %r6, 63;
mov.f32 %f69, %f68;
@%p11 bra BB51_19;
ld.shared.f32 %f35, [%rd1+256];
mul.f32 %f69, %f68, %f35;
st.shared.f32 [%rd1], %f69;
BB51_19:
mov.f32 %f68, %f69;
bar.sync 0;
BB51_20:
mov.f32 %f67, %f68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB51_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB51_23;
ld.volatile.shared.f32 %f36, [%rd1+128];
mul.f32 %f67, %f67, %f36;
st.volatile.shared.f32 [%rd1], %f67;
BB51_23:
mov.f32 %f66, %f67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB51_25;
ld.volatile.shared.f32 %f37, [%rd1+64];
mul.f32 %f66, %f66, %f37;
st.volatile.shared.f32 [%rd1], %f66;
BB51_25:
mov.f32 %f65, %f66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB51_27;
ld.volatile.shared.f32 %f38, [%rd1+32];
mul.f32 %f65, %f65, %f38;
st.volatile.shared.f32 [%rd1], %f65;
BB51_27:
mov.f32 %f64, %f65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB51_29;
ld.volatile.shared.f32 %f39, [%rd1+16];
mul.f32 %f64, %f64, %f39;
st.volatile.shared.f32 [%rd1], %f64;
BB51_29:
mov.f32 %f63, %f64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB51_31;
ld.volatile.shared.f32 %f40, [%rd1+8];
mul.f32 %f63, %f63, %f40;
st.volatile.shared.f32 [%rd1], %f63;
BB51_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB51_33;
ld.volatile.shared.f32 %f41, [%rd1+4];
mul.f32 %f42, %f63, %f41;
st.volatile.shared.f32 [%rd1], %f42;
BB51_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB51_35;
ld.shared.f32 %f43, [my_sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f43;
BB51_35:
ret;
}
// .globl reduce_row_mean_d
.visible .entry reduce_row_mean_d(
.param .u64 reduce_row_mean_d_param_0,
.param .u64 reduce_row_mean_d_param_1,
.param .u32 reduce_row_mean_d_param_2,
.param .u32 reduce_row_mean_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<76>;
.reg .b64 %rd<43>;
ld.param.u64 %rd1, [reduce_row_mean_d_param_0];
ld.param.u64 %rd2, [reduce_row_mean_d_param_1];
ld.param.u32 %r5, [reduce_row_mean_d_param_2];
ld.param.u32 %r4, [reduce_row_mean_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB52_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd74, 0d0000000000000000;
mov.f64 %fd75, %fd74;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB52_4;
cvta.to.global.u64 %rd3, %rd1;
BB52_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd75, %fd75, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd74, %fd75;
@%p3 bra BB52_3;
BB52_4:
mov.f64 %fd72, %fd74;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd72;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB52_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd73, %fd72;
@%p5 bra BB52_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd73, %fd72, %fd29;
st.shared.f64 [%rd8], %fd73;
BB52_7:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB52_8:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB52_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd71, %fd70;
@%p7 bra BB52_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd71, %fd70, %fd30;
st.shared.f64 [%rd8], %fd71;
BB52_11:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB52_12:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB52_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd69, %fd68;
@%p9 bra BB52_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd69, %fd68, %fd31;
st.shared.f64 [%rd8], %fd69;
BB52_15:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB52_16:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB52_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd67, %fd66;
@%p11 bra BB52_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd67, %fd66, %fd32;
st.shared.f64 [%rd8], %fd67;
BB52_19:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB52_20:
mov.f64 %fd65, %fd66;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB52_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB52_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd65, %fd65, %fd33;
st.volatile.shared.f64 [%rd8], %fd65;
BB52_23:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB52_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd64, %fd64, %fd34;
st.volatile.shared.f64 [%rd8], %fd64;
BB52_25:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB52_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd63, %fd63, %fd35;
st.volatile.shared.f64 [%rd8], %fd63;
BB52_27:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB52_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd62, %fd62, %fd36;
st.volatile.shared.f64 [%rd8], %fd62;
BB52_29:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB52_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd61, %fd61, %fd37;
st.volatile.shared.f64 [%rd8], %fd61;
BB52_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB52_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd61, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB52_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB52_35;
ld.shared.f64 %fd40, [my_sdata];
cvt.u64.u32 %rd39, %r4;
cvt.rn.f64.s64 %fd41, %rd39;
div.rn.f64 %fd42, %fd40, %fd41;
cvta.to.global.u64 %rd40, %rd2;
mul.wide.u32 %rd41, %r6, 8;
add.s64 %rd42, %rd40, %rd41;
st.global.f64 [%rd42], %fd42;
BB52_35:
ret;
}
// .globl reduce_row_mean_f
.visible .entry reduce_row_mean_f(
.param .u64 reduce_row_mean_f_param_0,
.param .u64 reduce_row_mean_f_param_1,
.param .u32 reduce_row_mean_f_param_2,
.param .u32 reduce_row_mean_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<76>;
.reg .b32 %r<39>;
.reg .b64 %rd<43>;
ld.param.u64 %rd1, [reduce_row_mean_f_param_0];
ld.param.u64 %rd2, [reduce_row_mean_f_param_1];
ld.param.u32 %r5, [reduce_row_mean_f_param_2];
ld.param.u32 %r4, [reduce_row_mean_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB53_35;
mov.u32 %r38, %tid.x;
mov.f32 %f74, 0f00000000;
mov.f32 %f75, %f74;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB53_4;
cvta.to.global.u64 %rd3, %rd1;
BB53_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
add.f32 %f75, %f75, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f32 %f74, %f75;
@%p3 bra BB53_3;
BB53_4:
mov.f32 %f72, %f74;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 4;
mov.u64 %rd7, my_sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f32 [%rd8], %f72;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB53_8;
setp.gt.u32 %p5, %r10, 511;
mov.f32 %f73, %f72;
@%p5 bra BB53_7;
ld.shared.f32 %f29, [%rd8+2048];
add.f32 %f73, %f72, %f29;
st.shared.f32 [%rd8], %f73;
BB53_7:
mov.f32 %f72, %f73;
bar.sync 0;
BB53_8:
mov.f32 %f70, %f72;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB53_12;
setp.gt.u32 %p7, %r10, 255;
mov.f32 %f71, %f70;
@%p7 bra BB53_11;
ld.shared.f32 %f30, [%rd8+1024];
add.f32 %f71, %f70, %f30;
st.shared.f32 [%rd8], %f71;
BB53_11:
mov.f32 %f70, %f71;
bar.sync 0;
BB53_12:
mov.f32 %f68, %f70;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB53_16;
setp.gt.u32 %p9, %r10, 127;
mov.f32 %f69, %f68;
@%p9 bra BB53_15;
ld.shared.f32 %f31, [%rd8+512];
add.f32 %f69, %f68, %f31;
st.shared.f32 [%rd8], %f69;
BB53_15:
mov.f32 %f68, %f69;
bar.sync 0;
BB53_16:
mov.f32 %f66, %f68;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB53_20;
setp.gt.u32 %p11, %r10, 63;
mov.f32 %f67, %f66;
@%p11 bra BB53_19;
ld.shared.f32 %f32, [%rd8+256];
add.f32 %f67, %f66, %f32;
st.shared.f32 [%rd8], %f67;
BB53_19:
mov.f32 %f66, %f67;
bar.sync 0;
BB53_20:
mov.f32 %f65, %f66;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB53_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB53_23;
ld.volatile.shared.f32 %f33, [%rd8+128];
add.f32 %f65, %f65, %f33;
st.volatile.shared.f32 [%rd8], %f65;
BB53_23:
mov.f32 %f64, %f65;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB53_25;
ld.volatile.shared.f32 %f34, [%rd8+64];
add.f32 %f64, %f64, %f34;
st.volatile.shared.f32 [%rd8], %f64;
BB53_25:
mov.f32 %f63, %f64;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB53_27;
ld.volatile.shared.f32 %f35, [%rd8+32];
add.f32 %f63, %f63, %f35;
st.volatile.shared.f32 [%rd8], %f63;
BB53_27:
mov.f32 %f62, %f63;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB53_29;
ld.volatile.shared.f32 %f36, [%rd8+16];
add.f32 %f62, %f62, %f36;
st.volatile.shared.f32 [%rd8], %f62;
BB53_29:
mov.f32 %f61, %f62;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB53_31;
ld.volatile.shared.f32 %f37, [%rd8+8];
add.f32 %f61, %f61, %f37;
st.volatile.shared.f32 [%rd8], %f61;
BB53_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB53_33;
ld.volatile.shared.f32 %f38, [%rd8+4];
add.f32 %f39, %f61, %f38;
st.volatile.shared.f32 [%rd8], %f39;
BB53_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB53_35;
ld.shared.f32 %f40, [my_sdata];
cvt.u64.u32 %rd39, %r4;
cvt.rn.f32.s64 %f41, %rd39;
div.rn.f32 %f42, %f40, %f41;
cvta.to.global.u64 %rd40, %rd2;
mul.wide.u32 %rd41, %r6, 4;
add.s64 %rd42, %rd40, %rd41;
st.global.f32 [%rd42], %f42;
BB53_35:
ret;
}
// .globl reduce_col_mean_d
.visible .entry reduce_col_mean_d(
.param .u64 reduce_col_mean_d_param_0,
.param .u64 reduce_col_mean_d_param_1,
.param .u32 reduce_col_mean_d_param_2,
.param .u32 reduce_col_mean_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<12>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_d_param_0];
ld.param.u64 %rd3, [reduce_col_mean_d_param_1];
ld.param.u32 %r5, [reduce_col_mean_d_param_2];
ld.param.u32 %r6, [reduce_col_mean_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB54_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd10, 0d0000000000000000;
mov.f64 %fd11, %fd10;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB54_4;
mov.u32 %r10, %r1;
BB54_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd11, %fd11, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd10, %fd11;
@%p3 bra BB54_3;
BB54_4:
cvta.to.global.u64 %rd6, %rd3;
cvt.u64.u32 %rd7, %r5;
cvt.rn.f64.s64 %fd7, %rd7;
div.rn.f64 %fd8, %fd10, %fd7;
mul.wide.u32 %rd8, %r1, 8;
add.s64 %rd9, %rd6, %rd8;
st.global.f64 [%rd9], %fd8;
BB54_5:
ret;
}
// .globl reduce_col_mean_f
.visible .entry reduce_col_mean_f(
.param .u64 reduce_col_mean_f_param_0,
.param .u64 reduce_col_mean_f_param_1,
.param .u32 reduce_col_mean_f_param_2,
.param .u32 reduce_col_mean_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<12>;
.reg .b32 %r<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_f_param_0];
ld.param.u64 %rd3, [reduce_col_mean_f_param_1];
ld.param.u32 %r5, [reduce_col_mean_f_param_2];
ld.param.u32 %r6, [reduce_col_mean_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB55_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f32 %f10, 0f00000000;
mov.f32 %f11, %f10;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB55_4;
mov.u32 %r10, %r1;
BB55_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
add.f32 %f11, %f11, %f6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f32 %f10, %f11;
@%p3 bra BB55_3;
BB55_4:
cvta.to.global.u64 %rd6, %rd3;
cvt.u64.u32 %rd7, %r5;
cvt.rn.f32.s64 %f7, %rd7;
div.rn.f32 %f8, %f10, %f7;
mul.wide.u32 %rd8, %r1, 4;
add.s64 %rd9, %rd6, %rd8;
st.global.f32 [%rd9], %f8;
BB55_5:
ret;
}
// .globl matrix_exp_d
.visible .entry matrix_exp_d(
.param .u64 matrix_exp_d_param_0,
.param .u64 matrix_exp_d_param_1,
.param .u32 matrix_exp_d_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<21>;
.reg .f64 %fd<41>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_exp_d_param_0];
ld.param.u64 %rd3, [matrix_exp_d_param_1];
ld.param.u32 %r5, [matrix_exp_d_param_2];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB56_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r2, %temp}, %fd8;
}
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
mov.f64 %fd15, 0d3E928AF3FCA213EA;
mov.f64 %fd16, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd17, %fd16, %fd14, %fd15;
mov.f64 %fd18, 0d3EC71DEE62401315;
fma.rn.f64 %fd19, %fd17, %fd14, %fd18;
mov.f64 %fd20, 0d3EFA01997C89EB71;
fma.rn.f64 %fd21, %fd19, %fd14, %fd20;
mov.f64 %fd22, 0d3F2A01A014761F65;
fma.rn.f64 %fd23, %fd21, %fd14, %fd22;
mov.f64 %fd24, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd25, %fd23, %fd14, %fd24;
mov.f64 %fd26, 0d3F81111111122322;
fma.rn.f64 %fd27, %fd25, %fd14, %fd26;
mov.f64 %fd28, 0d3FA55555555502A1;
fma.rn.f64 %fd29, %fd27, %fd14, %fd28;
mov.f64 %fd30, 0d3FC5555555555511;
fma.rn.f64 %fd31, %fd29, %fd14, %fd30;
mov.f64 %fd32, 0d3FE000000000000B;
fma.rn.f64 %fd33, %fd31, %fd14, %fd32;
mov.f64 %fd34, 0d3FF0000000000000;
fma.rn.f64 %fd35, %fd33, %fd14, %fd34;
fma.rn.f64 %fd36, %fd35, %fd14, %fd34;
{
.reg .b32 %temp;
mov.b64 {%r3, %temp}, %fd36;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd36;
}
shl.b32 %r9, %r2, 20;
add.s32 %r10, %r4, %r9;
mov.b64 %fd40, {%r3, %r10};
{
.reg .b32 %temp;
mov.b64 {%temp, %r11}, %fd1;
}
mov.b32 %f2, %r11;
abs.f32 %f1, %f2;
setp.lt.f32 %p2, %f1, 0f4086232B;
@%p2 bra BB56_4;
setp.lt.f64 %p3, %fd1, 0d0000000000000000;
add.f64 %fd37, %fd1, 0d7FF0000000000000;
selp.f64 %fd40, 0d0000000000000000, %fd37, %p3;
setp.geu.f32 %p4, %f1, 0f40874800;
@%p4 bra BB56_4;
shr.u32 %r12, %r2, 31;
add.s32 %r13, %r2, %r12;
shr.s32 %r14, %r13, 1;
shl.b32 %r15, %r14, 20;
add.s32 %r16, %r15, %r4;
mov.b64 %fd38, {%r3, %r16};
sub.s32 %r17, %r2, %r14;
shl.b32 %r18, %r17, 20;
add.s32 %r19, %r18, 1072693248;
mov.u32 %r20, 0;
mov.b64 %fd39, {%r20, %r19};
mul.f64 %fd40, %fd38, %fd39;
BB56_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd40;
BB56_5:
ret;
}
// .globl matrix_exp_f
.visible .entry matrix_exp_f(
.param .u64 matrix_exp_f_param_0,
.param .u64 matrix_exp_f_param_1,
.param .u32 matrix_exp_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<15>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_exp_f_param_0];
ld.param.u64 %rd2, [matrix_exp_f_param_1];
ld.param.u32 %r2, [matrix_exp_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB57_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f3, [%rd5];
mul.f32 %f4, %f3, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f5, %f4;
mov.f32 %f6, 0fBF317200;
fma.rn.f32 %f7, %f5, %f6, %f3;
mov.f32 %f8, 0fB5BFBE8E;
fma.rn.f32 %f9, %f5, %f8, %f7;
mul.f32 %f2, %f9, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f1,%f2;
// inline asm
add.f32 %f10, %f5, 0f00000000;
ex2.approx.f32 %f11, %f10;
mul.f32 %f12, %f1, %f11;
setp.lt.f32 %p2, %f3, 0fC2D20000;
selp.f32 %f13, 0f00000000, %f12, %p2;
setp.gt.f32 %p3, %f3, 0f42D20000;
selp.f32 %f14, 0f7F800000, %f13, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f14;
BB57_2:
ret;
}
// .globl matrix_sqrt_d
.visible .entry matrix_sqrt_d(
.param .u64 matrix_sqrt_d_param_0,
.param .u64 matrix_sqrt_d_param_1,
.param .u32 matrix_sqrt_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_sqrt_d_param_0];
ld.param.u64 %rd2, [matrix_sqrt_d_param_1];
ld.param.u32 %r2, [matrix_sqrt_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB58_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
sqrt.rn.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB58_2:
ret;
}
// .globl matrix_sqrt_f
.visible .entry matrix_sqrt_f(
.param .u64 matrix_sqrt_f_param_0,
.param .u64 matrix_sqrt_f_param_1,
.param .u32 matrix_sqrt_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_sqrt_f_param_0];
ld.param.u64 %rd2, [matrix_sqrt_f_param_1];
ld.param.u32 %r2, [matrix_sqrt_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB59_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
sqrt.rn.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB59_2:
ret;
}
// .globl matrix_round_d
.visible .entry matrix_round_d(
.param .u64 matrix_round_d_param_0,
.param .u64 matrix_round_d_param_1,
.param .u32 matrix_round_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<11>;
ld.param.u64 %rd2, [matrix_round_d_param_0];
ld.param.u64 %rd3, [matrix_round_d_param_1];
ld.param.u32 %r2, [matrix_round_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB60_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd9, [%rd6];
abs.f64 %fd2, %fd9;
setp.ge.f64 %p2, %fd2, 0d4330000000000000;
@%p2 bra BB60_3;
add.f64 %fd5, %fd2, 0d3FE0000000000000;
cvt.rzi.f64.f64 %fd6, %fd5;
setp.lt.f64 %p3, %fd2, 0d3FE0000000000000;
selp.f64 %fd7, 0d0000000000000000, %fd6, %p3;
{
.reg .b32 %temp;
mov.b64 {%r6, %temp}, %fd7;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r7}, %fd7;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd9;
}
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b64 %fd9, {%r6, %r10};
BB60_3:
cvta.to.global.u64 %rd7, %rd3;
cvt.rzi.s64.f64 %rd8, %fd9;
cvt.rn.f64.s64 %fd8, %rd8;
shl.b64 %rd9, %rd1, 3;
add.s64 %rd10, %rd7, %rd9;
st.global.f64 [%rd10], %fd8;
BB60_4:
ret;
}
// .globl matrix_round_f
.visible .entry matrix_round_f(
.param .u64 matrix_round_f_param_0,
.param .u64 matrix_round_f_param_1,
.param .u32 matrix_round_f_param_2
)
{
.reg .pred %p<8>;
.reg .f32 %f<8>;
.reg .b32 %r<17>;
.reg .b64 %rd<26>;
ld.param.u64 %rd7, [matrix_round_f_param_0];
ld.param.u64 %rd8, [matrix_round_f_param_1];
ld.param.u32 %r5, [matrix_round_f_param_2];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB61_8;
cvta.to.global.u64 %rd9, %rd7;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.u32 %r2, [%rd11];
and.b32 %r9, %r2, 2147483647;
setp.gt.u32 %p2, %r9, 2139095040;
mov.f32 %f3, 0fDF000000;
mov.f32 %f7, %f3;
@%p2 bra BB61_7;
setp.gt.s32 %p3, %r2, 1593835519;
mov.f32 %f4, 0f5F000000;
mov.f32 %f7, %f4;
@%p3 bra BB61_7;
setp.gt.u32 %p4, %r2, -553648129;
mov.f32 %f7, %f3;
@%p4 bra BB61_7;
bfe.u32 %r3, %r2, 23, 8;
mov.u32 %r10, 189;
sub.s32 %r4, %r10, %r3;
shl.b32 %r11, %r2, 8;
shr.u32 %r12, %r11, 1;
or.b32 %r13, %r12, 1073741824;
cvt.u64.u32 %rd13, %r13;
shl.b64 %rd25, %rd13, 32;
setp.gt.s32 %p5, %r4, 63;
mov.u64 %rd24, 0;
@%p5 bra BB61_6;
setp.ne.s32 %p6, %r3, 189;
mov.u32 %r14, 64;
sub.s32 %r15, %r14, %r4;
shl.b64 %rd14, %rd25, %r15;
cvt.u64.u32 %rd15, %r4;
selp.b64 %rd16, %rd15, 0, %p6;
cvt.u32.u64 %r16, %rd16;
shr.u64 %rd24, %rd25, %r16;
selp.b64 %rd25, %rd14, 0, %p6;
BB61_6:
shr.u64 %rd17, %rd25, 63;
add.s64 %rd18, %rd17, %rd24;
neg.s64 %rd19, %rd18;
setp.lt.s32 %p7, %r2, 0;
selp.b64 %rd20, %rd19, %rd18, %p7;
cvt.rn.f32.s64 %f7, %rd20;
BB61_7:
cvta.to.global.u64 %rd21, %rd8;
shl.b64 %rd22, %rd1, 2;
add.s64 %rd23, %rd21, %rd22;
st.global.f32 [%rd23], %f7;
BB61_8:
ret;
}
// .globl matrix_abs_d
.visible .entry matrix_abs_d(
.param .u64 matrix_abs_d_param_0,
.param .u64 matrix_abs_d_param_1,
.param .u32 matrix_abs_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_abs_d_param_0];
ld.param.u64 %rd2, [matrix_abs_d_param_1];
ld.param.u32 %r2, [matrix_abs_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB62_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
abs.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB62_2:
ret;
}
// .globl matrix_abs_f
.visible .entry matrix_abs_f(
.param .u64 matrix_abs_f_param_0,
.param .u64 matrix_abs_f_param_1,
.param .u32 matrix_abs_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_abs_f_param_0];
ld.param.u64 %rd2, [matrix_abs_f_param_1];
ld.param.u32 %r2, [matrix_abs_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB63_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB63_2:
ret;
}
// .globl matrix_log_d
.visible .entry matrix_log_d(
.param .u64 matrix_log_d_param_0,
.param .u64 matrix_log_d_param_1,
.param .u32 matrix_log_d_param_2
)
{
.reg .pred %p<6>;
.reg .f32 %f<2>;
.reg .b32 %r<33>;
.reg .f64 %fd<59>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_log_d_param_0];
ld.param.u64 %rd3, [matrix_log_d_param_1];
ld.param.u32 %r12, [matrix_log_d_param_2];
mov.u32 %r13, %ctaid.x;
mov.u32 %r14, %ntid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
setp.ge.u32 %p1, %r1, %r12;
@%p1 bra BB64_9;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd56, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r29}, %fd56;
}
{
.reg .b32 %temp;
mov.b64 {%r30, %temp}, %fd56;
}
mov.u32 %r31, -1023;
setp.gt.s32 %p2, %r29, 1048575;
@%p2 bra BB64_3;
mul.f64 %fd56, %fd56, 0d4350000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r29}, %fd56;
}
{
.reg .b32 %temp;
mov.b64 {%r30, %temp}, %fd56;
}
mov.u32 %r31, -1077;
BB64_3:
add.s32 %r18, %r29, -1;
setp.lt.u32 %p3, %r18, 2146435071;
@%p3 bra BB64_5;
bra.uni BB64_4;
BB64_5:
shr.u32 %r20, %r29, 20;
add.s32 %r32, %r31, %r20;
and.b32 %r21, %r29, -2146435073;
or.b32 %r22, %r21, 1072693248;
mov.b64 %fd57, {%r30, %r22};
setp.lt.s32 %p5, %r22, 1073127583;
@%p5 bra BB64_7;
{
.reg .b32 %temp;
mov.b64 {%r23, %temp}, %fd57;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r24}, %fd57;
}
add.s32 %r25, %r24, -1048576;
mov.b64 %fd57, {%r23, %r25};
add.s32 %r32, %r32, 1;
BB64_7:
add.f64 %fd13, %fd57, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd12,%fd13;
// inline asm
neg.f64 %fd14, %fd13;
mov.f64 %fd15, 0d3FF0000000000000;
fma.rn.f64 %fd16, %fd14, %fd12, %fd15;
fma.rn.f64 %fd17, %fd16, %fd16, %fd16;
fma.rn.f64 %fd18, %fd17, %fd12, %fd12;
add.f64 %fd19, %fd57, 0dBFF0000000000000;
mul.f64 %fd20, %fd19, %fd18;
fma.rn.f64 %fd21, %fd19, %fd18, %fd20;
mul.f64 %fd22, %fd21, %fd21;
mov.f64 %fd23, 0d3ED0EE258B7A8B04;
mov.f64 %fd24, 0d3EB1380B3AE80F1E;
fma.rn.f64 %fd25, %fd24, %fd22, %fd23;
mov.f64 %fd26, 0d3EF3B2669F02676F;
fma.rn.f64 %fd27, %fd25, %fd22, %fd26;
mov.f64 %fd28, 0d3F1745CBA9AB0956;
fma.rn.f64 %fd29, %fd27, %fd22, %fd28;
mov.f64 %fd30, 0d3F3C71C72D1B5154;
fma.rn.f64 %fd31, %fd29, %fd22, %fd30;
mov.f64 %fd32, 0d3F624924923BE72D;
fma.rn.f64 %fd33, %fd31, %fd22, %fd32;
mov.f64 %fd34, 0d3F8999999999A3C4;
fma.rn.f64 %fd35, %fd33, %fd22, %fd34;
mov.f64 %fd36, 0d3FB5555555555554;
fma.rn.f64 %fd37, %fd35, %fd22, %fd36;
sub.f64 %fd38, %fd19, %fd21;
add.f64 %fd39, %fd38, %fd38;
neg.f64 %fd40, %fd21;
fma.rn.f64 %fd41, %fd40, %fd19, %fd39;
mul.f64 %fd42, %fd18, %fd41;
mul.f64 %fd43, %fd22, %fd37;
fma.rn.f64 %fd44, %fd43, %fd21, %fd42;
xor.b32 %r26, %r32, -2147483648;
mov.u32 %r27, 1127219200;
mov.b64 %fd45, {%r26, %r27};
mov.u32 %r28, -2147483648;
mov.b64 %fd46, {%r28, %r27};
sub.f64 %fd47, %fd45, %fd46;
mov.f64 %fd48, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd49, %fd47, %fd48, %fd21;
neg.f64 %fd50, %fd47;
fma.rn.f64 %fd51, %fd50, %fd48, %fd49;
sub.f64 %fd52, %fd51, %fd21;
sub.f64 %fd53, %fd44, %fd52;
mov.f64 %fd54, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd55, %fd47, %fd54, %fd53;
add.f64 %fd58, %fd49, %fd55;
bra.uni BB64_8;
BB64_4:
mov.f64 %fd10, 0d7FF0000000000000;
fma.rn.f64 %fd11, %fd56, %fd10, %fd10;
{
.reg .b32 %temp;
mov.b64 {%temp, %r19}, %fd56;
}
mov.b32 %f1, %r19;
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4;
BB64_8:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd58;
BB64_9:
ret;
}
// .globl matrix_log_f
.visible .entry matrix_log_f(
.param .u64 matrix_log_f_param_0,
.param .u64 matrix_log_f_param_1,
.param .u32 matrix_log_f_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<36>;
.reg .b32 %r<10>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_log_f_param_0];
ld.param.u64 %rd3, [matrix_log_f_param_1];
ld.param.u32 %r2, [matrix_log_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB65_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f5, [%rd6];
setp.lt.f32 %p2, %f5, 0f00800000;
mul.f32 %f6, %f5, 0f4B000000;
selp.f32 %f1, %f6, %f5, %p2;
mov.b32 %r6, %f1;
add.s32 %r7, %r6, -1059760811;
and.b32 %r8, %r7, -8388608;
sub.s32 %r9, %r6, %r8;
mov.b32 %f7, %r9;
cvt.rn.f32.s32 %f8, %r8;
selp.f32 %f9, 0fC1B80000, 0f00000000, %p2;
mov.f32 %f10, 0f34000000;
fma.rn.f32 %f11, %f8, %f10, %f9;
add.f32 %f12, %f7, 0fBF800000;
mov.f32 %f13, 0f3E1039F6;
mov.f32 %f14, 0fBE055027;
fma.rn.f32 %f15, %f14, %f12, %f13;
mov.f32 %f16, 0fBDF8CDCC;
fma.rn.f32 %f17, %f15, %f12, %f16;
mov.f32 %f18, 0f3E0F2955;
fma.rn.f32 %f19, %f17, %f12, %f18;
mov.f32 %f20, 0fBE2AD8B9;
fma.rn.f32 %f21, %f19, %f12, %f20;
mov.f32 %f22, 0f3E4CED0B;
fma.rn.f32 %f23, %f21, %f12, %f22;
mov.f32 %f24, 0fBE7FFF22;
fma.rn.f32 %f25, %f23, %f12, %f24;
mov.f32 %f26, 0f3EAAAA78;
fma.rn.f32 %f27, %f25, %f12, %f26;
mov.f32 %f28, 0fBF000000;
fma.rn.f32 %f29, %f27, %f12, %f28;
mul.f32 %f30, %f12, %f29;
fma.rn.f32 %f31, %f30, %f12, %f12;
mov.f32 %f32, 0f3F317218;
fma.rn.f32 %f35, %f11, %f32, %f31;
setp.lt.u32 %p3, %r6, 2139095040;
@%p3 bra BB65_3;
mov.f32 %f33, 0f7F800000;
fma.rn.f32 %f35, %f1, %f33, %f33;
BB65_3:
cvta.to.global.u64 %rd7, %rd3;
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f32 %f34, 0fFF800000, %f35, %p4;
shl.b64 %rd8, %rd1, 2;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f34;
BB65_4:
ret;
}
// .globl matrix_floor_d
.visible .entry matrix_floor_d(
.param .u64 matrix_floor_d_param_0,
.param .u64 matrix_floor_d_param_1,
.param .u32 matrix_floor_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_floor_d_param_0];
ld.param.u64 %rd2, [matrix_floor_d_param_1];
ld.param.u32 %r2, [matrix_floor_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB66_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rmi.f64.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB66_2:
ret;
}
// .globl matrix_floor_f
.visible .entry matrix_floor_f(
.param .u64 matrix_floor_f_param_0,
.param .u64 matrix_floor_f_param_1,
.param .u32 matrix_floor_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_floor_f_param_0];
ld.param.u64 %rd2, [matrix_floor_f_param_1];
ld.param.u32 %r2, [matrix_floor_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB67_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.rmi.f32.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB67_2:
ret;
}
// .globl matrix_ceil_d
.visible .entry matrix_ceil_d(
.param .u64 matrix_ceil_d_param_0,
.param .u64 matrix_ceil_d_param_1,
.param .u32 matrix_ceil_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_ceil_d_param_0];
ld.param.u64 %rd2, [matrix_ceil_d_param_1];
ld.param.u32 %r2, [matrix_ceil_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB68_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rpi.f64.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB68_2:
ret;
}
// .globl matrix_ceil_f
.visible .entry matrix_ceil_f(
.param .u64 matrix_ceil_f_param_0,
.param .u64 matrix_ceil_f_param_1,
.param .u32 matrix_ceil_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_ceil_f_param_0];
ld.param.u64 %rd2, [matrix_ceil_f_param_1];
ld.param.u32 %r2, [matrix_ceil_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB69_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.rpi.f32.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB69_2:
ret;
}
// .globl matrix_sin_d
.visible .entry matrix_sin_d(
.param .u64 matrix_sin_d_param_0,
.param .u64 matrix_sin_d_param_1,
.param .u32 matrix_sin_d_param_2
)
{
.local .align 4 .b8 __local_depot70[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .b32 %r<18>;
.reg .f64 %fd<41>;
.reg .b64 %rd<17>;
mov.u64 %rd16, __local_depot70;
cvta.local.u64 %SP, %rd16;
ld.param.u64 %rd3, [matrix_sin_d_param_0];
ld.param.u64 %rd4, [matrix_sin_d_param_1];
ld.param.u32 %r5, [matrix_sin_d_param_2];
add.u64 %rd5, %SP, 0;
cvta.to.local.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB70_10;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd38, [%rd8];
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd38;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd38;
}
and.b32 %r11, %r10, 2147483647;
setp.eq.s32 %p2, %r11, 2146435072;
setp.eq.s32 %p3, %r9, 0;
and.pred %p4, %p2, %p3;
@!%p4 bra BB70_3;
bra.uni BB70_2;
BB70_2:
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
BB70_3:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r17, %fd15;
st.local.u32 [%rd1], %r17;
cvt.rn.f64.s32 %fd16, %r17;
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF921FB54442D18;
fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
mov.f64 %fd20, 0d3C91A62633145C00;
fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
mov.f64 %fd22, 0d397B839A252049C0;
fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd38;
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p5, %r13, 1105199104;
@%p5 bra BB70_5;
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd38;
.param .b64 param1;
st.param.b64 [param1+0], %rd5;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd39, [retval0+0];
//{
}// Callseq End 3
ld.local.u32 %r17, [%rd1];
BB70_5:
and.b32 %r14, %r17, 1;
shl.b32 %r15, %r14, 3;
setp.eq.b32 %p6, %r14, 1;
selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
mul.wide.u32 %rd10, %r15, 8;
mov.u64 %rd11, __cudart_sin_cos_coeffs;
add.s64 %rd12, %rd10, %rd11;
ld.const.f64 %fd24, [%rd12+8];
mul.rn.f64 %fd7, %fd39, %fd39;
fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
ld.const.f64 %fd26, [%rd12+16];
fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
ld.const.f64 %fd28, [%rd12+24];
fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
ld.const.f64 %fd30, [%rd12+32];
fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
ld.const.f64 %fd32, [%rd12+40];
fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
setp.eq.s32 %p7, %r14, 0;
@%p7 bra BB70_7;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
BB70_7:
and.b32 %r16, %r17, 2;
setp.eq.s32 %p8, %r16, 0;
@%p8 bra BB70_9;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
BB70_9:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
BB70_10:
ret;
}
// .globl matrix_sin_f
.visible .entry matrix_sin_f(
.param .u64 matrix_sin_f_param_0,
.param .u64 matrix_sin_f_param_1,
.param .u32 matrix_sin_f_param_2
)
{
.local .align 4 .b8 __local_depot71[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<15>;
.reg .f32 %f<48>;
.reg .b32 %r<95>;
.reg .b64 %rd<22>;
mov.u64 %rd21, __local_depot71;
cvta.local.u64 %SP, %rd21;
ld.param.u64 %rd8, [matrix_sin_f_param_0];
ld.param.u64 %rd9, [matrix_sin_f_param_1];
ld.param.u32 %r30, [matrix_sin_f_param_2];
mov.u32 %r31, %ntid.x;
mov.u32 %r32, %ctaid.x;
mov.u32 %r33, %tid.x;
mad.lo.s32 %r1, %r31, %r32, %r33;
setp.ge.u32 %p1, %r1, %r30;
@%p1 bra BB71_22;
cvta.to.global.u64 %rd10, %rd8;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
add.u64 %rd13, %SP, 0;
cvta.to.local.u64 %rd2, %rd13;
ld.global.f32 %f43, [%rd12];
abs.f32 %f19, %f43;
setp.neu.f32 %p2, %f19, 0f7F800000;
@%p2 bra BB71_3;
mov.f32 %f20, 0f00000000;
mul.rn.f32 %f43, %f43, %f20;
BB71_3:
mul.f32 %f21, %f43, 0f3F22F983;
cvt.rni.s32.f32 %r94, %f21;
cvt.rn.f32.s32 %f22, %r94;
neg.f32 %f23, %f22;
mov.f32 %f24, 0f3FC90FDA;
fma.rn.f32 %f25, %f23, %f24, %f43;
mov.f32 %f26, 0f33A22168;
fma.rn.f32 %f27, %f23, %f26, %f25;
mov.f32 %f28, 0f27C234C5;
fma.rn.f32 %f44, %f23, %f28, %f27;
abs.f32 %f29, %f43;
setp.leu.f32 %p3, %f29, 0f47CE4780;
@%p3 bra BB71_11;
mov.b32 %r3, %f43;
shr.u32 %r4, %r3, 23;
shl.b32 %r36, %r3, 8;
or.b32 %r5, %r36, -2147483648;
mov.u32 %r88, 0;
mov.u64 %rd19, __cudart_i2opi_f;
mov.u32 %r87, -6;
mov.u64 %rd20, %rd2;
BB71_5:
.pragma "nounroll";
mov.u64 %rd4, %rd20;
ld.const.u32 %r39, [%rd19];
// inline asm
{
mad.lo.cc.u32 %r37, %r39, %r5, %r88;
madc.hi.u32 %r88, %r39, %r5, 0;
}
// inline asm
st.local.u32 [%rd4], %r37;
add.s64 %rd5, %rd4, 4;
add.s64 %rd19, %rd19, 4;
add.s32 %r87, %r87, 1;
setp.ne.s32 %p4, %r87, 0;
mov.u64 %rd20, %rd5;
@%p4 bra BB71_5;
and.b32 %r42, %r4, 255;
add.s32 %r43, %r42, -128;
shr.u32 %r44, %r43, 5;
and.b32 %r10, %r3, -2147483648;
st.local.u32 [%rd2+24], %r88;
mov.u32 %r45, 6;
sub.s32 %r46, %r45, %r44;
mul.wide.s32 %rd15, %r46, 4;
add.s64 %rd7, %rd2, %rd15;
ld.local.u32 %r89, [%rd7];
ld.local.u32 %r90, [%rd7+-4];
and.b32 %r13, %r4, 31;
setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB71_8;
mov.u32 %r47, 32;
sub.s32 %r48, %r47, %r13;
shr.u32 %r49, %r90, %r48;
shl.b32 %r50, %r89, %r13;
add.s32 %r89, %r49, %r50;
ld.local.u32 %r51, [%rd7+-8];
shr.u32 %r52, %r51, %r48;
shl.b32 %r53, %r90, %r13;
add.s32 %r90, %r52, %r53;
BB71_8:
shr.u32 %r54, %r90, 30;
shl.b32 %r55, %r89, 2;
add.s32 %r91, %r54, %r55;
shl.b32 %r19, %r90, 2;
shr.u32 %r56, %r91, 31;
shr.u32 %r57, %r89, 30;
add.s32 %r20, %r56, %r57;
setp.eq.s32 %p6, %r56, 0;
mov.u32 %r92, %r10;
mov.u32 %r93, %r19;
@%p6 bra BB71_10;
not.b32 %r58, %r91;
neg.s32 %r21, %r19;
setp.eq.s32 %p7, %r19, 0;
selp.u32 %r59, 1, 0, %p7;
add.s32 %r91, %r59, %r58;
xor.b32 %r23, %r10, -2147483648;
mov.u32 %r92, %r23;
mov.u32 %r93, %r21;
BB71_10:
mov.u32 %r25, %r92;
neg.s32 %r60, %r20;
setp.ne.s32 %p8, %r10, 0;
selp.b32 %r94, %r60, %r20, %p8;
clz.b32 %r61, %r91;
setp.ne.s32 %p9, %r61, 0;
shl.b32 %r62, %r91, %r61;
mov.u32 %r63, 32;
sub.s32 %r64, %r63, %r61;
shr.u32 %r65, %r93, %r64;
add.s32 %r66, %r65, %r62;
selp.b32 %r67, %r66, %r91, %p9;
mul.lo.s32 %r68, %r67, -921707870;
mov.u32 %r69, -921707870;
mul.hi.u32 %r70, %r67, %r69;
setp.gt.s32 %p10, %r70, 0;
shl.b32 %r71, %r70, 1;
shr.u32 %r72, %r68, 31;
add.s32 %r73, %r72, %r71;
selp.b32 %r74, %r73, %r70, %p10;
selp.b32 %r75, -1, 0, %p10;
mov.u32 %r76, 126;
sub.s32 %r77, %r76, %r61;
add.s32 %r78, %r77, %r75;
shl.b32 %r79, %r78, 23;
add.s32 %r80, %r74, 1;
shr.u32 %r81, %r80, 7;
add.s32 %r82, %r81, 1;
shr.u32 %r83, %r82, 1;
add.s32 %r84, %r83, %r79;
or.b32 %r85, %r84, %r25;
mov.b32 %f44, %r85;
BB71_11:
mul.rn.f32 %f7, %f44, %f44;
and.b32 %r29, %r94, 1;
setp.eq.s32 %p11, %r29, 0;
@%p11 bra BB71_13;
mov.f32 %f30, 0fBAB6061A;
mov.f32 %f31, 0f37CCF5CE;
fma.rn.f32 %f45, %f31, %f7, %f30;
bra.uni BB71_14;
BB71_13:
mov.f32 %f32, 0f3C08839E;
mov.f32 %f33, 0fB94CA1F9;
fma.rn.f32 %f45, %f33, %f7, %f32;
BB71_14:
@%p11 bra BB71_16;
mov.f32 %f34, 0f3D2AAAA5;
fma.rn.f32 %f35, %f45, %f7, %f34;
mov.f32 %f36, 0fBF000000;
fma.rn.f32 %f46, %f35, %f7, %f36;
bra.uni BB71_17;
BB71_16:
mov.f32 %f37, 0fBE2AAAA3;
fma.rn.f32 %f38, %f45, %f7, %f37;
mov.f32 %f39, 0f00000000;
fma.rn.f32 %f46, %f38, %f7, %f39;
BB71_17:
fma.rn.f32 %f47, %f46, %f44, %f44;
@%p11 bra BB71_19;
mov.f32 %f40, 0f3F800000;
fma.rn.f32 %f47, %f46, %f7, %f40;
BB71_19:
and.b32 %r86, %r94, 2;
setp.eq.s32 %p14, %r86, 0;
@%p14 bra BB71_21;
mov.f32 %f41, 0f00000000;
mov.f32 %f42, 0fBF800000;
fma.rn.f32 %f47, %f47, %f42, %f41;
BB71_21:
cvta.to.global.u64 %rd16, %rd9;
shl.b64 %rd17, %rd1, 2;
add.s64 %rd18, %rd16, %rd17;
st.global.f32 [%rd18], %f47;
BB71_22:
ret;
}
// .globl matrix_sinh_d
.visible .entry matrix_sinh_d(
.param .u64 matrix_sinh_d_param_0,
.param .u64 matrix_sinh_d_param_1,
.param .u32 matrix_sinh_d_param_2
)
{
.reg .pred %p<7>;
.reg .b32 %r<24>;
.reg .f64 %fd<68>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_sinh_d_param_0];
ld.param.u64 %rd3, [matrix_sinh_d_param_1];
ld.param.u32 %r3, [matrix_sinh_d_param_2];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra BB72_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd5, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd5;
}
and.b32 %r7, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd5;
}
mov.b64 %fd1, {%r8, %r7};
setp.lt.u32 %p2, %r7, 1072693248;
@%p2 bra BB72_3;
bra.uni BB72_2;
BB72_3:
mul.f64 %fd51, %fd1, %fd1;
mov.f64 %fd52, 0d3DE611A561D87DEF;
mov.f64 %fd53, 0d3D6B4C75AB274C53;
fma.rn.f64 %fd54, %fd53, %fd51, %fd52;
mov.f64 %fd55, 0d3E5AE64671B18F5C;
fma.rn.f64 %fd56, %fd54, %fd51, %fd55;
mov.f64 %fd57, 0d3EC71DE3A465B1E4;
fma.rn.f64 %fd58, %fd56, %fd51, %fd57;
mov.f64 %fd59, 0d3F2A01A01A02899D;
fma.rn.f64 %fd60, %fd58, %fd51, %fd59;
mov.f64 %fd61, 0d3F811111111110A6;
fma.rn.f64 %fd62, %fd60, %fd51, %fd61;
mov.f64 %fd63, 0d3FC5555555555556;
fma.rn.f64 %fd64, %fd62, %fd51, %fd63;
mul.f64 %fd65, %fd51, %fd64;
fma.rn.f64 %fd67, %fd65, %fd1, %fd1;
bra.uni BB72_4;
BB72_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd1;
}
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd8;
}
add.s32 %r11, %r10, -1;
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
add.s32 %r12, %r9, %r9;
setp.lt.u32 %p3, %r12, 2142496327;
selp.b32 %r13, 0, %r11, %p3;
selp.f64 %fd15, %fd1, %fd14, %p3;
mov.f64 %fd16, 0d3E5AF86D8EBD13CD;
mov.f64 %fd17, 0d3E21F4076ACD15B6;
fma.rn.f64 %fd18, %fd17, %fd15, %fd16;
mov.f64 %fd19, 0d3E927E5092BA033D;
fma.rn.f64 %fd20, %fd18, %fd15, %fd19;
mov.f64 %fd21, 0d3EC71DDE6C5F9DA1;
fma.rn.f64 %fd22, %fd20, %fd15, %fd21;
mov.f64 %fd23, 0d3EFA01A018D034E6;
fma.rn.f64 %fd24, %fd22, %fd15, %fd23;
mov.f64 %fd25, 0d3F2A01A01B3B6940;
fma.rn.f64 %fd26, %fd24, %fd15, %fd25;
mov.f64 %fd27, 0d3F56C16C16C1B5DD;
fma.rn.f64 %fd28, %fd26, %fd15, %fd27;
mov.f64 %fd29, 0d3F8111111110F74D;
fma.rn.f64 %fd30, %fd28, %fd15, %fd29;
mov.f64 %fd31, 0d3FA555555555554D;
fma.rn.f64 %fd32, %fd30, %fd15, %fd31;
mov.f64 %fd33, 0d3FC5555555555557;
fma.rn.f64 %fd34, %fd32, %fd15, %fd33;
mov.f64 %fd35, 0d3FE0000000000000;
fma.rn.f64 %fd36, %fd34, %fd15, %fd35;
mul.f64 %fd37, %fd15, %fd36;
fma.rn.f64 %fd38, %fd37, %fd15, %fd15;
setp.eq.s32 %p4, %r13, 1024;
selp.b32 %r14, -1, 0, %p4;
add.s32 %r15, %r14, %r13;
shl.b32 %r16, %r15, 20;
add.s32 %r17, %r16, 1072693248;
mov.u32 %r18, 0;
mov.b64 %fd39, {%r18, %r17};
mov.u32 %r19, 1071644672;
mov.b64 %fd40, {%r18, %r19};
sub.f64 %fd41, %fd39, %fd40;
fma.rn.f64 %fd42, %fd38, %fd39, %fd41;
add.f64 %fd43, %fd42, %fd42;
selp.f64 %fd44, %fd43, %fd42, %p4;
setp.eq.s32 %p5, %r12, 0;
selp.f64 %fd45, %fd15, %fd44, %p5;
mov.f64 %fd46, 0d3FF0000000000000;
mov.f64 %fd47, 0d4000000000000000;
fma.rn.f64 %fd48, %fd47, %fd45, %fd46;
div.rn.f64 %fd49, %fd45, %fd48;
add.f64 %fd50, %fd49, %fd45;
setp.ge.f64 %p6, %fd1, 0d408633CE8FB9F87E;
selp.f64 %fd67, 0d7FF0000000000000, %fd50, %p6;
BB72_4:
cvta.to.global.u64 %rd7, %rd3;
and.b32 %r20, %r2, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%temp, %r21}, %fd67;
}
or.b32 %r22, %r21, %r20;
{
.reg .b32 %temp;
mov.b64 {%r23, %temp}, %fd67;
}
mov.b64 %fd66, {%r23, %r22};
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd66;
BB72_5:
ret;
}
// .globl matrix_sinh_f
.visible .entry matrix_sinh_f(
.param .u64 matrix_sinh_f_param_0,
.param .u64 matrix_sinh_f_param_1,
.param .u32 matrix_sinh_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_sinh_f_param_0];
ld.param.u64 %rd3, [matrix_sinh_f_param_1];
ld.param.u32 %r2, [matrix_sinh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB73_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
abs.f32 %f2, %f1;
setp.ltu.f32 %p2, %f2, 0f3F800000;
@%p2 bra BB73_3;
bra.uni BB73_2;
BB73_3:
mul.f32 %f22, %f1, %f1;
mov.f32 %f23, 0f394FFF49;
mov.f32 %f24, 0f363D0ADA;
fma.rn.f32 %f25, %f24, %f22, %f23;
mov.f32 %f26, 0f3C08889A;
fma.rn.f32 %f27, %f25, %f22, %f26;
mov.f32 %f28, 0f3E2AAAAB;
fma.rn.f32 %f29, %f27, %f22, %f28;
mul.f32 %f30, %f22, %f29;
fma.rn.f32 %f31, %f30, %f1, %f1;
bra.uni BB73_4;
BB73_2:
mul.f32 %f8, %f2, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f9, %f8;
mov.f32 %f10, 0fBF317200;
fma.rn.f32 %f11, %f9, %f10, %f2;
mov.f32 %f12, 0fB5BFBE8E;
fma.rn.f32 %f13, %f9, %f12, %f11;
mul.f32 %f7, %f13, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f6,%f7;
// inline asm
add.f32 %f14, %f9, 0fC0000000;
ex2.approx.f32 %f15, %f14;
mul.f32 %f16, %f6, %f15;
mov.f32 %f17, 0f3E000000;
div.approx.f32 %f18, %f17, %f16;
neg.f32 %f19, %f18;
mov.f32 %f20, 0f40000000;
fma.rn.f32 %f21, %f20, %f16, %f19;
mov.b32 %r6, %f21;
setp.ge.f32 %p3, %f2, 0f42B40000;
selp.b32 %r7, 2139095040, %r6, %p3;
mov.b32 %r8, %f1;
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b32 %f31, %r10;
BB73_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 2;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f31;
BB73_5:
ret;
}
// .globl matrix_cos_d
.visible .entry matrix_cos_d(
.param .u64 matrix_cos_d_param_0,
.param .u64 matrix_cos_d_param_1,
.param .u32 matrix_cos_d_param_2
)
{
.local .align 4 .b8 __local_depot74[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .b32 %r<19>;
.reg .f64 %fd<41>;
.reg .b64 %rd<17>;
mov.u64 %rd16, __local_depot74;
cvta.local.u64 %SP, %rd16;
ld.param.u64 %rd3, [matrix_cos_d_param_0];
ld.param.u64 %rd4, [matrix_cos_d_param_1];
ld.param.u32 %r6, [matrix_cos_d_param_2];
add.u64 %rd5, %SP, 0;
cvta.to.local.u64 %rd1, %rd5;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB74_10;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd38, [%rd8];
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd38;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r11}, %fd38;
}
and.b32 %r12, %r11, 2147483647;
setp.eq.s32 %p2, %r12, 2146435072;
setp.eq.s32 %p3, %r10, 0;
and.pred %p4, %p2, %p3;
@!%p4 bra BB74_3;
bra.uni BB74_2;
BB74_2:
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
BB74_3:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r18, %fd15;
st.local.u32 [%rd1], %r18;
cvt.rn.f64.s32 %fd16, %r18;
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF921FB54442D18;
fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
mov.f64 %fd20, 0d3C91A62633145C00;
fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
mov.f64 %fd22, 0d397B839A252049C0;
fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
{
.reg .b32 %temp;
mov.b64 {%temp, %r13}, %fd38;
}
and.b32 %r14, %r13, 2145386496;
setp.lt.u32 %p5, %r14, 1105199104;
@%p5 bra BB74_5;
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd38;
.param .b64 param1;
st.param.b64 [param1+0], %rd5;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd39, [retval0+0];
//{
}// Callseq End 4
ld.local.u32 %r18, [%rd1];
BB74_5:
add.s32 %r5, %r18, 1;
and.b32 %r15, %r5, 1;
shl.b32 %r16, %r15, 3;
setp.eq.b32 %p6, %r15, 1;
selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
mul.wide.u32 %rd10, %r16, 8;
mov.u64 %rd11, __cudart_sin_cos_coeffs;
add.s64 %rd12, %rd10, %rd11;
ld.const.f64 %fd24, [%rd12+8];
mul.rn.f64 %fd7, %fd39, %fd39;
fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
ld.const.f64 %fd26, [%rd12+16];
fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
ld.const.f64 %fd28, [%rd12+24];
fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
ld.const.f64 %fd30, [%rd12+32];
fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
ld.const.f64 %fd32, [%rd12+40];
fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
setp.eq.s32 %p7, %r15, 0;
@%p7 bra BB74_7;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
BB74_7:
and.b32 %r17, %r5, 2;
setp.eq.s32 %p8, %r17, 0;
@%p8 bra BB74_9;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
BB74_9:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
BB74_10:
ret;
}
// .globl matrix_cos_f
.visible .entry matrix_cos_f(
.param .u64 matrix_cos_f_param_0,
.param .u64 matrix_cos_f_param_1,
.param .u32 matrix_cos_f_param_2
)
{
.local .align 4 .b8 __local_depot75[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<15>;
.reg .f32 %f<48>;
.reg .b32 %r<96>;
.reg .b64 %rd<22>;
mov.u64 %rd21, __local_depot75;
cvta.local.u64 %SP, %rd21;
ld.param.u64 %rd8, [matrix_cos_f_param_0];
ld.param.u64 %rd9, [matrix_cos_f_param_1];
ld.param.u32 %r31, [matrix_cos_f_param_2];
mov.u32 %r32, %ntid.x;
mov.u32 %r33, %ctaid.x;
mov.u32 %r34, %tid.x;
mad.lo.s32 %r1, %r32, %r33, %r34;
setp.ge.u32 %p1, %r1, %r31;
@%p1 bra BB75_22;
cvta.to.global.u64 %rd10, %rd8;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
add.u64 %rd13, %SP, 0;
cvta.to.local.u64 %rd2, %rd13;
ld.global.f32 %f43, [%rd12];
abs.f32 %f19, %f43;
setp.neu.f32 %p2, %f19, 0f7F800000;
@%p2 bra BB75_3;
mov.f32 %f20, 0f00000000;
mul.rn.f32 %f43, %f43, %f20;
BB75_3:
mul.f32 %f21, %f43, 0f3F22F983;
cvt.rni.s32.f32 %r95, %f21;
cvt.rn.f32.s32 %f22, %r95;
neg.f32 %f23, %f22;
mov.f32 %f24, 0f3FC90FDA;
fma.rn.f32 %f25, %f23, %f24, %f43;
mov.f32 %f26, 0f33A22168;
fma.rn.f32 %f27, %f23, %f26, %f25;
mov.f32 %f28, 0f27C234C5;
fma.rn.f32 %f44, %f23, %f28, %f27;
abs.f32 %f29, %f43;
setp.leu.f32 %p3, %f29, 0f47CE4780;
@%p3 bra BB75_11;
mov.b32 %r3, %f43;
shr.u32 %r4, %r3, 23;
shl.b32 %r37, %r3, 8;
or.b32 %r5, %r37, -2147483648;
mov.u32 %r89, 0;
mov.u64 %rd19, __cudart_i2opi_f;
mov.u32 %r88, -6;
mov.u64 %rd20, %rd2;
BB75_5:
.pragma "nounroll";
mov.u64 %rd4, %rd20;
ld.const.u32 %r40, [%rd19];
// inline asm
{
mad.lo.cc.u32 %r38, %r40, %r5, %r89;
madc.hi.u32 %r89, %r40, %r5, 0;
}
// inline asm
st.local.u32 [%rd4], %r38;
add.s64 %rd5, %rd4, 4;
add.s64 %rd19, %rd19, 4;
add.s32 %r88, %r88, 1;
setp.ne.s32 %p4, %r88, 0;
mov.u64 %rd20, %rd5;
@%p4 bra BB75_5;
and.b32 %r43, %r4, 255;
add.s32 %r44, %r43, -128;
shr.u32 %r45, %r44, 5;
and.b32 %r10, %r3, -2147483648;
st.local.u32 [%rd2+24], %r89;
mov.u32 %r46, 6;
sub.s32 %r47, %r46, %r45;
mul.wide.s32 %rd15, %r47, 4;
add.s64 %rd7, %rd2, %rd15;
ld.local.u32 %r90, [%rd7];
ld.local.u32 %r91, [%rd7+-4];
and.b32 %r13, %r4, 31;
setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB75_8;
mov.u32 %r48, 32;
sub.s32 %r49, %r48, %r13;
shr.u32 %r50, %r91, %r49;
shl.b32 %r51, %r90, %r13;
add.s32 %r90, %r50, %r51;
ld.local.u32 %r52, [%rd7+-8];
shr.u32 %r53, %r52, %r49;
shl.b32 %r54, %r91, %r13;
add.s32 %r91, %r53, %r54;
BB75_8:
shr.u32 %r55, %r91, 30;
shl.b32 %r56, %r90, 2;
add.s32 %r92, %r55, %r56;
shl.b32 %r19, %r91, 2;
shr.u32 %r57, %r92, 31;
shr.u32 %r58, %r90, 30;
add.s32 %r20, %r57, %r58;
setp.eq.s32 %p6, %r57, 0;
mov.u32 %r93, %r10;
mov.u32 %r94, %r19;
@%p6 bra BB75_10;
not.b32 %r59, %r92;
neg.s32 %r21, %r19;
setp.eq.s32 %p7, %r19, 0;
selp.u32 %r60, 1, 0, %p7;
add.s32 %r92, %r60, %r59;
xor.b32 %r23, %r10, -2147483648;
mov.u32 %r93, %r23;
mov.u32 %r94, %r21;
BB75_10:
mov.u32 %r25, %r93;
neg.s32 %r61, %r20;
setp.ne.s32 %p8, %r10, 0;
selp.b32 %r95, %r61, %r20, %p8;
clz.b32 %r62, %r92;
setp.ne.s32 %p9, %r62, 0;
shl.b32 %r63, %r92, %r62;
mov.u32 %r64, 32;
sub.s32 %r65, %r64, %r62;
shr.u32 %r66, %r94, %r65;
add.s32 %r67, %r66, %r63;
selp.b32 %r68, %r67, %r92, %p9;
mul.lo.s32 %r69, %r68, -921707870;
mov.u32 %r70, -921707870;
mul.hi.u32 %r71, %r68, %r70;
setp.gt.s32 %p10, %r71, 0;
shl.b32 %r72, %r71, 1;
shr.u32 %r73, %r69, 31;
add.s32 %r74, %r73, %r72;
selp.b32 %r75, %r74, %r71, %p10;
selp.b32 %r76, -1, 0, %p10;
mov.u32 %r77, 126;
sub.s32 %r78, %r77, %r62;
add.s32 %r79, %r78, %r76;
shl.b32 %r80, %r79, 23;
add.s32 %r81, %r75, 1;
shr.u32 %r82, %r81, 7;
add.s32 %r83, %r82, 1;
shr.u32 %r84, %r83, 1;
add.s32 %r85, %r84, %r80;
or.b32 %r86, %r85, %r25;
mov.b32 %f44, %r86;
BB75_11:
mul.rn.f32 %f7, %f44, %f44;
add.s32 %r29, %r95, 1;
and.b32 %r30, %r29, 1;
setp.eq.s32 %p11, %r30, 0;
@%p11 bra BB75_13;
mov.f32 %f30, 0fBAB6061A;
mov.f32 %f31, 0f37CCF5CE;
fma.rn.f32 %f45, %f31, %f7, %f30;
bra.uni BB75_14;
BB75_13:
mov.f32 %f32, 0f3C08839E;
mov.f32 %f33, 0fB94CA1F9;
fma.rn.f32 %f45, %f33, %f7, %f32;
BB75_14:
@%p11 bra BB75_16;
mov.f32 %f34, 0f3D2AAAA5;
fma.rn.f32 %f35, %f45, %f7, %f34;
mov.f32 %f36, 0fBF000000;
fma.rn.f32 %f46, %f35, %f7, %f36;
bra.uni BB75_17;
BB75_16:
mov.f32 %f37, 0fBE2AAAA3;
fma.rn.f32 %f38, %f45, %f7, %f37;
mov.f32 %f39, 0f00000000;
fma.rn.f32 %f46, %f38, %f7, %f39;
BB75_17:
fma.rn.f32 %f47, %f46, %f44, %f44;
@%p11 bra BB75_19;
mov.f32 %f40, 0f3F800000;
fma.rn.f32 %f47, %f46, %f7, %f40;
BB75_19:
and.b32 %r87, %r29, 2;
setp.eq.s32 %p14, %r87, 0;
@%p14 bra BB75_21;
mov.f32 %f41, 0f00000000;
mov.f32 %f42, 0fBF800000;
fma.rn.f32 %f47, %f47, %f42, %f41;
BB75_21:
cvta.to.global.u64 %rd16, %rd9;
shl.b64 %rd17, %rd1, 2;
add.s64 %rd18, %rd16, %rd17;
st.global.f32 [%rd18], %f47;
BB75_22:
ret;
}
// .globl matrix_cosh_d
.visible .entry matrix_cosh_d(
.param .u64 matrix_cosh_d_param_0,
.param .u64 matrix_cosh_d_param_1,
.param .u32 matrix_cosh_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<16>;
.reg .f64 %fd<46>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_cosh_d_param_0];
ld.param.u64 %rd3, [matrix_cosh_d_param_1];
ld.param.u32 %r2, [matrix_cosh_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB76_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r7};
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd2;
}
setp.lt.u32 %p2, %r9, 1082536911;
@%p2 bra BB76_3;
bra.uni BB76_2;
BB76_3:
mov.f64 %fd8, 0d4338000000000000;
mov.f64 %fd9, 0d3FF71547652B82FE;
fma.rn.f64 %fd10, %fd2, %fd9, %fd8;
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd10;
}
mov.f64 %fd11, 0dC338000000000000;
add.rn.f64 %fd12, %fd10, %fd11;
mov.f64 %fd13, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd14, %fd12, %fd13, %fd2;
mov.f64 %fd15, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd16, %fd12, %fd15, %fd14;
mov.f64 %fd17, 0d3E928AF3FCA213EA;
mov.f64 %fd18, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd19, %fd18, %fd16, %fd17;
mov.f64 %fd20, 0d3EC71DEE62401315;
fma.rn.f64 %fd21, %fd19, %fd16, %fd20;
mov.f64 %fd22, 0d3EFA01997C89EB71;
fma.rn.f64 %fd23, %fd21, %fd16, %fd22;
mov.f64 %fd24, 0d3F2A01A014761F65;
fma.rn.f64 %fd25, %fd23, %fd16, %fd24;
mov.f64 %fd26, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd27, %fd25, %fd16, %fd26;
mov.f64 %fd28, 0d3F81111111122322;
fma.rn.f64 %fd29, %fd27, %fd16, %fd28;
mov.f64 %fd30, 0d3FA55555555502A1;
fma.rn.f64 %fd31, %fd29, %fd16, %fd30;
mov.f64 %fd32, 0d3FC5555555555511;
fma.rn.f64 %fd33, %fd31, %fd16, %fd32;
mov.f64 %fd34, 0d3FE000000000000B;
fma.rn.f64 %fd35, %fd33, %fd16, %fd34;
mov.f64 %fd36, 0d3FF0000000000000;
fma.rn.f64 %fd37, %fd35, %fd16, %fd36;
fma.rn.f64 %fd38, %fd37, %fd16, %fd36;
shl.b32 %r11, %r10, 20;
{
.reg .b32 %temp;
mov.b64 {%r12, %temp}, %fd38;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r13}, %fd38;
}
add.s32 %r14, %r11, %r13;
add.s32 %r15, %r14, -2097152;
mov.b64 %fd7, {%r12, %r15};
// inline asm
rcp.approx.ftz.f64 %fd6,%fd7;
// inline asm
neg.f64 %fd39, %fd7;
fma.rn.f64 %fd40, %fd39, %fd6, %fd36;
fma.rn.f64 %fd41, %fd40, %fd40, %fd40;
fma.rn.f64 %fd42, %fd41, %fd6, %fd6;
mov.f64 %fd43, 0d3FB0000000000000;
fma.rn.f64 %fd45, %fd42, %fd43, %fd7;
bra.uni BB76_4;
BB76_2:
setp.le.f64 %p3, %fd1, 0d7FF0000000000000;
selp.f64 %fd45, 0d7FF0000000000000, %fd1, %p3;
BB76_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
add.f64 %fd44, %fd45, %fd45;
st.global.f64 [%rd9], %fd44;
BB76_5:
ret;
}
// .globl matrix_cosh_f
.visible .entry matrix_cosh_f(
.param .u64 matrix_cosh_f_param_0,
.param .u64 matrix_cosh_f_param_1,
.param .u32 matrix_cosh_f_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<19>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_cosh_f_param_0];
ld.param.u64 %rd2, [matrix_cosh_f_param_1];
ld.param.u32 %r2, [matrix_cosh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB77_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f3, [%rd5];
abs.f32 %f4, %f3;
mul.f32 %f5, %f4, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f6, %f5;
mov.f32 %f7, 0fBF317200;
fma.rn.f32 %f8, %f6, %f7, %f4;
mov.f32 %f9, 0fB5BFBE8E;
fma.rn.f32 %f10, %f6, %f9, %f8;
mul.f32 %f2, %f10, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f1,%f2;
// inline asm
add.f32 %f11, %f6, 0fC0000000;
ex2.approx.f32 %f12, %f11;
mul.f32 %f13, %f1, %f12;
mov.f32 %f14, 0f3E000000;
div.approx.f32 %f15, %f14, %f13;
mov.f32 %f16, 0f40000000;
fma.rn.f32 %f17, %f16, %f13, %f15;
setp.ge.f32 %p2, %f4, 0f42B40000;
selp.f32 %f18, 0f7F800000, %f17, %p2;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f18;
BB77_2:
ret;
}
// .globl matrix_tan_d
.visible .entry matrix_tan_d(
.param .u64 matrix_tan_d_param_0,
.param .u64 matrix_tan_d_param_1,
.param .u32 matrix_tan_d_param_2
)
{
.local .align 4 .b8 __local_depot78[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<7>;
.reg .b32 %r<16>;
.reg .f64 %fd<66>;
.reg .b64 %rd<14>;
mov.u64 %rd13, __local_depot78;
cvta.local.u64 %SP, %rd13;
ld.param.u64 %rd3, [matrix_tan_d_param_0];
ld.param.u64 %rd4, [matrix_tan_d_param_1];
ld.param.u32 %r5, [matrix_tan_d_param_2];
add.u64 %rd5, %SP, 0;
cvta.to.local.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB78_8;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd63, [%rd8];
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd63;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd63;
}
and.b32 %r11, %r10, 2147483647;
setp.eq.s32 %p2, %r11, 2146435072;
setp.eq.s32 %p3, %r9, 0;
and.pred %p4, %p2, %p3;
@!%p4 bra BB78_3;
bra.uni BB78_2;
BB78_2:
mov.f64 %fd11, 0d0000000000000000;
mul.rn.f64 %fd63, %fd63, %fd11;
BB78_3:
mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r15, %fd12;
st.local.u32 [%rd1], %r15;
cvt.rn.f64.s32 %fd13, %r15;
neg.f64 %fd14, %fd13;
mov.f64 %fd15, 0d3FF921FB54442D18;
fma.rn.f64 %fd16, %fd14, %fd15, %fd63;
mov.f64 %fd17, 0d3C91A62633145C00;
fma.rn.f64 %fd18, %fd14, %fd17, %fd16;
mov.f64 %fd19, 0d397B839A252049C0;
fma.rn.f64 %fd64, %fd14, %fd19, %fd18;
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd63;
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p5, %r13, 1105199104;
@%p5 bra BB78_5;
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd63;
.param .b64 param1;
st.param.b64 [param1+0], %rd5;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd64, [retval0+0];
//{
}// Callseq End 5
ld.local.u32 %r15, [%rd1];
BB78_5:
mul.f64 %fd20, %fd64, %fd64;
mov.f64 %fd21, 0dBEF9757C5B27EBB1;
mov.f64 %fd22, 0d3EE48DAC2799BCB9;
fma.rn.f64 %fd23, %fd22, %fd20, %fd21;
mov.f64 %fd24, 0d3F0980E90FD91E04;
fma.rn.f64 %fd25, %fd23, %fd20, %fd24;
mov.f64 %fd26, 0dBEFAE2B0417D7E1D;
fma.rn.f64 %fd27, %fd25, %fd20, %fd26;
mov.f64 %fd28, 0d3F119F5341BFBA57;
fma.rn.f64 %fd29, %fd27, %fd20, %fd28;
mov.f64 %fd30, 0d3F15E791A00F6919;
fma.rn.f64 %fd31, %fd29, %fd20, %fd30;
mov.f64 %fd32, 0d3F2FF2E7FADEC73A;
fma.rn.f64 %fd33, %fd31, %fd20, %fd32;
mov.f64 %fd34, 0d3F434BC1B206DA62;
fma.rn.f64 %fd35, %fd33, %fd20, %fd34;
mov.f64 %fd36, 0d3F57DB18EF2F83F9;
fma.rn.f64 %fd37, %fd35, %fd20, %fd36;
mov.f64 %fd38, 0d3F6D6D2E7AE49FBC;
fma.rn.f64 %fd39, %fd37, %fd20, %fd38;
mov.f64 %fd40, 0d3F8226E3A816A776;
fma.rn.f64 %fd41, %fd39, %fd20, %fd40;
mov.f64 %fd42, 0d3F9664F485D25660;
fma.rn.f64 %fd43, %fd41, %fd20, %fd42;
mov.f64 %fd44, 0d3FABA1BA1BABF31D;
fma.rn.f64 %fd45, %fd43, %fd20, %fd44;
mov.f64 %fd46, 0d3FC11111111105D2;
fma.rn.f64 %fd47, %fd45, %fd20, %fd46;
mov.f64 %fd48, 0d3FD555555555555E;
fma.rn.f64 %fd49, %fd47, %fd20, %fd48;
mul.f64 %fd7, %fd20, %fd49;
fma.rn.f64 %fd65, %fd7, %fd64, %fd64;
and.b32 %r14, %r15, 1;
setp.eq.b32 %p6, %r14, 1;
@!%p6 bra BB78_7;
bra.uni BB78_6;
BB78_6:
sub.f64 %fd52, %fd65, %fd64;
neg.f64 %fd53, %fd52;
fma.rn.f64 %fd54, %fd7, %fd64, %fd53;
// inline asm
rcp.approx.ftz.f64 %fd50,%fd65;
// inline asm
neg.f64 %fd55, %fd65;
mov.f64 %fd56, 0d3FF0000000000000;
fma.rn.f64 %fd57, %fd55, %fd50, %fd56;
fma.rn.f64 %fd58, %fd57, %fd57, %fd57;
fma.rn.f64 %fd59, %fd58, %fd50, %fd50;
neg.f64 %fd60, %fd59;
fma.rn.f64 %fd61, %fd65, %fd60, %fd56;
fma.rn.f64 %fd62, %fd60, %fd54, %fd61;
fma.rn.f64 %fd65, %fd62, %fd60, %fd60;
BB78_7:
cvta.to.global.u64 %rd10, %rd4;
shl.b64 %rd11, %rd2, 3;
add.s64 %rd12, %rd10, %rd11;
st.global.f64 [%rd12], %fd65;
BB78_8:
ret;
}
// .globl matrix_tan_f
.visible .entry matrix_tan_f(
.param .u64 matrix_tan_f_param_0,
.param .u64 matrix_tan_f_param_1,
.param .u32 matrix_tan_f_param_2
)
{
.local .align 4 .b8 __local_depot79[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<12>;
.reg .f32 %f<33>;
.reg .b32 %r<94>;
.reg .b64 %rd<22>;
mov.u64 %rd21, __local_depot79;
cvta.local.u64 %SP, %rd21;
ld.param.u64 %rd8, [matrix_tan_f_param_0];
ld.param.u64 %rd9, [matrix_tan_f_param_1];
ld.param.u32 %r29, [matrix_tan_f_param_2];
mov.u32 %r30, %ntid.x;
mov.u32 %r31, %ctaid.x;
mov.u32 %r32, %tid.x;
mad.lo.s32 %r1, %r30, %r31, %r32;
setp.ge.u32 %p1, %r1, %r29;
@%p1 bra BB79_14;
cvta.to.global.u64 %rd10, %rd8;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
add.u64 %rd13, %SP, 0;
cvta.to.local.u64 %rd2, %rd13;
ld.global.f32 %f30, [%rd12];
abs.f32 %f10, %f30;
setp.neu.f32 %p2, %f10, 0f7F800000;
@%p2 bra BB79_3;
mov.f32 %f11, 0f00000000;
mul.rn.f32 %f30, %f30, %f11;
BB79_3:
mul.f32 %f12, %f30, 0f3F22F983;
cvt.rni.s32.f32 %r93, %f12;
cvt.rn.f32.s32 %f13, %r93;
neg.f32 %f14, %f13;
mov.f32 %f15, 0f3FC90FDA;
fma.rn.f32 %f16, %f14, %f15, %f30;
mov.f32 %f17, 0f33A22168;
fma.rn.f32 %f18, %f14, %f17, %f16;
mov.f32 %f19, 0f27C234C5;
fma.rn.f32 %f31, %f14, %f19, %f18;
abs.f32 %f20, %f30;
setp.leu.f32 %p3, %f20, 0f47CE4780;
@%p3 bra BB79_11;
mov.b32 %r3, %f30;
shr.u32 %r4, %r3, 23;
shl.b32 %r35, %r3, 8;
or.b32 %r5, %r35, -2147483648;
mov.u32 %r87, 0;
mov.u64 %rd19, __cudart_i2opi_f;
mov.u32 %r86, -6;
mov.u64 %rd20, %rd2;
BB79_5:
.pragma "nounroll";
mov.u64 %rd4, %rd20;
ld.const.u32 %r38, [%rd19];
// inline asm
{
mad.lo.cc.u32 %r36, %r38, %r5, %r87;
madc.hi.u32 %r87, %r38, %r5, 0;
}
// inline asm
st.local.u32 [%rd4], %r36;
add.s64 %rd5, %rd4, 4;
add.s64 %rd19, %rd19, 4;
add.s32 %r86, %r86, 1;
setp.ne.s32 %p4, %r86, 0;
mov.u64 %rd20, %rd5;
@%p4 bra BB79_5;
and.b32 %r41, %r4, 255;
add.s32 %r42, %r41, -128;
shr.u32 %r43, %r42, 5;
and.b32 %r10, %r3, -2147483648;
st.local.u32 [%rd2+24], %r87;
mov.u32 %r44, 6;
sub.s32 %r45, %r44, %r43;
mul.wide.s32 %rd15, %r45, 4;
add.s64 %rd7, %rd2, %rd15;
ld.local.u32 %r88, [%rd7];
ld.local.u32 %r89, [%rd7+-4];
and.b32 %r13, %r4, 31;
setp.eq.s32 %p5, %r13, 0;
@%p5 bra BB79_8;
mov.u32 %r46, 32;
sub.s32 %r47, %r46, %r13;
shr.u32 %r48, %r89, %r47;
shl.b32 %r49, %r88, %r13;
add.s32 %r88, %r48, %r49;
ld.local.u32 %r50, [%rd7+-8];
shr.u32 %r51, %r50, %r47;
shl.b32 %r52, %r89, %r13;
add.s32 %r89, %r51, %r52;
BB79_8:
shr.u32 %r53, %r89, 30;
shl.b32 %r54, %r88, 2;
add.s32 %r90, %r53, %r54;
shl.b32 %r19, %r89, 2;
shr.u32 %r55, %r90, 31;
shr.u32 %r56, %r88, 30;
add.s32 %r20, %r55, %r56;
setp.eq.s32 %p6, %r55, 0;
mov.u32 %r91, %r10;
mov.u32 %r92, %r19;
@%p6 bra BB79_10;
not.b32 %r57, %r90;
neg.s32 %r21, %r19;
setp.eq.s32 %p7, %r19, 0;
selp.u32 %r58, 1, 0, %p7;
add.s32 %r90, %r58, %r57;
xor.b32 %r23, %r10, -2147483648;
mov.u32 %r91, %r23;
mov.u32 %r92, %r21;
BB79_10:
mov.u32 %r25, %r91;
neg.s32 %r59, %r20;
setp.ne.s32 %p8, %r10, 0;
selp.b32 %r93, %r59, %r20, %p8;
clz.b32 %r60, %r90;
setp.ne.s32 %p9, %r60, 0;
shl.b32 %r61, %r90, %r60;
mov.u32 %r62, 32;
sub.s32 %r63, %r62, %r60;
shr.u32 %r64, %r92, %r63;
add.s32 %r65, %r64, %r61;
selp.b32 %r66, %r65, %r90, %p9;
mul.lo.s32 %r67, %r66, -921707870;
mov.u32 %r68, -921707870;
mul.hi.u32 %r69, %r66, %r68;
setp.gt.s32 %p10, %r69, 0;
shl.b32 %r70, %r69, 1;
shr.u32 %r71, %r67, 31;
add.s32 %r72, %r71, %r70;
selp.b32 %r73, %r72, %r69, %p10;
selp.b32 %r74, -1, 0, %p10;
mov.u32 %r75, 126;
sub.s32 %r76, %r75, %r60;
add.s32 %r77, %r76, %r74;
shl.b32 %r78, %r77, 23;
add.s32 %r79, %r73, 1;
shr.u32 %r80, %r79, 7;
add.s32 %r81, %r80, 1;
shr.u32 %r82, %r81, 1;
add.s32 %r83, %r82, %r78;
or.b32 %r84, %r83, %r25;
mov.b32 %f31, %r84;
BB79_11:
mul.f32 %f21, %f31, %f31;
mov.f32 %f22, 0fBF52B7F4;
mov.f32 %f23, 0f3B86D46D;
fma.rn.f32 %f24, %f23, %f21, %f22;
add.f32 %f25, %f21, 0fC01E09D0;
rcp.rn.f32 %f26, %f25;
mul.f32 %f27, %f24, %f26;
mul.f32 %f28, %f21, %f27;
fma.rn.f32 %f32, %f28, %f31, %f31;
and.b32 %r85, %r93, 1;
setp.eq.b32 %p11, %r85, 1;
@!%p11 bra BB79_13;
bra.uni BB79_12;
BB79_12:
mov.f32 %f29, 0fBF800000;
div.rn.f32 %f32, %f29, %f32;
BB79_13:
cvta.to.global.u64 %rd16, %rd9;
shl.b64 %rd17, %rd1, 2;
add.s64 %rd18, %rd16, %rd17;
st.global.f32 [%rd18], %f32;
BB79_14:
ret;
}
// .globl matrix_tanh_d
.visible .entry matrix_tanh_d(
.param .u64 matrix_tanh_d_param_0,
.param .u64 matrix_tanh_d_param_1,
.param .u32 matrix_tanh_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<17>;
.reg .f64 %fd<74>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_tanh_d_param_0];
ld.param.u64 %rd3, [matrix_tanh_d_param_1];
ld.param.u32 %r4, [matrix_tanh_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB80_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE1C7A398201CD6;
@%p2 bra BB80_3;
bra.uni BB80_2;
BB80_3:
mul.f64 %fd51, %fd1, %fd1;
mov.f64 %fd52, 0dBF2B9093D89F0E23;
mov.f64 %fd53, 0d3F0ABFFC9B5786C4;
fma.rn.f64 %fd54, %fd53, %fd51, %fd52;
mov.f64 %fd55, 0d3F42FA2744C30B61;
fma.rn.f64 %fd56, %fd54, %fd51, %fd55;
mov.f64 %fd57, 0dBF57CF3B9C1E491D;
fma.rn.f64 %fd58, %fd56, %fd51, %fd57;
mov.f64 %fd59, 0d3F6D6C61D450119A;
fma.rn.f64 %fd60, %fd58, %fd51, %fd59;
mov.f64 %fd61, 0dBF8226DDD44294F5;
fma.rn.f64 %fd62, %fd60, %fd51, %fd61;
mov.f64 %fd63, 0d3F9664F45C2B04A6;
fma.rn.f64 %fd64, %fd62, %fd51, %fd63;
mov.f64 %fd65, 0dBFABA1BA1AD70754;
fma.rn.f64 %fd66, %fd64, %fd51, %fd65;
mov.f64 %fd67, 0d3FC111111110295E;
fma.rn.f64 %fd68, %fd66, %fd51, %fd67;
mov.f64 %fd69, 0dBFD555555555549F;
fma.rn.f64 %fd70, %fd68, %fd51, %fd69;
mul.f64 %fd71, %fd51, %fd70;
fma.rn.f64 %fd73, %fd71, %fd1, %fd1;
bra.uni BB80_4;
BB80_2:
add.f64 %fd8, %fd2, %fd2;
mov.f64 %fd9, 0d4338000000000000;
mov.f64 %fd10, 0d3FF71547652B82FE;
fma.rn.f64 %fd11, %fd8, %fd10, %fd9;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd11;
}
mov.f64 %fd12, 0dC338000000000000;
add.rn.f64 %fd13, %fd11, %fd12;
mov.f64 %fd14, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd15, %fd13, %fd14, %fd8;
mov.f64 %fd16, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd17, %fd13, %fd16, %fd15;
mov.f64 %fd18, 0d3E5AF86D8EBD13CD;
mov.f64 %fd19, 0d3E21F4076ACD15B6;
fma.rn.f64 %fd20, %fd19, %fd17, %fd18;
mov.f64 %fd21, 0d3E927E5092BA033D;
fma.rn.f64 %fd22, %fd20, %fd17, %fd21;
mov.f64 %fd23, 0d3EC71DDE6C5F9DA1;
fma.rn.f64 %fd24, %fd22, %fd17, %fd23;
mov.f64 %fd25, 0d3EFA01A018D034E6;
fma.rn.f64 %fd26, %fd24, %fd17, %fd25;
mov.f64 %fd27, 0d3F2A01A01B3B6940;
fma.rn.f64 %fd28, %fd26, %fd17, %fd27;
mov.f64 %fd29, 0d3F56C16C16C1B5DD;
fma.rn.f64 %fd30, %fd28, %fd17, %fd29;
mov.f64 %fd31, 0d3F8111111110F74D;
fma.rn.f64 %fd32, %fd30, %fd17, %fd31;
mov.f64 %fd33, 0d3FA555555555554D;
fma.rn.f64 %fd34, %fd32, %fd17, %fd33;
mov.f64 %fd35, 0d3FC5555555555557;
fma.rn.f64 %fd36, %fd34, %fd17, %fd35;
mov.f64 %fd37, 0d3FE0000000000000;
fma.rn.f64 %fd38, %fd36, %fd17, %fd37;
mul.f64 %fd39, %fd17, %fd38;
fma.rn.f64 %fd40, %fd39, %fd17, %fd17;
shl.b32 %r10, %r9, 20;
add.s32 %r11, %r10, 1072693248;
mov.u32 %r12, 0;
mov.b64 %fd41, {%r12, %r11};
fma.rn.f64 %fd42, %fd40, %fd41, %fd41;
add.f64 %fd7, %fd42, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd6,%fd7;
// inline asm
neg.f64 %fd43, %fd7;
mov.f64 %fd44, 0d3FF0000000000000;
fma.rn.f64 %fd45, %fd43, %fd6, %fd44;
fma.rn.f64 %fd46, %fd45, %fd45, %fd45;
fma.rn.f64 %fd47, %fd46, %fd6, %fd6;
neg.f64 %fd48, %fd47;
mov.f64 %fd49, 0d4000000000000000;
fma.rn.f64 %fd50, %fd49, %fd48, %fd44;
setp.gt.u32 %p3, %r3, 1077936127;
selp.f64 %fd73, 0d3FF0000000000000, %fd50, %p3;
BB80_4:
cvta.to.global.u64 %rd7, %rd3;
and.b32 %r13, %r2, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%temp, %r14}, %fd73;
}
or.b32 %r15, %r14, %r13;
{
.reg .b32 %temp;
mov.b64 {%r16, %temp}, %fd73;
}
mov.b64 %fd72, {%r16, %r15};
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd72;
BB80_5:
ret;
}
// .globl matrix_tanh_f
.visible .entry matrix_tanh_f(
.param .u64 matrix_tanh_f_param_0,
.param .u64 matrix_tanh_f_param_1,
.param .u32 matrix_tanh_f_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<33>;
.reg .b32 %r<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_tanh_f_param_0];
ld.param.u64 %rd3, [matrix_tanh_f_param_1];
ld.param.u32 %r2, [matrix_tanh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB81_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
abs.f32 %f2, %f1;
setp.ltu.f32 %p2, %f2, 0f3F0CCCCD;
@%p2 bra BB81_3;
bra.uni BB81_2;
BB81_3:
mul.f32 %f21, %f1, %f1;
mov.f32 %f22, 0fBD57BE66;
mov.f32 %f23, 0f3C86A81B;
fma.rn.f32 %f24, %f23, %f21, %f22;
mov.f32 %f25, 0f3E08677B;
fma.rn.f32 %f26, %f24, %f21, %f25;
mov.f32 %f27, 0fBEAAAA29;
fma.rn.f32 %f28, %f26, %f21, %f27;
mul.f32 %f29, %f21, %f28;
fma.rn.f32 %f30, %f29, %f1, %f1;
add.f32 %f31, %f1, %f1;
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f32 %f32, %f31, %f30, %p4;
bra.uni BB81_4;
BB81_2:
add.f32 %f10, %f2, %f2;
mul.f32 %f11, %f10, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f12, %f11;
mov.f32 %f13, 0fBF317200;
fma.rn.f32 %f14, %f12, %f13, %f10;
mov.f32 %f15, 0fB5BFBE8E;
fma.rn.f32 %f16, %f12, %f15, %f14;
mul.f32 %f7, %f16, 0f3FB8AA3B;
// inline asm
ex2.approx.ftz.f32 %f6,%f7;
// inline asm
ex2.approx.f32 %f17, %f12;
mov.f32 %f18, 0f3F800000;
fma.rn.f32 %f9, %f6, %f17, %f18;
// inline asm
rcp.approx.ftz.f32 %f8,%f9;
// inline asm
mov.f32 %f19, 0fC0000000;
fma.rn.f32 %f20, %f8, %f19, %f18;
mov.b32 %r6, %f20;
setp.ge.f32 %p3, %f2, 0f42B00000;
selp.b32 %r7, 1065353216, %r6, %p3;
mov.b32 %r8, %f1;
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b32 %f32, %r10;
BB81_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 2;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f32;
BB81_5:
ret;
}
// .globl matrix_asin_d
.visible .entry matrix_asin_d(
.param .u64 matrix_asin_d_param_0,
.param .u64 matrix_asin_d_param_1,
.param .u32 matrix_asin_d_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<15>;
.reg .f64 %fd<83>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_asin_d_param_0];
ld.param.u64 %rd3, [matrix_asin_d_param_1];
ld.param.u32 %r3, [matrix_asin_d_param_2];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra BB82_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
mov.b32 %f1, %r2;
abs.f32 %f2, %f1;
setp.lt.f32 %p2, %f2, 0f3FE26666;
@%p2 bra BB82_3;
bra.uni BB82_2;
BB82_3:
mul.f64 %fd55, %fd1, %fd1;
mov.f64 %fd56, 0dBFB3823B180754AF;
mov.f64 %fd57, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd58, %fd57, %fd55, %fd56;
mov.f64 %fd59, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd60, %fd58, %fd55, %fd59;
mov.f64 %fd61, 0dBF924EAF3526861B;
fma.rn.f64 %fd62, %fd60, %fd55, %fd61;
mov.f64 %fd63, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd64, %fd62, %fd55, %fd63;
mov.f64 %fd65, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd66, %fd64, %fd55, %fd65;
mov.f64 %fd67, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd68, %fd66, %fd55, %fd67;
mov.f64 %fd69, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd70, %fd68, %fd55, %fd69;
mov.f64 %fd71, 0d3F96E91483144EF7;
fma.rn.f64 %fd72, %fd70, %fd55, %fd71;
mov.f64 %fd73, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd74, %fd72, %fd55, %fd73;
mov.f64 %fd75, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd76, %fd74, %fd55, %fd75;
mov.f64 %fd77, 0d3FB333333320F91B;
fma.rn.f64 %fd78, %fd76, %fd55, %fd77;
mov.f64 %fd79, 0d3FC5555555555F4D;
fma.rn.f64 %fd80, %fd78, %fd55, %fd79;
mul.f64 %fd81, %fd55, %fd80;
fma.rn.f64 %fd82, %fd81, %fd1, %fd1;
bra.uni BB82_4;
BB82_2:
abs.f64 %fd7, %fd1;
mov.f64 %fd8, 0d3FE0000000000000;
mov.f64 %fd9, 0dBFE0000000000000;
fma.rn.f64 %fd6, %fd9, %fd7, %fd8;
// inline asm
rsqrt.approx.ftz.f64 %fd5, %fd6;
// inline asm
{
.reg .b32 %temp;
mov.b64 {%r7, %temp}, %fd5;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd5;
}
add.s32 %r9, %r8, -1048576;
mov.b64 %fd10, {%r7, %r9};
mul.f64 %fd11, %fd6, %fd5;
neg.f64 %fd12, %fd11;
fma.rn.f64 %fd13, %fd11, %fd12, %fd6;
fma.rn.f64 %fd14, %fd13, %fd10, %fd11;
neg.f64 %fd15, %fd14;
mov.f64 %fd16, 0d3FF0000000000000;
fma.rn.f64 %fd17, %fd5, %fd15, %fd16;
fma.rn.f64 %fd18, %fd17, %fd10, %fd10;
fma.rn.f64 %fd19, %fd14, %fd15, %fd6;
fma.rn.f64 %fd20, %fd19, %fd18, %fd14;
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd6;
}
setp.lt.s32 %p3, %r10, 0;
selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3;
setp.ne.f64 %p4, %fd6, 0d0000000000000000;
selp.f64 %fd22, %fd21, %fd6, %p4;
mov.f64 %fd23, 0dBFB3823B180754AF;
mov.f64 %fd24, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd25, %fd24, %fd6, %fd23;
mov.f64 %fd26, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd27, %fd25, %fd6, %fd26;
mov.f64 %fd28, 0dBF924EAF3526861B;
fma.rn.f64 %fd29, %fd27, %fd6, %fd28;
mov.f64 %fd30, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd31, %fd29, %fd6, %fd30;
mov.f64 %fd32, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd33, %fd31, %fd6, %fd32;
mov.f64 %fd34, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd35, %fd33, %fd6, %fd34;
mov.f64 %fd36, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd37, %fd35, %fd6, %fd36;
mov.f64 %fd38, 0d3F96E91483144EF7;
fma.rn.f64 %fd39, %fd37, %fd6, %fd38;
mov.f64 %fd40, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd41, %fd39, %fd6, %fd40;
mov.f64 %fd42, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd43, %fd41, %fd6, %fd42;
mov.f64 %fd44, 0d3FB333333320F91B;
fma.rn.f64 %fd45, %fd43, %fd6, %fd44;
mov.f64 %fd46, 0d3FC5555555555F4D;
fma.rn.f64 %fd47, %fd45, %fd6, %fd46;
mul.f64 %fd48, %fd6, %fd47;
mul.f64 %fd49, %fd22, 0dC000000000000000;
mov.f64 %fd50, 0d3C91A62633145C07;
fma.rn.f64 %fd51, %fd49, %fd48, %fd50;
add.f64 %fd52, %fd49, 0d3FE921FB54442D18;
add.f64 %fd53, %fd52, %fd51;
add.f64 %fd54, %fd53, 0d3FE921FB54442D18;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd54;
}
and.b32 %r13, %r2, -2147483648;
or.b32 %r14, %r12, %r13;
mov.b64 %fd82, {%r11, %r14};
BB82_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd82;
BB82_5:
ret;
}
// .globl matrix_asin_f
.visible .entry matrix_asin_f(
.param .u64 matrix_asin_f_param_0,
.param .u64 matrix_asin_f_param_1,
.param .u32 matrix_asin_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<26>;
.reg .b32 %r<10>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_asin_f_param_0];
ld.param.u64 %rd2, [matrix_asin_f_param_1];
ld.param.u32 %r2, [matrix_asin_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB83_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
mov.f32 %f3, 0f3F800000;
sub.f32 %f4, %f3, %f2;
mul.f32 %f5, %f4, 0f3F000000;
sqrt.rn.f32 %f6, %f5;
setp.gt.f32 %p2, %f2, 0f3F11EB85;
selp.f32 %f7, %f6, %f2, %p2;
mul.f32 %f8, %f7, %f7;
mov.f32 %f9, 0f3C94D2E9;
mov.f32 %f10, 0f3D53F941;
fma.rn.f32 %f11, %f10, %f8, %f9;
mov.f32 %f12, 0f3D3F841F;
fma.rn.f32 %f13, %f11, %f8, %f12;
mov.f32 %f14, 0f3D994929;
fma.rn.f32 %f15, %f13, %f8, %f14;
mov.f32 %f16, 0f3E2AAB94;
fma.rn.f32 %f17, %f15, %f8, %f16;
mul.f32 %f18, %f8, %f17;
fma.rn.f32 %f19, %f18, %f7, %f7;
mov.f32 %f20, 0f3FC90FDB;
mov.f32 %f21, 0fC0000000;
fma.rn.f32 %f22, %f21, %f19, %f20;
selp.f32 %f23, %f22, %f19, %p2;
setp.le.f32 %p3, %f23, 0f7F800000;
mov.b32 %r6, %f23;
mov.b32 %r7, %f1;
and.b32 %r8, %r7, -2147483648;
or.b32 %r9, %r6, %r8;
mov.b32 %f24, %r9;
selp.f32 %f25, %f24, %f23, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f25;
BB83_2:
ret;
}
// .globl matrix_acos_d
.visible .entry matrix_acos_d(
.param .u64 matrix_acos_d_param_0,
.param .u64 matrix_acos_d_param_1,
.param .u32 matrix_acos_d_param_2
)
{
.reg .pred %p<7>;
.reg .b32 %r<17>;
.reg .f64 %fd<95>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_acos_d_param_0];
ld.param.u64 %rd3, [matrix_acos_d_param_1];
ld.param.u32 %r4, [matrix_acos_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB84_14;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd16, [%rd6];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd16;
}
abs.f64 %fd1, %fd16;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
setp.lt.s32 %p2, %r8, 1071801958;
@%p2 bra BB84_9;
bra.uni BB84_2;
BB84_9:
mul.f64 %fd62, %fd1, %fd1;
mov.f64 %fd63, 0dBFB3823B180754AF;
mov.f64 %fd64, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd65, %fd64, %fd62, %fd63;
mov.f64 %fd66, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd67, %fd65, %fd62, %fd66;
mov.f64 %fd68, 0dBF924EAF3526861B;
fma.rn.f64 %fd69, %fd67, %fd62, %fd68;
mov.f64 %fd70, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd71, %fd69, %fd62, %fd70;
mov.f64 %fd72, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd73, %fd71, %fd62, %fd72;
mov.f64 %fd74, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd75, %fd73, %fd62, %fd74;
mov.f64 %fd76, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd77, %fd75, %fd62, %fd76;
mov.f64 %fd78, 0d3F96E91483144EF7;
fma.rn.f64 %fd79, %fd77, %fd62, %fd78;
mov.f64 %fd80, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd81, %fd79, %fd62, %fd80;
mov.f64 %fd82, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd83, %fd81, %fd62, %fd82;
mov.f64 %fd84, 0d3FB333333320F91B;
fma.rn.f64 %fd85, %fd83, %fd62, %fd84;
mov.f64 %fd86, 0d3FC5555555555F4D;
fma.rn.f64 %fd87, %fd85, %fd62, %fd86;
mul.f64 %fd88, %fd62, %fd87;
fma.rn.f64 %fd10, %fd88, %fd1, %fd1;
setp.lt.s32 %p6, %r2, 0;
@%p6 bra BB84_11;
mov.f64 %fd89, 0dBC91A62633145C07;
add.rn.f64 %fd90, %fd10, %fd89;
neg.f64 %fd93, %fd90;
bra.uni BB84_12;
BB84_2:
mov.f64 %fd19, 0d3FF0000000000000;
sub.f64 %fd2, %fd19, %fd1;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd2;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd2;
}
add.s32 %r10, %r3, -1048576;
mov.b64 %fd18, {%r9, %r10};
// inline asm
rsqrt.approx.ftz.f64 %fd17, %fd18;
// inline asm
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd17;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd17;
}
add.s32 %r13, %r12, -1048576;
mov.b64 %fd20, {%r11, %r13};
mul.f64 %fd21, %fd18, %fd17;
neg.f64 %fd22, %fd21;
fma.rn.f64 %fd23, %fd21, %fd22, %fd18;
fma.rn.f64 %fd24, %fd23, %fd20, %fd21;
neg.f64 %fd25, %fd24;
fma.rn.f64 %fd26, %fd17, %fd25, %fd19;
fma.rn.f64 %fd27, %fd26, %fd20, %fd20;
fma.rn.f64 %fd28, %fd24, %fd25, %fd18;
fma.rn.f64 %fd3, %fd28, %fd27, %fd24;
setp.lt.s32 %p3, %r3, 1;
@%p3 bra BB84_4;
{
.reg .b32 %temp;
mov.b64 {%r14, %temp}, %fd3;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r15}, %fd3;
}
add.s32 %r16, %r15, 1048576;
mov.b64 %fd29, {%r14, %r16};
mov.f64 %fd30, 0dBEBAC2FE66FAAC4B;
mov.f64 %fd31, 0d3EC715B371155F70;
fma.rn.f64 %fd32, %fd31, %fd2, %fd30;
mov.f64 %fd33, 0d3ED9A9B88EFCD9B8;
fma.rn.f64 %fd34, %fd32, %fd2, %fd33;
mov.f64 %fd35, 0d3EDD0F40A8A0C4C3;
fma.rn.f64 %fd36, %fd34, %fd2, %fd35;
mov.f64 %fd37, 0d3EF46D4CFA9E0E1F;
fma.rn.f64 %fd38, %fd36, %fd2, %fd37;
mov.f64 %fd39, 0d3F079C168D1E2422;
fma.rn.f64 %fd40, %fd38, %fd2, %fd39;
mov.f64 %fd41, 0d3F1C9A88C3BCA540;
fma.rn.f64 %fd42, %fd40, %fd2, %fd41;
mov.f64 %fd43, 0d3F31C4E64BD476DF;
fma.rn.f64 %fd44, %fd42, %fd2, %fd43;
mov.f64 %fd45, 0d3F46E8BA60009C8F;
fma.rn.f64 %fd46, %fd44, %fd2, %fd45;
mov.f64 %fd47, 0d3F5F1C71C62B05A2;
fma.rn.f64 %fd48, %fd46, %fd2, %fd47;
mov.f64 %fd49, 0d3F76DB6DB6DC9F2C;
fma.rn.f64 %fd50, %fd48, %fd2, %fd49;
mov.f64 %fd51, 0d3F9333333333329C;
fma.rn.f64 %fd52, %fd50, %fd2, %fd51;
mov.f64 %fd53, 0d3FB5555555555555;
fma.rn.f64 %fd54, %fd52, %fd2, %fd53;
mul.f64 %fd55, %fd2, %fd54;
fma.rn.f64 %fd94, %fd55, %fd29, %fd29;
bra.uni BB84_5;
BB84_11:
mov.f64 %fd91, 0d3C91A62633145C07;
add.rn.f64 %fd93, %fd10, %fd91;
BB84_12:
mov.f64 %fd92, 0d3FF921FB54442D18;
add.rn.f64 %fd94, %fd92, %fd93;
bra.uni BB84_13;
BB84_4:
mov.f64 %fd56, 0d0000000000000000;
mul.rn.f64 %fd94, %fd1, %fd56;
BB84_5:
setp.gt.s32 %p4, %r3, -1;
@%p4 bra BB84_7;
mov.f64 %fd57, 0d7FF0000000000000;
mul.rn.f64 %fd94, %fd94, %fd57;
BB84_7:
setp.gt.s32 %p5, %r2, -1;
@%p5 bra BB84_13;
mov.f64 %fd58, 0dBCA1A62633145C07;
add.rn.f64 %fd59, %fd94, %fd58;
neg.f64 %fd60, %fd59;
mov.f64 %fd61, 0d400921FB54442D18;
add.rn.f64 %fd94, %fd61, %fd60;
BB84_13:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd94;
BB84_14:
ret;
}
// .globl matrix_acos_f
.visible .entry matrix_acos_f(
.param .u64 matrix_acos_f_param_0,
.param .u64 matrix_acos_f_param_1,
.param .u32 matrix_acos_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<27>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_acos_f_param_0];
ld.param.u64 %rd2, [matrix_acos_f_param_1];
ld.param.u32 %r2, [matrix_acos_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB85_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
mov.f32 %f3, 0f3F800000;
sub.f32 %f4, %f3, %f2;
mul.f32 %f5, %f4, 0f3F000000;
sqrt.rn.f32 %f6, %f5;
setp.gt.f32 %p2, %f2, 0f3F11EB85;
selp.f32 %f7, %f6, %f2, %p2;
mul.f32 %f8, %f7, %f7;
mov.f32 %f9, 0f3C94D2E9;
mov.f32 %f10, 0f3D53F941;
fma.rn.f32 %f11, %f10, %f8, %f9;
mov.f32 %f12, 0f3D3F841F;
fma.rn.f32 %f13, %f11, %f8, %f12;
mov.f32 %f14, 0f3D994929;
fma.rn.f32 %f15, %f13, %f8, %f14;
mov.f32 %f16, 0f3E2AAB94;
fma.rn.f32 %f17, %f15, %f8, %f16;
mul.f32 %f18, %f8, %f17;
fma.rn.f32 %f19, %f18, %f7, %f7;
add.f32 %f20, %f19, %f19;
mov.f32 %f21, 0f3FC90FDB;
sub.f32 %f22, %f21, %f19;
selp.f32 %f23, %f20, %f22, %p2;
setp.lt.f32 %p3, %f1, 0f00000000;
mov.f32 %f24, 0f40490FDB;
sub.f32 %f25, %f24, %f23;
selp.f32 %f26, %f25, %f23, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f26;
BB85_2:
ret;
}
// .globl matrix_atan_d
.visible .entry matrix_atan_d(
.param .u64 matrix_atan_d_param_0,
.param .u64 matrix_atan_d_param_1,
.param .u32 matrix_atan_d_param_2
)
{
.reg .pred %p<5>;
.reg .b32 %r<11>;
.reg .f64 %fd<57>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_atan_d_param_0];
ld.param.u64 %rd3, [matrix_atan_d_param_1];
ld.param.u32 %r2, [matrix_atan_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB86_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
abs.f64 %fd2, %fd1;
setp.leu.f64 %p2, %fd2, 0d3FF0000000000000;
mov.f64 %fd56, %fd2;
@%p2 bra BB86_3;
// inline asm
rcp.approx.ftz.f64 %fd5,%fd2;
// inline asm
neg.f64 %fd7, %fd2;
mov.f64 %fd8, 0d3FF0000000000000;
fma.rn.f64 %fd9, %fd7, %fd5, %fd8;
fma.rn.f64 %fd10, %fd9, %fd9, %fd9;
fma.rn.f64 %fd11, %fd10, %fd5, %fd5;
setp.eq.f64 %p3, %fd2, 0d7FF0000000000000;
selp.f64 %fd3, 0d0000000000000000, %fd11, %p3;
mov.f64 %fd56, %fd3;
BB86_3:
mov.f64 %fd4, %fd56;
cvta.to.global.u64 %rd7, %rd3;
mul.f64 %fd12, %fd4, %fd4;
mov.f64 %fd13, 0d3F2D3B63DBB65B49;
mov.f64 %fd14, 0dBEF53E1D2A25FF7E;
fma.rn.f64 %fd15, %fd14, %fd12, %fd13;
mov.f64 %fd16, 0dBF5312788DDE082E;
fma.rn.f64 %fd17, %fd15, %fd12, %fd16;
mov.f64 %fd18, 0d3F6F9690C8249315;
fma.rn.f64 %fd19, %fd17, %fd12, %fd18;
mov.f64 %fd20, 0dBF82CF5AABC7CF0D;
fma.rn.f64 %fd21, %fd19, %fd12, %fd20;
mov.f64 %fd22, 0d3F9162B0B2A3BFDE;
fma.rn.f64 %fd23, %fd21, %fd12, %fd22;
mov.f64 %fd24, 0dBF9A7256FEB6FC6B;
fma.rn.f64 %fd25, %fd23, %fd12, %fd24;
mov.f64 %fd26, 0d3FA171560CE4A489;
fma.rn.f64 %fd27, %fd25, %fd12, %fd26;
mov.f64 %fd28, 0dBFA4F44D841450E4;
fma.rn.f64 %fd29, %fd27, %fd12, %fd28;
mov.f64 %fd30, 0d3FA7EE3D3F36BB95;
fma.rn.f64 %fd31, %fd29, %fd12, %fd30;
mov.f64 %fd32, 0dBFAAD32AE04A9FD1;
fma.rn.f64 %fd33, %fd31, %fd12, %fd32;
mov.f64 %fd34, 0d3FAE17813D66954F;
fma.rn.f64 %fd35, %fd33, %fd12, %fd34;
mov.f64 %fd36, 0dBFB11089CA9A5BCD;
fma.rn.f64 %fd37, %fd35, %fd12, %fd36;
mov.f64 %fd38, 0d3FB3B12B2DB51738;
fma.rn.f64 %fd39, %fd37, %fd12, %fd38;
mov.f64 %fd40, 0dBFB745D022F8DC5C;
fma.rn.f64 %fd41, %fd39, %fd12, %fd40;
mov.f64 %fd42, 0d3FBC71C709DFE927;
fma.rn.f64 %fd43, %fd41, %fd12, %fd42;
mov.f64 %fd44, 0dBFC2492491FA1744;
fma.rn.f64 %fd45, %fd43, %fd12, %fd44;
mov.f64 %fd46, 0d3FC99999999840D2;
fma.rn.f64 %fd47, %fd45, %fd12, %fd46;
mov.f64 %fd48, 0dBFD555555555544C;
fma.rn.f64 %fd49, %fd47, %fd12, %fd48;
mul.f64 %fd50, %fd12, %fd49;
fma.rn.f64 %fd51, %fd50, %fd4, %fd4;
mov.f64 %fd52, 0d3FF921FB54442D18;
sub.f64 %fd53, %fd52, %fd51;
setp.gt.f64 %p4, %fd2, 0d3FF0000000000000;
selp.f64 %fd54, %fd53, %fd51, %p4;
{
.reg .b32 %temp;
mov.b64 {%r6, %temp}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r7}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b64 %fd55, {%r6, %r10};
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd55;
BB86_4:
ret;
}
// .globl matrix_atan_f
.visible .entry matrix_atan_f(
.param .u64 matrix_atan_f_param_0,
.param .u64 matrix_atan_f_param_1,
.param .u32 matrix_atan_f_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<26>;
.reg .b32 %r<10>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_atan_f_param_0];
ld.param.u64 %rd3, [matrix_atan_f_param_1];
ld.param.u32 %r2, [matrix_atan_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB87_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
abs.f32 %f2, %f1;
setp.leu.f32 %p2, %f2, 0f3F800000;
mov.f32 %f25, %f2;
@%p2 bra BB87_3;
rcp.rn.f32 %f3, %f2;
mov.f32 %f25, %f3;
BB87_3:
mov.f32 %f4, %f25;
cvta.to.global.u64 %rd7, %rd3;
mul.rn.f32 %f5, %f4, %f4;
mov.f32 %f6, 0fC0B59883;
mov.f32 %f7, 0fBF52C7EA;
fma.rn.f32 %f8, %f5, %f7, %f6;
mov.f32 %f9, 0fC0D21907;
fma.rn.f32 %f10, %f8, %f5, %f9;
mul.f32 %f11, %f5, %f10;
mul.f32 %f12, %f4, %f11;
add.f32 %f13, %f5, 0f41355DC0;
mov.f32 %f14, 0f41E6BD60;
fma.rn.f32 %f15, %f13, %f5, %f14;
mov.f32 %f16, 0f419D92C8;
fma.rn.f32 %f17, %f15, %f5, %f16;
rcp.rn.f32 %f18, %f17;
fma.rn.f32 %f19, %f12, %f18, %f4;
mov.f32 %f20, 0f3FC90FDB;
sub.f32 %f21, %f20, %f19;
setp.gt.f32 %p3, %f2, 0f3F800000;
selp.f32 %f22, %f21, %f19, %p3;
mov.b32 %r6, %f22;
mov.b32 %r7, %f1;
and.b32 %r8, %r7, -2147483648;
or.b32 %r9, %r6, %r8;
mov.b32 %f23, %r9;
setp.le.f32 %p4, %f2, 0f7F800000;
selp.f32 %f24, %f23, %f22, %p4;
shl.b64 %rd8, %rd1, 2;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f24;
BB87_4:
ret;
}
// .globl matrix_sign_d
.visible .entry matrix_sign_d(
.param .u64 matrix_sign_d_param_0,
.param .u64 matrix_sign_d_param_1,
.param .u32 matrix_sign_d_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<12>;
.reg .f64 %fd<4>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [matrix_sign_d_param_0];
ld.param.u64 %rd3, [matrix_sign_d_param_1];
ld.param.u32 %r2, [matrix_sign_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB88_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
setp.eq.f64 %p2, %fd1, 0d0000000000000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
@%p2 bra BB88_3;
bra.uni BB88_2;
BB88_3:
mov.u64 %rd8, 0;
st.global.u64 [%rd1], %rd8;
bra.uni BB88_4;
BB88_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, -2147483648;
mov.f64 %fd2, 0d3FF0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd2;
}
and.b32 %r9, %r8, 2147483647;
or.b32 %r10, %r9, %r7;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd2;
}
mov.b64 %fd3, {%r11, %r10};
st.global.f64 [%rd1], %fd3;
BB88_4:
ret;
}
// .globl matrix_sign_f
.visible .entry matrix_sign_f(
.param .u64 matrix_sign_f_param_0,
.param .u64 matrix_sign_f_param_1,
.param .u32 matrix_sign_f_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<3>;
.reg .b32 %r<13>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd2, [matrix_sign_f_param_0];
ld.param.u64 %rd3, [matrix_sign_f_param_1];
ld.param.u32 %r2, [matrix_sign_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB89_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
setp.eq.f32 %p2, %f1, 0f00000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
@%p2 bra BB89_3;
bra.uni BB89_2;
BB89_3:
mov.u32 %r12, 0;
st.global.u32 [%rd1], %r12;
bra.uni BB89_4;
BB89_2:
cvt.f64.f32 %fd1, %f1;
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, -2147483648;
mov.f64 %fd2, 0d3FF0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd2;
}
and.b32 %r9, %r8, 2147483647;
or.b32 %r10, %r9, %r7;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd2;
}
mov.b64 %fd3, {%r11, %r10};
cvt.rn.f32.f64 %f2, %fd3;
st.global.f32 [%rd1], %f2;
BB89_4:
ret;
}
// .globl matrix_sigmoid_d
.visible .entry matrix_sigmoid_d(
.param .u64 matrix_sigmoid_d_param_0,
.param .u64 matrix_sigmoid_d_param_1,
.param .u32 matrix_sigmoid_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<17>;
.reg .f64 %fd<76>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_sigmoid_d_param_0];
ld.param.u64 %rd3, [matrix_sigmoid_d_param_1];
ld.param.u32 %r4, [matrix_sigmoid_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB90_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd6, [%rd6];
mul.f64 %fd1, %fd6, 0d3FE0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE1C7A398201CD6;
@%p2 bra BB90_3;
bra.uni BB90_2;
BB90_3:
mul.f64 %fd52, %fd1, %fd1;
mov.f64 %fd53, 0dBF2B9093D89F0E23;
mov.f64 %fd54, 0d3F0ABFFC9B5786C4;
fma.rn.f64 %fd55, %fd54, %fd52, %fd53;
mov.f64 %fd56, 0d3F42FA2744C30B61;
fma.rn.f64 %fd57, %fd55, %fd52, %fd56;
mov.f64 %fd58, 0dBF57CF3B9C1E491D;
fma.rn.f64 %fd59, %fd57, %fd52, %fd58;
mov.f64 %fd60, 0d3F6D6C61D450119A;
fma.rn.f64 %fd61, %fd59, %fd52, %fd60;
mov.f64 %fd62, 0dBF8226DDD44294F5;
fma.rn.f64 %fd63, %fd61, %fd52, %fd62;
mov.f64 %fd64, 0d3F9664F45C2B04A6;
fma.rn.f64 %fd65, %fd63, %fd52, %fd64;
mov.f64 %fd66, 0dBFABA1BA1AD70754;
fma.rn.f64 %fd67, %fd65, %fd52, %fd66;
mov.f64 %fd68, 0d3FC111111110295E;
fma.rn.f64 %fd69, %fd67, %fd52, %fd68;
mov.f64 %fd70, 0dBFD555555555549F;
fma.rn.f64 %fd71, %fd69, %fd52, %fd70;
mul.f64 %fd72, %fd52, %fd71;
fma.rn.f64 %fd75, %fd72, %fd1, %fd1;
bra.uni BB90_4;
BB90_2:
add.f64 %fd9, %fd2, %fd2;
mov.f64 %fd10, 0d4338000000000000;
mov.f64 %fd11, 0d3FF71547652B82FE;
fma.rn.f64 %fd12, %fd9, %fd11, %fd10;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd12;
}
mov.f64 %fd13, 0dC338000000000000;
add.rn.f64 %fd14, %fd12, %fd13;
mov.f64 %fd15, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd16, %fd14, %fd15, %fd9;
mov.f64 %fd17, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd18, %fd14, %fd17, %fd16;
mov.f64 %fd19, 0d3E5AF86D8EBD13CD;
mov.f64 %fd20, 0d3E21F4076ACD15B6;
fma.rn.f64 %fd21, %fd20, %fd18, %fd19;
mov.f64 %fd22, 0d3E927E5092BA033D;
fma.rn.f64 %fd23, %fd21, %fd18, %fd22;
mov.f64 %fd24, 0d3EC71DDE6C5F9DA1;
fma.rn.f64 %fd25, %fd23, %fd18, %fd24;
mov.f64 %fd26, 0d3EFA01A018D034E6;
fma.rn.f64 %fd27, %fd25, %fd18, %fd26;
mov.f64 %fd28, 0d3F2A01A01B3B6940;
fma.rn.f64 %fd29, %fd27, %fd18, %fd28;
mov.f64 %fd30, 0d3F56C16C16C1B5DD;
fma.rn.f64 %fd31, %fd29, %fd18, %fd30;
mov.f64 %fd32, 0d3F8111111110F74D;
fma.rn.f64 %fd33, %fd31, %fd18, %fd32;
mov.f64 %fd34, 0d3FA555555555554D;
fma.rn.f64 %fd35, %fd33, %fd18, %fd34;
mov.f64 %fd36, 0d3FC5555555555557;
fma.rn.f64 %fd37, %fd35, %fd18, %fd36;
mov.f64 %fd38, 0d3FE0000000000000;
fma.rn.f64 %fd39, %fd37, %fd18, %fd38;
mul.f64 %fd40, %fd18, %fd39;
fma.rn.f64 %fd41, %fd40, %fd18, %fd18;
shl.b32 %r10, %r9, 20;
add.s32 %r11, %r10, 1072693248;
mov.u32 %r12, 0;
mov.b64 %fd42, {%r12, %r11};
fma.rn.f64 %fd43, %fd41, %fd42, %fd42;
add.f64 %fd8, %fd43, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd7,%fd8;
// inline asm
neg.f64 %fd44, %fd8;
mov.f64 %fd45, 0d3FF0000000000000;
fma.rn.f64 %fd46, %fd44, %fd7, %fd45;
fma.rn.f64 %fd47, %fd46, %fd46, %fd46;
fma.rn.f64 %fd48, %fd47, %fd7, %fd7;
neg.f64 %fd49, %fd48;
mov.f64 %fd50, 0d4000000000000000;
fma.rn.f64 %fd51, %fd50, %fd49, %fd45;
setp.gt.u32 %p3, %r3, 1077936127;
selp.f64 %fd75, 0d3FF0000000000000, %fd51, %p3;
BB90_4:
cvta.to.global.u64 %rd7, %rd3;
and.b32 %r13, %r2, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%temp, %r14}, %fd75;
}
or.b32 %r15, %r14, %r13;
{
.reg .b32 %temp;
mov.b64 {%r16, %temp}, %fd75;
}
mov.b64 %fd73, {%r16, %r15};
fma.rn.f64 %fd74, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd74;
BB90_5:
ret;
}
// .globl matrix_sigmoid_f
.visible .entry matrix_sigmoid_f(
.param .u64 matrix_sigmoid_f_param_0,
.param .u64 matrix_sigmoid_f_param_1,
.param .u32 matrix_sigmoid_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<3>;
.reg .b32 %r<17>;
.reg .f64 %fd<76>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_sigmoid_f_param_0];
ld.param.u64 %rd3, [matrix_sigmoid_f_param_1];
ld.param.u32 %r4, [matrix_sigmoid_f_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB91_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
cvt.f64.f32 %fd6, %f1;
mul.f64 %fd1, %fd6, 0d3FE0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE1C7A398201CD6;
@%p2 bra BB91_3;
bra.uni BB91_2;
BB91_3:
mul.f64 %fd52, %fd1, %fd1;
mov.f64 %fd53, 0dBF2B9093D89F0E23;
mov.f64 %fd54, 0d3F0ABFFC9B5786C4;
fma.rn.f64 %fd55, %fd54, %fd52, %fd53;
mov.f64 %fd56, 0d3F42FA2744C30B61;
fma.rn.f64 %fd57, %fd55, %fd52, %fd56;
mov.f64 %fd58, 0dBF57CF3B9C1E491D;
fma.rn.f64 %fd59, %fd57, %fd52, %fd58;
mov.f64 %fd60, 0d3F6D6C61D450119A;
fma.rn.f64 %fd61, %fd59, %fd52, %fd60;
mov.f64 %fd62, 0dBF8226DDD44294F5;
fma.rn.f64 %fd63, %fd61, %fd52, %fd62;
mov.f64 %fd64, 0d3F9664F45C2B04A6;
fma.rn.f64 %fd65, %fd63, %fd52, %fd64;
mov.f64 %fd66, 0dBFABA1BA1AD70754;
fma.rn.f64 %fd67, %fd65, %fd52, %fd66;
mov.f64 %fd68, 0d3FC111111110295E;
fma.rn.f64 %fd69, %fd67, %fd52, %fd68;
mov.f64 %fd70, 0dBFD555555555549F;
fma.rn.f64 %fd71, %fd69, %fd52, %fd70;
mul.f64 %fd72, %fd52, %fd71;
fma.rn.f64 %fd75, %fd72, %fd1, %fd1;
bra.uni BB91_4;
BB91_2:
add.f64 %fd9, %fd2, %fd2;
mov.f64 %fd10, 0d4338000000000000;
mov.f64 %fd11, 0d3FF71547652B82FE;
fma.rn.f64 %fd12, %fd9, %fd11, %fd10;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd12;
}
mov.f64 %fd13, 0dC338000000000000;
add.rn.f64 %fd14, %fd12, %fd13;
mov.f64 %fd15, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd16, %fd14, %fd15, %fd9;
mov.f64 %fd17, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd18, %fd14, %fd17, %fd16;
mov.f64 %fd19, 0d3E5AF86D8EBD13CD;
mov.f64 %fd20, 0d3E21F4076ACD15B6;
fma.rn.f64 %fd21, %fd20, %fd18, %fd19;
mov.f64 %fd22, 0d3E927E5092BA033D;
fma.rn.f64 %fd23, %fd21, %fd18, %fd22;
mov.f64 %fd24, 0d3EC71DDE6C5F9DA1;
fma.rn.f64 %fd25, %fd23, %fd18, %fd24;
mov.f64 %fd26, 0d3EFA01A018D034E6;
fma.rn.f64 %fd27, %fd25, %fd18, %fd26;
mov.f64 %fd28, 0d3F2A01A01B3B6940;
fma.rn.f64 %fd29, %fd27, %fd18, %fd28;
mov.f64 %fd30, 0d3F56C16C16C1B5DD;
fma.rn.f64 %fd31, %fd29, %fd18, %fd30;
mov.f64 %fd32, 0d3F8111111110F74D;
fma.rn.f64 %fd33, %fd31, %fd18, %fd32;
mov.f64 %fd34, 0d3FA555555555554D;
fma.rn.f64 %fd35, %fd33, %fd18, %fd34;
mov.f64 %fd36, 0d3FC5555555555557;
fma.rn.f64 %fd37, %fd35, %fd18, %fd36;
mov.f64 %fd38, 0d3FE0000000000000;
fma.rn.f64 %fd39, %fd37, %fd18, %fd38;
mul.f64 %fd40, %fd18, %fd39;
fma.rn.f64 %fd41, %fd40, %fd18, %fd18;
shl.b32 %r10, %r9, 20;
add.s32 %r11, %r10, 1072693248;
mov.u32 %r12, 0;
mov.b64 %fd42, {%r12, %r11};
fma.rn.f64 %fd43, %fd41, %fd42, %fd42;
add.f64 %fd8, %fd43, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd7,%fd8;
// inline asm
neg.f64 %fd44, %fd8;
mov.f64 %fd45, 0d3FF0000000000000;
fma.rn.f64 %fd46, %fd44, %fd7, %fd45;
fma.rn.f64 %fd47, %fd46, %fd46, %fd46;
fma.rn.f64 %fd48, %fd47, %fd7, %fd7;
neg.f64 %fd49, %fd48;
mov.f64 %fd50, 0d4000000000000000;
fma.rn.f64 %fd51, %fd50, %fd49, %fd45;
setp.gt.u32 %p3, %r3, 1077936127;
selp.f64 %fd75, 0d3FF0000000000000, %fd51, %p3;
BB91_4:
cvta.to.global.u64 %rd7, %rd3;
and.b32 %r13, %r2, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%temp, %r14}, %fd75;
}
or.b32 %r15, %r14, %r13;
{
.reg .b32 %temp;
mov.b64 {%r16, %temp}, %fd75;
}
mov.b64 %fd73, {%r16, %r15};
fma.rn.f64 %fd74, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
cvt.rn.f32.f64 %f2, %fd74;
shl.b64 %rd8, %rd1, 2;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f2;
BB91_5:
ret;
}
.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
.local .align 8 .b8 __local_depot92[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .b32 %r<42>;
.reg .f64 %fd<5>;
.reg .b64 %rd<101>;
mov.u64 %rd100, __local_depot92;
cvta.local.u64 %SP, %rd100;
ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0];
ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1];
add.u64 %rd38, %SP, 0;
cvta.to.local.u64 %rd1, %rd38;
{
.reg .b32 %temp;
mov.b64 {%temp, %r1}, %fd4;
}
and.b32 %r40, %r1, -2147483648;
shr.u32 %r3, %r1, 20;
bfe.u32 %r4, %r1, 20, 11;
setp.eq.s32 %p1, %r4, 2047;
@%p1 bra BB92_13;
add.s32 %r16, %r4, -1024;
shr.u32 %r17, %r16, 6;
mov.u32 %r18, 16;
sub.s32 %r5, %r18, %r17;
mov.u32 %r19, 19;
sub.s32 %r20, %r19, %r17;
mov.u32 %r21, 18;
min.s32 %r6, %r21, %r20;
setp.gt.s32 %p2, %r5, %r6;
mov.u64 %rd94, 0;
mov.u64 %rd93, %rd1;
@%p2 bra BB92_4;
mov.b64 %rd41, %fd4;
shl.b64 %rd42, %rd41, 11;
or.b64 %rd3, %rd42, -9223372036854775808;
add.s32 %r7, %r5, -1;
mov.u64 %rd92, %rd1;
bfe.u32 %r22, %r1, 20, 11;
add.s32 %r23, %r22, -1024;
shr.u32 %r24, %r23, 6;
neg.s32 %r25, %r24;
mul.wide.s32 %rd43, %r25, 8;
mov.u64 %rd44, __cudart_i2opi_d;
add.s64 %rd45, %rd43, %rd44;
add.s64 %rd90, %rd45, 120;
mov.u64 %rd94, 0;
mov.u64 %rd91, %rd1;
mov.u32 %r39, %r7;
BB92_3:
.pragma "nounroll";
mov.u32 %r8, %r39;
mov.u64 %rd7, %rd91;
ld.const.u64 %rd48, [%rd90];
// inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
mov.b64 {alo,ahi}, %rd48;
mov.b64 {blo,bhi}, %rd3;
mov.b64 {clo,chi}, %rd94;
mad.lo.cc.u32 r0, alo, blo, clo;
madc.hi.cc.u32 r1, alo, blo, chi;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd46, {r0,r1};
mov.b64 %rd94, {r2,r3};
}
// inline asm
st.local.u64 [%rd92], %rd46;
add.s32 %r9, %r8, 1;
sub.s32 %r26, %r9, %r7;
mul.wide.s32 %rd51, %r26, 8;
add.s64 %rd92, %rd1, %rd51;
add.s64 %rd13, %rd7, 8;
mov.u64 %rd93, %rd13;
add.s64 %rd90, %rd90, 8;
setp.lt.s32 %p3, %r9, %r6;
mov.u64 %rd91, %rd13;
mov.u32 %r39, %r9;
@%p3 bra BB92_3;
BB92_4:
st.local.u64 [%rd93], %rd94;
ld.local.u64 %rd95, [%rd1+16];
ld.local.u64 %rd96, [%rd1+24];
and.b32 %r10, %r3, 63;
setp.eq.s32 %p4, %r10, 0;
@%p4 bra BB92_6;
mov.u32 %r27, 64;
sub.s32 %r28, %r27, %r10;
shl.b64 %rd52, %rd96, %r10;
shr.u64 %rd53, %rd95, %r28;
or.b64 %rd96, %rd52, %rd53;
shl.b64 %rd54, %rd95, %r10;
ld.local.u64 %rd55, [%rd1+8];
shr.u64 %rd56, %rd55, %r28;
or.b64 %rd95, %rd56, %rd54;
BB92_6:
cvta.to.local.u64 %rd57, %rd37;
shr.u64 %rd58, %rd96, 62;
cvt.u32.u64 %r29, %rd58;
shr.u64 %rd59, %rd95, 62;
shl.b64 %rd60, %rd96, 2;
or.b64 %rd98, %rd60, %rd59;
shl.b64 %rd97, %rd95, 2;
shr.u64 %rd61, %rd96, 61;
cvt.u32.u64 %r30, %rd61;
and.b32 %r31, %r30, 1;
add.s32 %r32, %r31, %r29;
neg.s32 %r33, %r32;
setp.ne.s32 %p5, %r40, 0;
selp.b32 %r34, %r33, %r32, %p5;
st.local.u32 [%rd57], %r34;
setp.eq.s32 %p6, %r31, 0;
@%p6 bra BB92_8;
mov.u64 %rd65, 0;
// inline asm
{
.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
mov.b64 {a0,a1}, %rd65;
mov.b64 {a2,a3}, %rd65;
mov.b64 {b0,b1}, %rd97;
mov.b64 {b2,b3}, %rd98;
sub.cc.u32 r0, a0, b0;
subc.cc.u32 r1, a1, b1;
subc.cc.u32 r2, a2, b2;
subc.u32 r3, a3, b3;
mov.b64 %rd97, {r0,r1};
mov.b64 %rd98, {r2,r3};
}
// inline asm
xor.b32 %r40, %r40, -2147483648;
BB92_8:
clz.b64 %r41, %rd98;
setp.eq.s32 %p7, %r41, 0;
@%p7 bra BB92_10;
shl.b64 %rd68, %rd98, %r41;
mov.u32 %r35, 64;
sub.s32 %r36, %r35, %r41;
shr.u64 %rd69, %rd97, %r36;
or.b64 %rd98, %rd69, %rd68;
BB92_10:
mov.u64 %rd73, -3958705157555305931;
// inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
mov.b64 {alo,ahi}, %rd98;
mov.b64 {blo,bhi}, %rd73;
mul.lo.u32 r0, alo, blo;
mul.hi.u32 r1, alo, blo;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd70, {r0,r1};
mov.b64 %rd99, {r2,r3};
}
// inline asm
setp.lt.s64 %p8, %rd99, 1;
@%p8 bra BB92_12;
// inline asm
{
.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
mov.b64 {a0,a1}, %rd70;
mov.b64 {a2,a3}, %rd99;
mov.b64 {b0,b1}, %rd70;
mov.b64 {b2,b3}, %rd99;
add.cc.u32 r0, a0, b0;
addc.cc.u32 r1, a1, b1;
addc.cc.u32 r2, a2, b2;
addc.u32 r3, a3, b3;
mov.b64 %rd74, {r0,r1};
mov.b64 %rd99, {r2,r3};
}
// inline asm
add.s32 %r41, %r41, 1;
BB92_12:
cvt.u64.u32 %rd80, %r40;
shl.b64 %rd81, %rd80, 32;
mov.u32 %r37, 1022;
sub.s32 %r38, %r37, %r41;
cvt.u64.u32 %rd82, %r38;
shl.b64 %rd83, %rd82, 52;
add.s64 %rd84, %rd99, 1;
shr.u64 %rd85, %rd84, 10;
add.s64 %rd86, %rd85, 1;
shr.u64 %rd87, %rd86, 1;
add.s64 %rd88, %rd87, %rd83;
or.b64 %rd89, %rd88, %rd81;
mov.b64 %fd4, %rd89;
BB92_13:
st.param.f64 [func_retval0+0], %fd4;
ret;
}
.func (.param .b64 func_retval0) __internal_accurate_pow(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
{
.reg .pred %p<10>;
.reg .f32 %f<3>;
.reg .b32 %r<52>;
.reg .f64 %fd<134>;
ld.param.f64 %fd12, [__internal_accurate_pow_param_0];
ld.param.f64 %fd13, [__internal_accurate_pow_param_1];
{
.reg .b32 %temp;
mov.b64 {%temp, %r49}, %fd12;
}
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd12;
}
shr.u32 %r50, %r49, 20;
setp.ne.s32 %p1, %r50, 0;
@%p1 bra BB93_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r49}, %fd14;
}
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd14;
}
shr.u32 %r16, %r49, 20;
add.s32 %r50, %r16, -54;
BB93_2:
add.s32 %r51, %r50, -1023;
and.b32 %r17, %r49, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd132, {%r48, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
@%p2 bra BB93_4;
{
.reg .b32 %temp;
mov.b64 {%r19, %temp}, %fd132;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r20}, %fd132;
}
add.s32 %r21, %r20, -1048576;
mov.b64 %fd132, {%r19, %r21};
add.s32 %r51, %r50, -1022;
BB93_4:
add.f64 %fd16, %fd132, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd15,%fd16;
// inline asm
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF0000000000000;
fma.rn.f64 %fd19, %fd17, %fd15, %fd18;
fma.rn.f64 %fd20, %fd19, %fd19, %fd19;
fma.rn.f64 %fd21, %fd20, %fd15, %fd15;
add.f64 %fd22, %fd132, 0dBFF0000000000000;
mul.f64 %fd23, %fd22, %fd21;
fma.rn.f64 %fd24, %fd22, %fd21, %fd23;
mul.f64 %fd25, %fd24, %fd24;
mov.f64 %fd26, 0d3ED0F5D241AD3B5A;
mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2;
fma.rn.f64 %fd28, %fd27, %fd25, %fd26;
mov.f64 %fd29, 0d3EF3B20A75488A3F;
fma.rn.f64 %fd30, %fd28, %fd25, %fd29;
mov.f64 %fd31, 0d3F1745CDE4FAECD5;
fma.rn.f64 %fd32, %fd30, %fd25, %fd31;
mov.f64 %fd33, 0d3F3C71C7258A578B;
fma.rn.f64 %fd34, %fd32, %fd25, %fd33;
mov.f64 %fd35, 0d3F6249249242B910;
fma.rn.f64 %fd36, %fd34, %fd25, %fd35;
mov.f64 %fd37, 0d3F89999999999DFB;
fma.rn.f64 %fd38, %fd36, %fd25, %fd37;
sub.f64 %fd39, %fd22, %fd24;
add.f64 %fd40, %fd39, %fd39;
neg.f64 %fd41, %fd24;
fma.rn.f64 %fd42, %fd41, %fd22, %fd40;
mul.f64 %fd43, %fd21, %fd42;
fma.rn.f64 %fd44, %fd25, %fd38, 0d3FB5555555555555;
mov.f64 %fd45, 0d3FB5555555555555;
sub.f64 %fd46, %fd45, %fd44;
fma.rn.f64 %fd47, %fd25, %fd38, %fd46;
add.f64 %fd48, %fd47, 0d0000000000000000;
add.f64 %fd49, %fd48, 0dBC46A4CB00B9E7B0;
add.f64 %fd50, %fd44, %fd49;
sub.f64 %fd51, %fd44, %fd50;
add.f64 %fd52, %fd49, %fd51;
mul.rn.f64 %fd53, %fd24, %fd24;
neg.f64 %fd54, %fd53;
fma.rn.f64 %fd55, %fd24, %fd24, %fd54;
{
.reg .b32 %temp;
mov.b64 {%r22, %temp}, %fd43;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd43;
}
add.s32 %r24, %r23, 1048576;
mov.b64 %fd56, {%r22, %r24};
fma.rn.f64 %fd57, %fd24, %fd56, %fd55;
mul.rn.f64 %fd58, %fd53, %fd24;
neg.f64 %fd59, %fd58;
fma.rn.f64 %fd60, %fd53, %fd24, %fd59;
fma.rn.f64 %fd61, %fd53, %fd43, %fd60;
fma.rn.f64 %fd62, %fd57, %fd24, %fd61;
mul.rn.f64 %fd63, %fd50, %fd58;
neg.f64 %fd64, %fd63;
fma.rn.f64 %fd65, %fd50, %fd58, %fd64;
fma.rn.f64 %fd66, %fd50, %fd62, %fd65;
fma.rn.f64 %fd67, %fd52, %fd58, %fd66;
add.f64 %fd68, %fd63, %fd67;
sub.f64 %fd69, %fd63, %fd68;
add.f64 %fd70, %fd67, %fd69;
add.f64 %fd71, %fd24, %fd68;
sub.f64 %fd72, %fd24, %fd71;
add.f64 %fd73, %fd68, %fd72;
add.f64 %fd74, %fd70, %fd73;
add.f64 %fd75, %fd43, %fd74;
add.f64 %fd76, %fd71, %fd75;
sub.f64 %fd77, %fd71, %fd76;
add.f64 %fd78, %fd75, %fd77;
xor.b32 %r25, %r51, -2147483648;
mov.u32 %r26, 1127219200;
mov.b64 %fd79, {%r25, %r26};
mov.u32 %r27, -2147483648;
mov.b64 %fd80, {%r27, %r26};
sub.f64 %fd81, %fd79, %fd80;
mov.f64 %fd82, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd83, %fd81, %fd82, %fd76;
neg.f64 %fd84, %fd81;
fma.rn.f64 %fd85, %fd84, %fd82, %fd83;
sub.f64 %fd86, %fd85, %fd76;
sub.f64 %fd87, %fd78, %fd86;
mov.f64 %fd88, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd89, %fd81, %fd88, %fd87;
add.f64 %fd90, %fd83, %fd89;
sub.f64 %fd91, %fd83, %fd90;
add.f64 %fd92, %fd89, %fd91;
{
.reg .b32 %temp;
mov.b64 {%temp, %r28}, %fd13;
}
add.s32 %r29, %r28, %r28;
setp.gt.u32 %p3, %r29, -33554433;
and.b32 %r30, %r28, -15728641;
selp.b32 %r31, %r30, %r28, %p3;
{
.reg .b32 %temp;
mov.b64 {%r32, %temp}, %fd13;
}
mov.b64 %fd93, {%r32, %r31};
mul.rn.f64 %fd94, %fd90, %fd93;
neg.f64 %fd95, %fd94;
fma.rn.f64 %fd96, %fd90, %fd93, %fd95;
fma.rn.f64 %fd97, %fd92, %fd93, %fd96;
add.f64 %fd4, %fd94, %fd97;
sub.f64 %fd98, %fd94, %fd4;
add.f64 %fd5, %fd97, %fd98;
mov.f64 %fd99, 0d4338000000000000;
mov.f64 %fd100, 0d3FF71547652B82FE;
fma.rn.f64 %fd101, %fd4, %fd100, %fd99;
{
.reg .b32 %temp;
mov.b64 {%r13, %temp}, %fd101;
}
mov.f64 %fd102, 0dC338000000000000;
add.rn.f64 %fd103, %fd101, %fd102;
mov.f64 %fd104, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd105, %fd103, %fd104, %fd4;
mov.f64 %fd106, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd107, %fd103, %fd106, %fd105;
mov.f64 %fd108, 0d3E928AF3FCA213EA;
mov.f64 %fd109, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd110, %fd109, %fd107, %fd108;
mov.f64 %fd111, 0d3EC71DEE62401315;
fma.rn.f64 %fd112, %fd110, %fd107, %fd111;
mov.f64 %fd113, 0d3EFA01997C89EB71;
fma.rn.f64 %fd114, %fd112, %fd107, %fd113;
mov.f64 %fd115, 0d3F2A01A014761F65;
fma.rn.f64 %fd116, %fd114, %fd107, %fd115;
mov.f64 %fd117, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd118, %fd116, %fd107, %fd117;
mov.f64 %fd119, 0d3F81111111122322;
fma.rn.f64 %fd120, %fd118, %fd107, %fd119;
mov.f64 %fd121, 0d3FA55555555502A1;
fma.rn.f64 %fd122, %fd120, %fd107, %fd121;
mov.f64 %fd123, 0d3FC5555555555511;
fma.rn.f64 %fd124, %fd122, %fd107, %fd123;
mov.f64 %fd125, 0d3FE000000000000B;
fma.rn.f64 %fd126, %fd124, %fd107, %fd125;
fma.rn.f64 %fd127, %fd126, %fd107, %fd18;
fma.rn.f64 %fd128, %fd127, %fd107, %fd18;
{
.reg .b32 %temp;
mov.b64 {%r14, %temp}, %fd128;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r15}, %fd128;
}
shl.b32 %r33, %r13, 20;
add.s32 %r34, %r15, %r33;
mov.b64 %fd133, {%r14, %r34};
{
.reg .b32 %temp;
mov.b64 {%temp, %r35}, %fd4;
}
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
@%p4 bra BB93_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd133, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
@%p6 bra BB93_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
shr.s32 %r38, %r37, 1;
shl.b32 %r39, %r38, 20;
add.s32 %r40, %r39, %r15;
mov.b64 %fd130, {%r14, %r40};
sub.s32 %r41, %r13, %r38;
shl.b32 %r42, %r41, 20;
add.s32 %r43, %r42, 1072693248;
mov.u32 %r44, 0;
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd133, %fd130, %fd131;
BB93_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd133;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd133;
}
setp.ne.s32 %p8, %r47, 0;
or.pred %p9, %p8, %p7;
@!%p9 bra BB93_9;
bra.uni BB93_8;
BB93_8:
fma.rn.f64 %fd133, %fd133, %fd5, %fd133;
BB93_9:
st.param.f64 [func_retval0+0], %fd133;
ret;
}