kernels.SystemML.ptx Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21112126
// Cuda compilation tools, release 8.0, V8.0.43
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_30
.address_size 64
// .globl copy_u2l_dense
.func (.param .b64 func_retval0) __internal_accurate_pow
(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
;
.extern .shared .align 8 .b8 sdata[];
.visible .entry copy_u2l_dense(
.param .u64 copy_u2l_dense_param_0,
.param .u32 copy_u2l_dense_param_1,
.param .u32 copy_u2l_dense_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<13>;
.reg .f64 %fd<2>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_param_0];
ld.param.u32 %r4, [copy_u2l_dense_param_1];
ld.param.u32 %r5, [copy_u2l_dense_param_2];
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %ctaid.y;
mov.u32 %r11, %tid.y;
mad.lo.s32 %r2, %r9, %r10, %r11;
mad.lo.s32 %r3, %r2, %r4, %r1;
setp.gt.s32 %p1, %r2, %r1;
setp.lt.s32 %p2, %r3, %r5;
and.pred %p3, %p1, %p2;
@!%p3 bra BB0_2;
bra.uni BB0_1;
BB0_1:
cvta.to.global.u64 %rd2, %rd1;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd3, %r12, 8;
add.s64 %rd4, %rd2, %rd3;
ld.global.f64 %fd1, [%rd4];
mul.wide.s32 %rd5, %r3, 8;
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
BB0_2:
ret;
}
// .globl relu
.visible .entry relu(
.param .u64 relu_param_0,
.param .u64 relu_param_1,
.param .u32 relu_param_2,
.param .u32 relu_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<12>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [relu_param_0];
ld.param.u64 %rd2, [relu_param_1];
ld.param.u32 %r4, [relu_param_2];
ld.param.u32 %r3, [relu_param_3];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %ctaid.y;
mov.u32 %r10, %tid.y;
mad.lo.s32 %r2, %r8, %r9, %r10;
setp.lt.s32 %p1, %r1, %r4;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB1_2;
bra.uni BB1_1;
BB1_1:
cvta.to.global.u64 %rd3, %rd1;
mad.lo.s32 %r11, %r1, %r3, %r2;
mul.wide.s32 %rd4, %r11, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
mov.f64 %fd2, 0d0000000000000000;
max.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd3;
BB1_2:
ret;
}
// .globl relu_backward
.visible .entry relu_backward(
.param .u64 relu_backward_param_0,
.param .u64 relu_backward_param_1,
.param .u64 relu_backward_param_2,
.param .u32 relu_backward_param_3,
.param .u32 relu_backward_param_4
)
{
.reg .pred %p<5>;
.reg .b32 %r<12>;
.reg .f64 %fd<6>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [relu_backward_param_0];
ld.param.u64 %rd3, [relu_backward_param_1];
ld.param.u64 %rd4, [relu_backward_param_2];
ld.param.u32 %r4, [relu_backward_param_3];
ld.param.u32 %r3, [relu_backward_param_4];
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %ctaid.y;
mov.u32 %r10, %tid.y;
mad.lo.s32 %r2, %r8, %r9, %r10;
setp.lt.s32 %p1, %r1, %r4;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB2_4;
bra.uni BB2_1;
BB2_1:
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r11, %r1, %r3, %r2;
cvt.s64.s32 %rd1, %r11;
mul.wide.s32 %rd6, %r11, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd4, [%rd7];
mov.f64 %fd5, 0d0000000000000000;
setp.leu.f64 %p4, %fd4, 0d0000000000000000;
@%p4 bra BB2_3;
cvta.to.global.u64 %rd8, %rd3;
shl.b64 %rd9, %rd1, 3;
add.s64 %rd10, %rd8, %rd9;
ld.global.f64 %fd5, [%rd10];
BB2_3:
cvta.to.global.u64 %rd11, %rd4;
shl.b64 %rd12, %rd1, 3;
add.s64 %rd13, %rd11, %rd12;
st.global.f64 [%rd13], %fd5;
BB2_4:
ret;
}
// .globl bias_add
.visible .entry bias_add(
.param .u64 bias_add_param_0,
.param .u64 bias_add_param_1,
.param .u64 bias_add_param_2,
.param .u32 bias_add_param_3,
.param .u32 bias_add_param_4,
.param .u32 bias_add_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<14>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_add_param_0];
ld.param.u64 %rd2, [bias_add_param_1];
ld.param.u64 %rd3, [bias_add_param_2];
ld.param.u32 %r5, [bias_add_param_3];
ld.param.u32 %r3, [bias_add_param_4];
ld.param.u32 %r4, [bias_add_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %ctaid.y;
mov.u32 %r11, %tid.y;
mad.lo.s32 %r2, %r9, %r10, %r11;
setp.lt.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB3_2;
bra.uni BB3_1;
BB3_1:
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r12, %r1, %r3, %r2;
mul.wide.s32 %rd5, %r12, 8;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r13, %r2, %r4;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r13, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
add.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB3_2:
ret;
}
// .globl daxpy_matrix_vector
.visible .entry daxpy_matrix_vector(
.param .u64 daxpy_matrix_vector_param_0,
.param .u64 daxpy_matrix_vector_param_1,
.param .f64 daxpy_matrix_vector_param_2,
.param .u64 daxpy_matrix_vector_param_3,
.param .u32 daxpy_matrix_vector_param_4,
.param .u32 daxpy_matrix_vector_param_5,
.param .u32 daxpy_matrix_vector_param_6,
.param .u32 daxpy_matrix_vector_param_7
)
{
.reg .pred %p<5>;
.reg .b32 %r<13>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd3, [daxpy_matrix_vector_param_0];
ld.param.u64 %rd5, [daxpy_matrix_vector_param_1];
ld.param.f64 %fd2, [daxpy_matrix_vector_param_2];
ld.param.u64 %rd4, [daxpy_matrix_vector_param_3];
ld.param.u32 %r5, [daxpy_matrix_vector_param_4];
ld.param.u32 %r3, [daxpy_matrix_vector_param_5];
ld.param.u32 %r4, [daxpy_matrix_vector_param_6];
cvta.to.global.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %ctaid.y;
mov.u32 %r11, %tid.y;
mad.lo.s32 %r2, %r9, %r10, %r11;
setp.lt.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB4_4;
bra.uni BB4_1;
BB4_1:
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r12, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r12, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
@%p4 bra BB4_3;
bra.uni BB4_2;
BB4_3:
mul.wide.s32 %rd12, %r2, 8;
add.s64 %rd13, %rd1, %rd12;
ld.global.f64 %fd5, [%rd13];
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
st.global.f64 [%rd2], %fd6;
bra.uni BB4_4;
BB4_2:
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd1, %rd10;
ld.global.f64 %fd3, [%rd11];
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
st.global.f64 [%rd2], %fd4;
BB4_4:
ret;
}
// .globl bias_multiply
.visible .entry bias_multiply(
.param .u64 bias_multiply_param_0,
.param .u64 bias_multiply_param_1,
.param .u64 bias_multiply_param_2,
.param .u32 bias_multiply_param_3,
.param .u32 bias_multiply_param_4,
.param .u32 bias_multiply_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<14>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_multiply_param_0];
ld.param.u64 %rd2, [bias_multiply_param_1];
ld.param.u64 %rd3, [bias_multiply_param_2];
ld.param.u32 %r5, [bias_multiply_param_3];
ld.param.u32 %r3, [bias_multiply_param_4];
ld.param.u32 %r4, [bias_multiply_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %ctaid.y;
mov.u32 %r11, %tid.y;
mad.lo.s32 %r2, %r9, %r10, %r11;
setp.lt.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r2, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB5_2;
bra.uni BB5_1;
BB5_1:
cvta.to.global.u64 %rd4, %rd1;
mad.lo.s32 %r12, %r1, %r3, %r2;
mul.wide.s32 %rd5, %r12, 8;
add.s64 %rd6, %rd4, %rd5;
div.s32 %r13, %r2, %r4;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r13, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
mul.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB5_2:
ret;
}
// .globl compare_and_set
.visible .entry compare_and_set(
.param .u64 compare_and_set_param_0,
.param .u64 compare_and_set_param_1,
.param .u32 compare_and_set_param_2,
.param .u32 compare_and_set_param_3,
.param .f64 compare_and_set_param_4,
.param .f64 compare_and_set_param_5,
.param .f64 compare_and_set_param_6,
.param .f64 compare_and_set_param_7,
.param .f64 compare_and_set_param_8
)
{
.reg .pred %p<6>;
.reg .b32 %r<12>;
.reg .f64 %fd<9>;
.reg .b64 %rd<8>;
ld.param.u64 %rd2, [compare_and_set_param_0];
ld.param.u64 %rd3, [compare_and_set_param_1];
ld.param.u32 %r2, [compare_and_set_param_2];
ld.param.u32 %r3, [compare_and_set_param_3];
ld.param.f64 %fd2, [compare_and_set_param_4];
ld.param.f64 %fd3, [compare_and_set_param_5];
ld.param.f64 %fd4, [compare_and_set_param_6];
ld.param.f64 %fd5, [compare_and_set_param_7];
ld.param.f64 %fd6, [compare_and_set_param_8];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r5, %r4, %r6;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %ctaid.y;
mov.u32 %r10, %tid.y;
mad.lo.s32 %r11, %r8, %r9, %r10;
mad.lo.s32 %r1, %r7, %r3, %r11;
setp.lt.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r11, %r3;
and.pred %p3, %p1, %p2;
@!%p3 bra BB6_6;
bra.uni BB6_1;
BB6_1:
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
sub.f64 %fd7, %fd1, %fd2;
abs.f64 %fd8, %fd7;
setp.lt.f64 %p4, %fd8, %fd3;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
@%p4 bra BB6_5;
bra.uni BB6_2;
BB6_5:
st.global.f64 [%rd1], %fd4;
bra.uni BB6_6;
BB6_2:
setp.lt.f64 %p5, %fd1, %fd2;
@%p5 bra BB6_4;
bra.uni BB6_3;
BB6_4:
st.global.f64 [%rd1], %fd5;
bra.uni BB6_6;
BB6_3:
st.global.f64 [%rd1], %fd6;
BB6_6:
ret;
}
// .globl matrix_matrix_cellwise_op
.visible .entry matrix_matrix_cellwise_op(
.param .u64 matrix_matrix_cellwise_op_param_0,
.param .u64 matrix_matrix_cellwise_op_param_1,
.param .u64 matrix_matrix_cellwise_op_param_2,
.param .u32 matrix_matrix_cellwise_op_param_3,
.param .u32 matrix_matrix_cellwise_op_param_4,
.param .u32 matrix_matrix_cellwise_op_param_5,
.param .u32 matrix_matrix_cellwise_op_param_6,
.param .u32 matrix_matrix_cellwise_op_param_7
)
{
.reg .pred %p<54>;
.reg .b32 %r<55>;
.reg .f64 %fd<39>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [matrix_matrix_cellwise_op_param_0];
ld.param.u64 %rd3, [matrix_matrix_cellwise_op_param_1];
ld.param.u64 %rd4, [matrix_matrix_cellwise_op_param_2];
ld.param.u32 %r14, [matrix_matrix_cellwise_op_param_3];
ld.param.u32 %r10, [matrix_matrix_cellwise_op_param_4];
ld.param.u32 %r11, [matrix_matrix_cellwise_op_param_5];
ld.param.u32 %r12, [matrix_matrix_cellwise_op_param_6];
ld.param.u32 %r13, [matrix_matrix_cellwise_op_param_7];
mov.u32 %r15, %ntid.x;
mov.u32 %r16, %ctaid.x;
mov.u32 %r17, %tid.x;
mad.lo.s32 %r1, %r15, %r16, %r17;
mov.u32 %r18, %ntid.y;
mov.u32 %r19, %ctaid.y;
mov.u32 %r20, %tid.y;
mad.lo.s32 %r2, %r18, %r19, %r20;
setp.lt.s32 %p2, %r1, %r14;
setp.lt.s32 %p3, %r2, %r10;
and.pred %p4, %p2, %p3;
@!%p4 bra BB7_53;
bra.uni BB7_1;
BB7_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
mov.u32 %r53, %r1;
@%p5 bra BB7_5;
setp.ne.s32 %p6, %r11, 2;
mov.u32 %r54, %r3;
@%p6 bra BB7_4;
mov.u32 %r54, %r2;
BB7_4:
mov.u32 %r48, %r54;
mov.u32 %r4, %r48;
mov.u32 %r53, %r4;
BB7_5:
mov.u32 %r5, %r53;
setp.eq.s32 %p7, %r12, 1;
mov.u32 %r51, %r1;
@%p7 bra BB7_9;
setp.ne.s32 %p8, %r12, 2;
mov.u32 %r52, %r3;
@%p8 bra BB7_8;
mov.u32 %r52, %r2;
BB7_8:
mov.u32 %r51, %r52;
BB7_9:
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r5, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
mul.wide.s32 %rd9, %r51, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
mov.f64 %fd38, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 5;
@%p9 bra BB7_19;
setp.gt.s32 %p19, %r13, 2;
@%p19 bra BB7_15;
setp.eq.s32 %p23, %r13, 0;
@%p23 bra BB7_51;
setp.eq.s32 %p24, %r13, 1;
@%p24 bra BB7_50;
bra.uni BB7_13;
BB7_50:
sub.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_19:
setp.gt.s32 %p10, %r13, 8;
@%p10 bra BB7_24;
setp.eq.s32 %p16, %r13, 6;
@%p16 bra BB7_34;
setp.eq.s32 %p17, %r13, 7;
@%p17 bra BB7_33;
bra.uni BB7_22;
BB7_33:
setp.gt.f64 %p29, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
bra.uni BB7_52;
BB7_15:
setp.eq.s32 %p20, %r13, 3;
@%p20 bra BB7_49;
setp.eq.s32 %p21, %r13, 4;
@%p21 bra BB7_35;
bra.uni BB7_17;
BB7_35:
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd2;
}
bfe.u32 %r21, %r9, 20, 11;
add.s32 %r22, %r21, -1012;
mov.b64 %rd11, %fd2;
shl.b64 %rd1, %rd11, %r22;
setp.eq.s64 %p32, %rd1, -9223372036854775808;
abs.f64 %fd11, %fd1;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd11;
.param .b64 param1;
st.param.f64 [param1+0], %fd2;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd37, [retval0+0];
//{
}// Callseq End 0
setp.lt.s32 %p33, %r8, 0;
and.pred %p1, %p33, %p32;
@!%p1 bra BB7_37;
bra.uni BB7_36;
BB7_36:
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd37;
}
xor.b32 %r24, %r23, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r25, %temp}, %fd37;
}
mov.b64 %fd37, {%r25, %r24};
BB7_37:
mov.f64 %fd36, %fd37;
setp.eq.f64 %p34, %fd1, 0d0000000000000000;
@%p34 bra BB7_40;
bra.uni BB7_38;
BB7_40:
selp.b32 %r26, %r8, 0, %p32;
or.b32 %r27, %r26, 2146435072;
setp.lt.s32 %p38, %r9, 0;
selp.b32 %r28, %r27, %r26, %p38;
mov.u32 %r29, 0;
mov.b64 %fd36, {%r29, %r28};
bra.uni BB7_41;
BB7_24:
setp.gt.s32 %p11, %r13, 10;
@%p11 bra BB7_28;
setp.eq.s32 %p14, %r13, 9;
@%p14 bra BB7_32;
bra.uni BB7_26;
BB7_32:
setp.eq.f64 %p27, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
bra.uni BB7_52;
BB7_28:
setp.eq.s32 %p12, %r13, 11;
@%p12 bra BB7_31;
bra.uni BB7_29;
BB7_31:
min.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_51:
add.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_13:
setp.eq.s32 %p25, %r13, 2;
@%p25 bra BB7_14;
bra.uni BB7_52;
BB7_14:
mul.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_34:
setp.le.f64 %p30, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
bra.uni BB7_52;
BB7_22:
setp.eq.s32 %p18, %r13, 8;
@%p18 bra BB7_23;
bra.uni BB7_52;
BB7_23:
setp.ge.f64 %p28, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
bra.uni BB7_52;
BB7_49:
div.rn.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_17:
setp.eq.s32 %p22, %r13, 5;
@%p22 bra BB7_18;
bra.uni BB7_52;
BB7_18:
setp.lt.f64 %p31, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
bra.uni BB7_52;
BB7_26:
setp.eq.s32 %p15, %r13, 10;
@%p15 bra BB7_27;
bra.uni BB7_52;
BB7_27:
setp.neu.f64 %p26, %fd1, %fd2;
selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
bra.uni BB7_52;
BB7_29:
setp.ne.s32 %p13, %r13, 12;
@%p13 bra BB7_52;
max.f64 %fd38, %fd1, %fd2;
bra.uni BB7_52;
BB7_38:
setp.gt.s32 %p35, %r8, -1;
@%p35 bra BB7_41;
cvt.rzi.f64.f64 %fd29, %fd2;
setp.neu.f64 %p36, %fd29, %fd2;
selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36;
BB7_41:
mov.f64 %fd17, %fd36;
add.f64 %fd18, %fd1, %fd2;
{
.reg .b32 %temp;
mov.b64 {%temp, %r30}, %fd18;
}
and.b32 %r31, %r30, 2146435072;
setp.ne.s32 %p39, %r31, 2146435072;
mov.f64 %fd35, %fd17;
@%p39 bra BB7_48;
setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000;
mov.f64 %fd35, %fd18;
@%p40 bra BB7_48;
abs.f64 %fd30, %fd2;
setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000;
mov.f64 %fd34, %fd18;
mov.f64 %fd35, %fd34;
@%p41 bra BB7_48;
{
.reg .b32 %temp;
mov.b64 {%r32, %temp}, %fd2;
}
and.b32 %r33, %r9, 2147483647;
setp.eq.s32 %p42, %r33, 2146435072;
setp.eq.s32 %p43, %r32, 0;
and.pred %p44, %p42, %p43;
@%p44 bra BB7_47;
bra.uni BB7_45;
BB7_47:
setp.gt.f64 %p48, %fd11, 0d3FF0000000000000;
selp.b32 %r41, 2146435072, 0, %p48;
xor.b32 %r42, %r41, 2146435072;
setp.lt.s32 %p49, %r9, 0;
selp.b32 %r43, %r42, %r41, %p49;
setp.eq.f64 %p50, %fd1, 0dBFF0000000000000;
selp.b32 %r44, 1072693248, %r43, %p50;
mov.u32 %r45, 0;
mov.b64 %fd35, {%r45, %r44};
bra.uni BB7_48;
BB7_45:
{
.reg .b32 %temp;
mov.b64 {%r34, %temp}, %fd1;
}
and.b32 %r35, %r8, 2147483647;
setp.eq.s32 %p45, %r35, 2146435072;
setp.eq.s32 %p46, %r34, 0;
and.pred %p47, %p45, %p46;
mov.f64 %fd35, %fd17;
@!%p47 bra BB7_48;
bra.uni BB7_46;
BB7_46:
shr.s32 %r36, %r9, 31;
and.b32 %r37, %r36, -2146435072;
selp.b32 %r38, -1048576, 2146435072, %p1;
add.s32 %r39, %r38, %r37;
mov.u32 %r40, 0;
mov.b64 %fd35, {%r40, %r39};
BB7_48:
setp.eq.f64 %p51, %fd2, 0d0000000000000000;
setp.eq.f64 %p52, %fd1, 0d3FF0000000000000;
or.pred %p53, %p52, %p51;
selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53;
BB7_52:
cvta.to.global.u64 %rd12, %rd4;
mul.wide.s32 %rd13, %r3, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd38;
bar.sync 0;
BB7_53:
ret;
}
// .globl matrix_scalar_op
.visible .entry matrix_scalar_op(
.param .u64 matrix_scalar_op_param_0,
.param .f64 matrix_scalar_op_param_1,
.param .u64 matrix_scalar_op_param_2,
.param .u32 matrix_scalar_op_param_3,
.param .u32 matrix_scalar_op_param_4,
.param .u32 matrix_scalar_op_param_5
)
{
.reg .pred %p<95>;
.reg .b32 %r<62>;
.reg .f64 %fd<75>;
.reg .b64 %rd<12>;
ld.param.u64 %rd4, [matrix_scalar_op_param_0];
ld.param.f64 %fd52, [matrix_scalar_op_param_1];
ld.param.u64 %rd5, [matrix_scalar_op_param_2];
ld.param.u32 %r8, [matrix_scalar_op_param_3];
ld.param.u32 %r6, [matrix_scalar_op_param_4];
ld.param.u32 %r7, [matrix_scalar_op_param_5];
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r10, %r9, %r11;
setp.ge.s32 %p3, %r1, %r8;
@%p3 bra BB8_90;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
mul.wide.s32 %rd8, %r1, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
@%p4 bra BB8_46;
mov.f64 %fd66, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 5;
@%p5 bra BB8_12;
setp.gt.s32 %p15, %r6, 2;
@%p15 bra BB8_8;
setp.eq.s32 %p19, %r6, 0;
@%p19 bra BB8_44;
setp.eq.s32 %p20, %r6, 1;
@%p20 bra BB8_43;
bra.uni BB8_6;
BB8_43:
sub.f64 %fd66, %fd52, %fd1;
bra.uni BB8_45;
BB8_46:
mov.f64 %fd74, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p50, %r6, 5;
@%p50 bra BB8_56;
setp.gt.s32 %p60, %r6, 2;
@%p60 bra BB8_52;
setp.eq.s32 %p64, %r6, 0;
@%p64 bra BB8_88;
setp.eq.s32 %p65, %r6, 1;
@%p65 bra BB8_87;
bra.uni BB8_50;
BB8_87:
sub.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_12:
setp.gt.s32 %p6, %r6, 8;
@%p6 bra BB8_17;
setp.eq.s32 %p12, %r6, 6;
@%p12 bra BB8_27;
setp.eq.s32 %p13, %r6, 7;
@%p13 bra BB8_26;
bra.uni BB8_15;
BB8_26:
setp.lt.f64 %p25, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
bra.uni BB8_45;
BB8_56:
setp.gt.s32 %p51, %r6, 8;
@%p51 bra BB8_61;
setp.eq.s32 %p57, %r6, 6;
@%p57 bra BB8_71;
setp.eq.s32 %p58, %r6, 7;
@%p58 bra BB8_70;
bra.uni BB8_59;
BB8_70:
setp.gt.f64 %p70, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
bra.uni BB8_89;
BB8_8:
setp.eq.s32 %p16, %r6, 3;
@%p16 bra BB8_42;
setp.eq.s32 %p17, %r6, 4;
@%p17 bra BB8_28;
bra.uni BB8_10;
BB8_28:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd52;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd1;
}
bfe.u32 %r12, %r3, 20, 11;
add.s32 %r13, %r12, -1012;
mov.b64 %rd10, %fd1;
shl.b64 %rd2, %rd10, %r13;
setp.eq.s64 %p28, %rd2, -9223372036854775808;
abs.f64 %fd10, %fd52;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd10;
.param .b64 param1;
st.param.f64 [param1+0], %fd1;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd65, [retval0+0];
//{
}// Callseq End 1
setp.lt.s32 %p29, %r2, 0;
and.pred %p1, %p29, %p28;
@!%p1 bra BB8_30;
bra.uni BB8_29;
BB8_29:
{
.reg .b32 %temp;
mov.b64 {%temp, %r14}, %fd65;
}
xor.b32 %r15, %r14, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r16, %temp}, %fd65;
}
mov.b64 %fd65, {%r16, %r15};
BB8_30:
mov.f64 %fd64, %fd65;
setp.eq.f64 %p30, %fd52, 0d0000000000000000;
@%p30 bra BB8_33;
bra.uni BB8_31;
BB8_33:
selp.b32 %r17, %r2, 0, %p28;
or.b32 %r18, %r17, 2146435072;
setp.lt.s32 %p34, %r3, 0;
selp.b32 %r19, %r18, %r17, %p34;
mov.u32 %r20, 0;
mov.b64 %fd64, {%r20, %r19};
bra.uni BB8_34;
BB8_17:
setp.gt.s32 %p7, %r6, 10;
@%p7 bra BB8_21;
setp.eq.s32 %p10, %r6, 9;
@%p10 bra BB8_25;
bra.uni BB8_19;
BB8_25:
setp.eq.f64 %p23, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
bra.uni BB8_45;
BB8_52:
setp.eq.s32 %p61, %r6, 3;
@%p61 bra BB8_86;
setp.eq.s32 %p62, %r6, 4;
@%p62 bra BB8_72;
bra.uni BB8_54;
BB8_72:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd52;
}
bfe.u32 %r37, %r5, 20, 11;
add.s32 %r38, %r37, -1012;
mov.b64 %rd11, %fd52;
shl.b64 %rd3, %rd11, %r38;
setp.eq.s64 %p73, %rd3, -9223372036854775808;
abs.f64 %fd35, %fd1;
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// }
.param .b64 param0;
st.param.f64 [param0+0], %fd35;
.param .b64 param1;
st.param.f64 [param1+0], %fd52;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd73, [retval0+0];
//{
}// Callseq End 2
setp.lt.s32 %p74, %r4, 0;
and.pred %p2, %p74, %p73;
@!%p2 bra BB8_74;
bra.uni BB8_73;
BB8_73:
{
.reg .b32 %temp;
mov.b64 {%temp, %r39}, %fd73;
}
xor.b32 %r40, %r39, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r41, %temp}, %fd73;
}
mov.b64 %fd73, {%r41, %r40};
BB8_74:
mov.f64 %fd72, %fd73;
setp.eq.f64 %p75, %fd1, 0d0000000000000000;
@%p75 bra BB8_77;
bra.uni BB8_75;
BB8_77:
selp.b32 %r42, %r4, 0, %p73;
or.b32 %r43, %r42, 2146435072;
setp.lt.s32 %p79, %r5, 0;
selp.b32 %r44, %r43, %r42, %p79;
mov.u32 %r45, 0;
mov.b64 %fd72, {%r45, %r44};
bra.uni BB8_78;
BB8_61:
setp.gt.s32 %p52, %r6, 10;
@%p52 bra BB8_65;
setp.eq.s32 %p55, %r6, 9;
@%p55 bra BB8_69;
bra.uni BB8_63;
BB8_69:
setp.eq.f64 %p68, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
bra.uni BB8_89;
BB8_21:
setp.eq.s32 %p8, %r6, 11;
@%p8 bra BB8_24;
bra.uni BB8_22;
BB8_24:
min.f64 %fd66, %fd52, %fd1;
bra.uni BB8_45;
BB8_44:
add.f64 %fd66, %fd1, %fd52;
bra.uni BB8_45;
BB8_6:
setp.eq.s32 %p21, %r6, 2;
@%p21 bra BB8_7;
bra.uni BB8_45;
BB8_7:
mul.f64 %fd66, %fd1, %fd52;
bra.uni BB8_45;
BB8_27:
setp.ge.f64 %p26, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
bra.uni BB8_45;
BB8_15:
setp.eq.s32 %p14, %r6, 8;
@%p14 bra BB8_16;
bra.uni BB8_45;
BB8_16:
setp.le.f64 %p24, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
bra.uni BB8_45;
BB8_42:
div.rn.f64 %fd66, %fd52, %fd1;
bra.uni BB8_45;
BB8_10:
setp.eq.s32 %p18, %r6, 5;
@%p18 bra BB8_11;
bra.uni BB8_45;
BB8_11:
setp.gt.f64 %p27, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
bra.uni BB8_45;
BB8_65:
setp.eq.s32 %p53, %r6, 11;
@%p53 bra BB8_68;
bra.uni BB8_66;
BB8_68:
min.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_19:
setp.eq.s32 %p11, %r6, 10;
@%p11 bra BB8_20;
bra.uni BB8_45;
BB8_20:
setp.neu.f64 %p22, %fd1, %fd52;
selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
bra.uni BB8_45;
BB8_22:
setp.ne.s32 %p9, %r6, 12;
@%p9 bra BB8_45;
max.f64 %fd66, %fd52, %fd1;
bra.uni BB8_45;
BB8_88:
add.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_50:
setp.eq.s32 %p66, %r6, 2;
@%p66 bra BB8_51;
bra.uni BB8_89;
BB8_51:
mul.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_71:
setp.le.f64 %p71, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
bra.uni BB8_89;
BB8_59:
setp.eq.s32 %p59, %r6, 8;
@%p59 bra BB8_60;
bra.uni BB8_89;
BB8_60:
setp.ge.f64 %p69, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
bra.uni BB8_89;
BB8_86:
div.rn.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_54:
setp.eq.s32 %p63, %r6, 5;
@%p63 bra BB8_55;
bra.uni BB8_89;
BB8_55:
setp.lt.f64 %p72, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
bra.uni BB8_89;
BB8_63:
setp.eq.s32 %p56, %r6, 10;
@%p56 bra BB8_64;
bra.uni BB8_89;
BB8_64:
setp.neu.f64 %p67, %fd1, %fd52;
selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
bra.uni BB8_89;
BB8_66:
setp.ne.s32 %p54, %r6, 12;
@%p54 bra BB8_89;
max.f64 %fd74, %fd1, %fd52;
bra.uni BB8_89;
BB8_31:
setp.gt.s32 %p31, %r2, -1;
@%p31 bra BB8_34;
cvt.rzi.f64.f64 %fd54, %fd1;
setp.neu.f64 %p32, %fd54, %fd1;
selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p32;
BB8_34:
mov.f64 %fd16, %fd64;
add.f64 %fd17, %fd1, %fd52;
{
.reg .b32 %temp;
mov.b64 {%temp, %r21}, %fd17;
}
and.b32 %r22, %r21, 2146435072;
setp.ne.s32 %p35, %r22, 2146435072;
mov.f64 %fd63, %fd16;
@%p35 bra BB8_41;
setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000;
mov.f64 %fd63, %fd17;
@%p36 bra BB8_41;
abs.f64 %fd55, %fd1;
setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000;
mov.f64 %fd62, %fd17;
mov.f64 %fd63, %fd62;
@%p37 bra BB8_41;
{
.reg .b32 %temp;
mov.b64 {%r23, %temp}, %fd1;
}
and.b32 %r24, %r3, 2147483647;
setp.eq.s32 %p38, %r24, 2146435072;
setp.eq.s32 %p39, %r23, 0;
and.pred %p40, %p38, %p39;
@%p40 bra BB8_40;
bra.uni BB8_38;
BB8_40:
setp.gt.f64 %p44, %fd10, 0d3FF0000000000000;
selp.b32 %r32, 2146435072, 0, %p44;
xor.b32 %r33, %r32, 2146435072;
setp.lt.s32 %p45, %r3, 0;
selp.b32 %r34, %r33, %r32, %p45;
setp.eq.f64 %p46, %fd52, 0dBFF0000000000000;
selp.b32 %r35, 1072693248, %r34, %p46;
mov.u32 %r36, 0;
mov.b64 %fd63, {%r36, %r35};
bra.uni BB8_41;
BB8_75:
setp.gt.s32 %p76, %r4, -1;
@%p76 bra BB8_78;
cvt.rzi.f64.f64 %fd57, %fd52;
setp.neu.f64 %p77, %fd57, %fd52;
selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p77;
BB8_78:
mov.f64 %fd41, %fd72;
add.f64 %fd42, %fd1, %fd52;
{
.reg .b32 %temp;
mov.b64 {%temp, %r46}, %fd42;
}
and.b32 %r47, %r46, 2146435072;
setp.ne.s32 %p80, %r47, 2146435072;
mov.f64 %fd71, %fd41;
@%p80 bra BB8_85;
setp.gtu.f64 %p81, %fd35, 0d7FF0000000000000;
mov.f64 %fd71, %fd42;
@%p81 bra BB8_85;
abs.f64 %fd58, %fd52;
setp.gtu.f64 %p82, %fd58, 0d7FF0000000000000;
mov.f64 %fd70, %fd42;
mov.f64 %fd71, %fd70;
@%p82 bra BB8_85;
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd52;
}
and.b32 %r49, %r5, 2147483647;
setp.eq.s32 %p83, %r49, 2146435072;
setp.eq.s32 %p84, %r48, 0;
and.pred %p85, %p83, %p84;
@%p85 bra BB8_84;
bra.uni BB8_82;
BB8_84:
setp.gt.f64 %p89, %fd35, 0d3FF0000000000000;
selp.b32 %r57, 2146435072, 0, %p89;
xor.b32 %r58, %r57, 2146435072;
setp.lt.s32 %p90, %r5, 0;
selp.b32 %r59, %r58, %r57, %p90;
setp.eq.f64 %p91, %fd1, 0dBFF0000000000000;
selp.b32 %r60, 1072693248, %r59, %p91;
mov.u32 %r61, 0;
mov.b64 %fd71, {%r61, %r60};
bra.uni BB8_85;
BB8_38:
{
.reg .b32 %temp;
mov.b64 {%r25, %temp}, %fd52;
}
and.b32 %r26, %r2, 2147483647;
setp.eq.s32 %p41, %r26, 2146435072;
setp.eq.s32 %p42, %r25, 0;
and.pred %p43, %p41, %p42;
mov.f64 %fd63, %fd16;
@!%p43 bra BB8_41;
bra.uni BB8_39;
BB8_39:
shr.s32 %r27, %r3, 31;
and.b32 %r28, %r27, -2146435072;
selp.b32 %r29, -1048576, 2146435072, %p1;
add.s32 %r30, %r29, %r28;
mov.u32 %r31, 0;
mov.b64 %fd63, {%r31, %r30};
BB8_41:
setp.eq.f64 %p47, %fd1, 0d0000000000000000;
setp.eq.f64 %p48, %fd52, 0d3FF0000000000000;
or.pred %p49, %p48, %p47;
selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p49;
BB8_45:
st.global.f64 [%rd1], %fd66;
bra.uni BB8_90;
BB8_82:
{
.reg .b32 %temp;
mov.b64 {%r50, %temp}, %fd1;
}
and.b32 %r51, %r4, 2147483647;
setp.eq.s32 %p86, %r51, 2146435072;
setp.eq.s32 %p87, %r50, 0;
and.pred %p88, %p86, %p87;
mov.f64 %fd71, %fd41;
@!%p88 bra BB8_85;
bra.uni BB8_83;
BB8_83:
shr.s32 %r52, %r5, 31;
and.b32 %r53, %r52, -2146435072;
selp.b32 %r54, -1048576, 2146435072, %p2;
add.s32 %r55, %r54, %r53;
mov.u32 %r56, 0;
mov.b64 %fd71, {%r56, %r55};
BB8_85:
setp.eq.f64 %p92, %fd52, 0d0000000000000000;
setp.eq.f64 %p93, %fd1, 0d3FF0000000000000;
or.pred %p94, %p93, %p92;
selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p94;
BB8_89:
st.global.f64 [%rd1], %fd74;
BB8_90:
bar.sync 0;
ret;
}
// .globl fill
.visible .entry fill(
.param .u64 fill_param_0,
.param .f64 fill_param_1,
.param .u32 fill_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [fill_param_0];
ld.param.f64 %fd1, [fill_param_1];
ld.param.u32 %r2, [fill_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB9_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
BB9_2:
ret;
}
// .globl reduce_sum
.visible .entry reduce_sum(
.param .u64 reduce_sum_param_0,
.param .u64 reduce_sum_param_1,
.param .u32 reduce_sum_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_sum_param_0];
ld.param.u64 %rd3, [reduce_sum_param_1];
ld.param.u32 %r5, [reduce_sum_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d0000000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB10_4;
BB10_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
add.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB10_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
add.f64 %fd78, %fd78, %fd31;
BB10_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB10_1;
BB10_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB10_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB10_7;
ld.shared.f64 %fd32, [%rd1+4096];
add.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB10_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB10_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB10_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB10_11;
ld.shared.f64 %fd33, [%rd1+2048];
add.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB10_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB10_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB10_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB10_15;
ld.shared.f64 %fd34, [%rd1+1024];
add.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB10_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB10_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB10_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB10_19;
ld.shared.f64 %fd35, [%rd1+512];
add.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB10_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB10_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB10_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB10_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
add.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB10_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB10_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
add.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB10_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB10_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
add.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB10_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB10_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
add.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB10_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB10_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
add.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB10_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB10_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
add.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB10_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB10_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB10_35:
ret;
}
// .globl reduce_row_sum
.visible .entry reduce_row_sum(
.param .u64 reduce_row_sum_param_0,
.param .u64 reduce_row_sum_param_1,
.param .u32 reduce_row_sum_param_2,
.param .u32 reduce_row_sum_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_sum_param_0];
ld.param.u64 %rd2, [reduce_row_sum_param_1];
ld.param.u32 %r5, [reduce_row_sum_param_2];
ld.param.u32 %r4, [reduce_row_sum_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB11_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d0000000000000000;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB11_4;
cvta.to.global.u64 %rd3, %rd1;
BB11_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB11_3;
BB11_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB11_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB11_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB11_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB11_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB11_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB11_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB11_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB11_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB11_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB11_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB11_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB11_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB11_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB11_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB11_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB11_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB11_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB11_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB11_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB11_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB11_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB11_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB11_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB11_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB11_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB11_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB11_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB11_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB11_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB11_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB11_35:
ret;
}
// .globl reduce_col_sum
.visible .entry reduce_col_sum(
.param .u64 reduce_col_sum_param_0,
.param .u64 reduce_col_sum_param_1,
.param .u32 reduce_col_sum_param_2,
.param .u32 reduce_col_sum_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_param_0];
ld.param.u64 %rd3, [reduce_col_sum_param_1];
ld.param.u32 %r5, [reduce_col_sum_param_2];
ld.param.u32 %r6, [reduce_col_sum_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB12_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d0000000000000000;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB12_4;
mov.u32 %r10, %r1;
BB12_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB12_3;
BB12_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB12_5:
ret;
}
// .globl reduce_max
.visible .entry reduce_max(
.param .u64 reduce_max_param_0,
.param .u64 reduce_max_param_1,
.param .u32 reduce_max_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_max_param_0];
ld.param.u64 %rd3, [reduce_max_param_1];
ld.param.u32 %r5, [reduce_max_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB13_4;
BB13_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
max.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB13_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
max.f64 %fd78, %fd78, %fd31;
BB13_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB13_1;
BB13_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB13_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB13_7;
ld.shared.f64 %fd32, [%rd1+4096];
max.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB13_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB13_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB13_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB13_11;
ld.shared.f64 %fd33, [%rd1+2048];
max.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB13_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB13_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB13_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB13_15;
ld.shared.f64 %fd34, [%rd1+1024];
max.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB13_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB13_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB13_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB13_19;
ld.shared.f64 %fd35, [%rd1+512];
max.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB13_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB13_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB13_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB13_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
max.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB13_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB13_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
max.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB13_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB13_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
max.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB13_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB13_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
max.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB13_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB13_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
max.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB13_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB13_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
max.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB13_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB13_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB13_35:
ret;
}
// .globl reduce_row_max
.visible .entry reduce_row_max(
.param .u64 reduce_row_max_param_0,
.param .u64 reduce_row_max_param_1,
.param .u32 reduce_row_max_param_2,
.param .u32 reduce_row_max_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_max_param_0];
ld.param.u64 %rd2, [reduce_row_max_param_1];
ld.param.u32 %r5, [reduce_row_max_param_2];
ld.param.u32 %r4, [reduce_row_max_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB14_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB14_4;
cvta.to.global.u64 %rd3, %rd1;
BB14_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
max.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB14_3;
BB14_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB14_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB14_7;
ld.shared.f64 %fd29, [%rd8+4096];
max.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB14_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB14_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB14_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB14_11;
ld.shared.f64 %fd30, [%rd8+2048];
max.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB14_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB14_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB14_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB14_15;
ld.shared.f64 %fd31, [%rd8+1024];
max.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB14_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB14_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB14_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB14_19;
ld.shared.f64 %fd32, [%rd8+512];
max.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB14_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB14_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB14_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB14_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
max.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB14_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB14_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
max.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB14_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB14_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
max.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB14_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB14_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
max.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB14_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB14_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
max.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB14_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB14_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
max.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB14_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB14_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB14_35:
ret;
}
// .globl reduce_col_max
.visible .entry reduce_col_max(
.param .u64 reduce_col_max_param_0,
.param .u64 reduce_col_max_param_1,
.param .u32 reduce_col_max_param_2,
.param .u32 reduce_col_max_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_param_0];
ld.param.u64 %rd3, [reduce_col_max_param_1];
ld.param.u32 %r5, [reduce_col_max_param_2];
ld.param.u32 %r6, [reduce_col_max_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB15_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB15_4;
mov.u32 %r10, %r1;
BB15_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
max.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB15_3;
BB15_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB15_5:
ret;
}
// .globl reduce_min
.visible .entry reduce_min(
.param .u64 reduce_min_param_0,
.param .u64 reduce_min_param_1,
.param .u32 reduce_min_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_min_param_0];
ld.param.u64 %rd3, [reduce_min_param_1];
ld.param.u32 %r5, [reduce_min_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB16_4;
BB16_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
min.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB16_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
min.f64 %fd78, %fd78, %fd31;
BB16_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB16_1;
BB16_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB16_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB16_7;
ld.shared.f64 %fd32, [%rd1+4096];
min.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB16_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB16_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB16_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB16_11;
ld.shared.f64 %fd33, [%rd1+2048];
min.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB16_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB16_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB16_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB16_15;
ld.shared.f64 %fd34, [%rd1+1024];
min.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB16_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB16_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB16_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB16_19;
ld.shared.f64 %fd35, [%rd1+512];
min.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB16_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB16_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB16_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB16_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
min.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB16_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB16_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
min.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB16_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB16_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
min.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB16_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB16_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
min.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB16_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB16_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
min.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB16_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB16_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
min.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB16_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB16_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB16_35:
ret;
}
// .globl reduce_row_min
.visible .entry reduce_row_min(
.param .u64 reduce_row_min_param_0,
.param .u64 reduce_row_min_param_1,
.param .u32 reduce_row_min_param_2,
.param .u32 reduce_row_min_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<74>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_min_param_0];
ld.param.u64 %rd2, [reduce_row_min_param_1];
ld.param.u32 %r5, [reduce_row_min_param_2];
ld.param.u32 %r4, [reduce_row_min_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB17_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB17_4;
cvta.to.global.u64 %rd3, %rd1;
BB17_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
min.f64 %fd73, %fd73, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
@%p3 bra BB17_3;
BB17_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd70;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB17_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
@%p5 bra BB17_7;
ld.shared.f64 %fd29, [%rd8+4096];
min.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
BB17_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB17_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB17_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
@%p7 bra BB17_11;
ld.shared.f64 %fd30, [%rd8+2048];
min.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
BB17_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB17_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB17_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
@%p9 bra BB17_15;
ld.shared.f64 %fd31, [%rd8+1024];
min.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
BB17_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB17_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB17_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
@%p11 bra BB17_19;
ld.shared.f64 %fd32, [%rd8+512];
min.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
BB17_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
BB17_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB17_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB17_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
min.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
BB17_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB17_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
min.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
BB17_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB17_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
min.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
BB17_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB17_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
min.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
BB17_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB17_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
min.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
BB17_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB17_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
min.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB17_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB17_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
BB17_35:
ret;
}
// .globl reduce_col_min
.visible .entry reduce_col_min(
.param .u64 reduce_col_min_param_0,
.param .u64 reduce_col_min_param_1,
.param .u32 reduce_col_min_param_2,
.param .u32 reduce_col_min_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_param_0];
ld.param.u64 %rd3, [reduce_col_min_param_1];
ld.param.u32 %r5, [reduce_col_min_param_2];
ld.param.u32 %r6, [reduce_col_min_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB18_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB18_4;
mov.u32 %r10, %r1;
BB18_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
min.f64 %fd9, %fd9, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
@%p3 bra BB18_3;
BB18_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB18_5:
ret;
}
// .globl reduce_prod
.visible .entry reduce_prod(
.param .u64 reduce_prod_param_0,
.param .u64 reduce_prod_param_1,
.param .u32 reduce_prod_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<33>;
.reg .f64 %fd<79>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [reduce_prod_param_0];
ld.param.u64 %rd3, [reduce_prod_param_1];
ld.param.u32 %r5, [reduce_prod_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r32, %r8, %r9, %r6;
mov.f64 %fd76, 0d3FF0000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
@%p1 bra BB19_4;
BB19_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd30, [%rd6];
mul.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB19_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
mul.f64 %fd78, %fd78, %fd31;
BB19_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
@%p3 bra BB19_1;
BB19_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB19_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
@%p5 bra BB19_7;
ld.shared.f64 %fd32, [%rd1+4096];
mul.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
BB19_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
BB19_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB19_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
@%p7 bra BB19_11;
ld.shared.f64 %fd33, [%rd1+2048];
mul.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
BB19_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB19_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB19_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
@%p9 bra BB19_15;
ld.shared.f64 %fd34, [%rd1+1024];
mul.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
BB19_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB19_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB19_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
@%p11 bra BB19_19;
ld.shared.f64 %fd35, [%rd1+512];
mul.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
BB19_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB19_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
@%p12 bra BB19_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB19_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
mul.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
BB19_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB19_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
mul.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
BB19_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB19_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
mul.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
BB19_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB19_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
mul.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
BB19_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB19_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
mul.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
BB19_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB19_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
mul.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
BB19_33:
setp.ne.s32 %p19, %r6, 0;
@%p19 bra BB19_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
mul.wide.u32 %rd13, %r7, 8;
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
BB19_35:
ret;
}
// .globl reduce_row_mean
.visible .entry reduce_row_mean(
.param .u64 reduce_row_mean_param_0,
.param .u64 reduce_row_mean_param_1,
.param .u32 reduce_row_mean_param_2,
.param .u32 reduce_row_mean_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<39>;
.reg .f64 %fd<76>;
.reg .b64 %rd<42>;
ld.param.u64 %rd1, [reduce_row_mean_param_0];
ld.param.u64 %rd2, [reduce_row_mean_param_1];
ld.param.u32 %r5, [reduce_row_mean_param_2];
ld.param.u32 %r4, [reduce_row_mean_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB20_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd74, 0d0000000000000000;
mov.f64 %fd75, %fd74;
setp.ge.u32 %p2, %r38, %r4;
@%p2 bra BB20_4;
cvta.to.global.u64 %rd3, %rd1;
BB20_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd75, %fd75, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd74, %fd75;
@%p3 bra BB20_3;
BB20_4:
mov.f64 %fd72, %fd74;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
mov.u64 %rd7, sdata;
add.s64 %rd8, %rd7, %rd6;
st.shared.f64 [%rd8], %fd72;
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
@%p4 bra BB20_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd73, %fd72;
@%p5 bra BB20_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd73, %fd72, %fd29;
st.shared.f64 [%rd8], %fd73;
BB20_7:
mov.f64 %fd72, %fd73;
bar.sync 0;
BB20_8:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p6, %r11, 512;
@%p6 bra BB20_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd71, %fd70;
@%p7 bra BB20_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd71, %fd70, %fd30;
st.shared.f64 [%rd8], %fd71;
BB20_11:
mov.f64 %fd70, %fd71;
bar.sync 0;
BB20_12:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p8, %r11, 256;
@%p8 bra BB20_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd69, %fd68;
@%p9 bra BB20_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd69, %fd68, %fd31;
st.shared.f64 [%rd8], %fd69;
BB20_15:
mov.f64 %fd68, %fd69;
bar.sync 0;
BB20_16:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p10, %r11, 128;
@%p10 bra BB20_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd67, %fd66;
@%p11 bra BB20_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd67, %fd66, %fd32;
st.shared.f64 [%rd8], %fd67;
BB20_19:
mov.f64 %fd66, %fd67;
bar.sync 0;
BB20_20:
mov.f64 %fd65, %fd66;
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB20_33;
setp.lt.u32 %p13, %r11, 64;
@%p13 bra BB20_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd65, %fd65, %fd33;
st.volatile.shared.f64 [%rd8], %fd65;
BB20_23:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p14, %r11, 32;
@%p14 bra BB20_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd64, %fd64, %fd34;
st.volatile.shared.f64 [%rd8], %fd64;
BB20_25:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p15, %r11, 16;
@%p15 bra BB20_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd63, %fd63, %fd35;
st.volatile.shared.f64 [%rd8], %fd63;
BB20_27:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p16, %r11, 8;
@%p16 bra BB20_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd62, %fd62, %fd36;
st.volatile.shared.f64 [%rd8], %fd62;
BB20_29:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p17, %r11, 4;
@%p17 bra BB20_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd61, %fd61, %fd37;
st.volatile.shared.f64 [%rd8], %fd61;
BB20_31:
setp.lt.u32 %p18, %r11, 2;
@%p18 bra BB20_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd61, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
BB20_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB20_35;
ld.shared.f64 %fd40, [sdata];
cvt.rn.f64.s32 %fd41, %r4;
div.rn.f64 %fd42, %fd40, %fd41;
cvta.to.global.u64 %rd39, %rd2;
mul.wide.u32 %rd40, %r6, 8;
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd42;
BB20_35:
ret;
}
// .globl reduce_col_mean
.visible .entry reduce_col_mean(
.param .u64 reduce_col_mean_param_0,
.param .u64 reduce_col_mean_param_1,
.param .u32 reduce_col_mean_param_2,
.param .u32 reduce_col_mean_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<12>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_mean_param_0];
ld.param.u64 %rd3, [reduce_col_mean_param_1];
ld.param.u32 %r5, [reduce_col_mean_param_2];
ld.param.u32 %r6, [reduce_col_mean_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB21_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd10, 0d0000000000000000;
mov.f64 %fd11, %fd10;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB21_4;
mov.u32 %r10, %r1;
BB21_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd11, %fd11, %fd6;
add.s32 %r4, %r3, %r6;
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd10, %fd11;
@%p3 bra BB21_3;
BB21_4:
cvta.to.global.u64 %rd6, %rd3;
cvt.rn.f64.s32 %fd7, %r5;
div.rn.f64 %fd8, %fd10, %fd7;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB21_5:
ret;
}
// .globl matrix_exp
.visible .entry matrix_exp(
.param .u64 matrix_exp_param_0,
.param .u64 matrix_exp_param_1,
.param .u32 matrix_exp_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<21>;
.reg .f64 %fd<41>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [matrix_exp_param_0];
ld.param.u64 %rd3, [matrix_exp_param_1];
ld.param.u32 %r5, [matrix_exp_param_2];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB22_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r2, %temp}, %fd8;
}
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
mov.f64 %fd15, 0d3E928AF3FCA213EA;
mov.f64 %fd16, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd17, %fd16, %fd14, %fd15;
mov.f64 %fd18, 0d3EC71DEE62401315;
fma.rn.f64 %fd19, %fd17, %fd14, %fd18;
mov.f64 %fd20, 0d3EFA01997C89EB71;
fma.rn.f64 %fd21, %fd19, %fd14, %fd20;
mov.f64 %fd22, 0d3F2A01A014761F65;
fma.rn.f64 %fd23, %fd21, %fd14, %fd22;
mov.f64 %fd24, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd25, %fd23, %fd14, %fd24;
mov.f64 %fd26, 0d3F81111111122322;
fma.rn.f64 %fd27, %fd25, %fd14, %fd26;
mov.f64 %fd28, 0d3FA55555555502A1;
fma.rn.f64 %fd29, %fd27, %fd14, %fd28;
mov.f64 %fd30, 0d3FC5555555555511;
fma.rn.f64 %fd31, %fd29, %fd14, %fd30;
mov.f64 %fd32, 0d3FE000000000000B;
fma.rn.f64 %fd33, %fd31, %fd14, %fd32;
mov.f64 %fd34, 0d3FF0000000000000;
fma.rn.f64 %fd35, %fd33, %fd14, %fd34;
fma.rn.f64 %fd36, %fd35, %fd14, %fd34;
{
.reg .b32 %temp;
mov.b64 {%r3, %temp}, %fd36;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd36;
}
shl.b32 %r9, %r2, 20;
add.s32 %r10, %r4, %r9;
mov.b64 %fd40, {%r3, %r10};
{
.reg .b32 %temp;
mov.b64 {%temp, %r11}, %fd1;
}
mov.b32 %f2, %r11;
abs.f32 %f1, %f2;
setp.lt.f32 %p2, %f1, 0f4086232B;
@%p2 bra BB22_4;
setp.lt.f64 %p3, %fd1, 0d0000000000000000;
add.f64 %fd37, %fd1, 0d7FF0000000000000;
selp.f64 %fd40, 0d0000000000000000, %fd37, %p3;
setp.geu.f32 %p4, %f1, 0f40874800;
@%p4 bra BB22_4;
shr.u32 %r12, %r2, 31;
add.s32 %r13, %r2, %r12;
shr.s32 %r14, %r13, 1;
shl.b32 %r15, %r14, 20;
add.s32 %r16, %r15, %r4;
mov.b64 %fd38, {%r3, %r16};
sub.s32 %r17, %r2, %r14;
shl.b32 %r18, %r17, 20;
add.s32 %r19, %r18, 1072693248;
mov.u32 %r20, 0;
mov.b64 %fd39, {%r20, %r19};
mul.f64 %fd40, %fd38, %fd39;
BB22_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd40;
BB22_5:
ret;
}
.func (.param .b64 func_retval0) __internal_accurate_pow(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
{
.reg .pred %p<10>;
.reg .f32 %f<3>;
.reg .b32 %r<52>;
.reg .f64 %fd<134>;
ld.param.f64 %fd12, [__internal_accurate_pow_param_0];
ld.param.f64 %fd13, [__internal_accurate_pow_param_1];
{
.reg .b32 %temp;
mov.b64 {%temp, %r49}, %fd12;
}
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd12;
}
shr.u32 %r50, %r49, 20;
setp.ne.s32 %p1, %r50, 0;
@%p1 bra BB23_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r49}, %fd14;
}
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd14;
}
shr.u32 %r16, %r49, 20;
add.s32 %r50, %r16, -54;
BB23_2:
add.s32 %r51, %r50, -1023;
and.b32 %r17, %r49, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd132, {%r48, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
@%p2 bra BB23_4;
{
.reg .b32 %temp;
mov.b64 {%r19, %temp}, %fd132;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r20}, %fd132;
}
add.s32 %r21, %r20, -1048576;
mov.b64 %fd132, {%r19, %r21};
add.s32 %r51, %r50, -1022;
BB23_4:
add.f64 %fd16, %fd132, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd15,%fd16;
// inline asm
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF0000000000000;
fma.rn.f64 %fd19, %fd17, %fd15, %fd18;
fma.rn.f64 %fd20, %fd19, %fd19, %fd19;
fma.rn.f64 %fd21, %fd20, %fd15, %fd15;
add.f64 %fd22, %fd132, 0dBFF0000000000000;
mul.f64 %fd23, %fd22, %fd21;
fma.rn.f64 %fd24, %fd22, %fd21, %fd23;
mul.f64 %fd25, %fd24, %fd24;
mov.f64 %fd26, 0d3ED0F5D241AD3B5A;
mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2;
fma.rn.f64 %fd28, %fd27, %fd25, %fd26;
mov.f64 %fd29, 0d3EF3B20A75488A3F;
fma.rn.f64 %fd30, %fd28, %fd25, %fd29;
mov.f64 %fd31, 0d3F1745CDE4FAECD5;
fma.rn.f64 %fd32, %fd30, %fd25, %fd31;
mov.f64 %fd33, 0d3F3C71C7258A578B;
fma.rn.f64 %fd34, %fd32, %fd25, %fd33;
mov.f64 %fd35, 0d3F6249249242B910;
fma.rn.f64 %fd36, %fd34, %fd25, %fd35;
mov.f64 %fd37, 0d3F89999999999DFB;
fma.rn.f64 %fd38, %fd36, %fd25, %fd37;
sub.f64 %fd39, %fd22, %fd24;
add.f64 %fd40, %fd39, %fd39;
neg.f64 %fd41, %fd24;
fma.rn.f64 %fd42, %fd41, %fd22, %fd40;
mul.f64 %fd43, %fd21, %fd42;
fma.rn.f64 %fd44, %fd25, %fd38, 0d3FB5555555555555;
mov.f64 %fd45, 0d3FB5555555555555;
sub.f64 %fd46, %fd45, %fd44;
fma.rn.f64 %fd47, %fd25, %fd38, %fd46;
add.f64 %fd48, %fd47, 0d0000000000000000;
add.f64 %fd49, %fd48, 0dBC46A4CB00B9E7B0;
add.f64 %fd50, %fd44, %fd49;
sub.f64 %fd51, %fd44, %fd50;
add.f64 %fd52, %fd49, %fd51;
mul.rn.f64 %fd53, %fd24, %fd24;
neg.f64 %fd54, %fd53;
fma.rn.f64 %fd55, %fd24, %fd24, %fd54;
{
.reg .b32 %temp;
mov.b64 {%r22, %temp}, %fd43;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd43;
}
add.s32 %r24, %r23, 1048576;
mov.b64 %fd56, {%r22, %r24};
fma.rn.f64 %fd57, %fd24, %fd56, %fd55;
mul.rn.f64 %fd58, %fd53, %fd24;
neg.f64 %fd59, %fd58;
fma.rn.f64 %fd60, %fd53, %fd24, %fd59;
fma.rn.f64 %fd61, %fd53, %fd43, %fd60;
fma.rn.f64 %fd62, %fd57, %fd24, %fd61;
mul.rn.f64 %fd63, %fd50, %fd58;
neg.f64 %fd64, %fd63;
fma.rn.f64 %fd65, %fd50, %fd58, %fd64;
fma.rn.f64 %fd66, %fd50, %fd62, %fd65;
fma.rn.f64 %fd67, %fd52, %fd58, %fd66;
add.f64 %fd68, %fd63, %fd67;
sub.f64 %fd69, %fd63, %fd68;
add.f64 %fd70, %fd67, %fd69;
add.f64 %fd71, %fd24, %fd68;
sub.f64 %fd72, %fd24, %fd71;
add.f64 %fd73, %fd68, %fd72;
add.f64 %fd74, %fd70, %fd73;
add.f64 %fd75, %fd43, %fd74;
add.f64 %fd76, %fd71, %fd75;
sub.f64 %fd77, %fd71, %fd76;
add.f64 %fd78, %fd75, %fd77;
xor.b32 %r25, %r51, -2147483648;
mov.u32 %r26, 1127219200;
mov.b64 %fd79, {%r25, %r26};
mov.u32 %r27, -2147483648;
mov.b64 %fd80, {%r27, %r26};
sub.f64 %fd81, %fd79, %fd80;
mov.f64 %fd82, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd83, %fd81, %fd82, %fd76;
neg.f64 %fd84, %fd81;
fma.rn.f64 %fd85, %fd84, %fd82, %fd83;
sub.f64 %fd86, %fd85, %fd76;
sub.f64 %fd87, %fd78, %fd86;
mov.f64 %fd88, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd89, %fd81, %fd88, %fd87;
add.f64 %fd90, %fd83, %fd89;
sub.f64 %fd91, %fd83, %fd90;
add.f64 %fd92, %fd89, %fd91;
{
.reg .b32 %temp;
mov.b64 {%temp, %r28}, %fd13;
}
add.s32 %r29, %r28, %r28;
setp.gt.u32 %p3, %r29, -33554433;
and.b32 %r30, %r28, -15728641;
selp.b32 %r31, %r30, %r28, %p3;
{
.reg .b32 %temp;
mov.b64 {%r32, %temp}, %fd13;
}
mov.b64 %fd93, {%r32, %r31};
mul.rn.f64 %fd94, %fd90, %fd93;
neg.f64 %fd95, %fd94;
fma.rn.f64 %fd96, %fd90, %fd93, %fd95;
fma.rn.f64 %fd97, %fd92, %fd93, %fd96;
add.f64 %fd4, %fd94, %fd97;
sub.f64 %fd98, %fd94, %fd4;
add.f64 %fd5, %fd97, %fd98;
mov.f64 %fd99, 0d4338000000000000;
mov.f64 %fd100, 0d3FF71547652B82FE;
fma.rn.f64 %fd101, %fd4, %fd100, %fd99;
{
.reg .b32 %temp;
mov.b64 {%r13, %temp}, %fd101;
}
mov.f64 %fd102, 0dC338000000000000;
add.rn.f64 %fd103, %fd101, %fd102;
mov.f64 %fd104, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd105, %fd103, %fd104, %fd4;
mov.f64 %fd106, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd107, %fd103, %fd106, %fd105;
mov.f64 %fd108, 0d3E928AF3FCA213EA;
mov.f64 %fd109, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd110, %fd109, %fd107, %fd108;
mov.f64 %fd111, 0d3EC71DEE62401315;
fma.rn.f64 %fd112, %fd110, %fd107, %fd111;
mov.f64 %fd113, 0d3EFA01997C89EB71;
fma.rn.f64 %fd114, %fd112, %fd107, %fd113;
mov.f64 %fd115, 0d3F2A01A014761F65;
fma.rn.f64 %fd116, %fd114, %fd107, %fd115;
mov.f64 %fd117, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd118, %fd116, %fd107, %fd117;
mov.f64 %fd119, 0d3F81111111122322;
fma.rn.f64 %fd120, %fd118, %fd107, %fd119;
mov.f64 %fd121, 0d3FA55555555502A1;
fma.rn.f64 %fd122, %fd120, %fd107, %fd121;
mov.f64 %fd123, 0d3FC5555555555511;
fma.rn.f64 %fd124, %fd122, %fd107, %fd123;
mov.f64 %fd125, 0d3FE000000000000B;
fma.rn.f64 %fd126, %fd124, %fd107, %fd125;
fma.rn.f64 %fd127, %fd126, %fd107, %fd18;
fma.rn.f64 %fd128, %fd127, %fd107, %fd18;
{
.reg .b32 %temp;
mov.b64 {%r14, %temp}, %fd128;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r15}, %fd128;
}
shl.b32 %r33, %r13, 20;
add.s32 %r34, %r15, %r33;
mov.b64 %fd133, {%r14, %r34};
{
.reg .b32 %temp;
mov.b64 {%temp, %r35}, %fd4;
}
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
@%p4 bra BB23_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd133, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
@%p6 bra BB23_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
shr.s32 %r38, %r37, 1;
shl.b32 %r39, %r38, 20;
add.s32 %r40, %r39, %r15;
mov.b64 %fd130, {%r14, %r40};
sub.s32 %r41, %r13, %r38;
shl.b32 %r42, %r41, 20;
add.s32 %r43, %r42, 1072693248;
mov.u32 %r44, 0;
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd133, %fd130, %fd131;
BB23_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd133;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd133;
}
setp.ne.s32 %p8, %r47, 0;
or.pred %p9, %p8, %p7;
@!%p9 bra BB23_9;
bra.uni BB23_8;
BB23_8:
fma.rn.f64 %fd133, %fd133, %fd5, %fd133;
BB23_9:
st.param.f64 [func_retval0+0], %fd133;
ret;
}