All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kernels.SystemML.ptx Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21112126
// Cuda compilation tools, release 8.0, V8.0.43
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_30
.address_size 64

	// .globl	copy_u2l_dense
.func  (.param .b64 func_retval0) __internal_accurate_pow
(
	.param .b64 __internal_accurate_pow_param_0,
	.param .b64 __internal_accurate_pow_param_1
)
;
.extern .shared .align 8 .b8 sdata[];

.visible .entry copy_u2l_dense(
	.param .u64 copy_u2l_dense_param_0,
	.param .u32 copy_u2l_dense_param_1,
	.param .u32 copy_u2l_dense_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<13>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [copy_u2l_dense_param_0];
	ld.param.u32 	%r4, [copy_u2l_dense_param_1];
	ld.param.u32 	%r5, [copy_u2l_dense_param_2];
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r6, %r7, %r8;
	mov.u32 	%r9, %ntid.y;
	mov.u32 	%r10, %ctaid.y;
	mov.u32 	%r11, %tid.y;
	mad.lo.s32 	%r2, %r9, %r10, %r11;
	mad.lo.s32 	%r3, %r2, %r4, %r1;
	setp.gt.s32	%p1, %r2, %r1;
	setp.lt.s32	%p2, %r3, %r5;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB0_2;
	bra.uni 	BB0_1;

BB0_1:
	cvta.to.global.u64 	%rd2, %rd1;
	mad.lo.s32 	%r12, %r1, %r4, %r2;
	mul.wide.s32 	%rd3, %r12, 8;
	add.s64 	%rd4, %rd2, %rd3;
	ld.global.f64 	%fd1, [%rd4];
	mul.wide.s32 	%rd5, %r3, 8;
	add.s64 	%rd6, %rd2, %rd5;
	st.global.f64 	[%rd6], %fd1;

BB0_2:
	ret;
}

	// .globl	relu
.visible .entry relu(
	.param .u64 relu_param_0,
	.param .u64 relu_param_1,
	.param .u32 relu_param_2,
	.param .u32 relu_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<12>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [relu_param_0];
	ld.param.u64 	%rd2, [relu_param_1];
	ld.param.u32 	%r4, [relu_param_2];
	ld.param.u32 	%r3, [relu_param_3];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	mov.u32 	%r8, %ntid.y;
	mov.u32 	%r9, %ctaid.y;
	mov.u32 	%r10, %tid.y;
	mad.lo.s32 	%r2, %r8, %r9, %r10;
	setp.lt.s32	%p1, %r1, %r4;
	setp.lt.s32	%p2, %r2, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB1_2;
	bra.uni 	BB1_1;

BB1_1:
	cvta.to.global.u64 	%rd3, %rd1;
	mad.lo.s32 	%r11, %r1, %r3, %r2;
	mul.wide.s32 	%rd4, %r11, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	mov.f64 	%fd2, 0d0000000000000000;
	max.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd3;

BB1_2:
	ret;
}

	// .globl	relu_backward
.visible .entry relu_backward(
	.param .u64 relu_backward_param_0,
	.param .u64 relu_backward_param_1,
	.param .u64 relu_backward_param_2,
	.param .u32 relu_backward_param_3,
	.param .u32 relu_backward_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<12>;
	.reg .f64 	%fd<6>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd2, [relu_backward_param_0];
	ld.param.u64 	%rd3, [relu_backward_param_1];
	ld.param.u64 	%rd4, [relu_backward_param_2];
	ld.param.u32 	%r4, [relu_backward_param_3];
	ld.param.u32 	%r3, [relu_backward_param_4];
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r5, %r6, %r7;
	mov.u32 	%r8, %ntid.y;
	mov.u32 	%r9, %ctaid.y;
	mov.u32 	%r10, %tid.y;
	mad.lo.s32 	%r2, %r8, %r9, %r10;
	setp.lt.s32	%p1, %r1, %r4;
	setp.lt.s32	%p2, %r2, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB2_4;
	bra.uni 	BB2_1;

BB2_1:
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r11, %r1, %r3, %r2;
	cvt.s64.s32	%rd1, %r11;
	mul.wide.s32 	%rd6, %r11, 8;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f64 	%fd4, [%rd7];
	mov.f64 	%fd5, 0d0000000000000000;
	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
	@%p4 bra 	BB2_3;

	cvta.to.global.u64 	%rd8, %rd3;
	shl.b64 	%rd9, %rd1, 3;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f64 	%fd5, [%rd10];

BB2_3:
	cvta.to.global.u64 	%rd11, %rd4;
	shl.b64 	%rd12, %rd1, 3;
	add.s64 	%rd13, %rd11, %rd12;
	st.global.f64 	[%rd13], %fd5;

BB2_4:
	ret;
}

	// .globl	bias_add
.visible .entry bias_add(
	.param .u64 bias_add_param_0,
	.param .u64 bias_add_param_1,
	.param .u64 bias_add_param_2,
	.param .u32 bias_add_param_3,
	.param .u32 bias_add_param_4,
	.param .u32 bias_add_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<14>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_add_param_0];
	ld.param.u64 	%rd2, [bias_add_param_1];
	ld.param.u64 	%rd3, [bias_add_param_2];
	ld.param.u32 	%r5, [bias_add_param_3];
	ld.param.u32 	%r3, [bias_add_param_4];
	ld.param.u32 	%r4, [bias_add_param_5];
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	mov.u32 	%r9, %ntid.y;
	mov.u32 	%r10, %ctaid.y;
	mov.u32 	%r11, %tid.y;
	mad.lo.s32 	%r2, %r9, %r10, %r11;
	setp.lt.s32	%p1, %r1, %r5;
	setp.lt.s32	%p2, %r2, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB3_2;
	bra.uni 	BB3_1;

BB3_1:
	cvta.to.global.u64 	%rd4, %rd1;
	mad.lo.s32 	%r12, %r1, %r3, %r2;
	mul.wide.s32 	%rd5, %r12, 8;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r13, %r2, %r4;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r13, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	ld.global.f64 	%fd2, [%rd6];
	add.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f64 	[%rd11], %fd3;

BB3_2:
	ret;
}

	// .globl	daxpy_matrix_vector
.visible .entry daxpy_matrix_vector(
	.param .u64 daxpy_matrix_vector_param_0,
	.param .u64 daxpy_matrix_vector_param_1,
	.param .f64 daxpy_matrix_vector_param_2,
	.param .u64 daxpy_matrix_vector_param_3,
	.param .u32 daxpy_matrix_vector_param_4,
	.param .u32 daxpy_matrix_vector_param_5,
	.param .u32 daxpy_matrix_vector_param_6,
	.param .u32 daxpy_matrix_vector_param_7
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<13>;
	.reg .f64 	%fd<7>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd3, [daxpy_matrix_vector_param_0];
	ld.param.u64 	%rd5, [daxpy_matrix_vector_param_1];
	ld.param.f64 	%fd2, [daxpy_matrix_vector_param_2];
	ld.param.u64 	%rd4, [daxpy_matrix_vector_param_3];
	ld.param.u32 	%r5, [daxpy_matrix_vector_param_4];
	ld.param.u32 	%r3, [daxpy_matrix_vector_param_5];
	ld.param.u32 	%r4, [daxpy_matrix_vector_param_6];
	cvta.to.global.u64 	%rd1, %rd5;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r6, %r7, %r8;
	mov.u32 	%r9, %ntid.y;
	mov.u32 	%r10, %ctaid.y;
	mov.u32 	%r11, %tid.y;
	mad.lo.s32 	%r2, %r9, %r10, %r11;
	setp.lt.s32	%p1, %r1, %r5;
	setp.lt.s32	%p2, %r2, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB4_4;
	bra.uni 	BB4_1;

BB4_1:
	cvta.to.global.u64 	%rd6, %rd4;
	mad.lo.s32 	%r12, %r1, %r3, %r2;
	cvta.to.global.u64 	%rd7, %rd3;
	mul.wide.s32 	%rd8, %r12, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	add.s64 	%rd2, %rd6, %rd8;
	setp.eq.s32	%p4, %r4, 1;
	@%p4 bra 	BB4_3;
	bra.uni 	BB4_2;

BB4_3:
	mul.wide.s32 	%rd12, %r2, 8;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.f64 	%fd5, [%rd13];
	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
	st.global.f64 	[%rd2], %fd6;
	bra.uni 	BB4_4;

BB4_2:
	mul.wide.s32 	%rd10, %r1, 8;
	add.s64 	%rd11, %rd1, %rd10;
	ld.global.f64 	%fd3, [%rd11];
	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
	st.global.f64 	[%rd2], %fd4;

BB4_4:
	ret;
}

	// .globl	bias_multiply
.visible .entry bias_multiply(
	.param .u64 bias_multiply_param_0,
	.param .u64 bias_multiply_param_1,
	.param .u64 bias_multiply_param_2,
	.param .u32 bias_multiply_param_3,
	.param .u32 bias_multiply_param_4,
	.param .u32 bias_multiply_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<14>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_multiply_param_0];
	ld.param.u64 	%rd2, [bias_multiply_param_1];
	ld.param.u64 	%rd3, [bias_multiply_param_2];
	ld.param.u32 	%r5, [bias_multiply_param_3];
	ld.param.u32 	%r3, [bias_multiply_param_4];
	ld.param.u32 	%r4, [bias_multiply_param_5];
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	mov.u32 	%r9, %ntid.y;
	mov.u32 	%r10, %ctaid.y;
	mov.u32 	%r11, %tid.y;
	mad.lo.s32 	%r2, %r9, %r10, %r11;
	setp.lt.s32	%p1, %r1, %r5;
	setp.lt.s32	%p2, %r2, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB5_2;
	bra.uni 	BB5_1;

BB5_1:
	cvta.to.global.u64 	%rd4, %rd1;
	mad.lo.s32 	%r12, %r1, %r3, %r2;
	mul.wide.s32 	%rd5, %r12, 8;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r13, %r2, %r4;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r13, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	ld.global.f64 	%fd2, [%rd6];
	mul.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f64 	[%rd11], %fd3;

BB5_2:
	ret;
}

	// .globl	compare_and_set
.visible .entry compare_and_set(
	.param .u64 compare_and_set_param_0,
	.param .u64 compare_and_set_param_1,
	.param .u32 compare_and_set_param_2,
	.param .u32 compare_and_set_param_3,
	.param .f64 compare_and_set_param_4,
	.param .f64 compare_and_set_param_5,
	.param .f64 compare_and_set_param_6,
	.param .f64 compare_and_set_param_7,
	.param .f64 compare_and_set_param_8
)
{
	.reg .pred 	%p<6>;
	.reg .b32 	%r<12>;
	.reg .f64 	%fd<9>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd2, [compare_and_set_param_0];
	ld.param.u64 	%rd3, [compare_and_set_param_1];
	ld.param.u32 	%r2, [compare_and_set_param_2];
	ld.param.u32 	%r3, [compare_and_set_param_3];
	ld.param.f64 	%fd2, [compare_and_set_param_4];
	ld.param.f64 	%fd3, [compare_and_set_param_5];
	ld.param.f64 	%fd4, [compare_and_set_param_6];
	ld.param.f64 	%fd5, [compare_and_set_param_7];
	ld.param.f64 	%fd6, [compare_and_set_param_8];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r7, %r5, %r4, %r6;
	mov.u32 	%r8, %ntid.y;
	mov.u32 	%r9, %ctaid.y;
	mov.u32 	%r10, %tid.y;
	mad.lo.s32 	%r11, %r8, %r9, %r10;
	mad.lo.s32 	%r1, %r7, %r3, %r11;
	setp.lt.s32	%p1, %r7, %r2;
	setp.lt.s32	%p2, %r11, %r3;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB6_6;
	bra.uni 	BB6_1;

BB6_1:
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	sub.f64 	%fd7, %fd1, %fd2;
	abs.f64 	%fd8, %fd7;
	setp.lt.f64	%p4, %fd8, %fd3;
	cvta.to.global.u64 	%rd7, %rd3;
	add.s64 	%rd1, %rd7, %rd5;
	@%p4 bra 	BB6_5;
	bra.uni 	BB6_2;

BB6_5:
	st.global.f64 	[%rd1], %fd4;
	bra.uni 	BB6_6;

BB6_2:
	setp.lt.f64	%p5, %fd1, %fd2;
	@%p5 bra 	BB6_4;
	bra.uni 	BB6_3;

BB6_4:
	st.global.f64 	[%rd1], %fd5;
	bra.uni 	BB6_6;

BB6_3:
	st.global.f64 	[%rd1], %fd6;

BB6_6:
	ret;
}

	// .globl	matrix_matrix_cellwise_op
.visible .entry matrix_matrix_cellwise_op(
	.param .u64 matrix_matrix_cellwise_op_param_0,
	.param .u64 matrix_matrix_cellwise_op_param_1,
	.param .u64 matrix_matrix_cellwise_op_param_2,
	.param .u32 matrix_matrix_cellwise_op_param_3,
	.param .u32 matrix_matrix_cellwise_op_param_4,
	.param .u32 matrix_matrix_cellwise_op_param_5,
	.param .u32 matrix_matrix_cellwise_op_param_6,
	.param .u32 matrix_matrix_cellwise_op_param_7
)
{
	.reg .pred 	%p<54>;
	.reg .b32 	%r<55>;
	.reg .f64 	%fd<39>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
	mov.u32 	%r15, %ntid.x;
	mov.u32 	%r16, %ctaid.x;
	mov.u32 	%r17, %tid.x;
	mad.lo.s32 	%r1, %r15, %r16, %r17;
	mov.u32 	%r18, %ntid.y;
	mov.u32 	%r19, %ctaid.y;
	mov.u32 	%r20, %tid.y;
	mad.lo.s32 	%r2, %r18, %r19, %r20;
	setp.lt.s32	%p2, %r1, %r14;
	setp.lt.s32	%p3, %r2, %r10;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB7_53;
	bra.uni 	BB7_1;

BB7_1:
	mad.lo.s32 	%r3, %r1, %r10, %r2;
	setp.eq.s32	%p5, %r11, 1;
	mov.u32 	%r53, %r1;
	@%p5 bra 	BB7_5;

	setp.ne.s32	%p6, %r11, 2;
	mov.u32 	%r54, %r3;
	@%p6 bra 	BB7_4;

	mov.u32 	%r54, %r2;

BB7_4:
	mov.u32 	%r48, %r54;
	mov.u32 	%r4, %r48;
	mov.u32 	%r53, %r4;

BB7_5:
	mov.u32 	%r5, %r53;
	setp.eq.s32	%p7, %r12, 1;
	mov.u32 	%r51, %r1;
	@%p7 bra 	BB7_9;

	setp.ne.s32	%p8, %r12, 2;
	mov.u32 	%r52, %r3;
	@%p8 bra 	BB7_8;

	mov.u32 	%r52, %r2;

BB7_8:
	mov.u32 	%r51, %r52;

BB7_9:
	cvta.to.global.u64 	%rd5, %rd3;
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r5, 8;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f64 	%fd1, [%rd8];
	mul.wide.s32 	%rd9, %r51, 8;
	add.s64 	%rd10, %rd5, %rd9;
	ld.global.f64 	%fd2, [%rd10];
	mov.f64 	%fd38, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p9, %r13, 5;
	@%p9 bra 	BB7_19;

	setp.gt.s32	%p19, %r13, 2;
	@%p19 bra 	BB7_15;

	setp.eq.s32	%p23, %r13, 0;
	@%p23 bra 	BB7_51;

	setp.eq.s32	%p24, %r13, 1;
	@%p24 bra 	BB7_50;
	bra.uni 	BB7_13;

BB7_50:
	sub.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_19:
	setp.gt.s32	%p10, %r13, 8;
	@%p10 bra 	BB7_24;

	setp.eq.s32	%p16, %r13, 6;
	@%p16 bra 	BB7_34;

	setp.eq.s32	%p17, %r13, 7;
	@%p17 bra 	BB7_33;
	bra.uni 	BB7_22;

BB7_33:
	setp.gt.f64	%p29, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
	bra.uni 	BB7_52;

BB7_15:
	setp.eq.s32	%p20, %r13, 3;
	@%p20 bra 	BB7_49;

	setp.eq.s32	%p21, %r13, 4;
	@%p21 bra 	BB7_35;
	bra.uni 	BB7_17;

BB7_35:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd1;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r9}, %fd2;
	}
	bfe.u32 	%r21, %r9, 20, 11;
	add.s32 	%r22, %r21, -1012;
	mov.b64 	 %rd11, %fd2;
	shl.b64 	%rd1, %rd11, %r22;
	setp.eq.s64	%p32, %rd1, -9223372036854775808;
	abs.f64 	%fd11, %fd1;
	// Callseq Start 0
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd11;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd2;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd37, [retval0+0];
	
	//{
	}// Callseq End 0
	setp.lt.s32	%p33, %r8, 0;
	and.pred  	%p1, %p33, %p32;
	@!%p1 bra 	BB7_37;
	bra.uni 	BB7_36;

BB7_36:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r23}, %fd37;
	}
	xor.b32  	%r24, %r23, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r25, %temp}, %fd37;
	}
	mov.b64 	%fd37, {%r25, %r24};

BB7_37:
	mov.f64 	%fd36, %fd37;
	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
	@%p34 bra 	BB7_40;
	bra.uni 	BB7_38;

BB7_40:
	selp.b32	%r26, %r8, 0, %p32;
	or.b32  	%r27, %r26, 2146435072;
	setp.lt.s32	%p38, %r9, 0;
	selp.b32	%r28, %r27, %r26, %p38;
	mov.u32 	%r29, 0;
	mov.b64 	%fd36, {%r29, %r28};
	bra.uni 	BB7_41;

BB7_24:
	setp.gt.s32	%p11, %r13, 10;
	@%p11 bra 	BB7_28;

	setp.eq.s32	%p14, %r13, 9;
	@%p14 bra 	BB7_32;
	bra.uni 	BB7_26;

BB7_32:
	setp.eq.f64	%p27, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
	bra.uni 	BB7_52;

BB7_28:
	setp.eq.s32	%p12, %r13, 11;
	@%p12 bra 	BB7_31;
	bra.uni 	BB7_29;

BB7_31:
	min.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_51:
	add.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_13:
	setp.eq.s32	%p25, %r13, 2;
	@%p25 bra 	BB7_14;
	bra.uni 	BB7_52;

BB7_14:
	mul.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_34:
	setp.le.f64	%p30, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
	bra.uni 	BB7_52;

BB7_22:
	setp.eq.s32	%p18, %r13, 8;
	@%p18 bra 	BB7_23;
	bra.uni 	BB7_52;

BB7_23:
	setp.ge.f64	%p28, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
	bra.uni 	BB7_52;

BB7_49:
	div.rn.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_17:
	setp.eq.s32	%p22, %r13, 5;
	@%p22 bra 	BB7_18;
	bra.uni 	BB7_52;

BB7_18:
	setp.lt.f64	%p31, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
	bra.uni 	BB7_52;

BB7_26:
	setp.eq.s32	%p15, %r13, 10;
	@%p15 bra 	BB7_27;
	bra.uni 	BB7_52;

BB7_27:
	setp.neu.f64	%p26, %fd1, %fd2;
	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
	bra.uni 	BB7_52;

BB7_29:
	setp.ne.s32	%p13, %r13, 12;
	@%p13 bra 	BB7_52;

	max.f64 	%fd38, %fd1, %fd2;
	bra.uni 	BB7_52;

BB7_38:
	setp.gt.s32	%p35, %r8, -1;
	@%p35 bra 	BB7_41;

	cvt.rzi.f64.f64	%fd29, %fd2;
	setp.neu.f64	%p36, %fd29, %fd2;
	selp.f64	%fd36, 0dFFF8000000000000, %fd36, %p36;

BB7_41:
	mov.f64 	%fd17, %fd36;
	add.f64 	%fd18, %fd1, %fd2;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r30}, %fd18;
	}
	and.b32  	%r31, %r30, 2146435072;
	setp.ne.s32	%p39, %r31, 2146435072;
	mov.f64 	%fd35, %fd17;
	@%p39 bra 	BB7_48;

	setp.gtu.f64	%p40, %fd11, 0d7FF0000000000000;
	mov.f64 	%fd35, %fd18;
	@%p40 bra 	BB7_48;

	abs.f64 	%fd30, %fd2;
	setp.gtu.f64	%p41, %fd30, 0d7FF0000000000000;
	mov.f64 	%fd34, %fd18;
	mov.f64 	%fd35, %fd34;
	@%p41 bra 	BB7_48;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r32, %temp}, %fd2;
	}
	and.b32  	%r33, %r9, 2147483647;
	setp.eq.s32	%p42, %r33, 2146435072;
	setp.eq.s32	%p43, %r32, 0;
	and.pred  	%p44, %p42, %p43;
	@%p44 bra 	BB7_47;
	bra.uni 	BB7_45;

BB7_47:
	setp.gt.f64	%p48, %fd11, 0d3FF0000000000000;
	selp.b32	%r41, 2146435072, 0, %p48;
	xor.b32  	%r42, %r41, 2146435072;
	setp.lt.s32	%p49, %r9, 0;
	selp.b32	%r43, %r42, %r41, %p49;
	setp.eq.f64	%p50, %fd1, 0dBFF0000000000000;
	selp.b32	%r44, 1072693248, %r43, %p50;
	mov.u32 	%r45, 0;
	mov.b64 	%fd35, {%r45, %r44};
	bra.uni 	BB7_48;

BB7_45:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r34, %temp}, %fd1;
	}
	and.b32  	%r35, %r8, 2147483647;
	setp.eq.s32	%p45, %r35, 2146435072;
	setp.eq.s32	%p46, %r34, 0;
	and.pred  	%p47, %p45, %p46;
	mov.f64 	%fd35, %fd17;
	@!%p47 bra 	BB7_48;
	bra.uni 	BB7_46;

BB7_46:
	shr.s32 	%r36, %r9, 31;
	and.b32  	%r37, %r36, -2146435072;
	selp.b32	%r38, -1048576, 2146435072, %p1;
	add.s32 	%r39, %r38, %r37;
	mov.u32 	%r40, 0;
	mov.b64 	%fd35, {%r40, %r39};

BB7_48:
	setp.eq.f64	%p51, %fd2, 0d0000000000000000;
	setp.eq.f64	%p52, %fd1, 0d3FF0000000000000;
	or.pred  	%p53, %p52, %p51;
	selp.f64	%fd38, 0d3FF0000000000000, %fd35, %p53;

BB7_52:
	cvta.to.global.u64 	%rd12, %rd4;
	mul.wide.s32 	%rd13, %r3, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd38;
	bar.sync 	0;

BB7_53:
	ret;
}

	// .globl	matrix_scalar_op
.visible .entry matrix_scalar_op(
	.param .u64 matrix_scalar_op_param_0,
	.param .f64 matrix_scalar_op_param_1,
	.param .u64 matrix_scalar_op_param_2,
	.param .u32 matrix_scalar_op_param_3,
	.param .u32 matrix_scalar_op_param_4,
	.param .u32 matrix_scalar_op_param_5
)
{
	.reg .pred 	%p<95>;
	.reg .b32 	%r<62>;
	.reg .f64 	%fd<75>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
	ld.param.f64 	%fd52, [matrix_scalar_op_param_1];
	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
	mov.u32 	%r9, %ctaid.x;
	mov.u32 	%r10, %ntid.x;
	mov.u32 	%r11, %tid.x;
	mad.lo.s32 	%r1, %r10, %r9, %r11;
	setp.ge.s32	%p3, %r1, %r8;
	@%p3 bra 	BB8_90;

	cvta.to.global.u64 	%rd6, %rd5;
	cvta.to.global.u64 	%rd7, %rd4;
	mul.wide.s32 	%rd8, %r1, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	add.s64 	%rd1, %rd6, %rd8;
	setp.eq.s32	%p4, %r7, 0;
	@%p4 bra 	BB8_46;

	mov.f64 	%fd66, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p5, %r6, 5;
	@%p5 bra 	BB8_12;

	setp.gt.s32	%p15, %r6, 2;
	@%p15 bra 	BB8_8;

	setp.eq.s32	%p19, %r6, 0;
	@%p19 bra 	BB8_44;

	setp.eq.s32	%p20, %r6, 1;
	@%p20 bra 	BB8_43;
	bra.uni 	BB8_6;

BB8_43:
	sub.f64 	%fd66, %fd52, %fd1;
	bra.uni 	BB8_45;

BB8_46:
	mov.f64 	%fd74, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p50, %r6, 5;
	@%p50 bra 	BB8_56;

	setp.gt.s32	%p60, %r6, 2;
	@%p60 bra 	BB8_52;

	setp.eq.s32	%p64, %r6, 0;
	@%p64 bra 	BB8_88;

	setp.eq.s32	%p65, %r6, 1;
	@%p65 bra 	BB8_87;
	bra.uni 	BB8_50;

BB8_87:
	sub.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_12:
	setp.gt.s32	%p6, %r6, 8;
	@%p6 bra 	BB8_17;

	setp.eq.s32	%p12, %r6, 6;
	@%p12 bra 	BB8_27;

	setp.eq.s32	%p13, %r6, 7;
	@%p13 bra 	BB8_26;
	bra.uni 	BB8_15;

BB8_26:
	setp.lt.f64	%p25, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
	bra.uni 	BB8_45;

BB8_56:
	setp.gt.s32	%p51, %r6, 8;
	@%p51 bra 	BB8_61;

	setp.eq.s32	%p57, %r6, 6;
	@%p57 bra 	BB8_71;

	setp.eq.s32	%p58, %r6, 7;
	@%p58 bra 	BB8_70;
	bra.uni 	BB8_59;

BB8_70:
	setp.gt.f64	%p70, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
	bra.uni 	BB8_89;

BB8_8:
	setp.eq.s32	%p16, %r6, 3;
	@%p16 bra 	BB8_42;

	setp.eq.s32	%p17, %r6, 4;
	@%p17 bra 	BB8_28;
	bra.uni 	BB8_10;

BB8_28:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd52;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r3}, %fd1;
	}
	bfe.u32 	%r12, %r3, 20, 11;
	add.s32 	%r13, %r12, -1012;
	mov.b64 	 %rd10, %fd1;
	shl.b64 	%rd2, %rd10, %r13;
	setp.eq.s64	%p28, %rd2, -9223372036854775808;
	abs.f64 	%fd10, %fd52;
	// Callseq Start 1
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd10;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd1;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd65, [retval0+0];
	
	//{
	}// Callseq End 1
	setp.lt.s32	%p29, %r2, 0;
	and.pred  	%p1, %p29, %p28;
	@!%p1 bra 	BB8_30;
	bra.uni 	BB8_29;

BB8_29:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r14}, %fd65;
	}
	xor.b32  	%r15, %r14, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r16, %temp}, %fd65;
	}
	mov.b64 	%fd65, {%r16, %r15};

BB8_30:
	mov.f64 	%fd64, %fd65;
	setp.eq.f64	%p30, %fd52, 0d0000000000000000;
	@%p30 bra 	BB8_33;
	bra.uni 	BB8_31;

BB8_33:
	selp.b32	%r17, %r2, 0, %p28;
	or.b32  	%r18, %r17, 2146435072;
	setp.lt.s32	%p34, %r3, 0;
	selp.b32	%r19, %r18, %r17, %p34;
	mov.u32 	%r20, 0;
	mov.b64 	%fd64, {%r20, %r19};
	bra.uni 	BB8_34;

BB8_17:
	setp.gt.s32	%p7, %r6, 10;
	@%p7 bra 	BB8_21;

	setp.eq.s32	%p10, %r6, 9;
	@%p10 bra 	BB8_25;
	bra.uni 	BB8_19;

BB8_25:
	setp.eq.f64	%p23, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
	bra.uni 	BB8_45;

BB8_52:
	setp.eq.s32	%p61, %r6, 3;
	@%p61 bra 	BB8_86;

	setp.eq.s32	%p62, %r6, 4;
	@%p62 bra 	BB8_72;
	bra.uni 	BB8_54;

BB8_72:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r4}, %fd1;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd52;
	}
	bfe.u32 	%r37, %r5, 20, 11;
	add.s32 	%r38, %r37, -1012;
	mov.b64 	 %rd11, %fd52;
	shl.b64 	%rd3, %rd11, %r38;
	setp.eq.s64	%p73, %rd3, -9223372036854775808;
	abs.f64 	%fd35, %fd1;
	// Callseq Start 2
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd35;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd52;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd73, [retval0+0];
	
	//{
	}// Callseq End 2
	setp.lt.s32	%p74, %r4, 0;
	and.pred  	%p2, %p74, %p73;
	@!%p2 bra 	BB8_74;
	bra.uni 	BB8_73;

BB8_73:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r39}, %fd73;
	}
	xor.b32  	%r40, %r39, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r41, %temp}, %fd73;
	}
	mov.b64 	%fd73, {%r41, %r40};

BB8_74:
	mov.f64 	%fd72, %fd73;
	setp.eq.f64	%p75, %fd1, 0d0000000000000000;
	@%p75 bra 	BB8_77;
	bra.uni 	BB8_75;

BB8_77:
	selp.b32	%r42, %r4, 0, %p73;
	or.b32  	%r43, %r42, 2146435072;
	setp.lt.s32	%p79, %r5, 0;
	selp.b32	%r44, %r43, %r42, %p79;
	mov.u32 	%r45, 0;
	mov.b64 	%fd72, {%r45, %r44};
	bra.uni 	BB8_78;

BB8_61:
	setp.gt.s32	%p52, %r6, 10;
	@%p52 bra 	BB8_65;

	setp.eq.s32	%p55, %r6, 9;
	@%p55 bra 	BB8_69;
	bra.uni 	BB8_63;

BB8_69:
	setp.eq.f64	%p68, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
	bra.uni 	BB8_89;

BB8_21:
	setp.eq.s32	%p8, %r6, 11;
	@%p8 bra 	BB8_24;
	bra.uni 	BB8_22;

BB8_24:
	min.f64 	%fd66, %fd52, %fd1;
	bra.uni 	BB8_45;

BB8_44:
	add.f64 	%fd66, %fd1, %fd52;
	bra.uni 	BB8_45;

BB8_6:
	setp.eq.s32	%p21, %r6, 2;
	@%p21 bra 	BB8_7;
	bra.uni 	BB8_45;

BB8_7:
	mul.f64 	%fd66, %fd1, %fd52;
	bra.uni 	BB8_45;

BB8_27:
	setp.ge.f64	%p26, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
	bra.uni 	BB8_45;

BB8_15:
	setp.eq.s32	%p14, %r6, 8;
	@%p14 bra 	BB8_16;
	bra.uni 	BB8_45;

BB8_16:
	setp.le.f64	%p24, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
	bra.uni 	BB8_45;

BB8_42:
	div.rn.f64 	%fd66, %fd52, %fd1;
	bra.uni 	BB8_45;

BB8_10:
	setp.eq.s32	%p18, %r6, 5;
	@%p18 bra 	BB8_11;
	bra.uni 	BB8_45;

BB8_11:
	setp.gt.f64	%p27, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
	bra.uni 	BB8_45;

BB8_65:
	setp.eq.s32	%p53, %r6, 11;
	@%p53 bra 	BB8_68;
	bra.uni 	BB8_66;

BB8_68:
	min.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_19:
	setp.eq.s32	%p11, %r6, 10;
	@%p11 bra 	BB8_20;
	bra.uni 	BB8_45;

BB8_20:
	setp.neu.f64	%p22, %fd1, %fd52;
	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
	bra.uni 	BB8_45;

BB8_22:
	setp.ne.s32	%p9, %r6, 12;
	@%p9 bra 	BB8_45;

	max.f64 	%fd66, %fd52, %fd1;
	bra.uni 	BB8_45;

BB8_88:
	add.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_50:
	setp.eq.s32	%p66, %r6, 2;
	@%p66 bra 	BB8_51;
	bra.uni 	BB8_89;

BB8_51:
	mul.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_71:
	setp.le.f64	%p71, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
	bra.uni 	BB8_89;

BB8_59:
	setp.eq.s32	%p59, %r6, 8;
	@%p59 bra 	BB8_60;
	bra.uni 	BB8_89;

BB8_60:
	setp.ge.f64	%p69, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
	bra.uni 	BB8_89;

BB8_86:
	div.rn.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_54:
	setp.eq.s32	%p63, %r6, 5;
	@%p63 bra 	BB8_55;
	bra.uni 	BB8_89;

BB8_55:
	setp.lt.f64	%p72, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
	bra.uni 	BB8_89;

BB8_63:
	setp.eq.s32	%p56, %r6, 10;
	@%p56 bra 	BB8_64;
	bra.uni 	BB8_89;

BB8_64:
	setp.neu.f64	%p67, %fd1, %fd52;
	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
	bra.uni 	BB8_89;

BB8_66:
	setp.ne.s32	%p54, %r6, 12;
	@%p54 bra 	BB8_89;

	max.f64 	%fd74, %fd1, %fd52;
	bra.uni 	BB8_89;

BB8_31:
	setp.gt.s32	%p31, %r2, -1;
	@%p31 bra 	BB8_34;

	cvt.rzi.f64.f64	%fd54, %fd1;
	setp.neu.f64	%p32, %fd54, %fd1;
	selp.f64	%fd64, 0dFFF8000000000000, %fd64, %p32;

BB8_34:
	mov.f64 	%fd16, %fd64;
	add.f64 	%fd17, %fd1, %fd52;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r21}, %fd17;
	}
	and.b32  	%r22, %r21, 2146435072;
	setp.ne.s32	%p35, %r22, 2146435072;
	mov.f64 	%fd63, %fd16;
	@%p35 bra 	BB8_41;

	setp.gtu.f64	%p36, %fd10, 0d7FF0000000000000;
	mov.f64 	%fd63, %fd17;
	@%p36 bra 	BB8_41;

	abs.f64 	%fd55, %fd1;
	setp.gtu.f64	%p37, %fd55, 0d7FF0000000000000;
	mov.f64 	%fd62, %fd17;
	mov.f64 	%fd63, %fd62;
	@%p37 bra 	BB8_41;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r23, %temp}, %fd1;
	}
	and.b32  	%r24, %r3, 2147483647;
	setp.eq.s32	%p38, %r24, 2146435072;
	setp.eq.s32	%p39, %r23, 0;
	and.pred  	%p40, %p38, %p39;
	@%p40 bra 	BB8_40;
	bra.uni 	BB8_38;

BB8_40:
	setp.gt.f64	%p44, %fd10, 0d3FF0000000000000;
	selp.b32	%r32, 2146435072, 0, %p44;
	xor.b32  	%r33, %r32, 2146435072;
	setp.lt.s32	%p45, %r3, 0;
	selp.b32	%r34, %r33, %r32, %p45;
	setp.eq.f64	%p46, %fd52, 0dBFF0000000000000;
	selp.b32	%r35, 1072693248, %r34, %p46;
	mov.u32 	%r36, 0;
	mov.b64 	%fd63, {%r36, %r35};
	bra.uni 	BB8_41;

BB8_75:
	setp.gt.s32	%p76, %r4, -1;
	@%p76 bra 	BB8_78;

	cvt.rzi.f64.f64	%fd57, %fd52;
	setp.neu.f64	%p77, %fd57, %fd52;
	selp.f64	%fd72, 0dFFF8000000000000, %fd72, %p77;

BB8_78:
	mov.f64 	%fd41, %fd72;
	add.f64 	%fd42, %fd1, %fd52;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r46}, %fd42;
	}
	and.b32  	%r47, %r46, 2146435072;
	setp.ne.s32	%p80, %r47, 2146435072;
	mov.f64 	%fd71, %fd41;
	@%p80 bra 	BB8_85;

	setp.gtu.f64	%p81, %fd35, 0d7FF0000000000000;
	mov.f64 	%fd71, %fd42;
	@%p81 bra 	BB8_85;

	abs.f64 	%fd58, %fd52;
	setp.gtu.f64	%p82, %fd58, 0d7FF0000000000000;
	mov.f64 	%fd70, %fd42;
	mov.f64 	%fd71, %fd70;
	@%p82 bra 	BB8_85;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r48, %temp}, %fd52;
	}
	and.b32  	%r49, %r5, 2147483647;
	setp.eq.s32	%p83, %r49, 2146435072;
	setp.eq.s32	%p84, %r48, 0;
	and.pred  	%p85, %p83, %p84;
	@%p85 bra 	BB8_84;
	bra.uni 	BB8_82;

BB8_84:
	setp.gt.f64	%p89, %fd35, 0d3FF0000000000000;
	selp.b32	%r57, 2146435072, 0, %p89;
	xor.b32  	%r58, %r57, 2146435072;
	setp.lt.s32	%p90, %r5, 0;
	selp.b32	%r59, %r58, %r57, %p90;
	setp.eq.f64	%p91, %fd1, 0dBFF0000000000000;
	selp.b32	%r60, 1072693248, %r59, %p91;
	mov.u32 	%r61, 0;
	mov.b64 	%fd71, {%r61, %r60};
	bra.uni 	BB8_85;

BB8_38:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r25, %temp}, %fd52;
	}
	and.b32  	%r26, %r2, 2147483647;
	setp.eq.s32	%p41, %r26, 2146435072;
	setp.eq.s32	%p42, %r25, 0;
	and.pred  	%p43, %p41, %p42;
	mov.f64 	%fd63, %fd16;
	@!%p43 bra 	BB8_41;
	bra.uni 	BB8_39;

BB8_39:
	shr.s32 	%r27, %r3, 31;
	and.b32  	%r28, %r27, -2146435072;
	selp.b32	%r29, -1048576, 2146435072, %p1;
	add.s32 	%r30, %r29, %r28;
	mov.u32 	%r31, 0;
	mov.b64 	%fd63, {%r31, %r30};

BB8_41:
	setp.eq.f64	%p47, %fd1, 0d0000000000000000;
	setp.eq.f64	%p48, %fd52, 0d3FF0000000000000;
	or.pred  	%p49, %p48, %p47;
	selp.f64	%fd66, 0d3FF0000000000000, %fd63, %p49;

BB8_45:
	st.global.f64 	[%rd1], %fd66;
	bra.uni 	BB8_90;

BB8_82:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r50, %temp}, %fd1;
	}
	and.b32  	%r51, %r4, 2147483647;
	setp.eq.s32	%p86, %r51, 2146435072;
	setp.eq.s32	%p87, %r50, 0;
	and.pred  	%p88, %p86, %p87;
	mov.f64 	%fd71, %fd41;
	@!%p88 bra 	BB8_85;
	bra.uni 	BB8_83;

BB8_83:
	shr.s32 	%r52, %r5, 31;
	and.b32  	%r53, %r52, -2146435072;
	selp.b32	%r54, -1048576, 2146435072, %p2;
	add.s32 	%r55, %r54, %r53;
	mov.u32 	%r56, 0;
	mov.b64 	%fd71, {%r56, %r55};

BB8_85:
	setp.eq.f64	%p92, %fd52, 0d0000000000000000;
	setp.eq.f64	%p93, %fd1, 0d3FF0000000000000;
	or.pred  	%p94, %p93, %p92;
	selp.f64	%fd74, 0d3FF0000000000000, %fd71, %p94;

BB8_89:
	st.global.f64 	[%rd1], %fd74;

BB8_90:
	bar.sync 	0;
	ret;
}

	// .globl	fill
.visible .entry fill(
	.param .u64 fill_param_0,
	.param .f64 fill_param_1,
	.param .u32 fill_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<5>;


	ld.param.u64 	%rd1, [fill_param_0];
	ld.param.f64 	%fd1, [fill_param_1];
	ld.param.u32 	%r2, [fill_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB9_2;

	cvta.to.global.u64 	%rd2, %rd1;
	mul.wide.s32 	%rd3, %r1, 8;
	add.s64 	%rd4, %rd2, %rd3;
	st.global.f64 	[%rd4], %fd1;

BB9_2:
	ret;
}

	// .globl	reduce_sum
.visible .entry reduce_sum(
	.param .u64 reduce_sum_param_0,
	.param .u64 reduce_sum_param_1,
	.param .u32 reduce_sum_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_sum_param_0];
	ld.param.u64 	%rd3, [reduce_sum_param_1];
	ld.param.u32 	%r5, [reduce_sum_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d0000000000000000;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB10_4;

BB10_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	add.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB10_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	add.f64 	%fd78, %fd78, %fd31;

BB10_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB10_1;

BB10_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB10_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB10_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	add.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB10_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB10_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB10_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB10_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	add.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB10_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB10_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB10_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB10_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	add.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB10_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB10_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB10_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB10_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	add.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB10_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB10_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB10_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB10_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	add.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB10_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB10_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	add.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB10_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB10_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	add.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB10_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB10_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	add.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB10_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB10_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	add.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB10_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB10_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	add.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB10_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB10_35;

	ld.shared.f64 	%fd43, [sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB10_35:
	ret;
}

	// .globl	reduce_row_sum
.visible .entry reduce_row_sum(
	.param .u64 reduce_row_sum_param_0,
	.param .u64 reduce_row_sum_param_1,
	.param .u32 reduce_row_sum_param_2,
	.param .u32 reduce_row_sum_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_sum_param_0];
	ld.param.u64 	%rd2, [reduce_row_sum_param_1];
	ld.param.u32 	%r5, [reduce_row_sum_param_2];
	ld.param.u32 	%r4, [reduce_row_sum_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB11_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0d0000000000000000;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB11_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB11_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	add.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB11_3;

BB11_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB11_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB11_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	add.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB11_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB11_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB11_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB11_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	add.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB11_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB11_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB11_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB11_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	add.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB11_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB11_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB11_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB11_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	add.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB11_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB11_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB11_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB11_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	add.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB11_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB11_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	add.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB11_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB11_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	add.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB11_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB11_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	add.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB11_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB11_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	add.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB11_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB11_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	add.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB11_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB11_35;

	ld.shared.f64 	%fd40, [sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB11_35:
	ret;
}

	// .globl	reduce_col_sum
.visible .entry reduce_col_sum(
	.param .u64 reduce_col_sum_param_0,
	.param .u64 reduce_col_sum_param_1,
	.param .u32 reduce_col_sum_param_2,
	.param .u32 reduce_col_sum_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_sum_param_0];
	ld.param.u64 	%rd3, [reduce_col_sum_param_1];
	ld.param.u32 	%r5, [reduce_col_sum_param_2];
	ld.param.u32 	%r6, [reduce_col_sum_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB12_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0d0000000000000000;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB12_4;

	mov.u32 	%r10, %r1;

BB12_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	add.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB12_3;

BB12_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB12_5:
	ret;
}

	// .globl	reduce_max
.visible .entry reduce_max(
	.param .u64 reduce_max_param_0,
	.param .u64 reduce_max_param_1,
	.param .u32 reduce_max_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_max_param_0];
	ld.param.u64 	%rd3, [reduce_max_param_1];
	ld.param.u32 	%r5, [reduce_max_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB13_4;

BB13_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	max.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB13_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	max.f64 	%fd78, %fd78, %fd31;

BB13_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB13_1;

BB13_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB13_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB13_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	max.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB13_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB13_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB13_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB13_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	max.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB13_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB13_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB13_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB13_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	max.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB13_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB13_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB13_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB13_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	max.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB13_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB13_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB13_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB13_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	max.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB13_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB13_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	max.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB13_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB13_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	max.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB13_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB13_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	max.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB13_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB13_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	max.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB13_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB13_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	max.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB13_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB13_35;

	ld.shared.f64 	%fd43, [sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB13_35:
	ret;
}

	// .globl	reduce_row_max
.visible .entry reduce_row_max(
	.param .u64 reduce_row_max_param_0,
	.param .u64 reduce_row_max_param_1,
	.param .u32 reduce_row_max_param_2,
	.param .u32 reduce_row_max_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_max_param_0];
	ld.param.u64 	%rd2, [reduce_row_max_param_1];
	ld.param.u32 	%r5, [reduce_row_max_param_2];
	ld.param.u32 	%r4, [reduce_row_max_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB14_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB14_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB14_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	max.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB14_3;

BB14_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB14_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB14_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	max.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB14_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB14_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB14_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB14_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	max.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB14_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB14_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB14_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB14_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	max.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB14_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB14_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB14_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB14_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	max.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB14_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB14_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB14_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB14_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	max.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB14_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB14_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	max.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB14_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB14_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	max.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB14_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB14_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	max.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB14_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB14_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	max.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB14_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB14_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	max.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB14_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB14_35;

	ld.shared.f64 	%fd40, [sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB14_35:
	ret;
}

	// .globl	reduce_col_max
.visible .entry reduce_col_max(
	.param .u64 reduce_col_max_param_0,
	.param .u64 reduce_col_max_param_1,
	.param .u32 reduce_col_max_param_2,
	.param .u32 reduce_col_max_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_max_param_0];
	ld.param.u64 	%rd3, [reduce_col_max_param_1];
	ld.param.u32 	%r5, [reduce_col_max_param_2];
	ld.param.u32 	%r6, [reduce_col_max_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB15_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB15_4;

	mov.u32 	%r10, %r1;

BB15_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	max.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB15_3;

BB15_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB15_5:
	ret;
}

	// .globl	reduce_min
.visible .entry reduce_min(
	.param .u64 reduce_min_param_0,
	.param .u64 reduce_min_param_1,
	.param .u32 reduce_min_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_min_param_0];
	ld.param.u64 	%rd3, [reduce_min_param_1];
	ld.param.u32 	%r5, [reduce_min_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB16_4;

BB16_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	min.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB16_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	min.f64 	%fd78, %fd78, %fd31;

BB16_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB16_1;

BB16_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB16_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB16_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	min.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB16_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB16_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB16_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB16_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	min.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB16_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB16_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB16_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB16_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	min.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB16_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB16_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB16_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB16_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	min.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB16_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB16_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB16_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB16_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	min.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB16_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB16_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	min.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB16_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB16_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	min.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB16_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB16_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	min.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB16_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB16_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	min.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB16_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB16_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	min.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB16_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB16_35;

	ld.shared.f64 	%fd43, [sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB16_35:
	ret;
}

	// .globl	reduce_row_min
.visible .entry reduce_row_min(
	.param .u64 reduce_row_min_param_0,
	.param .u64 reduce_row_min_param_1,
	.param .u32 reduce_row_min_param_2,
	.param .u32 reduce_row_min_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_min_param_0];
	ld.param.u64 	%rd2, [reduce_row_min_param_1];
	ld.param.u32 	%r5, [reduce_row_min_param_2];
	ld.param.u32 	%r4, [reduce_row_min_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB17_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB17_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB17_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	min.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB17_3;

BB17_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB17_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB17_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	min.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB17_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB17_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB17_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB17_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	min.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB17_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB17_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB17_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB17_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	min.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB17_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB17_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB17_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB17_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	min.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB17_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB17_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB17_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB17_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	min.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB17_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB17_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	min.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB17_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB17_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	min.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB17_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB17_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	min.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB17_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB17_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	min.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB17_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB17_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	min.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB17_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB17_35;

	ld.shared.f64 	%fd40, [sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB17_35:
	ret;
}

	// .globl	reduce_col_min
.visible .entry reduce_col_min(
	.param .u64 reduce_col_min_param_0,
	.param .u64 reduce_col_min_param_1,
	.param .u32 reduce_col_min_param_2,
	.param .u32 reduce_col_min_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_min_param_0];
	ld.param.u64 	%rd3, [reduce_col_min_param_1];
	ld.param.u32 	%r5, [reduce_col_min_param_2];
	ld.param.u32 	%r6, [reduce_col_min_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB18_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB18_4;

	mov.u32 	%r10, %r1;

BB18_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	min.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB18_3;

BB18_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB18_5:
	ret;
}

	// .globl	reduce_prod
.visible .entry reduce_prod(
	.param .u64 reduce_prod_param_0,
	.param .u64 reduce_prod_param_1,
	.param .u32 reduce_prod_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_prod_param_0];
	ld.param.u64 	%rd3, [reduce_prod_param_1];
	ld.param.u32 	%r5, [reduce_prod_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d3FF0000000000000;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB19_4;

BB19_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	mul.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB19_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	mul.f64 	%fd78, %fd78, %fd31;

BB19_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB19_1;

BB19_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB19_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB19_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	mul.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB19_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB19_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB19_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB19_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	mul.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB19_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB19_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB19_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB19_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	mul.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB19_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB19_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB19_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB19_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	mul.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB19_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB19_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB19_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB19_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	mul.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB19_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB19_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	mul.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB19_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB19_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	mul.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB19_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB19_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	mul.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB19_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB19_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	mul.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB19_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB19_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	mul.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB19_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB19_35;

	ld.shared.f64 	%fd43, [sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB19_35:
	ret;
}

	// .globl	reduce_row_mean
.visible .entry reduce_row_mean(
	.param .u64 reduce_row_mean_param_0,
	.param .u64 reduce_row_mean_param_1,
	.param .u32 reduce_row_mean_param_2,
	.param .u32 reduce_row_mean_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<76>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_mean_param_0];
	ld.param.u64 	%rd2, [reduce_row_mean_param_1];
	ld.param.u32 	%r5, [reduce_row_mean_param_2];
	ld.param.u32 	%r4, [reduce_row_mean_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB20_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd74, 0d0000000000000000;
	mov.f64 	%fd75, %fd74;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB20_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB20_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	add.f64 	%fd75, %fd75, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd74, %fd75;
	@%p3 bra 	BB20_3;

BB20_4:
	mov.f64 	%fd72, %fd74;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd72;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB20_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd73, %fd72;
	@%p5 bra 	BB20_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	add.f64 	%fd73, %fd72, %fd29;
	st.shared.f64 	[%rd8], %fd73;

BB20_7:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB20_8:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB20_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd71, %fd70;
	@%p7 bra 	BB20_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	add.f64 	%fd71, %fd70, %fd30;
	st.shared.f64 	[%rd8], %fd71;

BB20_11:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB20_12:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB20_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd69, %fd68;
	@%p9 bra 	BB20_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	add.f64 	%fd69, %fd68, %fd31;
	st.shared.f64 	[%rd8], %fd69;

BB20_15:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB20_16:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB20_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd67, %fd66;
	@%p11 bra 	BB20_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	add.f64 	%fd67, %fd66, %fd32;
	st.shared.f64 	[%rd8], %fd67;

BB20_19:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB20_20:
	mov.f64 	%fd65, %fd66;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB20_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB20_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	add.f64 	%fd65, %fd65, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd65;

BB20_23:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB20_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	add.f64 	%fd64, %fd64, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd64;

BB20_25:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB20_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	add.f64 	%fd63, %fd63, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB20_27:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB20_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	add.f64 	%fd62, %fd62, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB20_29:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB20_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	add.f64 	%fd61, %fd61, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB20_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB20_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	add.f64 	%fd39, %fd61, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB20_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB20_35;

	ld.shared.f64 	%fd40, [sdata];
	cvt.rn.f64.s32	%fd41, %r4;
	div.rn.f64 	%fd42, %fd40, %fd41;
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd42;

BB20_35:
	ret;
}

	// .globl	reduce_col_mean
.visible .entry reduce_col_mean(
	.param .u64 reduce_col_mean_param_0,
	.param .u64 reduce_col_mean_param_1,
	.param .u32 reduce_col_mean_param_2,
	.param .u32 reduce_col_mean_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<12>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_mean_param_0];
	ld.param.u64 	%rd3, [reduce_col_mean_param_1];
	ld.param.u32 	%r5, [reduce_col_mean_param_2];
	ld.param.u32 	%r6, [reduce_col_mean_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB21_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd10, 0d0000000000000000;
	mov.f64 	%fd11, %fd10;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB21_4;

	mov.u32 	%r10, %r1;

BB21_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	add.f64 	%fd11, %fd11, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd10, %fd11;
	@%p3 bra 	BB21_3;

BB21_4:
	cvta.to.global.u64 	%rd6, %rd3;
	cvt.rn.f64.s32	%fd7, %r5;
	div.rn.f64 	%fd8, %fd10, %fd7;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB21_5:
	ret;
}

	// .globl	matrix_exp
.visible .entry matrix_exp(
	.param .u64 matrix_exp_param_0,
	.param .u64 matrix_exp_param_1,
	.param .u32 matrix_exp_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<21>;
	.reg .f64 	%fd<41>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_exp_param_0];
	ld.param.u64 	%rd3, [matrix_exp_param_1];
	ld.param.u32 	%r5, [matrix_exp_param_2];
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	setp.ge.u32	%p1, %r1, %r5;
	@%p1 bra 	BB22_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	mov.f64 	%fd6, 0d4338000000000000;
	mov.f64 	%fd7, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd8, %fd1, %fd7, %fd6;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r2, %temp}, %fd8;
	}
	mov.f64 	%fd9, 0dC338000000000000;
	add.rn.f64 	%fd10, %fd8, %fd9;
	mov.f64 	%fd11, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd12, %fd10, %fd11, %fd1;
	mov.f64 	%fd13, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd14, %fd10, %fd13, %fd12;
	mov.f64 	%fd15, 0d3E928AF3FCA213EA;
	mov.f64 	%fd16, 0d3E5ADE1569CE2BDF;
	fma.rn.f64 	%fd17, %fd16, %fd14, %fd15;
	mov.f64 	%fd18, 0d3EC71DEE62401315;
	fma.rn.f64 	%fd19, %fd17, %fd14, %fd18;
	mov.f64 	%fd20, 0d3EFA01997C89EB71;
	fma.rn.f64 	%fd21, %fd19, %fd14, %fd20;
	mov.f64 	%fd22, 0d3F2A01A014761F65;
	fma.rn.f64 	%fd23, %fd21, %fd14, %fd22;
	mov.f64 	%fd24, 0d3F56C16C1852B7AF;
	fma.rn.f64 	%fd25, %fd23, %fd14, %fd24;
	mov.f64 	%fd26, 0d3F81111111122322;
	fma.rn.f64 	%fd27, %fd25, %fd14, %fd26;
	mov.f64 	%fd28, 0d3FA55555555502A1;
	fma.rn.f64 	%fd29, %fd27, %fd14, %fd28;
	mov.f64 	%fd30, 0d3FC5555555555511;
	fma.rn.f64 	%fd31, %fd29, %fd14, %fd30;
	mov.f64 	%fd32, 0d3FE000000000000B;
	fma.rn.f64 	%fd33, %fd31, %fd14, %fd32;
	mov.f64 	%fd34, 0d3FF0000000000000;
	fma.rn.f64 	%fd35, %fd33, %fd14, %fd34;
	fma.rn.f64 	%fd36, %fd35, %fd14, %fd34;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r3, %temp}, %fd36;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r4}, %fd36;
	}
	shl.b32 	%r9, %r2, 20;
	add.s32 	%r10, %r4, %r9;
	mov.b64 	%fd40, {%r3, %r10};
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r11}, %fd1;
	}
	mov.b32 	 %f2, %r11;
	abs.f32 	%f1, %f2;
	setp.lt.f32	%p2, %f1, 0f4086232B;
	@%p2 bra 	BB22_4;

	setp.lt.f64	%p3, %fd1, 0d0000000000000000;
	add.f64 	%fd37, %fd1, 0d7FF0000000000000;
	selp.f64	%fd40, 0d0000000000000000, %fd37, %p3;
	setp.geu.f32	%p4, %f1, 0f40874800;
	@%p4 bra 	BB22_4;

	shr.u32 	%r12, %r2, 31;
	add.s32 	%r13, %r2, %r12;
	shr.s32 	%r14, %r13, 1;
	shl.b32 	%r15, %r14, 20;
	add.s32 	%r16, %r15, %r4;
	mov.b64 	%fd38, {%r3, %r16};
	sub.s32 	%r17, %r2, %r14;
	shl.b32 	%r18, %r17, 20;
	add.s32 	%r19, %r18, 1072693248;
	mov.u32 	%r20, 0;
	mov.b64 	%fd39, {%r20, %r19};
	mul.f64 	%fd40, %fd38, %fd39;

BB22_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd40;

BB22_5:
	ret;
}

.func  (.param .b64 func_retval0) __internal_accurate_pow(
	.param .b64 __internal_accurate_pow_param_0,
	.param .b64 __internal_accurate_pow_param_1
)
{
	.reg .pred 	%p<10>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<52>;
	.reg .f64 	%fd<134>;


	ld.param.f64 	%fd12, [__internal_accurate_pow_param_0];
	ld.param.f64 	%fd13, [__internal_accurate_pow_param_1];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r49}, %fd12;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r48, %temp}, %fd12;
	}
	shr.u32 	%r50, %r49, 20;
	setp.ne.s32	%p1, %r50, 0;
	@%p1 bra 	BB23_2;

	mul.f64 	%fd14, %fd12, 0d4350000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r49}, %fd14;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r48, %temp}, %fd14;
	}
	shr.u32 	%r16, %r49, 20;
	add.s32 	%r50, %r16, -54;

BB23_2:
	add.s32 	%r51, %r50, -1023;
	and.b32  	%r17, %r49, -2146435073;
	or.b32  	%r18, %r17, 1072693248;
	mov.b64 	%fd132, {%r48, %r18};
	setp.lt.u32	%p2, %r18, 1073127583;
	@%p2 bra 	BB23_4;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r19, %temp}, %fd132;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r20}, %fd132;
	}
	add.s32 	%r21, %r20, -1048576;
	mov.b64 	%fd132, {%r19, %r21};
	add.s32 	%r51, %r50, -1022;

BB23_4:
	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd15,%fd16;
	// inline asm
	neg.f64 	%fd17, %fd16;
	mov.f64 	%fd18, 0d3FF0000000000000;
	fma.rn.f64 	%fd19, %fd17, %fd15, %fd18;
	fma.rn.f64 	%fd20, %fd19, %fd19, %fd19;
	fma.rn.f64 	%fd21, %fd20, %fd15, %fd15;
	add.f64 	%fd22, %fd132, 0dBFF0000000000000;
	mul.f64 	%fd23, %fd22, %fd21;
	fma.rn.f64 	%fd24, %fd22, %fd21, %fd23;
	mul.f64 	%fd25, %fd24, %fd24;
	mov.f64 	%fd26, 0d3ED0F5D241AD3B5A;
	mov.f64 	%fd27, 0d3EB0F5FF7D2CAFE2;
	fma.rn.f64 	%fd28, %fd27, %fd25, %fd26;
	mov.f64 	%fd29, 0d3EF3B20A75488A3F;
	fma.rn.f64 	%fd30, %fd28, %fd25, %fd29;
	mov.f64 	%fd31, 0d3F1745CDE4FAECD5;
	fma.rn.f64 	%fd32, %fd30, %fd25, %fd31;
	mov.f64 	%fd33, 0d3F3C71C7258A578B;
	fma.rn.f64 	%fd34, %fd32, %fd25, %fd33;
	mov.f64 	%fd35, 0d3F6249249242B910;
	fma.rn.f64 	%fd36, %fd34, %fd25, %fd35;
	mov.f64 	%fd37, 0d3F89999999999DFB;
	fma.rn.f64 	%fd38, %fd36, %fd25, %fd37;
	sub.f64 	%fd39, %fd22, %fd24;
	add.f64 	%fd40, %fd39, %fd39;
	neg.f64 	%fd41, %fd24;
	fma.rn.f64 	%fd42, %fd41, %fd22, %fd40;
	mul.f64 	%fd43, %fd21, %fd42;
	fma.rn.f64 	%fd44, %fd25, %fd38, 0d3FB5555555555555;
	mov.f64 	%fd45, 0d3FB5555555555555;
	sub.f64 	%fd46, %fd45, %fd44;
	fma.rn.f64 	%fd47, %fd25, %fd38, %fd46;
	add.f64 	%fd48, %fd47, 0d0000000000000000;
	add.f64 	%fd49, %fd48, 0dBC46A4CB00B9E7B0;
	add.f64 	%fd50, %fd44, %fd49;
	sub.f64 	%fd51, %fd44, %fd50;
	add.f64 	%fd52, %fd49, %fd51;
	mul.rn.f64 	%fd53, %fd24, %fd24;
	neg.f64 	%fd54, %fd53;
	fma.rn.f64 	%fd55, %fd24, %fd24, %fd54;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r22, %temp}, %fd43;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r23}, %fd43;
	}
	add.s32 	%r24, %r23, 1048576;
	mov.b64 	%fd56, {%r22, %r24};
	fma.rn.f64 	%fd57, %fd24, %fd56, %fd55;
	mul.rn.f64 	%fd58, %fd53, %fd24;
	neg.f64 	%fd59, %fd58;
	fma.rn.f64 	%fd60, %fd53, %fd24, %fd59;
	fma.rn.f64 	%fd61, %fd53, %fd43, %fd60;
	fma.rn.f64 	%fd62, %fd57, %fd24, %fd61;
	mul.rn.f64 	%fd63, %fd50, %fd58;
	neg.f64 	%fd64, %fd63;
	fma.rn.f64 	%fd65, %fd50, %fd58, %fd64;
	fma.rn.f64 	%fd66, %fd50, %fd62, %fd65;
	fma.rn.f64 	%fd67, %fd52, %fd58, %fd66;
	add.f64 	%fd68, %fd63, %fd67;
	sub.f64 	%fd69, %fd63, %fd68;
	add.f64 	%fd70, %fd67, %fd69;
	add.f64 	%fd71, %fd24, %fd68;
	sub.f64 	%fd72, %fd24, %fd71;
	add.f64 	%fd73, %fd68, %fd72;
	add.f64 	%fd74, %fd70, %fd73;
	add.f64 	%fd75, %fd43, %fd74;
	add.f64 	%fd76, %fd71, %fd75;
	sub.f64 	%fd77, %fd71, %fd76;
	add.f64 	%fd78, %fd75, %fd77;
	xor.b32  	%r25, %r51, -2147483648;
	mov.u32 	%r26, 1127219200;
	mov.b64 	%fd79, {%r25, %r26};
	mov.u32 	%r27, -2147483648;
	mov.b64 	%fd80, {%r27, %r26};
	sub.f64 	%fd81, %fd79, %fd80;
	mov.f64 	%fd82, 0d3FE62E42FEFA39EF;
	fma.rn.f64 	%fd83, %fd81, %fd82, %fd76;
	neg.f64 	%fd84, %fd81;
	fma.rn.f64 	%fd85, %fd84, %fd82, %fd83;
	sub.f64 	%fd86, %fd85, %fd76;
	sub.f64 	%fd87, %fd78, %fd86;
	mov.f64 	%fd88, 0d3C7ABC9E3B39803F;
	fma.rn.f64 	%fd89, %fd81, %fd88, %fd87;
	add.f64 	%fd90, %fd83, %fd89;
	sub.f64 	%fd91, %fd83, %fd90;
	add.f64 	%fd92, %fd89, %fd91;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r28}, %fd13;
	}
	add.s32 	%r29, %r28, %r28;
	setp.gt.u32	%p3, %r29, -33554433;
	and.b32  	%r30, %r28, -15728641;
	selp.b32	%r31, %r30, %r28, %p3;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r32, %temp}, %fd13;
	}
	mov.b64 	%fd93, {%r32, %r31};
	mul.rn.f64 	%fd94, %fd90, %fd93;
	neg.f64 	%fd95, %fd94;
	fma.rn.f64 	%fd96, %fd90, %fd93, %fd95;
	fma.rn.f64 	%fd97, %fd92, %fd93, %fd96;
	add.f64 	%fd4, %fd94, %fd97;
	sub.f64 	%fd98, %fd94, %fd4;
	add.f64 	%fd5, %fd97, %fd98;
	mov.f64 	%fd99, 0d4338000000000000;
	mov.f64 	%fd100, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd101, %fd4, %fd100, %fd99;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r13, %temp}, %fd101;
	}
	mov.f64 	%fd102, 0dC338000000000000;
	add.rn.f64 	%fd103, %fd101, %fd102;
	mov.f64 	%fd104, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd105, %fd103, %fd104, %fd4;
	mov.f64 	%fd106, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd107, %fd103, %fd106, %fd105;
	mov.f64 	%fd108, 0d3E928AF3FCA213EA;
	mov.f64 	%fd109, 0d3E5ADE1569CE2BDF;
	fma.rn.f64 	%fd110, %fd109, %fd107, %fd108;
	mov.f64 	%fd111, 0d3EC71DEE62401315;
	fma.rn.f64 	%fd112, %fd110, %fd107, %fd111;
	mov.f64 	%fd113, 0d3EFA01997C89EB71;
	fma.rn.f64 	%fd114, %fd112, %fd107, %fd113;
	mov.f64 	%fd115, 0d3F2A01A014761F65;
	fma.rn.f64 	%fd116, %fd114, %fd107, %fd115;
	mov.f64 	%fd117, 0d3F56C16C1852B7AF;
	fma.rn.f64 	%fd118, %fd116, %fd107, %fd117;
	mov.f64 	%fd119, 0d3F81111111122322;
	fma.rn.f64 	%fd120, %fd118, %fd107, %fd119;
	mov.f64 	%fd121, 0d3FA55555555502A1;
	fma.rn.f64 	%fd122, %fd120, %fd107, %fd121;
	mov.f64 	%fd123, 0d3FC5555555555511;
	fma.rn.f64 	%fd124, %fd122, %fd107, %fd123;
	mov.f64 	%fd125, 0d3FE000000000000B;
	fma.rn.f64 	%fd126, %fd124, %fd107, %fd125;
	fma.rn.f64 	%fd127, %fd126, %fd107, %fd18;
	fma.rn.f64 	%fd128, %fd127, %fd107, %fd18;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r14, %temp}, %fd128;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r15}, %fd128;
	}
	shl.b32 	%r33, %r13, 20;
	add.s32 	%r34, %r15, %r33;
	mov.b64 	%fd133, {%r14, %r34};
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r35}, %fd4;
	}
	mov.b32 	 %f2, %r35;
	abs.f32 	%f1, %f2;
	setp.lt.f32	%p4, %f1, 0f4086232B;
	@%p4 bra 	BB23_7;

	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
	add.f64 	%fd129, %fd4, 0d7FF0000000000000;
	selp.f64	%fd133, 0d0000000000000000, %fd129, %p5;
	setp.geu.f32	%p6, %f1, 0f40874800;
	@%p6 bra 	BB23_7;

	shr.u32 	%r36, %r13, 31;
	add.s32 	%r37, %r13, %r36;
	shr.s32 	%r38, %r37, 1;
	shl.b32 	%r39, %r38, 20;
	add.s32 	%r40, %r39, %r15;
	mov.b64 	%fd130, {%r14, %r40};
	sub.s32 	%r41, %r13, %r38;
	shl.b32 	%r42, %r41, 20;
	add.s32 	%r43, %r42, 1072693248;
	mov.u32 	%r44, 0;
	mov.b64 	%fd131, {%r44, %r43};
	mul.f64 	%fd133, %fd130, %fd131;

BB23_7:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r45}, %fd133;
	}
	and.b32  	%r46, %r45, 2147483647;
	setp.ne.s32	%p7, %r46, 2146435072;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r47, %temp}, %fd133;
	}
	setp.ne.s32	%p8, %r47, 0;
	or.pred  	%p9, %p8, %p7;
	@!%p9 bra 	BB23_9;
	bra.uni 	BB23_8;

BB23_8:
	fma.rn.f64 	%fd133, %fd133, %fd5, %fd133;

BB23_9:
	st.param.f64	[func_retval0+0], %fd133;
	ret;
}






© 2015 - 2024 Weber Informatics LLC | Privacy Policy