All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kernels.SystemML.ptx Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21124049
// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_30
.address_size 64

	// .globl	double2float_f
.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.func  (.param .b64 func_retval0) __internal_accurate_pow
(
	.param .b64 __internal_accurate_pow_param_0,
	.param .b64 __internal_accurate_pow_param_1
)
;
.extern .shared .align 1 .b8 my_sdata[];
.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};

.visible .entry double2float_f(
	.param .u64 double2float_f_param_0,
	.param .u64 double2float_f_param_1,
	.param .u32 double2float_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [double2float_f_param_0];
	ld.param.u64 	%rd2, [double2float_f_param_1];
	ld.param.u32 	%r2, [double2float_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB0_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	cvt.rn.f32.f64	%f1, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r1, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f1;

BB0_2:
	ret;
}

	// .globl	float2double_f
.visible .entry float2double_f(
	.param .u64 float2double_f_param_0,
	.param .u64 float2double_f_param_1,
	.param .u32 float2double_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [float2double_f_param_0];
	ld.param.u64 	%rd2, [float2double_f_param_1];
	ld.param.u32 	%r2, [float2double_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB1_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	cvt.f64.f32	%fd1, %f1;
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd1;

BB1_2:
	ret;
}

	// .globl	slice_sparse_dense_row_d
.visible .entry slice_sparse_dense_row_d(
	.param .u64 slice_sparse_dense_row_d_param_0,
	.param .u64 slice_sparse_dense_row_d_param_1,
	.param .u64 slice_sparse_dense_row_d_param_2,
	.param .u64 slice_sparse_dense_row_d_param_3,
	.param .u32 slice_sparse_dense_row_d_param_4,
	.param .u32 slice_sparse_dense_row_d_param_5,
	.param .u32 slice_sparse_dense_row_d_param_6,
	.param .u32 slice_sparse_dense_row_d_param_7,
	.param .u32 slice_sparse_dense_row_d_param_8
)
{
	.reg .pred 	%p<7>;
	.reg .b32 	%r<24>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<23>;


	ld.param.u64 	%rd9, [slice_sparse_dense_row_d_param_0];
	ld.param.u64 	%rd10, [slice_sparse_dense_row_d_param_1];
	ld.param.u64 	%rd11, [slice_sparse_dense_row_d_param_2];
	ld.param.u64 	%rd12, [slice_sparse_dense_row_d_param_3];
	ld.param.u32 	%r15, [slice_sparse_dense_row_d_param_4];
	ld.param.u32 	%r16, [slice_sparse_dense_row_d_param_5];
	ld.param.u32 	%r12, [slice_sparse_dense_row_d_param_6];
	ld.param.u32 	%r13, [slice_sparse_dense_row_d_param_7];
	ld.param.u32 	%r14, [slice_sparse_dense_row_d_param_8];
	mov.u32 	%r17, %ntid.x;
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %tid.x;
	mad.lo.s32 	%r1, %r17, %r18, %r19;
	add.s32 	%r2, %r1, %r15;
	setp.gt.s32	%p1, %r2, %r16;
	@%p1 bra 	BB2_6;

	cvta.to.global.u64 	%rd13, %rd10;
	mul.wide.s32 	%rd14, %r2, 4;
	add.s64 	%rd1, %rd13, %rd14;
	ld.global.u32 	%r23, [%rd1];
	ld.global.u32 	%r22, [%rd1+4];
	setp.ge.s32	%p2, %r23, %r22;
	@%p2 bra 	BB2_6;

	cvta.to.global.u64 	%rd2, %rd12;
	cvta.to.global.u64 	%rd15, %rd9;
	cvta.to.global.u64 	%rd16, %rd11;
	mul.lo.s32 	%r20, %r1, %r14;
	sub.s32 	%r5, %r20, %r12;
	mul.wide.s32 	%rd17, %r23, 8;
	add.s64 	%rd22, %rd15, %rd17;
	mul.wide.s32 	%rd18, %r23, 4;
	add.s64 	%rd21, %rd16, %rd18;

BB2_3:
	ld.global.u32 	%r8, [%rd21];
	setp.lt.s32	%p3, %r8, %r12;
	setp.gt.s32	%p4, %r8, %r13;
	or.pred  	%p5, %p3, %p4;
	@%p5 bra 	BB2_5;

	ld.global.f64 	%fd1, [%rd22];
	add.s32 	%r21, %r5, %r8;
	mul.wide.s32 	%rd19, %r21, 8;
	add.s64 	%rd20, %rd2, %rd19;
	st.global.f64 	[%rd20], %fd1;
	ld.global.u32 	%r22, [%rd1+4];

BB2_5:
	add.s64 	%rd22, %rd22, 8;
	add.s64 	%rd21, %rd21, 4;
	add.s32 	%r23, %r23, 1;
	setp.lt.s32	%p6, %r23, %r22;
	@%p6 bra 	BB2_3;

BB2_6:
	ret;
}

	// .globl	slice_sparse_dense_row_f
.visible .entry slice_sparse_dense_row_f(
	.param .u64 slice_sparse_dense_row_f_param_0,
	.param .u64 slice_sparse_dense_row_f_param_1,
	.param .u64 slice_sparse_dense_row_f_param_2,
	.param .u64 slice_sparse_dense_row_f_param_3,
	.param .u32 slice_sparse_dense_row_f_param_4,
	.param .u32 slice_sparse_dense_row_f_param_5,
	.param .u32 slice_sparse_dense_row_f_param_6,
	.param .u32 slice_sparse_dense_row_f_param_7,
	.param .u32 slice_sparse_dense_row_f_param_8
)
{
	.reg .pred 	%p<7>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<24>;
	.reg .b64 	%rd<22>;


	ld.param.u64 	%rd9, [slice_sparse_dense_row_f_param_0];
	ld.param.u64 	%rd10, [slice_sparse_dense_row_f_param_1];
	ld.param.u64 	%rd11, [slice_sparse_dense_row_f_param_2];
	ld.param.u64 	%rd12, [slice_sparse_dense_row_f_param_3];
	ld.param.u32 	%r15, [slice_sparse_dense_row_f_param_4];
	ld.param.u32 	%r16, [slice_sparse_dense_row_f_param_5];
	ld.param.u32 	%r12, [slice_sparse_dense_row_f_param_6];
	ld.param.u32 	%r13, [slice_sparse_dense_row_f_param_7];
	ld.param.u32 	%r14, [slice_sparse_dense_row_f_param_8];
	mov.u32 	%r17, %ntid.x;
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %tid.x;
	mad.lo.s32 	%r1, %r17, %r18, %r19;
	add.s32 	%r2, %r1, %r15;
	setp.gt.s32	%p1, %r2, %r16;
	@%p1 bra 	BB3_6;

	cvta.to.global.u64 	%rd13, %rd10;
	mul.wide.s32 	%rd14, %r2, 4;
	add.s64 	%rd1, %rd13, %rd14;
	ld.global.u32 	%r23, [%rd1];
	ld.global.u32 	%r22, [%rd1+4];
	setp.ge.s32	%p2, %r23, %r22;
	@%p2 bra 	BB3_6;

	cvta.to.global.u64 	%rd2, %rd12;
	cvta.to.global.u64 	%rd15, %rd9;
	cvta.to.global.u64 	%rd16, %rd11;
	mul.lo.s32 	%r20, %r1, %r14;
	sub.s32 	%r5, %r20, %r12;
	mul.wide.s32 	%rd17, %r23, 4;
	add.s64 	%rd21, %rd15, %rd17;
	add.s64 	%rd20, %rd16, %rd17;

BB3_3:
	ld.global.u32 	%r8, [%rd20];
	setp.lt.s32	%p3, %r8, %r12;
	setp.gt.s32	%p4, %r8, %r13;
	or.pred  	%p5, %p3, %p4;
	@%p5 bra 	BB3_5;

	ld.global.f32 	%f1, [%rd21];
	add.s32 	%r21, %r5, %r8;
	mul.wide.s32 	%rd18, %r21, 4;
	add.s64 	%rd19, %rd2, %rd18;
	st.global.f32 	[%rd19], %f1;
	ld.global.u32 	%r22, [%rd1+4];

BB3_5:
	add.s64 	%rd21, %rd21, 4;
	add.s64 	%rd20, %rd20, 4;
	add.s32 	%r23, %r23, 1;
	setp.lt.s32	%p6, %r23, %r22;
	@%p6 bra 	BB3_3;

BB3_6:
	ret;
}

	// .globl	slice_sparse_dense_nnz_d
.visible .entry slice_sparse_dense_nnz_d(
	.param .u64 slice_sparse_dense_nnz_d_param_0,
	.param .u64 slice_sparse_dense_nnz_d_param_1,
	.param .u64 slice_sparse_dense_nnz_d_param_2,
	.param .u64 slice_sparse_dense_nnz_d_param_3,
	.param .u32 slice_sparse_dense_nnz_d_param_4,
	.param .u32 slice_sparse_dense_nnz_d_param_5,
	.param .u32 slice_sparse_dense_nnz_d_param_6,
	.param .u32 slice_sparse_dense_nnz_d_param_7,
	.param .u32 slice_sparse_dense_nnz_d_param_8
)
{
	.reg .pred 	%p<6>;
	.reg .b32 	%r<22>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<22>;


	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_0];
	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_d_param_1];
	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_2];
	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_3];
	ld.param.u32 	%r5, [slice_sparse_dense_nnz_d_param_4];
	ld.param.u32 	%r9, [slice_sparse_dense_nnz_d_param_5];
	ld.param.u32 	%r6, [slice_sparse_dense_nnz_d_param_6];
	ld.param.u32 	%r7, [slice_sparse_dense_nnz_d_param_7];
	ld.param.u32 	%r8, [slice_sparse_dense_nnz_d_param_8];
	mov.u32 	%r10, %ntid.x;
	mov.u32 	%r11, %ctaid.x;
	mov.u32 	%r12, %tid.x;
	mad.lo.s32 	%r13, %r10, %r11, %r12;
	cvta.to.global.u64 	%rd1, %rd8;
	mul.wide.s32 	%rd9, %r5, 4;
	add.s64 	%rd10, %rd1, %rd9;
	ld.global.u32 	%r14, [%rd10];
	add.s32 	%r1, %r13, %r14;
	mul.wide.s32 	%rd11, %r9, 4;
	add.s64 	%rd12, %rd1, %rd11;
	ld.global.u32 	%r15, [%rd12+4];
	setp.ge.s32	%p1, %r1, %r15;
	@%p1 bra 	BB4_5;

	cvta.to.global.u64 	%rd2, %rd7;
	cvta.to.global.u64 	%rd3, %rd5;
	cvta.to.global.u64 	%rd13, %rd6;
	cvt.s64.s32	%rd4, %r1;
	mul.wide.s32 	%rd14, %r1, 4;
	add.s64 	%rd15, %rd13, %rd14;
	ld.global.u32 	%r2, [%rd15];
	setp.lt.s32	%p2, %r2, %r6;
	setp.gt.s32	%p3, %r2, %r7;
	or.pred  	%p4, %p2, %p3;
	@%p4 bra 	BB4_5;

	mov.u32 	%r21, %r5;

BB4_3:
	mov.u32 	%r3, %r21;
	add.s32 	%r4, %r3, 1;
	mul.wide.s32 	%rd16, %r4, 4;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.u32 	%r16, [%rd17];
	setp.le.s32	%p5, %r16, %r1;
	mov.u32 	%r21, %r4;
	@%p5 bra 	BB4_3;

	shl.b64 	%rd18, %rd4, 3;
	add.s64 	%rd19, %rd3, %rd18;
	ld.global.f64 	%fd1, [%rd19];
	sub.s32 	%r17, %r3, %r5;
	mul.lo.s32 	%r18, %r17, %r8;
	sub.s32 	%r19, %r18, %r6;
	add.s32 	%r20, %r19, %r2;
	mul.wide.s32 	%rd20, %r20, 8;
	add.s64 	%rd21, %rd2, %rd20;
	st.global.f64 	[%rd21], %fd1;

BB4_5:
	ret;
}

	// .globl	slice_sparse_dense_nnz_f
.visible .entry slice_sparse_dense_nnz_f(
	.param .u64 slice_sparse_dense_nnz_f_param_0,
	.param .u64 slice_sparse_dense_nnz_f_param_1,
	.param .u64 slice_sparse_dense_nnz_f_param_2,
	.param .u64 slice_sparse_dense_nnz_f_param_3,
	.param .u32 slice_sparse_dense_nnz_f_param_4,
	.param .u32 slice_sparse_dense_nnz_f_param_5,
	.param .u32 slice_sparse_dense_nnz_f_param_6,
	.param .u32 slice_sparse_dense_nnz_f_param_7,
	.param .u32 slice_sparse_dense_nnz_f_param_8
)
{
	.reg .pred 	%p<6>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<22>;
	.reg .b64 	%rd<22>;


	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_0];
	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_f_param_1];
	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_2];
	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_3];
	ld.param.u32 	%r5, [slice_sparse_dense_nnz_f_param_4];
	ld.param.u32 	%r9, [slice_sparse_dense_nnz_f_param_5];
	ld.param.u32 	%r6, [slice_sparse_dense_nnz_f_param_6];
	ld.param.u32 	%r7, [slice_sparse_dense_nnz_f_param_7];
	ld.param.u32 	%r8, [slice_sparse_dense_nnz_f_param_8];
	mov.u32 	%r10, %ntid.x;
	mov.u32 	%r11, %ctaid.x;
	mov.u32 	%r12, %tid.x;
	mad.lo.s32 	%r13, %r10, %r11, %r12;
	cvta.to.global.u64 	%rd1, %rd8;
	mul.wide.s32 	%rd9, %r5, 4;
	add.s64 	%rd10, %rd1, %rd9;
	ld.global.u32 	%r14, [%rd10];
	add.s32 	%r1, %r13, %r14;
	mul.wide.s32 	%rd11, %r9, 4;
	add.s64 	%rd12, %rd1, %rd11;
	ld.global.u32 	%r15, [%rd12+4];
	setp.ge.s32	%p1, %r1, %r15;
	@%p1 bra 	BB5_5;

	cvta.to.global.u64 	%rd2, %rd7;
	cvta.to.global.u64 	%rd3, %rd5;
	cvta.to.global.u64 	%rd13, %rd6;
	cvt.s64.s32	%rd4, %r1;
	mul.wide.s32 	%rd14, %r1, 4;
	add.s64 	%rd15, %rd13, %rd14;
	ld.global.u32 	%r2, [%rd15];
	setp.lt.s32	%p2, %r2, %r6;
	setp.gt.s32	%p3, %r2, %r7;
	or.pred  	%p4, %p2, %p3;
	@%p4 bra 	BB5_5;

	mov.u32 	%r21, %r5;

BB5_3:
	mov.u32 	%r3, %r21;
	add.s32 	%r4, %r3, 1;
	mul.wide.s32 	%rd16, %r4, 4;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.u32 	%r16, [%rd17];
	setp.le.s32	%p5, %r16, %r1;
	mov.u32 	%r21, %r4;
	@%p5 bra 	BB5_3;

	shl.b64 	%rd18, %rd4, 2;
	add.s64 	%rd19, %rd3, %rd18;
	ld.global.f32 	%f1, [%rd19];
	sub.s32 	%r17, %r3, %r5;
	mul.lo.s32 	%r18, %r17, %r8;
	sub.s32 	%r19, %r18, %r6;
	add.s32 	%r20, %r19, %r2;
	mul.wide.s32 	%rd20, %r20, 4;
	add.s64 	%rd21, %rd2, %rd20;
	st.global.f32 	[%rd21], %f1;

BB5_5:
	ret;
}

	// .globl	slice_dense_dense_d
.visible .entry slice_dense_dense_d(
	.param .u64 slice_dense_dense_d_param_0,
	.param .u64 slice_dense_dense_d_param_1,
	.param .u32 slice_dense_dense_d_param_2,
	.param .u32 slice_dense_dense_d_param_3,
	.param .u32 slice_dense_dense_d_param_4,
	.param .u32 slice_dense_dense_d_param_5,
	.param .u32 slice_dense_dense_d_param_6,
	.param .u32 slice_dense_dense_d_param_7,
	.param .u32 slice_dense_dense_d_param_8
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<15>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [slice_dense_dense_d_param_0];
	ld.param.u64 	%rd2, [slice_dense_dense_d_param_1];
	ld.param.u32 	%r3, [slice_dense_dense_d_param_2];
	ld.param.u32 	%r4, [slice_dense_dense_d_param_4];
	ld.param.u32 	%r5, [slice_dense_dense_d_param_6];
	ld.param.u32 	%r7, [slice_dense_dense_d_param_7];
	ld.param.u32 	%r6, [slice_dense_dense_d_param_8];
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r1, %r9, %r8, %r10;
	div.s32 	%r2, %r1, %r6;
	setp.lt.s32	%p1, %r2, %r7;
	setp.gt.s32	%p2, %r6, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB6_2;
	bra.uni 	BB6_1;

BB6_1:
	rem.s32 	%r11, %r1, %r6;
	cvta.to.global.u64 	%rd3, %rd1;
	add.s32 	%r12, %r2, %r3;
	add.s32 	%r13, %r11, %r4;
	mad.lo.s32 	%r14, %r12, %r5, %r13;
	mul.wide.s32 	%rd4, %r14, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd1;

BB6_2:
	ret;
}

	// .globl	slice_dense_dense_f
.visible .entry slice_dense_dense_f(
	.param .u64 slice_dense_dense_f_param_0,
	.param .u64 slice_dense_dense_f_param_1,
	.param .u32 slice_dense_dense_f_param_2,
	.param .u32 slice_dense_dense_f_param_3,
	.param .u32 slice_dense_dense_f_param_4,
	.param .u32 slice_dense_dense_f_param_5,
	.param .u32 slice_dense_dense_f_param_6,
	.param .u32 slice_dense_dense_f_param_7,
	.param .u32 slice_dense_dense_f_param_8
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<15>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [slice_dense_dense_f_param_0];
	ld.param.u64 	%rd2, [slice_dense_dense_f_param_1];
	ld.param.u32 	%r3, [slice_dense_dense_f_param_2];
	ld.param.u32 	%r4, [slice_dense_dense_f_param_4];
	ld.param.u32 	%r5, [slice_dense_dense_f_param_6];
	ld.param.u32 	%r7, [slice_dense_dense_f_param_7];
	ld.param.u32 	%r6, [slice_dense_dense_f_param_8];
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r1, %r9, %r8, %r10;
	div.s32 	%r2, %r1, %r6;
	setp.lt.s32	%p1, %r2, %r7;
	setp.gt.s32	%p2, %r6, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB7_2;
	bra.uni 	BB7_1;

BB7_1:
	rem.s32 	%r11, %r1, %r6;
	cvta.to.global.u64 	%rd3, %rd1;
	add.s32 	%r12, %r2, %r3;
	add.s32 	%r13, %r11, %r4;
	mad.lo.s32 	%r14, %r12, %r5, %r13;
	mul.wide.s32 	%rd4, %r14, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r1, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f1;

BB7_2:
	ret;
}

	// .globl	copy_u2l_dense_d
.visible .entry copy_u2l_dense_d(
	.param .u64 copy_u2l_dense_d_param_0,
	.param .u32 copy_u2l_dense_d_param_1,
	.param .u32 copy_u2l_dense_d_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<10>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [copy_u2l_dense_d_param_0];
	ld.param.u32 	%r3, [copy_u2l_dense_d_param_1];
	ld.param.u32 	%r4, [copy_u2l_dense_d_param_2];
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r5, %r6, %r7;
	div.s32 	%r8, %r1, %r3;
	rem.s32 	%r9, %r1, %r3;
	mad.lo.s32 	%r2, %r9, %r3, %r8;
	setp.gt.s32	%p1, %r9, %r8;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB8_2;
	bra.uni 	BB8_1;

BB8_1:
	cvta.to.global.u64 	%rd2, %rd1;
	mul.wide.s32 	%rd3, %r1, 8;
	add.s64 	%rd4, %rd2, %rd3;
	ld.global.f64 	%fd1, [%rd4];
	mul.wide.s32 	%rd5, %r2, 8;
	add.s64 	%rd6, %rd2, %rd5;
	st.global.f64 	[%rd6], %fd1;

BB8_2:
	ret;
}

	// .globl	copy_u2l_dense_f
.visible .entry copy_u2l_dense_f(
	.param .u64 copy_u2l_dense_f_param_0,
	.param .u32 copy_u2l_dense_f_param_1,
	.param .u32 copy_u2l_dense_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<10>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [copy_u2l_dense_f_param_0];
	ld.param.u32 	%r3, [copy_u2l_dense_f_param_1];
	ld.param.u32 	%r4, [copy_u2l_dense_f_param_2];
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r5, %r6, %r7;
	div.s32 	%r8, %r1, %r3;
	rem.s32 	%r9, %r1, %r3;
	mad.lo.s32 	%r2, %r9, %r3, %r8;
	setp.gt.s32	%p1, %r9, %r8;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB9_2;
	bra.uni 	BB9_1;

BB9_1:
	cvta.to.global.u64 	%rd2, %rd1;
	mul.wide.s32 	%rd3, %r1, 4;
	add.s64 	%rd4, %rd2, %rd3;
	ld.global.f32 	%f1, [%rd4];
	mul.wide.s32 	%rd5, %r2, 4;
	add.s64 	%rd6, %rd2, %rd5;
	st.global.f32 	[%rd6], %f1;

BB9_2:
	ret;
}

	// .globl	relu_d
.visible .entry relu_d(
	.param .u64 relu_d_param_0,
	.param .u64 relu_d_param_1,
	.param .u32 relu_d_param_2,
	.param .u32 relu_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<8>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [relu_d_param_0];
	ld.param.u64 	%rd2, [relu_d_param_1];
	ld.param.u32 	%r2, [relu_d_param_2];
	ld.param.u32 	%r3, [relu_d_param_3];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB10_2;
	bra.uni 	BB10_1;

BB10_1:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	mov.f64 	%fd2, 0d0000000000000000;
	max.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd3;

BB10_2:
	ret;
}

	// .globl	relu_f
.visible .entry relu_f(
	.param .u64 relu_f_param_0,
	.param .u64 relu_f_param_1,
	.param .u32 relu_f_param_2,
	.param .u32 relu_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<8>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [relu_f_param_0];
	ld.param.u64 	%rd2, [relu_f_param_1];
	ld.param.u32 	%r2, [relu_f_param_2];
	ld.param.u32 	%r3, [relu_f_param_3];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB11_2;
	bra.uni 	BB11_1;

BB11_1:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	cvt.f64.f32	%fd1, %f1;
	mov.f64 	%fd2, 0d0000000000000000;
	max.f64 	%fd3, %fd2, %fd1;
	cvt.rn.f32.f64	%f2, %fd3;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f2;

BB11_2:
	ret;
}

	// .globl	relu_backward_d
.visible .entry relu_backward_d(
	.param .u64 relu_backward_d_param_0,
	.param .u64 relu_backward_d_param_1,
	.param .u64 relu_backward_d_param_2,
	.param .u32 relu_backward_d_param_3,
	.param .u32 relu_backward_d_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<8>;
	.reg .f64 	%fd<6>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd2, [relu_backward_d_param_0];
	ld.param.u64 	%rd3, [relu_backward_d_param_1];
	ld.param.u64 	%rd4, [relu_backward_d_param_2];
	ld.param.u32 	%r2, [relu_backward_d_param_3];
	ld.param.u32 	%r3, [relu_backward_d_param_4];
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r4, %r5, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB12_4;
	bra.uni 	BB12_1;

BB12_1:
	cvta.to.global.u64 	%rd5, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd6, %r1, 8;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f64 	%fd4, [%rd7];
	mov.f64 	%fd5, 0d0000000000000000;
	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
	@%p4 bra 	BB12_3;

	cvta.to.global.u64 	%rd8, %rd3;
	shl.b64 	%rd9, %rd1, 3;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f64 	%fd5, [%rd10];

BB12_3:
	cvta.to.global.u64 	%rd11, %rd4;
	shl.b64 	%rd12, %rd1, 3;
	add.s64 	%rd13, %rd11, %rd12;
	st.global.f64 	[%rd13], %fd5;

BB12_4:
	ret;
}

	// .globl	relu_backward_f
.visible .entry relu_backward_f(
	.param .u64 relu_backward_f_param_0,
	.param .u64 relu_backward_f_param_1,
	.param .u64 relu_backward_f_param_2,
	.param .u32 relu_backward_f_param_3,
	.param .u32 relu_backward_f_param_4
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<6>;
	.reg .b32 	%r<8>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd2, [relu_backward_f_param_0];
	ld.param.u64 	%rd3, [relu_backward_f_param_1];
	ld.param.u64 	%rd4, [relu_backward_f_param_2];
	ld.param.u32 	%r2, [relu_backward_f_param_3];
	ld.param.u32 	%r3, [relu_backward_f_param_4];
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r4, %r5, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB13_4;
	bra.uni 	BB13_1;

BB13_1:
	cvta.to.global.u64 	%rd5, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd6, %r1, 4;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f32 	%f4, [%rd7];
	mov.f32 	%f5, 0f00000000;
	setp.leu.f32	%p4, %f4, 0f00000000;
	@%p4 bra 	BB13_3;

	cvta.to.global.u64 	%rd8, %rd3;
	shl.b64 	%rd9, %rd1, 2;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f32 	%f5, [%rd10];

BB13_3:
	cvta.to.global.u64 	%rd11, %rd4;
	shl.b64 	%rd12, %rd1, 2;
	add.s64 	%rd13, %rd11, %rd12;
	st.global.f32 	[%rd13], %f5;

BB13_4:
	ret;
}

	// .globl	inplace_add_d
.visible .entry inplace_add_d(
	.param .u64 inplace_add_d_param_0,
	.param .u64 inplace_add_d_param_1,
	.param .u32 inplace_add_d_param_2,
	.param .u32 inplace_add_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<8>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [inplace_add_d_param_0];
	ld.param.u64 	%rd2, [inplace_add_d_param_1];
	ld.param.u32 	%r2, [inplace_add_d_param_2];
	ld.param.u32 	%r3, [inplace_add_d_param_3];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB14_2;
	bra.uni 	BB14_1;

BB14_1:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	ld.global.f64 	%fd1, [%rd7];
	ld.global.f64 	%fd2, [%rd5];
	add.f64 	%fd3, %fd2, %fd1;
	st.global.f64 	[%rd7], %fd3;

BB14_2:
	ret;
}

	// .globl	inplace_add_f
.visible .entry inplace_add_f(
	.param .u64 inplace_add_f_param_0,
	.param .u64 inplace_add_f_param_1,
	.param .u32 inplace_add_f_param_2,
	.param .u32 inplace_add_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<4>;
	.reg .b32 	%r<8>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [inplace_add_f_param_0];
	ld.param.u64 	%rd2, [inplace_add_f_param_1];
	ld.param.u32 	%r2, [inplace_add_f_param_2];
	ld.param.u32 	%r3, [inplace_add_f_param_3];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	div.s32 	%r7, %r1, %r3;
	setp.lt.s32	%p1, %r7, %r2;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB15_2;
	bra.uni 	BB15_1;

BB15_1:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	ld.global.f32 	%f1, [%rd7];
	ld.global.f32 	%f2, [%rd5];
	add.f32 	%f3, %f2, %f1;
	st.global.f32 	[%rd7], %f3;

BB15_2:
	ret;
}

	// .globl	bias_add_d
.visible .entry bias_add_d(
	.param .u64 bias_add_d_param_0,
	.param .u64 bias_add_d_param_1,
	.param .u64 bias_add_d_param_2,
	.param .u32 bias_add_d_param_3,
	.param .u32 bias_add_d_param_4,
	.param .u32 bias_add_d_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_add_d_param_0];
	ld.param.u64 	%rd2, [bias_add_d_param_1];
	ld.param.u64 	%rd3, [bias_add_d_param_2];
	ld.param.u32 	%r4, [bias_add_d_param_3];
	ld.param.u32 	%r2, [bias_add_d_param_4];
	ld.param.u32 	%r3, [bias_add_d_param_5];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	div.s32 	%r8, %r1, %r2;
	setp.lt.s32	%p1, %r8, %r4;
	setp.gt.s32	%p2, %r2, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB16_2;
	bra.uni 	BB16_1;

BB16_1:
	rem.s32 	%r9, %r1, %r2;
	cvta.to.global.u64 	%rd4, %rd1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r10, %r9, %r3;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r10, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	ld.global.f64 	%fd2, [%rd6];
	add.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f64 	[%rd11], %fd3;

BB16_2:
	ret;
}

	// .globl	bias_add_f
.visible .entry bias_add_f(
	.param .u64 bias_add_f_param_0,
	.param .u64 bias_add_f_param_1,
	.param .u64 bias_add_f_param_2,
	.param .u32 bias_add_f_param_3,
	.param .u32 bias_add_f_param_4,
	.param .u32 bias_add_f_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<4>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_add_f_param_0];
	ld.param.u64 	%rd2, [bias_add_f_param_1];
	ld.param.u64 	%rd3, [bias_add_f_param_2];
	ld.param.u32 	%r4, [bias_add_f_param_3];
	ld.param.u32 	%r2, [bias_add_f_param_4];
	ld.param.u32 	%r3, [bias_add_f_param_5];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	div.s32 	%r8, %r1, %r2;
	setp.lt.s32	%p1, %r8, %r4;
	setp.gt.s32	%p2, %r2, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB17_2;
	bra.uni 	BB17_1;

BB17_1:
	rem.s32 	%r9, %r1, %r2;
	cvta.to.global.u64 	%rd4, %rd1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r10, %r9, %r3;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r10, 4;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f32 	%f1, [%rd9];
	ld.global.f32 	%f2, [%rd6];
	add.f32 	%f3, %f2, %f1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f32 	[%rd11], %f3;

BB17_2:
	ret;
}

	// .globl	daxpy_matrix_vector_d
.visible .entry daxpy_matrix_vector_d(
	.param .u64 daxpy_matrix_vector_d_param_0,
	.param .u64 daxpy_matrix_vector_d_param_1,
	.param .f64 daxpy_matrix_vector_d_param_2,
	.param .u64 daxpy_matrix_vector_d_param_3,
	.param .u32 daxpy_matrix_vector_d_param_4,
	.param .u32 daxpy_matrix_vector_d_param_5,
	.param .u32 daxpy_matrix_vector_d_param_6,
	.param .u32 daxpy_matrix_vector_d_param_7
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<7>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd3, [daxpy_matrix_vector_d_param_0];
	ld.param.u64 	%rd5, [daxpy_matrix_vector_d_param_1];
	ld.param.f64 	%fd2, [daxpy_matrix_vector_d_param_2];
	ld.param.u64 	%rd4, [daxpy_matrix_vector_d_param_3];
	ld.param.u32 	%r5, [daxpy_matrix_vector_d_param_4];
	ld.param.u32 	%r3, [daxpy_matrix_vector_d_param_5];
	ld.param.u32 	%r4, [daxpy_matrix_vector_d_param_6];
	cvta.to.global.u64 	%rd1, %rd5;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r9, %r6, %r7, %r8;
	div.s32 	%r1, %r9, %r3;
	rem.s32 	%r2, %r9, %r3;
	setp.lt.s32	%p1, %r1, %r5;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB18_4;
	bra.uni 	BB18_1;

BB18_1:
	cvta.to.global.u64 	%rd6, %rd4;
	mad.lo.s32 	%r10, %r1, %r3, %r2;
	cvta.to.global.u64 	%rd7, %rd3;
	mul.wide.s32 	%rd8, %r10, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	add.s64 	%rd2, %rd6, %rd8;
	setp.eq.s32	%p4, %r4, 1;
	@%p4 bra 	BB18_3;
	bra.uni 	BB18_2;

BB18_3:
	mul.wide.s32 	%rd12, %r2, 8;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.f64 	%fd5, [%rd13];
	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
	st.global.f64 	[%rd2], %fd6;
	bra.uni 	BB18_4;

BB18_2:
	mul.wide.s32 	%rd10, %r1, 8;
	add.s64 	%rd11, %rd1, %rd10;
	ld.global.f64 	%fd3, [%rd11];
	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
	st.global.f64 	[%rd2], %fd4;

BB18_4:
	ret;
}

	// .globl	daxpy_matrix_vector_f
.visible .entry daxpy_matrix_vector_f(
	.param .u64 daxpy_matrix_vector_f_param_0,
	.param .u64 daxpy_matrix_vector_f_param_1,
	.param .f64 daxpy_matrix_vector_f_param_2,
	.param .u64 daxpy_matrix_vector_f_param_3,
	.param .u32 daxpy_matrix_vector_f_param_4,
	.param .u32 daxpy_matrix_vector_f_param_5,
	.param .u32 daxpy_matrix_vector_f_param_6,
	.param .u32 daxpy_matrix_vector_f_param_7
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<6>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<7>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd3, [daxpy_matrix_vector_f_param_0];
	ld.param.u64 	%rd5, [daxpy_matrix_vector_f_param_1];
	ld.param.f64 	%fd2, [daxpy_matrix_vector_f_param_2];
	ld.param.u64 	%rd4, [daxpy_matrix_vector_f_param_3];
	ld.param.u32 	%r5, [daxpy_matrix_vector_f_param_4];
	ld.param.u32 	%r3, [daxpy_matrix_vector_f_param_5];
	ld.param.u32 	%r4, [daxpy_matrix_vector_f_param_6];
	cvta.to.global.u64 	%rd1, %rd5;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r9, %r6, %r7, %r8;
	div.s32 	%r1, %r9, %r3;
	rem.s32 	%r2, %r9, %r3;
	setp.lt.s32	%p1, %r1, %r5;
	setp.gt.s32	%p2, %r3, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB19_4;
	bra.uni 	BB19_1;

BB19_1:
	cvta.to.global.u64 	%rd6, %rd4;
	mad.lo.s32 	%r10, %r1, %r3, %r2;
	cvta.to.global.u64 	%rd7, %rd3;
	mul.wide.s32 	%rd8, %r10, 4;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f32 	%f1, [%rd9];
	cvt.f64.f32	%fd1, %f1;
	add.s64 	%rd2, %rd6, %rd8;
	setp.eq.s32	%p4, %r4, 1;
	@%p4 bra 	BB19_3;
	bra.uni 	BB19_2;

BB19_3:
	mul.wide.s32 	%rd12, %r2, 4;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.f32 	%f4, [%rd13];
	cvt.f64.f32	%fd5, %f4;
	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
	cvt.rn.f32.f64	%f5, %fd6;
	st.global.f32 	[%rd2], %f5;
	bra.uni 	BB19_4;

BB19_2:
	mul.wide.s32 	%rd10, %r1, 4;
	add.s64 	%rd11, %rd1, %rd10;
	ld.global.f32 	%f2, [%rd11];
	cvt.f64.f32	%fd3, %f2;
	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
	cvt.rn.f32.f64	%f3, %fd4;
	st.global.f32 	[%rd2], %f3;

BB19_4:
	ret;
}

	// .globl	bias_multiply_d
.visible .entry bias_multiply_d(
	.param .u64 bias_multiply_d_param_0,
	.param .u64 bias_multiply_d_param_1,
	.param .u64 bias_multiply_d_param_2,
	.param .u32 bias_multiply_d_param_3,
	.param .u32 bias_multiply_d_param_4,
	.param .u32 bias_multiply_d_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_multiply_d_param_0];
	ld.param.u64 	%rd2, [bias_multiply_d_param_1];
	ld.param.u64 	%rd3, [bias_multiply_d_param_2];
	ld.param.u32 	%r4, [bias_multiply_d_param_3];
	ld.param.u32 	%r2, [bias_multiply_d_param_4];
	ld.param.u32 	%r3, [bias_multiply_d_param_5];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	div.s32 	%r8, %r1, %r2;
	setp.lt.s32	%p1, %r8, %r4;
	setp.gt.s32	%p2, %r2, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB20_2;
	bra.uni 	BB20_1;

BB20_1:
	rem.s32 	%r9, %r1, %r2;
	cvta.to.global.u64 	%rd4, %rd1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r10, %r9, %r3;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r10, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	ld.global.f64 	%fd2, [%rd6];
	mul.f64 	%fd3, %fd2, %fd1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f64 	[%rd11], %fd3;

BB20_2:
	ret;
}

	// .globl	bias_multiply_f
.visible .entry bias_multiply_f(
	.param .u64 bias_multiply_f_param_0,
	.param .u64 bias_multiply_f_param_1,
	.param .u64 bias_multiply_f_param_2,
	.param .u32 bias_multiply_f_param_3,
	.param .u32 bias_multiply_f_param_4,
	.param .u32 bias_multiply_f_param_5
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<4>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<12>;


	ld.param.u64 	%rd1, [bias_multiply_f_param_0];
	ld.param.u64 	%rd2, [bias_multiply_f_param_1];
	ld.param.u64 	%rd3, [bias_multiply_f_param_2];
	ld.param.u32 	%r4, [bias_multiply_f_param_3];
	ld.param.u32 	%r2, [bias_multiply_f_param_4];
	ld.param.u32 	%r3, [bias_multiply_f_param_5];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	div.s32 	%r8, %r1, %r2;
	setp.lt.s32	%p1, %r8, %r4;
	setp.gt.s32	%p2, %r2, -1;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB21_2;
	bra.uni 	BB21_1;

BB21_1:
	rem.s32 	%r9, %r1, %r2;
	cvta.to.global.u64 	%rd4, %rd1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	div.s32 	%r10, %r9, %r3;
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r10, 4;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f32 	%f1, [%rd9];
	ld.global.f32 	%f2, [%rd6];
	mul.f32 	%f3, %f2, %f1;
	cvta.to.global.u64 	%rd10, %rd3;
	add.s64 	%rd11, %rd10, %rd5;
	st.global.f32 	[%rd11], %f3;

BB21_2:
	ret;
}

	// .globl	matrix_matrix_cellwise_op_d
.visible .entry matrix_matrix_cellwise_op_d(
	.param .u64 matrix_matrix_cellwise_op_d_param_0,
	.param .u64 matrix_matrix_cellwise_op_d_param_1,
	.param .u64 matrix_matrix_cellwise_op_d_param_2,
	.param .u32 matrix_matrix_cellwise_op_d_param_3,
	.param .u32 matrix_matrix_cellwise_op_d_param_4,
	.param .u32 matrix_matrix_cellwise_op_d_param_5,
	.param .u32 matrix_matrix_cellwise_op_d_param_6,
	.param .u32 matrix_matrix_cellwise_op_d_param_7
)
{
	.reg .pred 	%p<77>;
	.reg .b32 	%r<56>;
	.reg .f64 	%fd<55>;
	.reg .b64 	%rd<19>;


	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_d_param_0];
	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_d_param_1];
	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_d_param_2];
	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_3];
	ld.param.u32 	%r6, [matrix_matrix_cellwise_op_d_param_4];
	ld.param.u32 	%r7, [matrix_matrix_cellwise_op_d_param_5];
	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_d_param_6];
	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_d_param_7];
	mov.u32 	%r11, %ctaid.x;
	mov.u32 	%r12, %ntid.x;
	mov.u32 	%r13, %tid.x;
	mad.lo.s32 	%r1, %r12, %r11, %r13;
	div.s32 	%r2, %r1, %r6;
	setp.lt.s32	%p2, %r2, %r10;
	setp.gt.s32	%p3, %r6, -1;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB22_65;
	bra.uni 	BB22_1;

BB22_1:
	rem.s32 	%r14, %r1, %r6;
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r3, %r2, %r6, %r14;
	setp.eq.s32	%p5, %r7, 2;
	selp.b32	%r15, %r14, %r3, %p5;
	setp.eq.s32	%p6, %r7, 1;
	selp.b32	%r16, %r2, %r15, %p6;
	setp.eq.s32	%p7, %r8, 2;
	selp.b32	%r17, %r14, %r3, %p7;
	setp.eq.s32	%p8, %r8, 1;
	selp.b32	%r18, %r2, %r17, %p8;
	mul.wide.s32 	%rd6, %r16, 8;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f64 	%fd1, [%rd7];
	cvta.to.global.u64 	%rd8, %rd3;
	mul.wide.s32 	%rd9, %r18, 8;
	add.s64 	%rd10, %rd8, %rd9;
	ld.global.f64 	%fd2, [%rd10];
	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p9, %r9, 8;
	@%p9 bra 	BB22_18;

	setp.gt.s32	%p23, %r9, 3;
	@%p23 bra 	BB22_10;

	setp.gt.s32	%p30, %r9, 1;
	@%p30 bra 	BB22_7;

	setp.eq.s32	%p33, %r9, 0;
	@%p33 bra 	BB22_63;
	bra.uni 	BB22_5;

BB22_63:
	add.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_18:
	setp.gt.s32	%p10, %r9, 13;
	@%p10 bra 	BB22_27;

	setp.gt.s32	%p17, %r9, 10;
	@%p17 bra 	BB22_23;

	setp.eq.s32	%p21, %r9, 9;
	@%p21 bra 	BB22_45;
	bra.uni 	BB22_21;

BB22_45:
	setp.eq.f64	%p50, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
	bra.uni 	BB22_64;

BB22_10:
	setp.gt.s32	%p24, %r9, 5;
	@%p24 bra 	BB22_14;

	setp.eq.s32	%p28, %r9, 4;
	@%p28 bra 	BB22_48;
	bra.uni 	BB22_12;

BB22_48:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r4}, %fd1;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd2;
	}
	bfe.u32 	%r31, %r5, 20, 11;
	add.s32 	%r32, %r31, -1012;
	mov.b64 	 %rd15, %fd2;
	shl.b64 	%rd1, %rd15, %r32;
	setp.eq.s64	%p55, %rd1, -9223372036854775808;
	abs.f64 	%fd19, %fd1;
	// Callseq Start 0
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd19;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd2;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd53, [retval0+0];
	
	//{
	}// Callseq End 0
	setp.lt.s32	%p56, %r4, 0;
	and.pred  	%p1, %p56, %p55;
	@!%p1 bra 	BB22_50;
	bra.uni 	BB22_49;

BB22_49:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r33}, %fd53;
	}
	xor.b32  	%r34, %r33, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r35, %temp}, %fd53;
	}
	mov.b64 	%fd53, {%r35, %r34};

BB22_50:
	mov.f64 	%fd52, %fd53;
	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
	@%p57 bra 	BB22_53;
	bra.uni 	BB22_51;

BB22_53:
	selp.b32	%r36, %r4, 0, %p55;
	or.b32  	%r37, %r36, 2146435072;
	setp.lt.s32	%p61, %r5, 0;
	selp.b32	%r38, %r37, %r36, %p61;
	mov.u32 	%r39, 0;
	mov.b64 	%fd52, {%r39, %r38};
	bra.uni 	BB22_54;

BB22_27:
	setp.gt.s32	%p11, %r9, 15;
	@%p11 bra 	BB22_31;

	setp.eq.s32	%p15, %r9, 14;
	@%p15 bra 	BB22_42;
	bra.uni 	BB22_29;

BB22_42:
	cvt.rni.s64.f64	%rd11, %fd1;
	cvt.rni.s64.f64	%rd12, %fd2;
	cvt.u32.u64	%r25, %rd11;
	cvt.u32.u64	%r26, %rd12;
	or.b32  	%r27, %r26, %r25;
	setp.eq.s32	%p47, %r27, 0;
	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
	bra.uni 	BB22_64;

BB22_7:
	setp.eq.s32	%p31, %r9, 2;
	@%p31 bra 	BB22_62;
	bra.uni 	BB22_8;

BB22_62:
	mul.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_23:
	setp.eq.s32	%p18, %r9, 11;
	@%p18 bra 	BB22_44;

	setp.eq.s32	%p19, %r9, 12;
	@%p19 bra 	BB22_43;
	bra.uni 	BB22_25;

BB22_43:
	max.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_14:
	setp.eq.s32	%p25, %r9, 6;
	@%p25 bra 	BB22_47;

	setp.eq.s32	%p26, %r9, 7;
	@%p26 bra 	BB22_46;
	bra.uni 	BB22_16;

BB22_46:
	setp.gt.f64	%p52, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
	bra.uni 	BB22_64;

BB22_31:
	setp.eq.s32	%p12, %r9, 16;
	@%p12 bra 	BB22_41;

	setp.eq.s32	%p13, %r9, 17;
	@%p13 bra 	BB22_37;
	bra.uni 	BB22_33;

BB22_37:
	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
	or.pred  	%p41, %p39, %p40;
	mov.f64 	%fd54, 0d7FF8000000000000;
	@%p41 bra 	BB22_64;

	div.rn.f64 	%fd54, %fd1, %fd2;
	abs.f64 	%fd39, %fd54;
	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
	@%p42 bra 	BB22_64;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r22, %temp}, %fd54;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r23}, %fd54;
	}
	and.b32  	%r24, %r23, 2147483647;
	setp.ne.s32	%p43, %r24, 2146435072;
	setp.ne.s32	%p44, %r22, 0;
	or.pred  	%p45, %p43, %p44;
	@!%p45 bra 	BB22_64;
	bra.uni 	BB22_40;

BB22_40:
	cvt.rmi.f64.f64	%fd40, %fd54;
	mul.f64 	%fd41, %fd2, %fd40;
	sub.f64 	%fd54, %fd1, %fd41;
	bra.uni 	BB22_64;

BB22_5:
	setp.eq.s32	%p34, %r9, 1;
	@%p34 bra 	BB22_6;
	bra.uni 	BB22_64;

BB22_6:
	sub.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_21:
	setp.eq.s32	%p22, %r9, 10;
	@%p22 bra 	BB22_22;
	bra.uni 	BB22_64;

BB22_22:
	setp.neu.f64	%p49, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
	bra.uni 	BB22_64;

BB22_12:
	setp.eq.s32	%p29, %r9, 5;
	@%p29 bra 	BB22_13;
	bra.uni 	BB22_64;

BB22_13:
	setp.lt.f64	%p54, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
	bra.uni 	BB22_64;

BB22_29:
	setp.eq.s32	%p16, %r9, 15;
	@%p16 bra 	BB22_30;
	bra.uni 	BB22_64;

BB22_30:
	mul.f64 	%fd43, %fd1, %fd2;
	mov.f64 	%fd44, 0d3FF0000000000000;
	sub.f64 	%fd54, %fd44, %fd43;
	bra.uni 	BB22_64;

BB22_8:
	setp.eq.s32	%p32, %r9, 3;
	@%p32 bra 	BB22_9;
	bra.uni 	BB22_64;

BB22_9:
	div.rn.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_44:
	min.f64 	%fd54, %fd1, %fd2;
	bra.uni 	BB22_64;

BB22_25:
	setp.eq.s32	%p20, %r9, 13;
	@%p20 bra 	BB22_26;
	bra.uni 	BB22_64;

BB22_26:
	cvt.rni.s64.f64	%rd13, %fd1;
	cvt.rni.s64.f64	%rd14, %fd2;
	cvt.u32.u64	%r28, %rd13;
	cvt.u32.u64	%r29, %rd14;
	and.b32  	%r30, %r29, %r28;
	setp.eq.s32	%p48, %r30, 0;
	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
	bra.uni 	BB22_64;

BB22_47:
	setp.le.f64	%p53, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
	bra.uni 	BB22_64;

BB22_16:
	setp.eq.s32	%p27, %r9, 8;
	@%p27 bra 	BB22_17;
	bra.uni 	BB22_64;

BB22_17:
	setp.ge.f64	%p51, %fd1, %fd2;
	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
	bra.uni 	BB22_64;

BB22_41:
	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
	sub.f64 	%fd42, %fd1, %fd2;
	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
	bra.uni 	BB22_64;

BB22_33:
	setp.ne.s32	%p14, %r9, 18;
	@%p14 bra 	BB22_64;

	div.rn.f64 	%fd54, %fd1, %fd2;
	abs.f64 	%fd37, %fd54;
	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
	@%p35 bra 	BB22_64;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r19, %temp}, %fd54;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r20}, %fd54;
	}
	and.b32  	%r21, %r20, 2147483647;
	setp.ne.s32	%p36, %r21, 2146435072;
	setp.ne.s32	%p37, %r19, 0;
	or.pred  	%p38, %p36, %p37;
	@!%p38 bra 	BB22_64;
	bra.uni 	BB22_36;

BB22_36:
	cvt.rmi.f64.f64	%fd54, %fd54;
	bra.uni 	BB22_64;

BB22_51:
	setp.gt.s32	%p58, %r4, -1;
	@%p58 bra 	BB22_54;

	cvt.rzi.f64.f64	%fd45, %fd2;
	setp.neu.f64	%p59, %fd45, %fd2;
	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;

BB22_54:
	mov.f64 	%fd25, %fd52;
	add.f64 	%fd26, %fd1, %fd2;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r40}, %fd26;
	}
	and.b32  	%r41, %r40, 2146435072;
	setp.ne.s32	%p62, %r41, 2146435072;
	mov.f64 	%fd51, %fd25;
	@%p62 bra 	BB22_61;

	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
	mov.f64 	%fd51, %fd26;
	@%p63 bra 	BB22_61;

	abs.f64 	%fd46, %fd2;
	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
	mov.f64 	%fd50, %fd26;
	mov.f64 	%fd51, %fd50;
	@%p64 bra 	BB22_61;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r42, %temp}, %fd2;
	}
	and.b32  	%r43, %r5, 2147483647;
	setp.eq.s32	%p65, %r43, 2146435072;
	setp.eq.s32	%p66, %r42, 0;
	and.pred  	%p67, %p65, %p66;
	@%p67 bra 	BB22_60;
	bra.uni 	BB22_58;

BB22_60:
	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
	selp.b32	%r51, 2146435072, 0, %p71;
	xor.b32  	%r52, %r51, 2146435072;
	setp.lt.s32	%p72, %r5, 0;
	selp.b32	%r53, %r52, %r51, %p72;
	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
	selp.b32	%r54, 1072693248, %r53, %p73;
	mov.u32 	%r55, 0;
	mov.b64 	%fd51, {%r55, %r54};
	bra.uni 	BB22_61;

BB22_58:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r44, %temp}, %fd1;
	}
	and.b32  	%r45, %r4, 2147483647;
	setp.eq.s32	%p68, %r45, 2146435072;
	setp.eq.s32	%p69, %r44, 0;
	and.pred  	%p70, %p68, %p69;
	mov.f64 	%fd51, %fd25;
	@!%p70 bra 	BB22_61;
	bra.uni 	BB22_59;

BB22_59:
	shr.s32 	%r46, %r5, 31;
	and.b32  	%r47, %r46, -2146435072;
	selp.b32	%r48, -1048576, 2146435072, %p1;
	add.s32 	%r49, %r48, %r47;
	mov.u32 	%r50, 0;
	mov.b64 	%fd51, {%r50, %r49};

BB22_61:
	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
	or.pred  	%p76, %p75, %p74;
	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;

BB22_64:
	cvta.to.global.u64 	%rd16, %rd4;
	mul.wide.s32 	%rd17, %r3, 8;
	add.s64 	%rd18, %rd16, %rd17;
	st.global.f64 	[%rd18], %fd54;
	bar.sync 	0;

BB22_65:
	ret;
}

	// .globl	matrix_matrix_cellwise_op_f
.visible .entry matrix_matrix_cellwise_op_f(
	.param .u64 matrix_matrix_cellwise_op_f_param_0,
	.param .u64 matrix_matrix_cellwise_op_f_param_1,
	.param .u64 matrix_matrix_cellwise_op_f_param_2,
	.param .u32 matrix_matrix_cellwise_op_f_param_3,
	.param .u32 matrix_matrix_cellwise_op_f_param_4,
	.param .u32 matrix_matrix_cellwise_op_f_param_5,
	.param .u32 matrix_matrix_cellwise_op_f_param_6,
	.param .u32 matrix_matrix_cellwise_op_f_param_7
)
{
	.reg .pred 	%p<76>;
	.reg .f32 	%f<134>;
	.reg .b32 	%r<42>;
	.reg .b64 	%rd<17>;


	ld.param.u64 	%rd1, [matrix_matrix_cellwise_op_f_param_0];
	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_f_param_1];
	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_f_param_2];
	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_3];
	ld.param.u32 	%r4, [matrix_matrix_cellwise_op_f_param_4];
	ld.param.u32 	%r5, [matrix_matrix_cellwise_op_f_param_5];
	ld.param.u32 	%r6, [matrix_matrix_cellwise_op_f_param_6];
	ld.param.u32 	%r7, [matrix_matrix_cellwise_op_f_param_7];
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %ctaid.x;
	mov.u32 	%r11, %tid.x;
	mad.lo.s32 	%r1, %r9, %r10, %r11;
	div.s32 	%r2, %r1, %r4;
	setp.lt.s32	%p2, %r2, %r8;
	setp.gt.s32	%p3, %r4, -1;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB23_63;
	bra.uni 	BB23_1;

BB23_1:
	rem.s32 	%r12, %r1, %r4;
	cvta.to.global.u64 	%rd4, %rd1;
	mad.lo.s32 	%r3, %r2, %r4, %r12;
	setp.eq.s32	%p5, %r5, 2;
	selp.b32	%r13, %r12, %r3, %p5;
	setp.eq.s32	%p6, %r5, 1;
	selp.b32	%r14, %r2, %r13, %p6;
	setp.eq.s32	%p7, %r6, 2;
	selp.b32	%r15, %r12, %r3, %p7;
	setp.eq.s32	%p8, %r6, 1;
	selp.b32	%r16, %r2, %r15, %p8;
	mul.wide.s32 	%rd5, %r14, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	cvta.to.global.u64 	%rd7, %rd2;
	mul.wide.s32 	%rd8, %r16, 4;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f32 	%f2, [%rd9];
	mov.f32 	%f133, 0f7F7FFFFF;
	setp.gt.s32	%p9, %r7, 8;
	@%p9 bra 	BB23_18;

	setp.gt.s32	%p23, %r7, 3;
	@%p23 bra 	BB23_10;

	setp.gt.s32	%p30, %r7, 1;
	@%p30 bra 	BB23_7;

	setp.eq.s32	%p33, %r7, 0;
	@%p33 bra 	BB23_61;
	bra.uni 	BB23_5;

BB23_61:
	add.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_18:
	setp.gt.s32	%p10, %r7, 13;
	@%p10 bra 	BB23_27;

	setp.gt.s32	%p17, %r7, 10;
	@%p17 bra 	BB23_23;

	setp.eq.s32	%p21, %r7, 9;
	@%p21 bra 	BB23_43;
	bra.uni 	BB23_21;

BB23_43:
	setp.eq.f32	%p44, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p44;
	bra.uni 	BB23_62;

BB23_10:
	setp.gt.s32	%p24, %r7, 5;
	@%p24 bra 	BB23_14;

	setp.eq.s32	%p28, %r7, 4;
	@%p28 bra 	BB23_46;
	bra.uni 	BB23_12;

BB23_46:
	mul.f32 	%f53, %f2, 0f3F000000;
	cvt.rzi.f32.f32	%f54, %f53;
	fma.rn.f32 	%f55, %f54, 0fC0000000, %f2;
	abs.f32 	%f19, %f55;
	abs.f32 	%f20, %f1;
	setp.lt.f32	%p49, %f20, 0f00800000;
	mul.f32 	%f56, %f20, 0f4B800000;
	selp.f32	%f57, 0fC3170000, 0fC2FE0000, %p49;
	selp.f32	%f58, %f56, %f20, %p49;
	mov.b32 	 %r23, %f58;
	and.b32  	%r24, %r23, 8388607;
	or.b32  	%r25, %r24, 1065353216;
	mov.b32 	 %f59, %r25;
	shr.u32 	%r26, %r23, 23;
	cvt.rn.f32.u32	%f60, %r26;
	add.f32 	%f61, %f57, %f60;
	setp.gt.f32	%p50, %f59, 0f3FB504F3;
	mul.f32 	%f62, %f59, 0f3F000000;
	add.f32 	%f63, %f61, 0f3F800000;
	selp.f32	%f64, %f62, %f59, %p50;
	selp.f32	%f65, %f63, %f61, %p50;
	add.f32 	%f66, %f64, 0fBF800000;
	add.f32 	%f50, %f64, 0f3F800000;
	// inline asm
	rcp.approx.ftz.f32 %f49,%f50;
	// inline asm
	add.f32 	%f67, %f66, %f66;
	mul.f32 	%f68, %f49, %f67;
	mul.f32 	%f69, %f68, %f68;
	mov.f32 	%f70, 0f3C4CAF63;
	mov.f32 	%f71, 0f3B18F0FE;
	fma.rn.f32 	%f72, %f71, %f69, %f70;
	mov.f32 	%f73, 0f3DAAAABD;
	fma.rn.f32 	%f74, %f72, %f69, %f73;
	mul.rn.f32 	%f75, %f74, %f69;
	mul.rn.f32 	%f76, %f75, %f68;
	sub.f32 	%f77, %f66, %f68;
	neg.f32 	%f78, %f68;
	add.f32 	%f79, %f77, %f77;
	fma.rn.f32 	%f80, %f78, %f66, %f79;
	mul.rn.f32 	%f81, %f49, %f80;
	add.f32 	%f82, %f76, %f68;
	sub.f32 	%f83, %f68, %f82;
	add.f32 	%f84, %f76, %f83;
	add.f32 	%f85, %f81, %f84;
	add.f32 	%f86, %f82, %f85;
	sub.f32 	%f87, %f82, %f86;
	add.f32 	%f88, %f85, %f87;
	mov.f32 	%f89, 0f3F317200;
	mul.rn.f32 	%f90, %f65, %f89;
	mov.f32 	%f91, 0f35BFBE8E;
	mul.rn.f32 	%f92, %f65, %f91;
	add.f32 	%f93, %f90, %f86;
	sub.f32 	%f94, %f90, %f93;
	add.f32 	%f95, %f86, %f94;
	add.f32 	%f96, %f88, %f95;
	add.f32 	%f97, %f92, %f96;
	add.f32 	%f98, %f93, %f97;
	sub.f32 	%f99, %f93, %f98;
	add.f32 	%f100, %f97, %f99;
	abs.f32 	%f21, %f2;
	setp.gt.f32	%p51, %f21, 0f77F684DF;
	mul.f32 	%f101, %f2, 0f39000000;
	selp.f32	%f102, %f101, %f2, %p51;
	mul.rn.f32 	%f103, %f102, %f98;
	neg.f32 	%f104, %f103;
	fma.rn.f32 	%f105, %f102, %f98, %f104;
	fma.rn.f32 	%f106, %f102, %f100, %f105;
	mov.f32 	%f107, 0f00000000;
	fma.rn.f32 	%f108, %f107, %f98, %f106;
	add.rn.f32 	%f109, %f103, %f108;
	neg.f32 	%f110, %f109;
	add.rn.f32 	%f111, %f103, %f110;
	add.rn.f32 	%f112, %f111, %f108;
	mov.b32 	 %r27, %f109;
	setp.eq.s32	%p52, %r27, 1118925336;
	add.s32 	%r28, %r27, -1;
	mov.b32 	 %f113, %r28;
	add.f32 	%f114, %f112, 0f37000000;
	selp.f32	%f115, %f113, %f109, %p52;
	selp.f32	%f22, %f114, %f112, %p52;
	mul.f32 	%f116, %f115, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f117, %f116;
	mov.f32 	%f118, 0fBF317200;
	fma.rn.f32 	%f119, %f117, %f118, %f115;
	mov.f32 	%f120, 0fB5BFBE8E;
	fma.rn.f32 	%f121, %f117, %f120, %f119;
	mul.f32 	%f52, %f121, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f51,%f52;
	// inline asm
	add.f32 	%f122, %f117, 0f00000000;
	ex2.approx.f32 	%f123, %f122;
	mul.f32 	%f124, %f51, %f123;
	setp.lt.f32	%p53, %f115, 0fC2D20000;
	selp.f32	%f125, 0f00000000, %f124, %p53;
	setp.gt.f32	%p54, %f115, 0f42D20000;
	selp.f32	%f131, 0f7F800000, %f125, %p54;
	setp.eq.f32	%p55, %f131, 0f7F800000;
	@%p55 bra 	BB23_48;

	fma.rn.f32 	%f131, %f131, %f22, %f131;

BB23_48:
	setp.lt.f32	%p56, %f1, 0f00000000;
	setp.eq.f32	%p57, %f19, 0f3F800000;
	and.pred  	%p1, %p56, %p57;
	mov.b32 	 %r29, %f131;
	xor.b32  	%r30, %r29, -2147483648;
	mov.b32 	 %f126, %r30;
	selp.f32	%f132, %f126, %f131, %p1;
	setp.eq.f32	%p58, %f1, 0f00000000;
	@%p58 bra 	BB23_51;
	bra.uni 	BB23_49;

BB23_51:
	add.f32 	%f128, %f1, %f1;
	mov.b32 	 %r31, %f128;
	selp.b32	%r32, %r31, 0, %p57;
	or.b32  	%r33, %r32, 2139095040;
	setp.lt.f32	%p62, %f2, 0f00000000;
	selp.b32	%r34, %r33, %r32, %p62;
	mov.b32 	 %f132, %r34;
	bra.uni 	BB23_52;

BB23_27:
	setp.gt.s32	%p11, %r7, 15;
	@%p11 bra 	BB23_31;

	setp.eq.s32	%p15, %r7, 14;
	@%p15 bra 	BB23_40;
	bra.uni 	BB23_29;

BB23_40:
	cvt.rni.s64.f32	%rd10, %f1;
	cvt.rni.s64.f32	%rd11, %f2;
	cvt.u32.u64	%r17, %rd10;
	cvt.u32.u64	%r18, %rd11;
	or.b32  	%r19, %r18, %r17;
	setp.eq.s32	%p41, %r19, 0;
	selp.f32	%f133, 0f00000000, 0f3F800000, %p41;
	bra.uni 	BB23_62;

BB23_7:
	setp.eq.s32	%p31, %r7, 2;
	@%p31 bra 	BB23_60;
	bra.uni 	BB23_8;

BB23_60:
	mul.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_23:
	setp.eq.s32	%p18, %r7, 11;
	@%p18 bra 	BB23_42;

	setp.eq.s32	%p19, %r7, 12;
	@%p19 bra 	BB23_41;
	bra.uni 	BB23_25;

BB23_41:
	max.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_14:
	setp.eq.s32	%p25, %r7, 6;
	@%p25 bra 	BB23_45;

	setp.eq.s32	%p26, %r7, 7;
	@%p26 bra 	BB23_44;
	bra.uni 	BB23_16;

BB23_44:
	setp.gt.f32	%p46, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p46;
	bra.uni 	BB23_62;

BB23_31:
	setp.eq.s32	%p12, %r7, 16;
	@%p12 bra 	BB23_39;

	setp.eq.s32	%p13, %r7, 17;
	@%p13 bra 	BB23_36;
	bra.uni 	BB23_33;

BB23_36:
	setp.eq.f32	%p36, %f2, 0f00000000;
	setp.eq.f32	%p37, %f2, 0f80000000;
	or.pred  	%p38, %p36, %p37;
	mov.f32 	%f133, 0f7FC00000;
	@%p38 bra 	BB23_62;

	div.rn.f32 	%f133, %f1, %f2;
	abs.f32 	%f43, %f133;
	setp.geu.f32	%p39, %f43, 0f7F800000;
	@%p39 bra 	BB23_62;

	cvt.rmi.f32.f32	%f44, %f133;
	mul.f32 	%f45, %f2, %f44;
	sub.f32 	%f133, %f1, %f45;
	bra.uni 	BB23_62;

BB23_5:
	setp.eq.s32	%p34, %r7, 1;
	@%p34 bra 	BB23_6;
	bra.uni 	BB23_62;

BB23_6:
	sub.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_21:
	setp.eq.s32	%p22, %r7, 10;
	@%p22 bra 	BB23_22;
	bra.uni 	BB23_62;

BB23_22:
	setp.neu.f32	%p43, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p43;
	bra.uni 	BB23_62;

BB23_12:
	setp.eq.s32	%p29, %r7, 5;
	@%p29 bra 	BB23_13;
	bra.uni 	BB23_62;

BB23_13:
	setp.lt.f32	%p48, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p48;
	bra.uni 	BB23_62;

BB23_29:
	setp.eq.s32	%p16, %r7, 15;
	@%p16 bra 	BB23_30;
	bra.uni 	BB23_62;

BB23_30:
	mul.f32 	%f47, %f1, %f2;
	mov.f32 	%f48, 0f3F800000;
	sub.f32 	%f133, %f48, %f47;
	bra.uni 	BB23_62;

BB23_8:
	setp.eq.s32	%p32, %r7, 3;
	@%p32 bra 	BB23_9;
	bra.uni 	BB23_62;

BB23_9:
	div.rn.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_42:
	min.f32 	%f133, %f1, %f2;
	bra.uni 	BB23_62;

BB23_25:
	setp.eq.s32	%p20, %r7, 13;
	@%p20 bra 	BB23_26;
	bra.uni 	BB23_62;

BB23_26:
	cvt.rni.s64.f32	%rd12, %f1;
	cvt.rni.s64.f32	%rd13, %f2;
	cvt.u32.u64	%r20, %rd12;
	cvt.u32.u64	%r21, %rd13;
	and.b32  	%r22, %r21, %r20;
	setp.eq.s32	%p42, %r22, 0;
	selp.f32	%f133, 0f00000000, 0f3F800000, %p42;
	bra.uni 	BB23_62;

BB23_45:
	setp.le.f32	%p47, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p47;
	bra.uni 	BB23_62;

BB23_16:
	setp.eq.s32	%p27, %r7, 8;
	@%p27 bra 	BB23_17;
	bra.uni 	BB23_62;

BB23_17:
	setp.ge.f32	%p45, %f1, %f2;
	selp.f32	%f133, 0f3F800000, 0f00000000, %p45;
	bra.uni 	BB23_62;

BB23_39:
	setp.neu.f32	%p40, %f1, 0f00000000;
	sub.f32 	%f46, %f1, %f2;
	selp.f32	%f133, %f46, 0f00000000, %p40;
	bra.uni 	BB23_62;

BB23_33:
	setp.ne.s32	%p14, %r7, 18;
	@%p14 bra 	BB23_62;

	div.rn.f32 	%f133, %f1, %f2;
	abs.f32 	%f41, %f133;
	setp.geu.f32	%p35, %f41, 0f7F800000;
	@%p35 bra 	BB23_62;

	cvt.rmi.f32.f32	%f133, %f133;
	bra.uni 	BB23_62;

BB23_49:
	setp.geu.f32	%p59, %f1, 0f00000000;
	@%p59 bra 	BB23_52;

	cvt.rzi.f32.f32	%f127, %f2;
	setp.neu.f32	%p60, %f127, %f2;
	selp.f32	%f132, 0f7FFFFFFF, %f132, %p60;

BB23_52:
	add.f32 	%f129, %f20, %f21;
	mov.b32 	 %r35, %f129;
	setp.lt.s32	%p63, %r35, 2139095040;
	@%p63 bra 	BB23_59;

	setp.gtu.f32	%p64, %f20, 0f7F800000;
	setp.gtu.f32	%p65, %f21, 0f7F800000;
	or.pred  	%p66, %p64, %p65;
	@%p66 bra 	BB23_58;
	bra.uni 	BB23_54;

BB23_58:
	add.f32 	%f132, %f1, %f2;
	bra.uni 	BB23_59;

BB23_54:
	setp.eq.f32	%p67, %f21, 0f7F800000;
	@%p67 bra 	BB23_57;
	bra.uni 	BB23_55;

BB23_57:
	setp.gt.f32	%p70, %f20, 0f3F800000;
	selp.b32	%r39, 2139095040, 0, %p70;
	xor.b32  	%r40, %r39, 2139095040;
	setp.lt.f32	%p71, %f2, 0f00000000;
	selp.b32	%r41, %r40, %r39, %p71;
	mov.b32 	 %f130, %r41;
	setp.eq.f32	%p72, %f1, 0fBF800000;
	selp.f32	%f132, 0f3F800000, %f130, %p72;
	bra.uni 	BB23_59;

BB23_55:
	setp.neu.f32	%p68, %f20, 0f7F800000;
	@%p68 bra 	BB23_59;

	setp.ge.f32	%p69, %f2, 0f00000000;
	selp.b32	%r36, 2139095040, 0, %p69;
	or.b32  	%r37, %r36, -2147483648;
	selp.b32	%r38, %r37, %r36, %p1;
	mov.b32 	 %f132, %r38;

BB23_59:
	setp.eq.f32	%p73, %f2, 0f00000000;
	setp.eq.f32	%p74, %f1, 0f3F800000;
	or.pred  	%p75, %p74, %p73;
	selp.f32	%f133, 0f3F800000, %f132, %p75;

BB23_62:
	cvta.to.global.u64 	%rd14, %rd3;
	mul.wide.s32 	%rd15, %r3, 4;
	add.s64 	%rd16, %rd14, %rd15;
	st.global.f32 	[%rd16], %f133;
	bar.sync 	0;

BB23_63:
	ret;
}

	// .globl	matrix_scalar_op_d
.visible .entry matrix_scalar_op_d(
	.param .u64 matrix_scalar_op_d_param_0,
	.param .f64 matrix_scalar_op_d_param_1,
	.param .u64 matrix_scalar_op_d_param_2,
	.param .u32 matrix_scalar_op_d_param_3,
	.param .u32 matrix_scalar_op_d_param_4,
	.param .u32 matrix_scalar_op_d_param_5
)
{
	.reg .pred 	%p<141>;
	.reg .b32 	%r<86>;
	.reg .f64 	%fd<107>;
	.reg .b64 	%rd<20>;


	ld.param.u64 	%rd4, [matrix_scalar_op_d_param_0];
	ld.param.f64 	%fd68, [matrix_scalar_op_d_param_1];
	ld.param.u64 	%rd5, [matrix_scalar_op_d_param_2];
	ld.param.u32 	%r8, [matrix_scalar_op_d_param_3];
	ld.param.u32 	%r6, [matrix_scalar_op_d_param_4];
	ld.param.u32 	%r7, [matrix_scalar_op_d_param_5];
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %ctaid.x;
	mov.u32 	%r11, %tid.x;
	mad.lo.s32 	%r1, %r9, %r10, %r11;
	setp.ge.s32	%p3, %r1, %r8;
	@%p3 bra 	BB24_130;

	cvta.to.global.u64 	%rd6, %rd5;
	cvta.to.global.u64 	%rd7, %rd4;
	mul.wide.s32 	%rd8, %r1, 8;
	add.s64 	%rd9, %rd7, %rd8;
	ld.global.f64 	%fd1, [%rd9];
	add.s64 	%rd1, %rd6, %rd8;
	setp.eq.s32	%p4, %r7, 0;
	@%p4 bra 	BB24_66;

	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p5, %r6, 8;
	@%p5 bra 	BB24_19;

	setp.gt.s32	%p19, %r6, 3;
	@%p19 bra 	BB24_11;

	setp.gt.s32	%p26, %r6, 1;
	@%p26 bra 	BB24_8;

	setp.eq.s32	%p29, %r6, 0;
	@%p29 bra 	BB24_64;
	bra.uni 	BB24_6;

BB24_64:
	add.f64 	%fd98, %fd1, %fd68;
	bra.uni 	BB24_65;

BB24_66:
	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
	setp.gt.s32	%p73, %r6, 8;
	@%p73 bra 	BB24_83;

	setp.gt.s32	%p87, %r6, 3;
	@%p87 bra 	BB24_75;

	setp.gt.s32	%p94, %r6, 1;
	@%p94 bra 	BB24_72;

	setp.eq.s32	%p97, %r6, 0;
	@%p97 bra 	BB24_128;
	bra.uni 	BB24_70;

BB24_128:
	add.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_19:
	setp.gt.s32	%p6, %r6, 13;
	@%p6 bra 	BB24_28;

	setp.gt.s32	%p13, %r6, 10;
	@%p13 bra 	BB24_24;

	setp.eq.s32	%p17, %r6, 9;
	@%p17 bra 	BB24_46;
	bra.uni 	BB24_22;

BB24_46:
	setp.eq.f64	%p46, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
	bra.uni 	BB24_65;

BB24_83:
	setp.gt.s32	%p74, %r6, 13;
	@%p74 bra 	BB24_92;

	setp.gt.s32	%p81, %r6, 10;
	@%p81 bra 	BB24_88;

	setp.eq.s32	%p85, %r6, 9;
	@%p85 bra 	BB24_110;
	bra.uni 	BB24_86;

BB24_110:
	setp.eq.f64	%p114, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
	bra.uni 	BB24_129;

BB24_11:
	setp.gt.s32	%p20, %r6, 5;
	@%p20 bra 	BB24_15;

	setp.eq.s32	%p24, %r6, 4;
	@%p24 bra 	BB24_49;
	bra.uni 	BB24_13;

BB24_49:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd68;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r3}, %fd1;
	}
	bfe.u32 	%r24, %r3, 20, 11;
	add.s32 	%r25, %r24, -1012;
	mov.b64 	 %rd14, %fd1;
	shl.b64 	%rd2, %rd14, %r25;
	setp.eq.s64	%p51, %rd2, -9223372036854775808;
	abs.f64 	%fd18, %fd68;
	// Callseq Start 1
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd18;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd1;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd97, [retval0+0];
	
	//{
	}// Callseq End 1
	setp.lt.s32	%p52, %r2, 0;
	and.pred  	%p1, %p52, %p51;
	@!%p1 bra 	BB24_51;
	bra.uni 	BB24_50;

BB24_50:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r26}, %fd97;
	}
	xor.b32  	%r27, %r26, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r28, %temp}, %fd97;
	}
	mov.b64 	%fd97, {%r28, %r27};

BB24_51:
	mov.f64 	%fd96, %fd97;
	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
	@%p53 bra 	BB24_54;
	bra.uni 	BB24_52;

BB24_54:
	selp.b32	%r29, %r2, 0, %p51;
	or.b32  	%r30, %r29, 2146435072;
	setp.lt.s32	%p57, %r3, 0;
	selp.b32	%r31, %r30, %r29, %p57;
	mov.u32 	%r32, 0;
	mov.b64 	%fd96, {%r32, %r31};
	bra.uni 	BB24_55;

BB24_28:
	setp.gt.s32	%p7, %r6, 15;
	@%p7 bra 	BB24_32;

	setp.eq.s32	%p11, %r6, 14;
	@%p11 bra 	BB24_43;
	bra.uni 	BB24_30;

BB24_43:
	cvt.rni.s64.f64	%rd10, %fd68;
	cvt.rni.s64.f64	%rd11, %fd1;
	cvt.u32.u64	%r18, %rd10;
	cvt.u32.u64	%r19, %rd11;
	or.b32  	%r20, %r19, %r18;
	setp.eq.s32	%p43, %r20, 0;
	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
	bra.uni 	BB24_65;

BB24_75:
	setp.gt.s32	%p88, %r6, 5;
	@%p88 bra 	BB24_79;

	setp.eq.s32	%p92, %r6, 4;
	@%p92 bra 	BB24_113;
	bra.uni 	BB24_77;

BB24_113:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r4}, %fd1;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd68;
	}
	bfe.u32 	%r61, %r5, 20, 11;
	add.s32 	%r62, %r61, -1012;
	mov.b64 	 %rd19, %fd68;
	shl.b64 	%rd3, %rd19, %r62;
	setp.eq.s64	%p119, %rd3, -9223372036854775808;
	abs.f64 	%fd51, %fd1;
	// Callseq Start 2
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd51;
	.param .b64 param1;
	st.param.f64	[param1+0], %fd68;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_accurate_pow, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd105, [retval0+0];
	
	//{
	}// Callseq End 2
	setp.lt.s32	%p120, %r4, 0;
	and.pred  	%p2, %p120, %p119;
	@!%p2 bra 	BB24_115;
	bra.uni 	BB24_114;

BB24_114:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r63}, %fd105;
	}
	xor.b32  	%r64, %r63, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r65, %temp}, %fd105;
	}
	mov.b64 	%fd105, {%r65, %r64};

BB24_115:
	mov.f64 	%fd104, %fd105;
	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
	@%p121 bra 	BB24_118;
	bra.uni 	BB24_116;

BB24_118:
	selp.b32	%r66, %r4, 0, %p119;
	or.b32  	%r67, %r66, 2146435072;
	setp.lt.s32	%p125, %r5, 0;
	selp.b32	%r68, %r67, %r66, %p125;
	mov.u32 	%r69, 0;
	mov.b64 	%fd104, {%r69, %r68};
	bra.uni 	BB24_119;

BB24_92:
	setp.gt.s32	%p75, %r6, 15;
	@%p75 bra 	BB24_96;

	setp.eq.s32	%p79, %r6, 14;
	@%p79 bra 	BB24_107;
	bra.uni 	BB24_94;

BB24_107:
	cvt.rni.s64.f64	%rd15, %fd1;
	cvt.rni.s64.f64	%rd16, %fd68;
	cvt.u32.u64	%r55, %rd15;
	cvt.u32.u64	%r56, %rd16;
	or.b32  	%r57, %r56, %r55;
	setp.eq.s32	%p111, %r57, 0;
	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
	bra.uni 	BB24_129;

BB24_8:
	setp.eq.s32	%p27, %r6, 2;
	@%p27 bra 	BB24_63;
	bra.uni 	BB24_9;

BB24_63:
	mul.f64 	%fd98, %fd1, %fd68;
	bra.uni 	BB24_65;

BB24_24:
	setp.eq.s32	%p14, %r6, 11;
	@%p14 bra 	BB24_45;

	setp.eq.s32	%p15, %r6, 12;
	@%p15 bra 	BB24_44;
	bra.uni 	BB24_26;

BB24_44:
	max.f64 	%fd98, %fd68, %fd1;
	bra.uni 	BB24_65;

BB24_15:
	setp.eq.s32	%p21, %r6, 6;
	@%p21 bra 	BB24_48;

	setp.eq.s32	%p22, %r6, 7;
	@%p22 bra 	BB24_47;
	bra.uni 	BB24_17;

BB24_47:
	setp.lt.f64	%p48, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
	bra.uni 	BB24_65;

BB24_32:
	setp.eq.s32	%p8, %r6, 16;
	@%p8 bra 	BB24_42;

	setp.eq.s32	%p9, %r6, 17;
	@%p9 bra 	BB24_38;
	bra.uni 	BB24_34;

BB24_38:
	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
	or.pred  	%p37, %p35, %p36;
	mov.f64 	%fd98, 0d7FF8000000000000;
	@%p37 bra 	BB24_65;

	div.rn.f64 	%fd98, %fd68, %fd1;
	abs.f64 	%fd72, %fd98;
	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
	@%p38 bra 	BB24_65;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r15, %temp}, %fd98;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r16}, %fd98;
	}
	and.b32  	%r17, %r16, 2147483647;
	setp.ne.s32	%p39, %r17, 2146435072;
	setp.ne.s32	%p40, %r15, 0;
	or.pred  	%p41, %p39, %p40;
	@!%p41 bra 	BB24_65;
	bra.uni 	BB24_41;

BB24_41:
	cvt.rmi.f64.f64	%fd73, %fd98;
	mul.f64 	%fd74, %fd1, %fd73;
	sub.f64 	%fd98, %fd68, %fd74;
	bra.uni 	BB24_65;

BB24_72:
	setp.eq.s32	%p95, %r6, 2;
	@%p95 bra 	BB24_127;
	bra.uni 	BB24_73;

BB24_127:
	mul.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_88:
	setp.eq.s32	%p82, %r6, 11;
	@%p82 bra 	BB24_109;

	setp.eq.s32	%p83, %r6, 12;
	@%p83 bra 	BB24_108;
	bra.uni 	BB24_90;

BB24_108:
	max.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_79:
	setp.eq.s32	%p89, %r6, 6;
	@%p89 bra 	BB24_112;

	setp.eq.s32	%p90, %r6, 7;
	@%p90 bra 	BB24_111;
	bra.uni 	BB24_81;

BB24_111:
	setp.gt.f64	%p116, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
	bra.uni 	BB24_129;

BB24_96:
	setp.eq.s32	%p76, %r6, 16;
	@%p76 bra 	BB24_106;

	setp.eq.s32	%p77, %r6, 17;
	@%p77 bra 	BB24_102;
	bra.uni 	BB24_98;

BB24_102:
	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
	or.pred  	%p105, %p103, %p104;
	mov.f64 	%fd106, 0d7FF8000000000000;
	@%p105 bra 	BB24_129;

	div.rn.f64 	%fd106, %fd1, %fd68;
	abs.f64 	%fd83, %fd106;
	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
	@%p106 bra 	BB24_129;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r52, %temp}, %fd106;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r53}, %fd106;
	}
	and.b32  	%r54, %r53, 2147483647;
	setp.ne.s32	%p107, %r54, 2146435072;
	setp.ne.s32	%p108, %r52, 0;
	or.pred  	%p109, %p107, %p108;
	@!%p109 bra 	BB24_129;
	bra.uni 	BB24_105;

BB24_105:
	cvt.rmi.f64.f64	%fd84, %fd106;
	mul.f64 	%fd85, %fd84, %fd68;
	sub.f64 	%fd106, %fd1, %fd85;
	bra.uni 	BB24_129;

BB24_6:
	setp.eq.s32	%p30, %r6, 1;
	@%p30 bra 	BB24_7;
	bra.uni 	BB24_65;

BB24_7:
	sub.f64 	%fd98, %fd68, %fd1;
	bra.uni 	BB24_65;

BB24_22:
	setp.eq.s32	%p18, %r6, 10;
	@%p18 bra 	BB24_23;
	bra.uni 	BB24_65;

BB24_23:
	setp.neu.f64	%p45, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
	bra.uni 	BB24_65;

BB24_13:
	setp.eq.s32	%p25, %r6, 5;
	@%p25 bra 	BB24_14;
	bra.uni 	BB24_65;

BB24_14:
	setp.gt.f64	%p50, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
	bra.uni 	BB24_65;

BB24_30:
	setp.eq.s32	%p12, %r6, 15;
	@%p12 bra 	BB24_31;
	bra.uni 	BB24_65;

BB24_31:
	mul.f64 	%fd76, %fd1, %fd68;
	mov.f64 	%fd77, 0d3FF0000000000000;
	sub.f64 	%fd98, %fd77, %fd76;
	bra.uni 	BB24_65;

BB24_9:
	setp.eq.s32	%p28, %r6, 3;
	@%p28 bra 	BB24_10;
	bra.uni 	BB24_65;

BB24_10:
	div.rn.f64 	%fd98, %fd68, %fd1;
	bra.uni 	BB24_65;

BB24_45:
	min.f64 	%fd98, %fd68, %fd1;
	bra.uni 	BB24_65;

BB24_26:
	setp.eq.s32	%p16, %r6, 13;
	@%p16 bra 	BB24_27;
	bra.uni 	BB24_65;

BB24_27:
	cvt.rni.s64.f64	%rd12, %fd68;
	cvt.rni.s64.f64	%rd13, %fd1;
	cvt.u32.u64	%r21, %rd12;
	cvt.u32.u64	%r22, %rd13;
	and.b32  	%r23, %r22, %r21;
	setp.eq.s32	%p44, %r23, 0;
	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
	bra.uni 	BB24_65;

BB24_48:
	setp.ge.f64	%p49, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
	bra.uni 	BB24_65;

BB24_17:
	setp.eq.s32	%p23, %r6, 8;
	@%p23 bra 	BB24_18;
	bra.uni 	BB24_65;

BB24_18:
	setp.le.f64	%p47, %fd1, %fd68;
	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
	bra.uni 	BB24_65;

BB24_42:
	setp.neu.f64	%p42, %fd68, 0d0000000000000000;
	sub.f64 	%fd75, %fd68, %fd1;
	selp.f64	%fd98, %fd75, 0d0000000000000000, %p42;
	bra.uni 	BB24_65;

BB24_34:
	setp.ne.s32	%p10, %r6, 18;
	@%p10 bra 	BB24_65;

	div.rn.f64 	%fd98, %fd68, %fd1;
	abs.f64 	%fd70, %fd98;
	setp.gtu.f64	%p31, %fd70, 0d7FF0000000000000;
	@%p31 bra 	BB24_65;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r12, %temp}, %fd98;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r13}, %fd98;
	}
	and.b32  	%r14, %r13, 2147483647;
	setp.ne.s32	%p32, %r14, 2146435072;
	setp.ne.s32	%p33, %r12, 0;
	or.pred  	%p34, %p32, %p33;
	@!%p34 bra 	BB24_65;
	bra.uni 	BB24_37;

BB24_37:
	cvt.rmi.f64.f64	%fd98, %fd98;
	bra.uni 	BB24_65;

BB24_70:
	setp.eq.s32	%p98, %r6, 1;
	@%p98 bra 	BB24_71;
	bra.uni 	BB24_129;

BB24_71:
	sub.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_86:
	setp.eq.s32	%p86, %r6, 10;
	@%p86 bra 	BB24_87;
	bra.uni 	BB24_129;

BB24_87:
	setp.neu.f64	%p113, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
	bra.uni 	BB24_129;

BB24_77:
	setp.eq.s32	%p93, %r6, 5;
	@%p93 bra 	BB24_78;
	bra.uni 	BB24_129;

BB24_78:
	setp.lt.f64	%p118, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
	bra.uni 	BB24_129;

BB24_94:
	setp.eq.s32	%p80, %r6, 15;
	@%p80 bra 	BB24_95;
	bra.uni 	BB24_129;

BB24_95:
	mul.f64 	%fd87, %fd1, %fd68;
	mov.f64 	%fd88, 0d3FF0000000000000;
	sub.f64 	%fd106, %fd88, %fd87;
	bra.uni 	BB24_129;

BB24_73:
	setp.eq.s32	%p96, %r6, 3;
	@%p96 bra 	BB24_74;
	bra.uni 	BB24_129;

BB24_74:
	div.rn.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_109:
	min.f64 	%fd106, %fd1, %fd68;
	bra.uni 	BB24_129;

BB24_90:
	setp.eq.s32	%p84, %r6, 13;
	@%p84 bra 	BB24_91;
	bra.uni 	BB24_129;

BB24_91:
	cvt.rni.s64.f64	%rd17, %fd1;
	cvt.rni.s64.f64	%rd18, %fd68;
	cvt.u32.u64	%r58, %rd17;
	cvt.u32.u64	%r59, %rd18;
	and.b32  	%r60, %r59, %r58;
	setp.eq.s32	%p112, %r60, 0;
	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
	bra.uni 	BB24_129;

BB24_112:
	setp.le.f64	%p117, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
	bra.uni 	BB24_129;

BB24_81:
	setp.eq.s32	%p91, %r6, 8;
	@%p91 bra 	BB24_82;
	bra.uni 	BB24_129;

BB24_82:
	setp.ge.f64	%p115, %fd1, %fd68;
	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
	bra.uni 	BB24_129;

BB24_106:
	setp.neu.f64	%p110, %fd1, 0d0000000000000000;
	sub.f64 	%fd86, %fd1, %fd68;
	selp.f64	%fd106, %fd86, 0d0000000000000000, %p110;
	bra.uni 	BB24_129;

BB24_98:
	setp.ne.s32	%p78, %r6, 18;
	@%p78 bra 	BB24_129;

	div.rn.f64 	%fd106, %fd1, %fd68;
	abs.f64 	%fd81, %fd106;
	setp.gtu.f64	%p99, %fd81, 0d7FF0000000000000;
	@%p99 bra 	BB24_129;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r49, %temp}, %fd106;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r50}, %fd106;
	}
	and.b32  	%r51, %r50, 2147483647;
	setp.ne.s32	%p100, %r51, 2146435072;
	setp.ne.s32	%p101, %r49, 0;
	or.pred  	%p102, %p100, %p101;
	@!%p102 bra 	BB24_129;
	bra.uni 	BB24_101;

BB24_101:
	cvt.rmi.f64.f64	%fd106, %fd106;
	bra.uni 	BB24_129;

BB24_52:
	setp.gt.s32	%p54, %r2, -1;
	@%p54 bra 	BB24_55;

	cvt.rzi.f64.f64	%fd78, %fd1;
	setp.neu.f64	%p55, %fd78, %fd1;
	selp.f64	%fd96, 0dFFF8000000000000, %fd96, %p55;

BB24_55:
	mov.f64 	%fd24, %fd96;
	add.f64 	%fd25, %fd1, %fd68;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r33}, %fd25;
	}
	and.b32  	%r34, %r33, 2146435072;
	setp.ne.s32	%p58, %r34, 2146435072;
	mov.f64 	%fd95, %fd24;
	@%p58 bra 	BB24_62;

	setp.gtu.f64	%p59, %fd18, 0d7FF0000000000000;
	mov.f64 	%fd95, %fd25;
	@%p59 bra 	BB24_62;

	abs.f64 	%fd79, %fd1;
	setp.gtu.f64	%p60, %fd79, 0d7FF0000000000000;
	mov.f64 	%fd94, %fd25;
	mov.f64 	%fd95, %fd94;
	@%p60 bra 	BB24_62;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r35, %temp}, %fd1;
	}
	and.b32  	%r36, %r3, 2147483647;
	setp.eq.s32	%p61, %r36, 2146435072;
	setp.eq.s32	%p62, %r35, 0;
	and.pred  	%p63, %p61, %p62;
	@%p63 bra 	BB24_61;
	bra.uni 	BB24_59;

BB24_61:
	setp.gt.f64	%p67, %fd18, 0d3FF0000000000000;
	selp.b32	%r44, 2146435072, 0, %p67;
	xor.b32  	%r45, %r44, 2146435072;
	setp.lt.s32	%p68, %r3, 0;
	selp.b32	%r46, %r45, %r44, %p68;
	setp.eq.f64	%p69, %fd68, 0dBFF0000000000000;
	selp.b32	%r47, 1072693248, %r46, %p69;
	mov.u32 	%r48, 0;
	mov.b64 	%fd95, {%r48, %r47};
	bra.uni 	BB24_62;

BB24_116:
	setp.gt.s32	%p122, %r4, -1;
	@%p122 bra 	BB24_119;

	cvt.rzi.f64.f64	%fd89, %fd68;
	setp.neu.f64	%p123, %fd89, %fd68;
	selp.f64	%fd104, 0dFFF8000000000000, %fd104, %p123;

BB24_119:
	mov.f64 	%fd57, %fd104;
	add.f64 	%fd58, %fd1, %fd68;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r70}, %fd58;
	}
	and.b32  	%r71, %r70, 2146435072;
	setp.ne.s32	%p126, %r71, 2146435072;
	mov.f64 	%fd103, %fd57;
	@%p126 bra 	BB24_126;

	setp.gtu.f64	%p127, %fd51, 0d7FF0000000000000;
	mov.f64 	%fd103, %fd58;
	@%p127 bra 	BB24_126;

	abs.f64 	%fd90, %fd68;
	setp.gtu.f64	%p128, %fd90, 0d7FF0000000000000;
	mov.f64 	%fd102, %fd58;
	mov.f64 	%fd103, %fd102;
	@%p128 bra 	BB24_126;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r72, %temp}, %fd68;
	}
	and.b32  	%r73, %r5, 2147483647;
	setp.eq.s32	%p129, %r73, 2146435072;
	setp.eq.s32	%p130, %r72, 0;
	and.pred  	%p131, %p129, %p130;
	@%p131 bra 	BB24_125;
	bra.uni 	BB24_123;

BB24_125:
	setp.gt.f64	%p135, %fd51, 0d3FF0000000000000;
	selp.b32	%r81, 2146435072, 0, %p135;
	xor.b32  	%r82, %r81, 2146435072;
	setp.lt.s32	%p136, %r5, 0;
	selp.b32	%r83, %r82, %r81, %p136;
	setp.eq.f64	%p137, %fd1, 0dBFF0000000000000;
	selp.b32	%r84, 1072693248, %r83, %p137;
	mov.u32 	%r85, 0;
	mov.b64 	%fd103, {%r85, %r84};
	bra.uni 	BB24_126;

BB24_59:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r37, %temp}, %fd68;
	}
	and.b32  	%r38, %r2, 2147483647;
	setp.eq.s32	%p64, %r38, 2146435072;
	setp.eq.s32	%p65, %r37, 0;
	and.pred  	%p66, %p64, %p65;
	mov.f64 	%fd95, %fd24;
	@!%p66 bra 	BB24_62;
	bra.uni 	BB24_60;

BB24_60:
	shr.s32 	%r39, %r3, 31;
	and.b32  	%r40, %r39, -2146435072;
	selp.b32	%r41, -1048576, 2146435072, %p1;
	add.s32 	%r42, %r41, %r40;
	mov.u32 	%r43, 0;
	mov.b64 	%fd95, {%r43, %r42};

BB24_62:
	setp.eq.f64	%p70, %fd1, 0d0000000000000000;
	setp.eq.f64	%p71, %fd68, 0d3FF0000000000000;
	or.pred  	%p72, %p71, %p70;
	selp.f64	%fd98, 0d3FF0000000000000, %fd95, %p72;

BB24_65:
	st.global.f64 	[%rd1], %fd98;
	bra.uni 	BB24_130;

BB24_123:
	{
	.reg .b32 %temp; 
	mov.b64 	{%r74, %temp}, %fd1;
	}
	and.b32  	%r75, %r4, 2147483647;
	setp.eq.s32	%p132, %r75, 2146435072;
	setp.eq.s32	%p133, %r74, 0;
	and.pred  	%p134, %p132, %p133;
	mov.f64 	%fd103, %fd57;
	@!%p134 bra 	BB24_126;
	bra.uni 	BB24_124;

BB24_124:
	shr.s32 	%r76, %r5, 31;
	and.b32  	%r77, %r76, -2146435072;
	selp.b32	%r78, -1048576, 2146435072, %p2;
	add.s32 	%r79, %r78, %r77;
	mov.u32 	%r80, 0;
	mov.b64 	%fd103, {%r80, %r79};

BB24_126:
	setp.eq.f64	%p138, %fd68, 0d0000000000000000;
	setp.eq.f64	%p139, %fd1, 0d3FF0000000000000;
	or.pred  	%p140, %p139, %p138;
	selp.f64	%fd106, 0d3FF0000000000000, %fd103, %p140;

BB24_129:
	st.global.f64 	[%rd1], %fd106;

BB24_130:
	bar.sync 	0;
	ret;
}

	// .globl	matrix_scalar_op_f
.visible .entry matrix_scalar_op_f(
	.param .u64 matrix_scalar_op_f_param_0,
	.param .f64 matrix_scalar_op_f_param_1,
	.param .u64 matrix_scalar_op_f_param_2,
	.param .u32 matrix_scalar_op_f_param_3,
	.param .u32 matrix_scalar_op_f_param_4,
	.param .u32 matrix_scalar_op_f_param_5
)
{
	.reg .pred 	%p<139>;
	.reg .f32 	%f<265>;
	.reg .b32 	%r<58>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<16>;


	ld.param.u64 	%rd2, [matrix_scalar_op_f_param_0];
	ld.param.f64 	%fd1, [matrix_scalar_op_f_param_1];
	ld.param.u64 	%rd3, [matrix_scalar_op_f_param_2];
	ld.param.u32 	%r4, [matrix_scalar_op_f_param_3];
	ld.param.u32 	%r2, [matrix_scalar_op_f_param_4];
	ld.param.u32 	%r3, [matrix_scalar_op_f_param_5];
	cvt.rn.f32.f64	%f1, %fd1;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r5, %r6, %r7;
	setp.ge.s32	%p3, %r1, %r4;
	@%p3 bra 	BB25_126;

	cvta.to.global.u64 	%rd4, %rd3;
	cvta.to.global.u64 	%rd5, %rd2;
	mul.wide.s32 	%rd6, %r1, 4;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f32 	%f2, [%rd7];
	add.s64 	%rd1, %rd4, %rd6;
	setp.eq.s32	%p4, %r3, 0;
	@%p4 bra 	BB25_64;

	mov.f32 	%f261, 0f7F7FFFFF;
	setp.gt.s32	%p5, %r2, 8;
	@%p5 bra 	BB25_19;

	setp.gt.s32	%p19, %r2, 3;
	@%p19 bra 	BB25_11;

	setp.gt.s32	%p26, %r2, 1;
	@%p26 bra 	BB25_8;

	setp.eq.s32	%p29, %r2, 0;
	@%p29 bra 	BB25_62;
	bra.uni 	BB25_6;

BB25_62:
	add.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_64:
	mov.f32 	%f264, 0f7F7FFFFF;
	setp.gt.s32	%p72, %r2, 8;
	@%p72 bra 	BB25_81;

	setp.gt.s32	%p86, %r2, 3;
	@%p86 bra 	BB25_73;

	setp.gt.s32	%p93, %r2, 1;
	@%p93 bra 	BB25_70;

	setp.eq.s32	%p96, %r2, 0;
	@%p96 bra 	BB25_124;
	bra.uni 	BB25_68;

BB25_124:
	add.f32 	%f264, %f1, %f2;
	bra.uni 	BB25_125;

BB25_19:
	setp.gt.s32	%p6, %r2, 13;
	@%p6 bra 	BB25_28;

	setp.gt.s32	%p13, %r2, 10;
	@%p13 bra 	BB25_24;

	setp.eq.s32	%p17, %r2, 9;
	@%p17 bra 	BB25_44;
	bra.uni 	BB25_22;

BB25_44:
	setp.eq.f32	%p40, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p40;
	bra.uni 	BB25_63;

BB25_81:
	setp.gt.s32	%p73, %r2, 13;
	@%p73 bra 	BB25_90;

	setp.gt.s32	%p80, %r2, 10;
	@%p80 bra 	BB25_86;

	setp.eq.s32	%p84, %r2, 9;
	@%p84 bra 	BB25_106;
	bra.uni 	BB25_84;

BB25_106:
	setp.eq.f32	%p107, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p107;
	bra.uni 	BB25_125;

BB25_11:
	setp.gt.s32	%p20, %r2, 5;
	@%p20 bra 	BB25_15;

	setp.eq.s32	%p24, %r2, 4;
	@%p24 bra 	BB25_47;
	bra.uni 	BB25_13;

BB25_47:
	mul.f32 	%f90, %f2, 0f3F000000;
	cvt.rzi.f32.f32	%f91, %f90;
	fma.rn.f32 	%f92, %f91, 0fC0000000, %f2;
	abs.f32 	%f19, %f92;
	abs.f32 	%f20, %f1;
	setp.lt.f32	%p45, %f20, 0f00800000;
	mul.f32 	%f93, %f20, 0f4B800000;
	selp.f32	%f94, 0fC3170000, 0fC2FE0000, %p45;
	selp.f32	%f95, %f93, %f20, %p45;
	mov.b32 	 %r14, %f95;
	and.b32  	%r15, %r14, 8388607;
	or.b32  	%r16, %r15, 1065353216;
	mov.b32 	 %f96, %r16;
	shr.u32 	%r17, %r14, 23;
	cvt.rn.f32.u32	%f97, %r17;
	add.f32 	%f98, %f94, %f97;
	setp.gt.f32	%p46, %f96, 0f3FB504F3;
	mul.f32 	%f99, %f96, 0f3F000000;
	add.f32 	%f100, %f98, 0f3F800000;
	selp.f32	%f101, %f99, %f96, %p46;
	selp.f32	%f102, %f100, %f98, %p46;
	add.f32 	%f103, %f101, 0fBF800000;
	add.f32 	%f87, %f101, 0f3F800000;
	// inline asm
	rcp.approx.ftz.f32 %f86,%f87;
	// inline asm
	add.f32 	%f104, %f103, %f103;
	mul.f32 	%f105, %f86, %f104;
	mul.f32 	%f106, %f105, %f105;
	mov.f32 	%f107, 0f3C4CAF63;
	mov.f32 	%f108, 0f3B18F0FE;
	fma.rn.f32 	%f109, %f108, %f106, %f107;
	mov.f32 	%f110, 0f3DAAAABD;
	fma.rn.f32 	%f111, %f109, %f106, %f110;
	mul.rn.f32 	%f112, %f111, %f106;
	mul.rn.f32 	%f113, %f112, %f105;
	sub.f32 	%f114, %f103, %f105;
	neg.f32 	%f115, %f105;
	add.f32 	%f116, %f114, %f114;
	fma.rn.f32 	%f117, %f115, %f103, %f116;
	mul.rn.f32 	%f118, %f86, %f117;
	add.f32 	%f119, %f113, %f105;
	sub.f32 	%f120, %f105, %f119;
	add.f32 	%f121, %f113, %f120;
	add.f32 	%f122, %f118, %f121;
	add.f32 	%f123, %f119, %f122;
	sub.f32 	%f124, %f119, %f123;
	add.f32 	%f125, %f122, %f124;
	mov.f32 	%f126, 0f3F317200;
	mul.rn.f32 	%f127, %f102, %f126;
	mov.f32 	%f128, 0f35BFBE8E;
	mul.rn.f32 	%f129, %f102, %f128;
	add.f32 	%f130, %f127, %f123;
	sub.f32 	%f131, %f127, %f130;
	add.f32 	%f132, %f123, %f131;
	add.f32 	%f133, %f125, %f132;
	add.f32 	%f134, %f129, %f133;
	add.f32 	%f135, %f130, %f134;
	sub.f32 	%f136, %f130, %f135;
	add.f32 	%f137, %f134, %f136;
	abs.f32 	%f21, %f2;
	setp.gt.f32	%p47, %f21, 0f77F684DF;
	mul.f32 	%f138, %f2, 0f39000000;
	selp.f32	%f139, %f138, %f2, %p47;
	mul.rn.f32 	%f140, %f139, %f135;
	neg.f32 	%f141, %f140;
	fma.rn.f32 	%f142, %f139, %f135, %f141;
	fma.rn.f32 	%f143, %f139, %f137, %f142;
	mov.f32 	%f144, 0f00000000;
	fma.rn.f32 	%f145, %f144, %f135, %f143;
	add.rn.f32 	%f146, %f140, %f145;
	neg.f32 	%f147, %f146;
	add.rn.f32 	%f148, %f140, %f147;
	add.rn.f32 	%f149, %f148, %f145;
	mov.b32 	 %r18, %f146;
	setp.eq.s32	%p48, %r18, 1118925336;
	add.s32 	%r19, %r18, -1;
	mov.b32 	 %f150, %r19;
	add.f32 	%f151, %f149, 0f37000000;
	selp.f32	%f152, %f150, %f146, %p48;
	selp.f32	%f22, %f151, %f149, %p48;
	mul.f32 	%f153, %f152, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f154, %f153;
	mov.f32 	%f155, 0fBF317200;
	fma.rn.f32 	%f156, %f154, %f155, %f152;
	mov.f32 	%f157, 0fB5BFBE8E;
	fma.rn.f32 	%f158, %f154, %f157, %f156;
	mul.f32 	%f89, %f158, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f88,%f89;
	// inline asm
	add.f32 	%f159, %f154, 0f00000000;
	ex2.approx.f32 	%f160, %f159;
	mul.f32 	%f161, %f88, %f160;
	setp.lt.f32	%p49, %f152, 0fC2D20000;
	selp.f32	%f162, 0f00000000, %f161, %p49;
	setp.gt.f32	%p50, %f152, 0f42D20000;
	selp.f32	%f259, 0f7F800000, %f162, %p50;
	setp.eq.f32	%p51, %f259, 0f7F800000;
	@%p51 bra 	BB25_49;

	fma.rn.f32 	%f259, %f259, %f22, %f259;

BB25_49:
	setp.lt.f32	%p52, %f1, 0f00000000;
	setp.eq.f32	%p53, %f19, 0f3F800000;
	and.pred  	%p1, %p52, %p53;
	mov.b32 	 %r20, %f259;
	xor.b32  	%r21, %r20, -2147483648;
	mov.b32 	 %f163, %r21;
	selp.f32	%f260, %f163, %f259, %p1;
	setp.eq.f32	%p54, %f1, 0f00000000;
	@%p54 bra 	BB25_52;
	bra.uni 	BB25_50;

BB25_52:
	add.f32 	%f165, %f1, %f1;
	mov.b32 	 %r22, %f165;
	selp.b32	%r23, %r22, 0, %p53;
	or.b32  	%r24, %r23, 2139095040;
	setp.lt.f32	%p58, %f2, 0f00000000;
	selp.b32	%r25, %r24, %r23, %p58;
	mov.b32 	 %f260, %r25;
	bra.uni 	BB25_53;

BB25_28:
	setp.gt.s32	%p7, %r2, 15;
	@%p7 bra 	BB25_32;

	setp.eq.s32	%p11, %r2, 14;
	@%p11 bra 	BB25_41;
	bra.uni 	BB25_30;

BB25_41:
	cvt.rni.s64.f32	%rd8, %f1;
	cvt.rni.s64.f32	%rd9, %f2;
	cvt.u32.u64	%r8, %rd8;
	cvt.u32.u64	%r9, %rd9;
	or.b32  	%r10, %r9, %r8;
	setp.eq.s32	%p37, %r10, 0;
	selp.f32	%f261, 0f00000000, 0f3F800000, %p37;
	bra.uni 	BB25_63;

BB25_73:
	setp.gt.s32	%p87, %r2, 5;
	@%p87 bra 	BB25_77;

	setp.eq.s32	%p91, %r2, 4;
	@%p91 bra 	BB25_109;
	bra.uni 	BB25_75;

BB25_109:
	mul.f32 	%f181, %f1, 0f3F000000;
	cvt.rzi.f32.f32	%f182, %f181;
	fma.rn.f32 	%f183, %f182, 0fC0000000, %f1;
	abs.f32 	%f56, %f183;
	abs.f32 	%f57, %f2;
	setp.lt.f32	%p112, %f57, 0f00800000;
	mul.f32 	%f184, %f57, 0f4B800000;
	selp.f32	%f185, 0fC3170000, 0fC2FE0000, %p112;
	selp.f32	%f186, %f184, %f57, %p112;
	mov.b32 	 %r39, %f186;
	and.b32  	%r40, %r39, 8388607;
	or.b32  	%r41, %r40, 1065353216;
	mov.b32 	 %f187, %r41;
	shr.u32 	%r42, %r39, 23;
	cvt.rn.f32.u32	%f188, %r42;
	add.f32 	%f189, %f185, %f188;
	setp.gt.f32	%p113, %f187, 0f3FB504F3;
	mul.f32 	%f190, %f187, 0f3F000000;
	add.f32 	%f191, %f189, 0f3F800000;
	selp.f32	%f192, %f190, %f187, %p113;
	selp.f32	%f193, %f191, %f189, %p113;
	add.f32 	%f194, %f192, 0fBF800000;
	add.f32 	%f178, %f192, 0f3F800000;
	// inline asm
	rcp.approx.ftz.f32 %f177,%f178;
	// inline asm
	add.f32 	%f195, %f194, %f194;
	mul.f32 	%f196, %f177, %f195;
	mul.f32 	%f197, %f196, %f196;
	mov.f32 	%f198, 0f3C4CAF63;
	mov.f32 	%f199, 0f3B18F0FE;
	fma.rn.f32 	%f200, %f199, %f197, %f198;
	mov.f32 	%f201, 0f3DAAAABD;
	fma.rn.f32 	%f202, %f200, %f197, %f201;
	mul.rn.f32 	%f203, %f202, %f197;
	mul.rn.f32 	%f204, %f203, %f196;
	sub.f32 	%f205, %f194, %f196;
	neg.f32 	%f206, %f196;
	add.f32 	%f207, %f205, %f205;
	fma.rn.f32 	%f208, %f206, %f194, %f207;
	mul.rn.f32 	%f209, %f177, %f208;
	add.f32 	%f210, %f204, %f196;
	sub.f32 	%f211, %f196, %f210;
	add.f32 	%f212, %f204, %f211;
	add.f32 	%f213, %f209, %f212;
	add.f32 	%f214, %f210, %f213;
	sub.f32 	%f215, %f210, %f214;
	add.f32 	%f216, %f213, %f215;
	mov.f32 	%f217, 0f3F317200;
	mul.rn.f32 	%f218, %f193, %f217;
	mov.f32 	%f219, 0f35BFBE8E;
	mul.rn.f32 	%f220, %f193, %f219;
	add.f32 	%f221, %f218, %f214;
	sub.f32 	%f222, %f218, %f221;
	add.f32 	%f223, %f214, %f222;
	add.f32 	%f224, %f216, %f223;
	add.f32 	%f225, %f220, %f224;
	add.f32 	%f226, %f221, %f225;
	sub.f32 	%f227, %f221, %f226;
	add.f32 	%f228, %f225, %f227;
	abs.f32 	%f58, %f1;
	setp.gt.f32	%p114, %f58, 0f77F684DF;
	mul.f32 	%f229, %f1, 0f39000000;
	selp.f32	%f230, %f229, %f1, %p114;
	mul.rn.f32 	%f231, %f230, %f226;
	neg.f32 	%f232, %f231;
	fma.rn.f32 	%f233, %f230, %f226, %f232;
	fma.rn.f32 	%f234, %f230, %f228, %f233;
	mov.f32 	%f235, 0f00000000;
	fma.rn.f32 	%f236, %f235, %f226, %f234;
	add.rn.f32 	%f237, %f231, %f236;
	neg.f32 	%f238, %f237;
	add.rn.f32 	%f239, %f231, %f238;
	add.rn.f32 	%f240, %f239, %f236;
	mov.b32 	 %r43, %f237;
	setp.eq.s32	%p115, %r43, 1118925336;
	add.s32 	%r44, %r43, -1;
	mov.b32 	 %f241, %r44;
	add.f32 	%f242, %f240, 0f37000000;
	selp.f32	%f243, %f241, %f237, %p115;
	selp.f32	%f59, %f242, %f240, %p115;
	mul.f32 	%f244, %f243, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f245, %f244;
	mov.f32 	%f246, 0fBF317200;
	fma.rn.f32 	%f247, %f245, %f246, %f243;
	mov.f32 	%f248, 0fB5BFBE8E;
	fma.rn.f32 	%f249, %f245, %f248, %f247;
	mul.f32 	%f180, %f249, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f179,%f180;
	// inline asm
	add.f32 	%f250, %f245, 0f00000000;
	ex2.approx.f32 	%f251, %f250;
	mul.f32 	%f252, %f179, %f251;
	setp.lt.f32	%p116, %f243, 0fC2D20000;
	selp.f32	%f253, 0f00000000, %f252, %p116;
	setp.gt.f32	%p117, %f243, 0f42D20000;
	selp.f32	%f262, 0f7F800000, %f253, %p117;
	setp.eq.f32	%p118, %f262, 0f7F800000;
	@%p118 bra 	BB25_111;

	fma.rn.f32 	%f262, %f262, %f59, %f262;

BB25_111:
	setp.lt.f32	%p119, %f2, 0f00000000;
	setp.eq.f32	%p120, %f56, 0f3F800000;
	and.pred  	%p2, %p119, %p120;
	mov.b32 	 %r45, %f262;
	xor.b32  	%r46, %r45, -2147483648;
	mov.b32 	 %f254, %r46;
	selp.f32	%f263, %f254, %f262, %p2;
	setp.eq.f32	%p121, %f2, 0f00000000;
	@%p121 bra 	BB25_114;
	bra.uni 	BB25_112;

BB25_114:
	add.f32 	%f256, %f2, %f2;
	mov.b32 	 %r47, %f256;
	selp.b32	%r48, %r47, 0, %p120;
	or.b32  	%r49, %r48, 2139095040;
	setp.lt.f32	%p125, %f1, 0f00000000;
	selp.b32	%r50, %r49, %r48, %p125;
	mov.b32 	 %f263, %r50;
	bra.uni 	BB25_115;

BB25_90:
	setp.gt.s32	%p74, %r2, 15;
	@%p74 bra 	BB25_94;

	setp.eq.s32	%p78, %r2, 14;
	@%p78 bra 	BB25_103;
	bra.uni 	BB25_92;

BB25_103:
	cvt.rni.s64.f32	%rd12, %f2;
	cvt.rni.s64.f32	%rd13, %f1;
	cvt.u32.u64	%r33, %rd12;
	cvt.u32.u64	%r34, %rd13;
	or.b32  	%r35, %r34, %r33;
	setp.eq.s32	%p104, %r35, 0;
	selp.f32	%f264, 0f00000000, 0f3F800000, %p104;
	bra.uni 	BB25_125;

BB25_8:
	setp.eq.s32	%p27, %r2, 2;
	@%p27 bra 	BB25_61;
	bra.uni 	BB25_9;

BB25_61:
	mul.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_24:
	setp.eq.s32	%p14, %r2, 11;
	@%p14 bra 	BB25_43;

	setp.eq.s32	%p15, %r2, 12;
	@%p15 bra 	BB25_42;
	bra.uni 	BB25_26;

BB25_42:
	max.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_15:
	setp.eq.s32	%p21, %r2, 6;
	@%p21 bra 	BB25_46;

	setp.eq.s32	%p22, %r2, 7;
	@%p22 bra 	BB25_45;
	bra.uni 	BB25_17;

BB25_45:
	setp.gt.f32	%p42, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p42;
	bra.uni 	BB25_63;

BB25_32:
	setp.eq.s32	%p8, %r2, 16;
	@%p8 bra 	BB25_40;

	setp.eq.s32	%p9, %r2, 17;
	@%p9 bra 	BB25_37;
	bra.uni 	BB25_34;

BB25_37:
	setp.eq.f32	%p32, %f2, 0f00000000;
	setp.eq.f32	%p33, %f2, 0f80000000;
	or.pred  	%p34, %p32, %p33;
	mov.f32 	%f261, 0f7FC00000;
	@%p34 bra 	BB25_63;

	div.rn.f32 	%f261, %f1, %f2;
	abs.f32 	%f80, %f261;
	setp.geu.f32	%p35, %f80, 0f7F800000;
	@%p35 bra 	BB25_63;

	cvt.rmi.f32.f32	%f81, %f261;
	mul.f32 	%f82, %f2, %f81;
	sub.f32 	%f261, %f1, %f82;
	bra.uni 	BB25_63;

BB25_70:
	setp.eq.s32	%p94, %r2, 2;
	@%p94 bra 	BB25_123;
	bra.uni 	BB25_71;

BB25_123:
	mul.f32 	%f264, %f1, %f2;
	bra.uni 	BB25_125;

BB25_86:
	setp.eq.s32	%p81, %r2, 11;
	@%p81 bra 	BB25_105;

	setp.eq.s32	%p82, %r2, 12;
	@%p82 bra 	BB25_104;
	bra.uni 	BB25_88;

BB25_104:
	max.f32 	%f264, %f2, %f1;
	bra.uni 	BB25_125;

BB25_77:
	setp.eq.s32	%p88, %r2, 6;
	@%p88 bra 	BB25_108;

	setp.eq.s32	%p89, %r2, 7;
	@%p89 bra 	BB25_107;
	bra.uni 	BB25_79;

BB25_107:
	setp.gt.f32	%p109, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p109;
	bra.uni 	BB25_125;

BB25_94:
	setp.eq.s32	%p75, %r2, 16;
	@%p75 bra 	BB25_102;

	setp.eq.s32	%p76, %r2, 17;
	@%p76 bra 	BB25_99;
	bra.uni 	BB25_96;

BB25_99:
	setp.eq.f32	%p99, %f1, 0f00000000;
	setp.eq.f32	%p100, %f1, 0f80000000;
	or.pred  	%p101, %p99, %p100;
	mov.f32 	%f264, 0f7FC00000;
	@%p101 bra 	BB25_125;

	div.rn.f32 	%f264, %f2, %f1;
	abs.f32 	%f171, %f264;
	setp.geu.f32	%p102, %f171, 0f7F800000;
	@%p102 bra 	BB25_125;

	cvt.rmi.f32.f32	%f172, %f264;
	mul.f32 	%f173, %f1, %f172;
	sub.f32 	%f264, %f2, %f173;
	bra.uni 	BB25_125;

BB25_6:
	setp.eq.s32	%p30, %r2, 1;
	@%p30 bra 	BB25_7;
	bra.uni 	BB25_63;

BB25_7:
	sub.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_22:
	setp.eq.s32	%p18, %r2, 10;
	@%p18 bra 	BB25_23;
	bra.uni 	BB25_63;

BB25_23:
	setp.neu.f32	%p39, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p39;
	bra.uni 	BB25_63;

BB25_13:
	setp.eq.s32	%p25, %r2, 5;
	@%p25 bra 	BB25_14;
	bra.uni 	BB25_63;

BB25_14:
	setp.lt.f32	%p44, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p44;
	bra.uni 	BB25_63;

BB25_30:
	setp.eq.s32	%p12, %r2, 15;
	@%p12 bra 	BB25_31;
	bra.uni 	BB25_63;

BB25_31:
	mul.f32 	%f84, %f1, %f2;
	mov.f32 	%f85, 0f3F800000;
	sub.f32 	%f261, %f85, %f84;
	bra.uni 	BB25_63;

BB25_9:
	setp.eq.s32	%p28, %r2, 3;
	@%p28 bra 	BB25_10;
	bra.uni 	BB25_63;

BB25_10:
	div.rn.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_43:
	min.f32 	%f261, %f1, %f2;
	bra.uni 	BB25_63;

BB25_26:
	setp.eq.s32	%p16, %r2, 13;
	@%p16 bra 	BB25_27;
	bra.uni 	BB25_63;

BB25_27:
	cvt.rni.s64.f32	%rd10, %f1;
	cvt.rni.s64.f32	%rd11, %f2;
	cvt.u32.u64	%r11, %rd10;
	cvt.u32.u64	%r12, %rd11;
	and.b32  	%r13, %r12, %r11;
	setp.eq.s32	%p38, %r13, 0;
	selp.f32	%f261, 0f00000000, 0f3F800000, %p38;
	bra.uni 	BB25_63;

BB25_46:
	setp.le.f32	%p43, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p43;
	bra.uni 	BB25_63;

BB25_17:
	setp.eq.s32	%p23, %r2, 8;
	@%p23 bra 	BB25_18;
	bra.uni 	BB25_63;

BB25_18:
	setp.ge.f32	%p41, %f1, %f2;
	selp.f32	%f261, 0f3F800000, 0f00000000, %p41;
	bra.uni 	BB25_63;

BB25_40:
	setp.neu.f32	%p36, %f1, 0f00000000;
	sub.f32 	%f83, %f1, %f2;
	selp.f32	%f261, %f83, 0f00000000, %p36;
	bra.uni 	BB25_63;

BB25_34:
	setp.ne.s32	%p10, %r2, 18;
	@%p10 bra 	BB25_63;

	div.rn.f32 	%f261, %f1, %f2;
	abs.f32 	%f78, %f261;
	setp.geu.f32	%p31, %f78, 0f7F800000;
	@%p31 bra 	BB25_63;

	cvt.rmi.f32.f32	%f261, %f261;
	bra.uni 	BB25_63;

BB25_68:
	setp.eq.s32	%p97, %r2, 1;
	@%p97 bra 	BB25_69;
	bra.uni 	BB25_125;

BB25_69:
	sub.f32 	%f264, %f2, %f1;
	bra.uni 	BB25_125;

BB25_84:
	setp.eq.s32	%p85, %r2, 10;
	@%p85 bra 	BB25_85;
	bra.uni 	BB25_125;

BB25_85:
	setp.neu.f32	%p106, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p106;
	bra.uni 	BB25_125;

BB25_75:
	setp.eq.s32	%p92, %r2, 5;
	@%p92 bra 	BB25_76;
	bra.uni 	BB25_125;

BB25_76:
	setp.lt.f32	%p111, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p111;
	bra.uni 	BB25_125;

BB25_92:
	setp.eq.s32	%p79, %r2, 15;
	@%p79 bra 	BB25_93;
	bra.uni 	BB25_125;

BB25_93:
	mul.f32 	%f175, %f1, %f2;
	mov.f32 	%f176, 0f3F800000;
	sub.f32 	%f264, %f176, %f175;
	bra.uni 	BB25_125;

BB25_71:
	setp.eq.s32	%p95, %r2, 3;
	@%p95 bra 	BB25_72;
	bra.uni 	BB25_125;

BB25_72:
	div.rn.f32 	%f264, %f2, %f1;
	bra.uni 	BB25_125;

BB25_105:
	min.f32 	%f264, %f2, %f1;
	bra.uni 	BB25_125;

BB25_88:
	setp.eq.s32	%p83, %r2, 13;
	@%p83 bra 	BB25_89;
	bra.uni 	BB25_125;

BB25_89:
	cvt.rni.s64.f32	%rd14, %f2;
	cvt.rni.s64.f32	%rd15, %f1;
	cvt.u32.u64	%r36, %rd14;
	cvt.u32.u64	%r37, %rd15;
	and.b32  	%r38, %r37, %r36;
	setp.eq.s32	%p105, %r38, 0;
	selp.f32	%f264, 0f00000000, 0f3F800000, %p105;
	bra.uni 	BB25_125;

BB25_108:
	setp.le.f32	%p110, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p110;
	bra.uni 	BB25_125;

BB25_79:
	setp.eq.s32	%p90, %r2, 8;
	@%p90 bra 	BB25_80;
	bra.uni 	BB25_125;

BB25_80:
	setp.ge.f32	%p108, %f2, %f1;
	selp.f32	%f264, 0f3F800000, 0f00000000, %p108;
	bra.uni 	BB25_125;

BB25_102:
	setp.neu.f32	%p103, %f2, 0f00000000;
	sub.f32 	%f174, %f2, %f1;
	selp.f32	%f264, %f174, 0f00000000, %p103;
	bra.uni 	BB25_125;

BB25_96:
	setp.ne.s32	%p77, %r2, 18;
	@%p77 bra 	BB25_125;

	div.rn.f32 	%f264, %f2, %f1;
	abs.f32 	%f169, %f264;
	setp.geu.f32	%p98, %f169, 0f7F800000;
	@%p98 bra 	BB25_125;

	cvt.rmi.f32.f32	%f264, %f264;
	bra.uni 	BB25_125;

BB25_50:
	setp.geu.f32	%p55, %f1, 0f00000000;
	@%p55 bra 	BB25_53;

	cvt.rzi.f32.f32	%f164, %f2;
	setp.neu.f32	%p56, %f164, %f2;
	selp.f32	%f260, 0f7FFFFFFF, %f260, %p56;

BB25_53:
	add.f32 	%f166, %f20, %f21;
	mov.b32 	 %r26, %f166;
	setp.lt.s32	%p59, %r26, 2139095040;
	@%p59 bra 	BB25_60;

	setp.gtu.f32	%p60, %f20, 0f7F800000;
	setp.gtu.f32	%p61, %f21, 0f7F800000;
	or.pred  	%p62, %p60, %p61;
	@%p62 bra 	BB25_59;
	bra.uni 	BB25_55;

BB25_59:
	add.f32 	%f260, %f1, %f2;
	bra.uni 	BB25_60;

BB25_55:
	setp.eq.f32	%p63, %f21, 0f7F800000;
	@%p63 bra 	BB25_58;
	bra.uni 	BB25_56;

BB25_58:
	setp.gt.f32	%p66, %f20, 0f3F800000;
	selp.b32	%r30, 2139095040, 0, %p66;
	xor.b32  	%r31, %r30, 2139095040;
	setp.lt.f32	%p67, %f2, 0f00000000;
	selp.b32	%r32, %r31, %r30, %p67;
	mov.b32 	 %f167, %r32;
	setp.eq.f32	%p68, %f1, 0fBF800000;
	selp.f32	%f260, 0f3F800000, %f167, %p68;
	bra.uni 	BB25_60;

BB25_112:
	setp.geu.f32	%p122, %f2, 0f00000000;
	@%p122 bra 	BB25_115;

	cvt.rzi.f32.f32	%f255, %f1;
	setp.neu.f32	%p123, %f255, %f1;
	selp.f32	%f263, 0f7FFFFFFF, %f263, %p123;

BB25_115:
	add.f32 	%f257, %f57, %f58;
	mov.b32 	 %r51, %f257;
	setp.lt.s32	%p126, %r51, 2139095040;
	@%p126 bra 	BB25_122;

	setp.gtu.f32	%p127, %f57, 0f7F800000;
	setp.gtu.f32	%p128, %f58, 0f7F800000;
	or.pred  	%p129, %p127, %p128;
	@%p129 bra 	BB25_121;
	bra.uni 	BB25_117;

BB25_121:
	add.f32 	%f263, %f1, %f2;
	bra.uni 	BB25_122;

BB25_117:
	setp.eq.f32	%p130, %f58, 0f7F800000;
	@%p130 bra 	BB25_120;
	bra.uni 	BB25_118;

BB25_120:
	setp.gt.f32	%p133, %f57, 0f3F800000;
	selp.b32	%r55, 2139095040, 0, %p133;
	xor.b32  	%r56, %r55, 2139095040;
	setp.lt.f32	%p134, %f1, 0f00000000;
	selp.b32	%r57, %r56, %r55, %p134;
	mov.b32 	 %f258, %r57;
	setp.eq.f32	%p135, %f2, 0fBF800000;
	selp.f32	%f263, 0f3F800000, %f258, %p135;
	bra.uni 	BB25_122;

BB25_56:
	setp.neu.f32	%p64, %f20, 0f7F800000;
	@%p64 bra 	BB25_60;

	setp.ge.f32	%p65, %f2, 0f00000000;
	selp.b32	%r27, 2139095040, 0, %p65;
	or.b32  	%r28, %r27, -2147483648;
	selp.b32	%r29, %r28, %r27, %p1;
	mov.b32 	 %f260, %r29;

BB25_60:
	setp.eq.f32	%p69, %f2, 0f00000000;
	setp.eq.f32	%p70, %f1, 0f3F800000;
	or.pred  	%p71, %p70, %p69;
	selp.f32	%f261, 0f3F800000, %f260, %p71;

BB25_63:
	st.global.f32 	[%rd1], %f261;
	bra.uni 	BB25_126;

BB25_118:
	setp.neu.f32	%p131, %f57, 0f7F800000;
	@%p131 bra 	BB25_122;

	setp.ge.f32	%p132, %f1, 0f00000000;
	selp.b32	%r52, 2139095040, 0, %p132;
	or.b32  	%r53, %r52, -2147483648;
	selp.b32	%r54, %r53, %r52, %p2;
	mov.b32 	 %f263, %r54;

BB25_122:
	setp.eq.f32	%p136, %f1, 0f00000000;
	setp.eq.f32	%p137, %f2, 0f3F800000;
	or.pred  	%p138, %p137, %p136;
	selp.f32	%f264, 0f3F800000, %f263, %p138;

BB25_125:
	st.global.f32 	[%rd1], %f264;

BB25_126:
	bar.sync 	0;
	ret;
}

	// .globl	fill_d
.visible .entry fill_d(
	.param .u64 fill_d_param_0,
	.param .f64 fill_d_param_1,
	.param .u32 fill_d_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<5>;


	ld.param.u64 	%rd1, [fill_d_param_0];
	ld.param.f64 	%fd1, [fill_d_param_1];
	ld.param.u32 	%r2, [fill_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB26_2;

	cvta.to.global.u64 	%rd2, %rd1;
	mul.wide.s32 	%rd3, %r1, 8;
	add.s64 	%rd4, %rd2, %rd3;
	st.global.f64 	[%rd4], %fd1;

BB26_2:
	ret;
}

	// .globl	fill_f
.visible .entry fill_f(
	.param .u64 fill_f_param_0,
	.param .f64 fill_f_param_1,
	.param .u32 fill_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<5>;


	ld.param.u64 	%rd1, [fill_f_param_0];
	ld.param.f64 	%fd1, [fill_f_param_1];
	ld.param.u32 	%r2, [fill_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.s32	%p1, %r1, %r2;
	@%p1 bra 	BB27_2;

	cvt.rn.f32.f64	%f1, %fd1;
	cvta.to.global.u64 	%rd2, %rd1;
	mul.wide.s32 	%rd3, %r1, 4;
	add.s64 	%rd4, %rd2, %rd3;
	st.global.f32 	[%rd4], %f1;

BB27_2:
	ret;
}

	// .globl	cbind_d
.visible .entry cbind_d(
	.param .u64 cbind_d_param_0,
	.param .u64 cbind_d_param_1,
	.param .u64 cbind_d_param_2,
	.param .u32 cbind_d_param_3,
	.param .u32 cbind_d_param_4,
	.param .u32 cbind_d_param_5,
	.param .u32 cbind_d_param_6
)
{
	.reg .pred 	%p<7>;
	.reg .b32 	%r<18>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [cbind_d_param_0];
	ld.param.u64 	%rd3, [cbind_d_param_1];
	ld.param.u64 	%rd4, [cbind_d_param_2];
	ld.param.u32 	%r7, [cbind_d_param_3];
	ld.param.u32 	%r4, [cbind_d_param_4];
	ld.param.u32 	%r5, [cbind_d_param_5];
	ld.param.u32 	%r6, [cbind_d_param_6];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r8, %ntid.x;
	mov.u32 	%r9, %ctaid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r11, %r8, %r9, %r10;
	max.s32 	%r12, %r4, %r6;
	div.s32 	%r1, %r11, %r12;
	rem.s32 	%r2, %r11, %r12;
	add.s32 	%r3, %r6, %r4;
	setp.lt.s32	%p1, %r1, %r7;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB28_2;
	bra.uni 	BB28_1;

BB28_1:
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r13, %r1, %r4, %r2;
	mul.wide.s32 	%rd6, %r13, 8;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f64 	%fd1, [%rd7];
	mad.lo.s32 	%r14, %r1, %r3, %r2;
	mul.wide.s32 	%rd8, %r14, 8;
	add.s64 	%rd9, %rd1, %rd8;
	st.global.f64 	[%rd9], %fd1;

BB28_2:
	setp.lt.s32	%p4, %r1, %r5;
	setp.lt.s32	%p5, %r2, %r6;
	and.pred  	%p6, %p4, %p5;
	@!%p6 bra 	BB28_4;
	bra.uni 	BB28_3;

BB28_3:
	cvta.to.global.u64 	%rd10, %rd3;
	mad.lo.s32 	%r15, %r1, %r6, %r2;
	mul.wide.s32 	%rd11, %r15, 8;
	add.s64 	%rd12, %rd10, %rd11;
	ld.global.f64 	%fd2, [%rd12];
	add.s32 	%r16, %r2, %r4;
	mad.lo.s32 	%r17, %r1, %r3, %r16;
	mul.wide.s32 	%rd13, %r17, 8;
	add.s64 	%rd14, %rd1, %rd13;
	st.global.f64 	[%rd14], %fd2;

BB28_4:
	ret;
}

	// .globl	cbind_f
.visible .entry cbind_f(
	.param .u64 cbind_f_param_0,
	.param .u64 cbind_f_param_1,
	.param .u64 cbind_f_param_2,
	.param .u32 cbind_f_param_3,
	.param .u32 cbind_f_param_4,
	.param .u32 cbind_f_param_5,
	.param .u32 cbind_f_param_6
)
{
	.reg .pred 	%p<7>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<18>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [cbind_f_param_0];
	ld.param.u64 	%rd3, [cbind_f_param_1];
	ld.param.u64 	%rd4, [cbind_f_param_2];
	ld.param.u32 	%r7, [cbind_f_param_3];
	ld.param.u32 	%r4, [cbind_f_param_4];
	ld.param.u32 	%r5, [cbind_f_param_5];
	ld.param.u32 	%r6, [cbind_f_param_6];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r8, %ntid.x;
	mov.u32 	%r9, %ctaid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r11, %r8, %r9, %r10;
	max.s32 	%r12, %r4, %r6;
	div.s32 	%r1, %r11, %r12;
	rem.s32 	%r2, %r11, %r12;
	add.s32 	%r3, %r6, %r4;
	setp.lt.s32	%p1, %r1, %r7;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB29_2;
	bra.uni 	BB29_1;

BB29_1:
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r13, %r1, %r4, %r2;
	mul.wide.s32 	%rd6, %r13, 4;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f32 	%f1, [%rd7];
	mad.lo.s32 	%r14, %r1, %r3, %r2;
	mul.wide.s32 	%rd8, %r14, 4;
	add.s64 	%rd9, %rd1, %rd8;
	st.global.f32 	[%rd9], %f1;

BB29_2:
	setp.lt.s32	%p4, %r1, %r5;
	setp.lt.s32	%p5, %r2, %r6;
	and.pred  	%p6, %p4, %p5;
	@!%p6 bra 	BB29_4;
	bra.uni 	BB29_3;

BB29_3:
	cvta.to.global.u64 	%rd10, %rd3;
	mad.lo.s32 	%r15, %r1, %r6, %r2;
	mul.wide.s32 	%rd11, %r15, 4;
	add.s64 	%rd12, %rd10, %rd11;
	ld.global.f32 	%f2, [%rd12];
	add.s32 	%r16, %r2, %r4;
	mad.lo.s32 	%r17, %r1, %r3, %r16;
	mul.wide.s32 	%rd13, %r17, 4;
	add.s64 	%rd14, %rd1, %rd13;
	st.global.f32 	[%rd14], %f2;

BB29_4:
	ret;
}

	// .globl	rbind_d
.visible .entry rbind_d(
	.param .u64 rbind_d_param_0,
	.param .u64 rbind_d_param_1,
	.param .u64 rbind_d_param_2,
	.param .u32 rbind_d_param_3,
	.param .u32 rbind_d_param_4,
	.param .u32 rbind_d_param_5,
	.param .u32 rbind_d_param_6
)
{
	.reg .pred 	%p<7>;
	.reg .b32 	%r<16>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd2, [rbind_d_param_0];
	ld.param.u64 	%rd3, [rbind_d_param_1];
	ld.param.u64 	%rd4, [rbind_d_param_2];
	ld.param.u32 	%r3, [rbind_d_param_3];
	ld.param.u32 	%r4, [rbind_d_param_4];
	ld.param.u32 	%r5, [rbind_d_param_5];
	ld.param.u32 	%r6, [rbind_d_param_6];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r10, %r7, %r8, %r9;
	max.s32 	%r11, %r4, %r6;
	div.s32 	%r1, %r10, %r11;
	rem.s32 	%r2, %r10, %r11;
	setp.lt.s32	%p1, %r1, %r3;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB30_2;
	bra.uni 	BB30_1;

BB30_1:
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r12, %r1, %r4, %r2;
	mul.wide.s32 	%rd6, %r12, 8;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f64 	%fd1, [%rd7];
	add.s64 	%rd8, %rd1, %rd6;
	st.global.f64 	[%rd8], %fd1;

BB30_2:
	setp.lt.s32	%p4, %r1, %r5;
	setp.lt.s32	%p5, %r2, %r6;
	and.pred  	%p6, %p4, %p5;
	@!%p6 bra 	BB30_4;
	bra.uni 	BB30_3;

BB30_3:
	cvta.to.global.u64 	%rd9, %rd3;
	mad.lo.s32 	%r13, %r1, %r6, %r2;
	mul.wide.s32 	%rd10, %r13, 8;
	add.s64 	%rd11, %rd9, %rd10;
	ld.global.f64 	%fd2, [%rd11];
	add.s32 	%r14, %r1, %r3;
	mad.lo.s32 	%r15, %r14, %r4, %r2;
	mul.wide.s32 	%rd12, %r15, 8;
	add.s64 	%rd13, %rd1, %rd12;
	st.global.f64 	[%rd13], %fd2;

BB30_4:
	ret;
}

	// .globl	rbind_f
.visible .entry rbind_f(
	.param .u64 rbind_f_param_0,
	.param .u64 rbind_f_param_1,
	.param .u64 rbind_f_param_2,
	.param .u32 rbind_f_param_3,
	.param .u32 rbind_f_param_4,
	.param .u32 rbind_f_param_5,
	.param .u32 rbind_f_param_6
)
{
	.reg .pred 	%p<7>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<16>;
	.reg .b64 	%rd<14>;


	ld.param.u64 	%rd2, [rbind_f_param_0];
	ld.param.u64 	%rd3, [rbind_f_param_1];
	ld.param.u64 	%rd4, [rbind_f_param_2];
	ld.param.u32 	%r3, [rbind_f_param_3];
	ld.param.u32 	%r4, [rbind_f_param_4];
	ld.param.u32 	%r5, [rbind_f_param_5];
	ld.param.u32 	%r6, [rbind_f_param_6];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r10, %r7, %r8, %r9;
	max.s32 	%r11, %r4, %r6;
	div.s32 	%r1, %r10, %r11;
	rem.s32 	%r2, %r10, %r11;
	setp.lt.s32	%p1, %r1, %r3;
	setp.lt.s32	%p2, %r2, %r4;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB31_2;
	bra.uni 	BB31_1;

BB31_1:
	cvta.to.global.u64 	%rd5, %rd2;
	mad.lo.s32 	%r12, %r1, %r4, %r2;
	mul.wide.s32 	%rd6, %r12, 4;
	add.s64 	%rd7, %rd5, %rd6;
	ld.global.f32 	%f1, [%rd7];
	add.s64 	%rd8, %rd1, %rd6;
	st.global.f32 	[%rd8], %f1;

BB31_2:
	setp.lt.s32	%p4, %r1, %r5;
	setp.lt.s32	%p5, %r2, %r6;
	and.pred  	%p6, %p4, %p5;
	@!%p6 bra 	BB31_4;
	bra.uni 	BB31_3;

BB31_3:
	cvta.to.global.u64 	%rd9, %rd3;
	mad.lo.s32 	%r13, %r1, %r6, %r2;
	mul.wide.s32 	%rd10, %r13, 4;
	add.s64 	%rd11, %rd9, %rd10;
	ld.global.f32 	%f2, [%rd11];
	add.s32 	%r14, %r1, %r3;
	mad.lo.s32 	%r15, %r14, %r4, %r2;
	mul.wide.s32 	%rd12, %r15, 4;
	add.s64 	%rd13, %rd1, %rd12;
	st.global.f32 	[%rd13], %f2;

BB31_4:
	ret;
}

	// .globl	reduce_sum_d
.visible .entry reduce_sum_d(
	.param .u64 reduce_sum_d_param_0,
	.param .u64 reduce_sum_d_param_1,
	.param .u32 reduce_sum_d_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_sum_d_param_0];
	ld.param.u64 	%rd3, [reduce_sum_d_param_1];
	ld.param.u32 	%r5, [reduce_sum_d_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d0000000000000000;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB32_4;

BB32_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	add.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB32_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	add.f64 	%fd78, %fd78, %fd31;

BB32_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB32_1;

BB32_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB32_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB32_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	add.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB32_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB32_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB32_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB32_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	add.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB32_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB32_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB32_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB32_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	add.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB32_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB32_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB32_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB32_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	add.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB32_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB32_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB32_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB32_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	add.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB32_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB32_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	add.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB32_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB32_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	add.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB32_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB32_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	add.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB32_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB32_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	add.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB32_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB32_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	add.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB32_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB32_35;

	ld.shared.f64 	%fd43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB32_35:
	ret;
}

	// .globl	reduce_sum_f
.visible .entry reduce_sum_f(
	.param .u64 reduce_sum_f_param_0,
	.param .u64 reduce_sum_f_param_1,
	.param .u32 reduce_sum_f_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<79>;
	.reg .b32 	%r<33>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_sum_f_param_0];
	ld.param.u64 	%rd3, [reduce_sum_f_param_1];
	ld.param.u32 	%r5, [reduce_sum_f_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f32 	%f76, 0f00000000;
	mov.f32 	%f77, %f76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB33_4;

BB33_1:
	mov.f32 	%f1, %f77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f30, [%rd6];
	add.f32 	%f78, %f1, %f30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB33_3;

	mul.wide.u32 	%rd8, %r3, 4;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f32 	%f31, [%rd9];
	add.f32 	%f78, %f78, %f31;

BB33_3:
	mov.f32 	%f77, %f78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f32 	%f76, %f77;
	@%p3 bra 	BB33_1;

BB33_4:
	mov.f32 	%f74, %f76;
	mul.wide.u32 	%rd10, %r6, 4;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f32 	[%rd1], %f74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB33_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f32 	%f75, %f74;
	@%p5 bra 	BB33_7;

	ld.shared.f32 	%f32, [%rd1+2048];
	add.f32 	%f75, %f74, %f32;
	st.shared.f32 	[%rd1], %f75;

BB33_7:
	mov.f32 	%f74, %f75;
	bar.sync 	0;

BB33_8:
	mov.f32 	%f72, %f74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB33_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f32 	%f73, %f72;
	@%p7 bra 	BB33_11;

	ld.shared.f32 	%f33, [%rd1+1024];
	add.f32 	%f73, %f72, %f33;
	st.shared.f32 	[%rd1], %f73;

BB33_11:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB33_12:
	mov.f32 	%f70, %f72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB33_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f32 	%f71, %f70;
	@%p9 bra 	BB33_15;

	ld.shared.f32 	%f34, [%rd1+512];
	add.f32 	%f71, %f70, %f34;
	st.shared.f32 	[%rd1], %f71;

BB33_15:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB33_16:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB33_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f32 	%f69, %f68;
	@%p11 bra 	BB33_19;

	ld.shared.f32 	%f35, [%rd1+256];
	add.f32 	%f69, %f68, %f35;
	st.shared.f32 	[%rd1], %f69;

BB33_19:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB33_20:
	mov.f32 	%f67, %f68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB33_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB33_23;

	ld.volatile.shared.f32 	%f36, [%rd1+128];
	add.f32 	%f67, %f67, %f36;
	st.volatile.shared.f32 	[%rd1], %f67;

BB33_23:
	mov.f32 	%f66, %f67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB33_25;

	ld.volatile.shared.f32 	%f37, [%rd1+64];
	add.f32 	%f66, %f66, %f37;
	st.volatile.shared.f32 	[%rd1], %f66;

BB33_25:
	mov.f32 	%f65, %f66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB33_27;

	ld.volatile.shared.f32 	%f38, [%rd1+32];
	add.f32 	%f65, %f65, %f38;
	st.volatile.shared.f32 	[%rd1], %f65;

BB33_27:
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB33_29;

	ld.volatile.shared.f32 	%f39, [%rd1+16];
	add.f32 	%f64, %f64, %f39;
	st.volatile.shared.f32 	[%rd1], %f64;

BB33_29:
	mov.f32 	%f63, %f64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB33_31;

	ld.volatile.shared.f32 	%f40, [%rd1+8];
	add.f32 	%f63, %f63, %f40;
	st.volatile.shared.f32 	[%rd1], %f63;

BB33_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB33_33;

	ld.volatile.shared.f32 	%f41, [%rd1+4];
	add.f32 	%f42, %f63, %f41;
	st.volatile.shared.f32 	[%rd1], %f42;

BB33_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB33_35;

	ld.shared.f32 	%f43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 4;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f32 	[%rd14], %f43;

BB33_35:
	ret;
}

	// .globl	reduce_row_sum_d
.visible .entry reduce_row_sum_d(
	.param .u64 reduce_row_sum_d_param_0,
	.param .u64 reduce_row_sum_d_param_1,
	.param .u32 reduce_row_sum_d_param_2,
	.param .u32 reduce_row_sum_d_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_sum_d_param_0];
	ld.param.u64 	%rd2, [reduce_row_sum_d_param_1];
	ld.param.u32 	%r5, [reduce_row_sum_d_param_2];
	ld.param.u32 	%r4, [reduce_row_sum_d_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB34_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0d0000000000000000;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB34_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB34_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	add.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB34_3;

BB34_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB34_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB34_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	add.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB34_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB34_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB34_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB34_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	add.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB34_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB34_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB34_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB34_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	add.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB34_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB34_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB34_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB34_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	add.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB34_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB34_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB34_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB34_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	add.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB34_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB34_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	add.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB34_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB34_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	add.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB34_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB34_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	add.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB34_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB34_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	add.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB34_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB34_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	add.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB34_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB34_35;

	ld.shared.f64 	%fd40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB34_35:
	ret;
}

	// .globl	reduce_row_sum_f
.visible .entry reduce_row_sum_f(
	.param .u64 reduce_row_sum_f_param_0,
	.param .u64 reduce_row_sum_f_param_1,
	.param .u32 reduce_row_sum_f_param_2,
	.param .u32 reduce_row_sum_f_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<74>;
	.reg .b32 	%r<39>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_sum_f_param_0];
	ld.param.u64 	%rd2, [reduce_row_sum_f_param_1];
	ld.param.u32 	%r5, [reduce_row_sum_f_param_2];
	ld.param.u32 	%r4, [reduce_row_sum_f_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB35_35;

	mov.u32 	%r38, %tid.x;
	mov.f32 	%f72, 0f00000000;
	mov.f32 	%f73, %f72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB35_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB35_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f28, [%rd5];
	add.f32 	%f73, %f73, %f28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f32 	%f72, %f73;
	@%p3 bra 	BB35_3;

BB35_4:
	mov.f32 	%f70, %f72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 4;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f32 	[%rd8], %f70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB35_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f32 	%f71, %f70;
	@%p5 bra 	BB35_7;

	ld.shared.f32 	%f29, [%rd8+2048];
	add.f32 	%f71, %f70, %f29;
	st.shared.f32 	[%rd8], %f71;

BB35_7:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB35_8:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB35_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f32 	%f69, %f68;
	@%p7 bra 	BB35_11;

	ld.shared.f32 	%f30, [%rd8+1024];
	add.f32 	%f69, %f68, %f30;
	st.shared.f32 	[%rd8], %f69;

BB35_11:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB35_12:
	mov.f32 	%f66, %f68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB35_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f32 	%f67, %f66;
	@%p9 bra 	BB35_15;

	ld.shared.f32 	%f31, [%rd8+512];
	add.f32 	%f67, %f66, %f31;
	st.shared.f32 	[%rd8], %f67;

BB35_15:
	mov.f32 	%f66, %f67;
	bar.sync 	0;

BB35_16:
	mov.f32 	%f64, %f66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB35_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f32 	%f65, %f64;
	@%p11 bra 	BB35_19;

	ld.shared.f32 	%f32, [%rd8+256];
	add.f32 	%f65, %f64, %f32;
	st.shared.f32 	[%rd8], %f65;

BB35_19:
	mov.f32 	%f64, %f65;
	bar.sync 	0;

BB35_20:
	mov.f32 	%f63, %f64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB35_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB35_23;

	ld.volatile.shared.f32 	%f33, [%rd8+128];
	add.f32 	%f63, %f63, %f33;
	st.volatile.shared.f32 	[%rd8], %f63;

BB35_23:
	mov.f32 	%f62, %f63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB35_25;

	ld.volatile.shared.f32 	%f34, [%rd8+64];
	add.f32 	%f62, %f62, %f34;
	st.volatile.shared.f32 	[%rd8], %f62;

BB35_25:
	mov.f32 	%f61, %f62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB35_27;

	ld.volatile.shared.f32 	%f35, [%rd8+32];
	add.f32 	%f61, %f61, %f35;
	st.volatile.shared.f32 	[%rd8], %f61;

BB35_27:
	mov.f32 	%f60, %f61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB35_29;

	ld.volatile.shared.f32 	%f36, [%rd8+16];
	add.f32 	%f60, %f60, %f36;
	st.volatile.shared.f32 	[%rd8], %f60;

BB35_29:
	mov.f32 	%f59, %f60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB35_31;

	ld.volatile.shared.f32 	%f37, [%rd8+8];
	add.f32 	%f59, %f59, %f37;
	st.volatile.shared.f32 	[%rd8], %f59;

BB35_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB35_33;

	ld.volatile.shared.f32 	%f38, [%rd8+4];
	add.f32 	%f39, %f59, %f38;
	st.volatile.shared.f32 	[%rd8], %f39;

BB35_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB35_35;

	ld.shared.f32 	%f40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 4;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f32 	[%rd41], %f40;

BB35_35:
	ret;
}

	// .globl	reduce_col_sum_d
.visible .entry reduce_col_sum_d(
	.param .u64 reduce_col_sum_d_param_0,
	.param .u64 reduce_col_sum_d_param_1,
	.param .u32 reduce_col_sum_d_param_2,
	.param .u32 reduce_col_sum_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_sum_d_param_0];
	ld.param.u64 	%rd3, [reduce_col_sum_d_param_1];
	ld.param.u32 	%r5, [reduce_col_sum_d_param_2];
	ld.param.u32 	%r6, [reduce_col_sum_d_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB36_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0d0000000000000000;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB36_4;

	mov.u32 	%r10, %r1;

BB36_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	add.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB36_3;

BB36_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB36_5:
	ret;
}

	// .globl	reduce_col_sum_f
.visible .entry reduce_col_sum_f(
	.param .u64 reduce_col_sum_f_param_0,
	.param .u64 reduce_col_sum_f_param_1,
	.param .u32 reduce_col_sum_f_param_2,
	.param .u32 reduce_col_sum_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<10>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_sum_f_param_0];
	ld.param.u64 	%rd3, [reduce_col_sum_f_param_1];
	ld.param.u32 	%r5, [reduce_col_sum_f_param_2];
	ld.param.u32 	%r6, [reduce_col_sum_f_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB37_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f32 	%f8, 0f00000000;
	mov.f32 	%f9, %f8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB37_4;

	mov.u32 	%r10, %r1;

BB37_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 4;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f32 	%f6, [%rd5];
	add.f32 	%f9, %f9, %f6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f32 	%f8, %f9;
	@%p3 bra 	BB37_3;

BB37_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f8;

BB37_5:
	ret;
}

	// .globl	reduce_max_d
.visible .entry reduce_max_d(
	.param .u64 reduce_max_d_param_0,
	.param .u64 reduce_max_d_param_1,
	.param .u32 reduce_max_d_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_max_d_param_0];
	ld.param.u64 	%rd3, [reduce_max_d_param_1];
	ld.param.u32 	%r5, [reduce_max_d_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB38_4;

BB38_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	max.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB38_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	max.f64 	%fd78, %fd78, %fd31;

BB38_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB38_1;

BB38_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB38_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB38_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	max.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB38_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB38_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB38_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB38_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	max.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB38_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB38_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB38_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB38_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	max.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB38_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB38_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB38_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB38_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	max.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB38_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB38_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB38_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB38_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	max.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB38_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB38_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	max.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB38_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB38_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	max.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB38_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB38_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	max.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB38_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB38_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	max.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB38_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB38_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	max.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB38_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB38_35;

	ld.shared.f64 	%fd43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB38_35:
	ret;
}

	// .globl	reduce_max_f
.visible .entry reduce_max_f(
	.param .u64 reduce_max_f_param_0,
	.param .u64 reduce_max_f_param_1,
	.param .u32 reduce_max_f_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<79>;
	.reg .b32 	%r<33>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_max_f_param_0];
	ld.param.u64 	%rd3, [reduce_max_f_param_1];
	ld.param.u32 	%r5, [reduce_max_f_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f32 	%f76, 0fFF7FFFFF;
	mov.f32 	%f77, %f76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB39_4;

BB39_1:
	mov.f32 	%f1, %f77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f30, [%rd6];
	max.f32 	%f78, %f1, %f30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB39_3;

	mul.wide.u32 	%rd8, %r3, 4;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f32 	%f31, [%rd9];
	max.f32 	%f78, %f78, %f31;

BB39_3:
	mov.f32 	%f77, %f78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f32 	%f76, %f77;
	@%p3 bra 	BB39_1;

BB39_4:
	mov.f32 	%f74, %f76;
	mul.wide.u32 	%rd10, %r6, 4;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f32 	[%rd1], %f74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB39_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f32 	%f75, %f74;
	@%p5 bra 	BB39_7;

	ld.shared.f32 	%f32, [%rd1+2048];
	max.f32 	%f75, %f74, %f32;
	st.shared.f32 	[%rd1], %f75;

BB39_7:
	mov.f32 	%f74, %f75;
	bar.sync 	0;

BB39_8:
	mov.f32 	%f72, %f74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB39_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f32 	%f73, %f72;
	@%p7 bra 	BB39_11;

	ld.shared.f32 	%f33, [%rd1+1024];
	max.f32 	%f73, %f72, %f33;
	st.shared.f32 	[%rd1], %f73;

BB39_11:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB39_12:
	mov.f32 	%f70, %f72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB39_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f32 	%f71, %f70;
	@%p9 bra 	BB39_15;

	ld.shared.f32 	%f34, [%rd1+512];
	max.f32 	%f71, %f70, %f34;
	st.shared.f32 	[%rd1], %f71;

BB39_15:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB39_16:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB39_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f32 	%f69, %f68;
	@%p11 bra 	BB39_19;

	ld.shared.f32 	%f35, [%rd1+256];
	max.f32 	%f69, %f68, %f35;
	st.shared.f32 	[%rd1], %f69;

BB39_19:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB39_20:
	mov.f32 	%f67, %f68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB39_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB39_23;

	ld.volatile.shared.f32 	%f36, [%rd1+128];
	max.f32 	%f67, %f67, %f36;
	st.volatile.shared.f32 	[%rd1], %f67;

BB39_23:
	mov.f32 	%f66, %f67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB39_25;

	ld.volatile.shared.f32 	%f37, [%rd1+64];
	max.f32 	%f66, %f66, %f37;
	st.volatile.shared.f32 	[%rd1], %f66;

BB39_25:
	mov.f32 	%f65, %f66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB39_27;

	ld.volatile.shared.f32 	%f38, [%rd1+32];
	max.f32 	%f65, %f65, %f38;
	st.volatile.shared.f32 	[%rd1], %f65;

BB39_27:
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB39_29;

	ld.volatile.shared.f32 	%f39, [%rd1+16];
	max.f32 	%f64, %f64, %f39;
	st.volatile.shared.f32 	[%rd1], %f64;

BB39_29:
	mov.f32 	%f63, %f64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB39_31;

	ld.volatile.shared.f32 	%f40, [%rd1+8];
	max.f32 	%f63, %f63, %f40;
	st.volatile.shared.f32 	[%rd1], %f63;

BB39_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB39_33;

	ld.volatile.shared.f32 	%f41, [%rd1+4];
	max.f32 	%f42, %f63, %f41;
	st.volatile.shared.f32 	[%rd1], %f42;

BB39_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB39_35;

	ld.shared.f32 	%f43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 4;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f32 	[%rd14], %f43;

BB39_35:
	ret;
}

	// .globl	reduce_row_max_d
.visible .entry reduce_row_max_d(
	.param .u64 reduce_row_max_d_param_0,
	.param .u64 reduce_row_max_d_param_1,
	.param .u32 reduce_row_max_d_param_2,
	.param .u32 reduce_row_max_d_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_max_d_param_0];
	ld.param.u64 	%rd2, [reduce_row_max_d_param_1];
	ld.param.u32 	%r5, [reduce_row_max_d_param_2];
	ld.param.u32 	%r4, [reduce_row_max_d_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB40_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB40_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB40_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	max.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB40_3;

BB40_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB40_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB40_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	max.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB40_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB40_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB40_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB40_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	max.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB40_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB40_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB40_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB40_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	max.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB40_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB40_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB40_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB40_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	max.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB40_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB40_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB40_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB40_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	max.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB40_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB40_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	max.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB40_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB40_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	max.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB40_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB40_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	max.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB40_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB40_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	max.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB40_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB40_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	max.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB40_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB40_35;

	ld.shared.f64 	%fd40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB40_35:
	ret;
}

	// .globl	reduce_row_max_f
.visible .entry reduce_row_max_f(
	.param .u64 reduce_row_max_f_param_0,
	.param .u64 reduce_row_max_f_param_1,
	.param .u32 reduce_row_max_f_param_2,
	.param .u32 reduce_row_max_f_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<74>;
	.reg .b32 	%r<39>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_max_f_param_0];
	ld.param.u64 	%rd2, [reduce_row_max_f_param_1];
	ld.param.u32 	%r5, [reduce_row_max_f_param_2];
	ld.param.u32 	%r4, [reduce_row_max_f_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB41_35;

	mov.u32 	%r38, %tid.x;
	mov.f32 	%f72, 0fFF7FFFFF;
	mov.f32 	%f73, %f72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB41_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB41_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f28, [%rd5];
	max.f32 	%f73, %f73, %f28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f32 	%f72, %f73;
	@%p3 bra 	BB41_3;

BB41_4:
	mov.f32 	%f70, %f72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 4;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f32 	[%rd8], %f70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB41_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f32 	%f71, %f70;
	@%p5 bra 	BB41_7;

	ld.shared.f32 	%f29, [%rd8+2048];
	max.f32 	%f71, %f70, %f29;
	st.shared.f32 	[%rd8], %f71;

BB41_7:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB41_8:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB41_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f32 	%f69, %f68;
	@%p7 bra 	BB41_11;

	ld.shared.f32 	%f30, [%rd8+1024];
	max.f32 	%f69, %f68, %f30;
	st.shared.f32 	[%rd8], %f69;

BB41_11:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB41_12:
	mov.f32 	%f66, %f68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB41_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f32 	%f67, %f66;
	@%p9 bra 	BB41_15;

	ld.shared.f32 	%f31, [%rd8+512];
	max.f32 	%f67, %f66, %f31;
	st.shared.f32 	[%rd8], %f67;

BB41_15:
	mov.f32 	%f66, %f67;
	bar.sync 	0;

BB41_16:
	mov.f32 	%f64, %f66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB41_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f32 	%f65, %f64;
	@%p11 bra 	BB41_19;

	ld.shared.f32 	%f32, [%rd8+256];
	max.f32 	%f65, %f64, %f32;
	st.shared.f32 	[%rd8], %f65;

BB41_19:
	mov.f32 	%f64, %f65;
	bar.sync 	0;

BB41_20:
	mov.f32 	%f63, %f64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB41_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB41_23;

	ld.volatile.shared.f32 	%f33, [%rd8+128];
	max.f32 	%f63, %f63, %f33;
	st.volatile.shared.f32 	[%rd8], %f63;

BB41_23:
	mov.f32 	%f62, %f63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB41_25;

	ld.volatile.shared.f32 	%f34, [%rd8+64];
	max.f32 	%f62, %f62, %f34;
	st.volatile.shared.f32 	[%rd8], %f62;

BB41_25:
	mov.f32 	%f61, %f62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB41_27;

	ld.volatile.shared.f32 	%f35, [%rd8+32];
	max.f32 	%f61, %f61, %f35;
	st.volatile.shared.f32 	[%rd8], %f61;

BB41_27:
	mov.f32 	%f60, %f61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB41_29;

	ld.volatile.shared.f32 	%f36, [%rd8+16];
	max.f32 	%f60, %f60, %f36;
	st.volatile.shared.f32 	[%rd8], %f60;

BB41_29:
	mov.f32 	%f59, %f60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB41_31;

	ld.volatile.shared.f32 	%f37, [%rd8+8];
	max.f32 	%f59, %f59, %f37;
	st.volatile.shared.f32 	[%rd8], %f59;

BB41_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB41_33;

	ld.volatile.shared.f32 	%f38, [%rd8+4];
	max.f32 	%f39, %f59, %f38;
	st.volatile.shared.f32 	[%rd8], %f39;

BB41_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB41_35;

	ld.shared.f32 	%f40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 4;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f32 	[%rd41], %f40;

BB41_35:
	ret;
}

	// .globl	reduce_col_max_d
.visible .entry reduce_col_max_d(
	.param .u64 reduce_col_max_d_param_0,
	.param .u64 reduce_col_max_d_param_1,
	.param .u32 reduce_col_max_d_param_2,
	.param .u32 reduce_col_max_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_max_d_param_0];
	ld.param.u64 	%rd3, [reduce_col_max_d_param_1];
	ld.param.u32 	%r5, [reduce_col_max_d_param_2];
	ld.param.u32 	%r6, [reduce_col_max_d_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB42_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB42_4;

	mov.u32 	%r10, %r1;

BB42_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	max.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB42_3;

BB42_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB42_5:
	ret;
}

	// .globl	reduce_col_max_f
.visible .entry reduce_col_max_f(
	.param .u64 reduce_col_max_f_param_0,
	.param .u64 reduce_col_max_f_param_1,
	.param .u32 reduce_col_max_f_param_2,
	.param .u32 reduce_col_max_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<10>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_max_f_param_0];
	ld.param.u64 	%rd3, [reduce_col_max_f_param_1];
	ld.param.u32 	%r5, [reduce_col_max_f_param_2];
	ld.param.u32 	%r6, [reduce_col_max_f_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB43_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f32 	%f8, 0fFF7FFFFF;
	mov.f32 	%f9, %f8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB43_4;

	mov.u32 	%r10, %r1;

BB43_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 4;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f32 	%f6, [%rd5];
	max.f32 	%f9, %f9, %f6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f32 	%f8, %f9;
	@%p3 bra 	BB43_3;

BB43_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f8;

BB43_5:
	ret;
}

	// .globl	reduce_min_d
.visible .entry reduce_min_d(
	.param .u64 reduce_min_d_param_0,
	.param .u64 reduce_min_d_param_1,
	.param .u32 reduce_min_d_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_min_d_param_0];
	ld.param.u64 	%rd3, [reduce_min_d_param_1];
	ld.param.u32 	%r5, [reduce_min_d_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB44_4;

BB44_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	min.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB44_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	min.f64 	%fd78, %fd78, %fd31;

BB44_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB44_1;

BB44_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB44_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB44_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	min.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB44_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB44_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB44_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB44_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	min.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB44_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB44_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB44_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB44_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	min.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB44_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB44_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB44_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB44_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	min.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB44_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB44_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB44_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB44_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	min.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB44_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB44_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	min.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB44_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB44_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	min.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB44_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB44_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	min.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB44_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB44_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	min.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB44_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB44_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	min.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB44_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB44_35;

	ld.shared.f64 	%fd43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB44_35:
	ret;
}

	// .globl	reduce_min_f
.visible .entry reduce_min_f(
	.param .u64 reduce_min_f_param_0,
	.param .u64 reduce_min_f_param_1,
	.param .u32 reduce_min_f_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<79>;
	.reg .b32 	%r<33>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_min_f_param_0];
	ld.param.u64 	%rd3, [reduce_min_f_param_1];
	ld.param.u32 	%r5, [reduce_min_f_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f32 	%f76, 0f7F7FFFFF;
	mov.f32 	%f77, %f76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB45_4;

BB45_1:
	mov.f32 	%f1, %f77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f30, [%rd6];
	min.f32 	%f78, %f1, %f30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB45_3;

	mul.wide.u32 	%rd8, %r3, 4;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f32 	%f31, [%rd9];
	min.f32 	%f78, %f78, %f31;

BB45_3:
	mov.f32 	%f77, %f78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f32 	%f76, %f77;
	@%p3 bra 	BB45_1;

BB45_4:
	mov.f32 	%f74, %f76;
	mul.wide.u32 	%rd10, %r6, 4;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f32 	[%rd1], %f74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB45_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f32 	%f75, %f74;
	@%p5 bra 	BB45_7;

	ld.shared.f32 	%f32, [%rd1+2048];
	min.f32 	%f75, %f74, %f32;
	st.shared.f32 	[%rd1], %f75;

BB45_7:
	mov.f32 	%f74, %f75;
	bar.sync 	0;

BB45_8:
	mov.f32 	%f72, %f74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB45_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f32 	%f73, %f72;
	@%p7 bra 	BB45_11;

	ld.shared.f32 	%f33, [%rd1+1024];
	min.f32 	%f73, %f72, %f33;
	st.shared.f32 	[%rd1], %f73;

BB45_11:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB45_12:
	mov.f32 	%f70, %f72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB45_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f32 	%f71, %f70;
	@%p9 bra 	BB45_15;

	ld.shared.f32 	%f34, [%rd1+512];
	min.f32 	%f71, %f70, %f34;
	st.shared.f32 	[%rd1], %f71;

BB45_15:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB45_16:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB45_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f32 	%f69, %f68;
	@%p11 bra 	BB45_19;

	ld.shared.f32 	%f35, [%rd1+256];
	min.f32 	%f69, %f68, %f35;
	st.shared.f32 	[%rd1], %f69;

BB45_19:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB45_20:
	mov.f32 	%f67, %f68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB45_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB45_23;

	ld.volatile.shared.f32 	%f36, [%rd1+128];
	min.f32 	%f67, %f67, %f36;
	st.volatile.shared.f32 	[%rd1], %f67;

BB45_23:
	mov.f32 	%f66, %f67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB45_25;

	ld.volatile.shared.f32 	%f37, [%rd1+64];
	min.f32 	%f66, %f66, %f37;
	st.volatile.shared.f32 	[%rd1], %f66;

BB45_25:
	mov.f32 	%f65, %f66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB45_27;

	ld.volatile.shared.f32 	%f38, [%rd1+32];
	min.f32 	%f65, %f65, %f38;
	st.volatile.shared.f32 	[%rd1], %f65;

BB45_27:
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB45_29;

	ld.volatile.shared.f32 	%f39, [%rd1+16];
	min.f32 	%f64, %f64, %f39;
	st.volatile.shared.f32 	[%rd1], %f64;

BB45_29:
	mov.f32 	%f63, %f64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB45_31;

	ld.volatile.shared.f32 	%f40, [%rd1+8];
	min.f32 	%f63, %f63, %f40;
	st.volatile.shared.f32 	[%rd1], %f63;

BB45_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB45_33;

	ld.volatile.shared.f32 	%f41, [%rd1+4];
	min.f32 	%f42, %f63, %f41;
	st.volatile.shared.f32 	[%rd1], %f42;

BB45_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB45_35;

	ld.shared.f32 	%f43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 4;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f32 	[%rd14], %f43;

BB45_35:
	ret;
}

	// .globl	reduce_row_min_d
.visible .entry reduce_row_min_d(
	.param .u64 reduce_row_min_d_param_0,
	.param .u64 reduce_row_min_d_param_1,
	.param .u32 reduce_row_min_d_param_2,
	.param .u32 reduce_row_min_d_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_min_d_param_0];
	ld.param.u64 	%rd2, [reduce_row_min_d_param_1];
	ld.param.u32 	%r5, [reduce_row_min_d_param_2];
	ld.param.u32 	%r4, [reduce_row_min_d_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB46_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd72, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd73, %fd72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB46_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB46_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	min.f64 	%fd73, %fd73, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd72, %fd73;
	@%p3 bra 	BB46_3;

BB46_4:
	mov.f64 	%fd70, %fd72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB46_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd71, %fd70;
	@%p5 bra 	BB46_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	min.f64 	%fd71, %fd70, %fd29;
	st.shared.f64 	[%rd8], %fd71;

BB46_7:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB46_8:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB46_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd69, %fd68;
	@%p7 bra 	BB46_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	min.f64 	%fd69, %fd68, %fd30;
	st.shared.f64 	[%rd8], %fd69;

BB46_11:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB46_12:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB46_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd67, %fd66;
	@%p9 bra 	BB46_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	min.f64 	%fd67, %fd66, %fd31;
	st.shared.f64 	[%rd8], %fd67;

BB46_15:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB46_16:
	mov.f64 	%fd64, %fd66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB46_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd65, %fd64;
	@%p11 bra 	BB46_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	min.f64 	%fd65, %fd64, %fd32;
	st.shared.f64 	[%rd8], %fd65;

BB46_19:
	mov.f64 	%fd64, %fd65;
	bar.sync 	0;

BB46_20:
	mov.f64 	%fd63, %fd64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB46_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB46_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	min.f64 	%fd63, %fd63, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB46_23:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB46_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	min.f64 	%fd62, %fd62, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB46_25:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB46_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	min.f64 	%fd61, %fd61, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB46_27:
	mov.f64 	%fd60, %fd61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB46_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	min.f64 	%fd60, %fd60, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd60;

BB46_29:
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB46_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	min.f64 	%fd59, %fd59, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd59;

BB46_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB46_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	min.f64 	%fd39, %fd59, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB46_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB46_35;

	ld.shared.f64 	%fd40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 8;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f64 	[%rd41], %fd40;

BB46_35:
	ret;
}

	// .globl	reduce_row_min_f
.visible .entry reduce_row_min_f(
	.param .u64 reduce_row_min_f_param_0,
	.param .u64 reduce_row_min_f_param_1,
	.param .u32 reduce_row_min_f_param_2,
	.param .u32 reduce_row_min_f_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<74>;
	.reg .b32 	%r<39>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd1, [reduce_row_min_f_param_0];
	ld.param.u64 	%rd2, [reduce_row_min_f_param_1];
	ld.param.u32 	%r5, [reduce_row_min_f_param_2];
	ld.param.u32 	%r4, [reduce_row_min_f_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB47_35;

	mov.u32 	%r38, %tid.x;
	mov.f32 	%f72, 0f7F7FFFFF;
	mov.f32 	%f73, %f72;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB47_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB47_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f28, [%rd5];
	min.f32 	%f73, %f73, %f28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f32 	%f72, %f73;
	@%p3 bra 	BB47_3;

BB47_4:
	mov.f32 	%f70, %f72;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 4;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f32 	[%rd8], %f70;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB47_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f32 	%f71, %f70;
	@%p5 bra 	BB47_7;

	ld.shared.f32 	%f29, [%rd8+2048];
	min.f32 	%f71, %f70, %f29;
	st.shared.f32 	[%rd8], %f71;

BB47_7:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB47_8:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB47_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f32 	%f69, %f68;
	@%p7 bra 	BB47_11;

	ld.shared.f32 	%f30, [%rd8+1024];
	min.f32 	%f69, %f68, %f30;
	st.shared.f32 	[%rd8], %f69;

BB47_11:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB47_12:
	mov.f32 	%f66, %f68;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB47_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f32 	%f67, %f66;
	@%p9 bra 	BB47_15;

	ld.shared.f32 	%f31, [%rd8+512];
	min.f32 	%f67, %f66, %f31;
	st.shared.f32 	[%rd8], %f67;

BB47_15:
	mov.f32 	%f66, %f67;
	bar.sync 	0;

BB47_16:
	mov.f32 	%f64, %f66;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB47_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f32 	%f65, %f64;
	@%p11 bra 	BB47_19;

	ld.shared.f32 	%f32, [%rd8+256];
	min.f32 	%f65, %f64, %f32;
	st.shared.f32 	[%rd8], %f65;

BB47_19:
	mov.f32 	%f64, %f65;
	bar.sync 	0;

BB47_20:
	mov.f32 	%f63, %f64;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB47_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB47_23;

	ld.volatile.shared.f32 	%f33, [%rd8+128];
	min.f32 	%f63, %f63, %f33;
	st.volatile.shared.f32 	[%rd8], %f63;

BB47_23:
	mov.f32 	%f62, %f63;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB47_25;

	ld.volatile.shared.f32 	%f34, [%rd8+64];
	min.f32 	%f62, %f62, %f34;
	st.volatile.shared.f32 	[%rd8], %f62;

BB47_25:
	mov.f32 	%f61, %f62;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB47_27;

	ld.volatile.shared.f32 	%f35, [%rd8+32];
	min.f32 	%f61, %f61, %f35;
	st.volatile.shared.f32 	[%rd8], %f61;

BB47_27:
	mov.f32 	%f60, %f61;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB47_29;

	ld.volatile.shared.f32 	%f36, [%rd8+16];
	min.f32 	%f60, %f60, %f36;
	st.volatile.shared.f32 	[%rd8], %f60;

BB47_29:
	mov.f32 	%f59, %f60;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB47_31;

	ld.volatile.shared.f32 	%f37, [%rd8+8];
	min.f32 	%f59, %f59, %f37;
	st.volatile.shared.f32 	[%rd8], %f59;

BB47_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB47_33;

	ld.volatile.shared.f32 	%f38, [%rd8+4];
	min.f32 	%f39, %f59, %f38;
	st.volatile.shared.f32 	[%rd8], %f39;

BB47_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB47_35;

	ld.shared.f32 	%f40, [my_sdata];
	cvta.to.global.u64 	%rd39, %rd2;
	mul.wide.u32 	%rd40, %r6, 4;
	add.s64 	%rd41, %rd39, %rd40;
	st.global.f32 	[%rd41], %f40;

BB47_35:
	ret;
}

	// .globl	reduce_col_min_d
.visible .entry reduce_col_min_d(
	.param .u64 reduce_col_min_d_param_0,
	.param .u64 reduce_col_min_d_param_1,
	.param .u32 reduce_col_min_d_param_2,
	.param .u32 reduce_col_min_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_min_d_param_0];
	ld.param.u64 	%rd3, [reduce_col_min_d_param_1];
	ld.param.u32 	%r5, [reduce_col_min_d_param_2];
	ld.param.u32 	%r6, [reduce_col_min_d_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB48_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
	mov.f64 	%fd9, %fd8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB48_4;

	mov.u32 	%r10, %r1;

BB48_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	min.f64 	%fd9, %fd9, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd8, %fd9;
	@%p3 bra 	BB48_3;

BB48_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f64 	[%rd8], %fd8;

BB48_5:
	ret;
}

	// .globl	reduce_col_min_f
.visible .entry reduce_col_min_f(
	.param .u64 reduce_col_min_f_param_0,
	.param .u64 reduce_col_min_f_param_1,
	.param .u32 reduce_col_min_f_param_2,
	.param .u32 reduce_col_min_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<10>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [reduce_col_min_f_param_0];
	ld.param.u64 	%rd3, [reduce_col_min_f_param_1];
	ld.param.u32 	%r5, [reduce_col_min_f_param_2];
	ld.param.u32 	%r6, [reduce_col_min_f_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB49_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f32 	%f8, 0f7F7FFFFF;
	mov.f32 	%f9, %f8;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB49_4;

	mov.u32 	%r10, %r1;

BB49_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 4;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f32 	%f6, [%rd5];
	min.f32 	%f9, %f9, %f6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f32 	%f8, %f9;
	@%p3 bra 	BB49_3;

BB49_4:
	cvta.to.global.u64 	%rd6, %rd3;
	mul.wide.u32 	%rd7, %r1, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f8;

BB49_5:
	ret;
}

	// .globl	reduce_prod_d
.visible .entry reduce_prod_d(
	.param .u64 reduce_prod_d_param_0,
	.param .u64 reduce_prod_d_param_1,
	.param .u32 reduce_prod_d_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<79>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_prod_d_param_0];
	ld.param.u64 	%rd3, [reduce_prod_d_param_1];
	ld.param.u32 	%r5, [reduce_prod_d_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f64 	%fd76, 0d3FF0000000000000;
	mov.f64 	%fd77, %fd76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB50_4;

BB50_1:
	mov.f64 	%fd1, %fd77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd30, [%rd6];
	mul.f64 	%fd78, %fd1, %fd30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB50_3;

	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd31, [%rd9];
	mul.f64 	%fd78, %fd78, %fd31;

BB50_3:
	mov.f64 	%fd77, %fd78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f64 	%fd76, %fd77;
	@%p3 bra 	BB50_1;

BB50_4:
	mov.f64 	%fd74, %fd76;
	mul.wide.u32 	%rd10, %r6, 8;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB50_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f64 	%fd75, %fd74;
	@%p5 bra 	BB50_7;

	ld.shared.f64 	%fd32, [%rd1+4096];
	mul.f64 	%fd75, %fd74, %fd32;
	st.shared.f64 	[%rd1], %fd75;

BB50_7:
	mov.f64 	%fd74, %fd75;
	bar.sync 	0;

BB50_8:
	mov.f64 	%fd72, %fd74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB50_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f64 	%fd73, %fd72;
	@%p7 bra 	BB50_11;

	ld.shared.f64 	%fd33, [%rd1+2048];
	mul.f64 	%fd73, %fd72, %fd33;
	st.shared.f64 	[%rd1], %fd73;

BB50_11:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB50_12:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB50_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f64 	%fd71, %fd70;
	@%p9 bra 	BB50_15;

	ld.shared.f64 	%fd34, [%rd1+1024];
	mul.f64 	%fd71, %fd70, %fd34;
	st.shared.f64 	[%rd1], %fd71;

BB50_15:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB50_16:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB50_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f64 	%fd69, %fd68;
	@%p11 bra 	BB50_19;

	ld.shared.f64 	%fd35, [%rd1+512];
	mul.f64 	%fd69, %fd68, %fd35;
	st.shared.f64 	[%rd1], %fd69;

BB50_19:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB50_20:
	mov.f64 	%fd67, %fd68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB50_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB50_23;

	ld.volatile.shared.f64 	%fd36, [%rd1+256];
	mul.f64 	%fd67, %fd67, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd67;

BB50_23:
	mov.f64 	%fd66, %fd67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB50_25;

	ld.volatile.shared.f64 	%fd37, [%rd1+128];
	mul.f64 	%fd66, %fd66, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd66;

BB50_25:
	mov.f64 	%fd65, %fd66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB50_27;

	ld.volatile.shared.f64 	%fd38, [%rd1+64];
	mul.f64 	%fd65, %fd65, %fd38;
	st.volatile.shared.f64 	[%rd1], %fd65;

BB50_27:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB50_29;

	ld.volatile.shared.f64 	%fd39, [%rd1+32];
	mul.f64 	%fd64, %fd64, %fd39;
	st.volatile.shared.f64 	[%rd1], %fd64;

BB50_29:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB50_31;

	ld.volatile.shared.f64 	%fd40, [%rd1+16];
	mul.f64 	%fd63, %fd63, %fd40;
	st.volatile.shared.f64 	[%rd1], %fd63;

BB50_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB50_33;

	ld.volatile.shared.f64 	%fd41, [%rd1+8];
	mul.f64 	%fd42, %fd63, %fd41;
	st.volatile.shared.f64 	[%rd1], %fd42;

BB50_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB50_35;

	ld.shared.f64 	%fd43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 8;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f64 	[%rd14], %fd43;

BB50_35:
	ret;
}

	// .globl	reduce_prod_f
.visible .entry reduce_prod_f(
	.param .u64 reduce_prod_f_param_0,
	.param .u64 reduce_prod_f_param_1,
	.param .u32 reduce_prod_f_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<79>;
	.reg .b32 	%r<33>;
	.reg .b64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_prod_f_param_0];
	ld.param.u64 	%rd3, [reduce_prod_f_param_1];
	ld.param.u32 	%r5, [reduce_prod_f_param_2];
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r32, %r8, %r9, %r6;
	mov.f32 	%f76, 0f3F800000;
	mov.f32 	%f77, %f76;
	setp.ge.u32	%p1, %r32, %r5;
	@%p1 bra 	BB51_4;

BB51_1:
	mov.f32 	%f1, %f77;
	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.u32 	%rd5, %r32, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f30, [%rd6];
	mul.f32 	%f78, %f1, %f30;
	add.s32 	%r3, %r32, %r9;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB51_3;

	mul.wide.u32 	%rd8, %r3, 4;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f32 	%f31, [%rd9];
	mul.f32 	%f78, %f78, %f31;

BB51_3:
	mov.f32 	%f77, %f78;
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	mad.lo.s32 	%r32, %r12, %r13, %r32;
	setp.lt.u32	%p3, %r32, %r5;
	mov.f32 	%f76, %f77;
	@%p3 bra 	BB51_1;

BB51_4:
	mov.f32 	%f74, %f76;
	mul.wide.u32 	%rd10, %r6, 4;
	mov.u64 	%rd11, my_sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f32 	[%rd1], %f74;
	bar.sync 	0;
	setp.lt.u32	%p4, %r9, 1024;
	@%p4 bra 	BB51_8;

	setp.gt.u32	%p5, %r6, 511;
	mov.f32 	%f75, %f74;
	@%p5 bra 	BB51_7;

	ld.shared.f32 	%f32, [%rd1+2048];
	mul.f32 	%f75, %f74, %f32;
	st.shared.f32 	[%rd1], %f75;

BB51_7:
	mov.f32 	%f74, %f75;
	bar.sync 	0;

BB51_8:
	mov.f32 	%f72, %f74;
	setp.lt.u32	%p6, %r9, 512;
	@%p6 bra 	BB51_12;

	setp.gt.u32	%p7, %r6, 255;
	mov.f32 	%f73, %f72;
	@%p7 bra 	BB51_11;

	ld.shared.f32 	%f33, [%rd1+1024];
	mul.f32 	%f73, %f72, %f33;
	st.shared.f32 	[%rd1], %f73;

BB51_11:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB51_12:
	mov.f32 	%f70, %f72;
	setp.lt.u32	%p8, %r9, 256;
	@%p8 bra 	BB51_16;

	setp.gt.u32	%p9, %r6, 127;
	mov.f32 	%f71, %f70;
	@%p9 bra 	BB51_15;

	ld.shared.f32 	%f34, [%rd1+512];
	mul.f32 	%f71, %f70, %f34;
	st.shared.f32 	[%rd1], %f71;

BB51_15:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB51_16:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p10, %r9, 128;
	@%p10 bra 	BB51_20;

	setp.gt.u32	%p11, %r6, 63;
	mov.f32 	%f69, %f68;
	@%p11 bra 	BB51_19;

	ld.shared.f32 	%f35, [%rd1+256];
	mul.f32 	%f69, %f68, %f35;
	st.shared.f32 	[%rd1], %f69;

BB51_19:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB51_20:
	mov.f32 	%f67, %f68;
	setp.gt.u32	%p12, %r6, 31;
	@%p12 bra 	BB51_33;

	setp.lt.u32	%p13, %r9, 64;
	@%p13 bra 	BB51_23;

	ld.volatile.shared.f32 	%f36, [%rd1+128];
	mul.f32 	%f67, %f67, %f36;
	st.volatile.shared.f32 	[%rd1], %f67;

BB51_23:
	mov.f32 	%f66, %f67;
	setp.lt.u32	%p14, %r9, 32;
	@%p14 bra 	BB51_25;

	ld.volatile.shared.f32 	%f37, [%rd1+64];
	mul.f32 	%f66, %f66, %f37;
	st.volatile.shared.f32 	[%rd1], %f66;

BB51_25:
	mov.f32 	%f65, %f66;
	setp.lt.u32	%p15, %r9, 16;
	@%p15 bra 	BB51_27;

	ld.volatile.shared.f32 	%f38, [%rd1+32];
	mul.f32 	%f65, %f65, %f38;
	st.volatile.shared.f32 	[%rd1], %f65;

BB51_27:
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p16, %r9, 8;
	@%p16 bra 	BB51_29;

	ld.volatile.shared.f32 	%f39, [%rd1+16];
	mul.f32 	%f64, %f64, %f39;
	st.volatile.shared.f32 	[%rd1], %f64;

BB51_29:
	mov.f32 	%f63, %f64;
	setp.lt.u32	%p17, %r9, 4;
	@%p17 bra 	BB51_31;

	ld.volatile.shared.f32 	%f40, [%rd1+8];
	mul.f32 	%f63, %f63, %f40;
	st.volatile.shared.f32 	[%rd1], %f63;

BB51_31:
	setp.lt.u32	%p18, %r9, 2;
	@%p18 bra 	BB51_33;

	ld.volatile.shared.f32 	%f41, [%rd1+4];
	mul.f32 	%f42, %f63, %f41;
	st.volatile.shared.f32 	[%rd1], %f42;

BB51_33:
	setp.ne.s32	%p19, %r6, 0;
	@%p19 bra 	BB51_35;

	ld.shared.f32 	%f43, [my_sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	mul.wide.u32 	%rd13, %r7, 4;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f32 	[%rd14], %f43;

BB51_35:
	ret;
}

	// .globl	reduce_row_mean_d
.visible .entry reduce_row_mean_d(
	.param .u64 reduce_row_mean_d_param_0,
	.param .u64 reduce_row_mean_d_param_1,
	.param .u32 reduce_row_mean_d_param_2,
	.param .u32 reduce_row_mean_d_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .b32 	%r<39>;
	.reg .f64 	%fd<76>;
	.reg .b64 	%rd<43>;


	ld.param.u64 	%rd1, [reduce_row_mean_d_param_0];
	ld.param.u64 	%rd2, [reduce_row_mean_d_param_1];
	ld.param.u32 	%r5, [reduce_row_mean_d_param_2];
	ld.param.u32 	%r4, [reduce_row_mean_d_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB52_35;

	mov.u32 	%r38, %tid.x;
	mov.f64 	%fd74, 0d0000000000000000;
	mov.f64 	%fd75, %fd74;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB52_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB52_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd28, [%rd5];
	add.f64 	%fd75, %fd75, %fd28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f64 	%fd74, %fd75;
	@%p3 bra 	BB52_3;

BB52_4:
	mov.f64 	%fd72, %fd74;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 8;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f64 	[%rd8], %fd72;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB52_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f64 	%fd73, %fd72;
	@%p5 bra 	BB52_7;

	ld.shared.f64 	%fd29, [%rd8+4096];
	add.f64 	%fd73, %fd72, %fd29;
	st.shared.f64 	[%rd8], %fd73;

BB52_7:
	mov.f64 	%fd72, %fd73;
	bar.sync 	0;

BB52_8:
	mov.f64 	%fd70, %fd72;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB52_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f64 	%fd71, %fd70;
	@%p7 bra 	BB52_11;

	ld.shared.f64 	%fd30, [%rd8+2048];
	add.f64 	%fd71, %fd70, %fd30;
	st.shared.f64 	[%rd8], %fd71;

BB52_11:
	mov.f64 	%fd70, %fd71;
	bar.sync 	0;

BB52_12:
	mov.f64 	%fd68, %fd70;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB52_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f64 	%fd69, %fd68;
	@%p9 bra 	BB52_15;

	ld.shared.f64 	%fd31, [%rd8+1024];
	add.f64 	%fd69, %fd68, %fd31;
	st.shared.f64 	[%rd8], %fd69;

BB52_15:
	mov.f64 	%fd68, %fd69;
	bar.sync 	0;

BB52_16:
	mov.f64 	%fd66, %fd68;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB52_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f64 	%fd67, %fd66;
	@%p11 bra 	BB52_19;

	ld.shared.f64 	%fd32, [%rd8+512];
	add.f64 	%fd67, %fd66, %fd32;
	st.shared.f64 	[%rd8], %fd67;

BB52_19:
	mov.f64 	%fd66, %fd67;
	bar.sync 	0;

BB52_20:
	mov.f64 	%fd65, %fd66;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB52_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB52_23;

	ld.volatile.shared.f64 	%fd33, [%rd8+256];
	add.f64 	%fd65, %fd65, %fd33;
	st.volatile.shared.f64 	[%rd8], %fd65;

BB52_23:
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB52_25;

	ld.volatile.shared.f64 	%fd34, [%rd8+128];
	add.f64 	%fd64, %fd64, %fd34;
	st.volatile.shared.f64 	[%rd8], %fd64;

BB52_25:
	mov.f64 	%fd63, %fd64;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB52_27;

	ld.volatile.shared.f64 	%fd35, [%rd8+64];
	add.f64 	%fd63, %fd63, %fd35;
	st.volatile.shared.f64 	[%rd8], %fd63;

BB52_27:
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB52_29;

	ld.volatile.shared.f64 	%fd36, [%rd8+32];
	add.f64 	%fd62, %fd62, %fd36;
	st.volatile.shared.f64 	[%rd8], %fd62;

BB52_29:
	mov.f64 	%fd61, %fd62;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB52_31;

	ld.volatile.shared.f64 	%fd37, [%rd8+16];
	add.f64 	%fd61, %fd61, %fd37;
	st.volatile.shared.f64 	[%rd8], %fd61;

BB52_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB52_33;

	ld.volatile.shared.f64 	%fd38, [%rd8+8];
	add.f64 	%fd39, %fd61, %fd38;
	st.volatile.shared.f64 	[%rd8], %fd39;

BB52_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB52_35;

	ld.shared.f64 	%fd40, [my_sdata];
	cvt.u64.u32	%rd39, %r4;
	cvt.rn.f64.s64	%fd41, %rd39;
	div.rn.f64 	%fd42, %fd40, %fd41;
	cvta.to.global.u64 	%rd40, %rd2;
	mul.wide.u32 	%rd41, %r6, 8;
	add.s64 	%rd42, %rd40, %rd41;
	st.global.f64 	[%rd42], %fd42;

BB52_35:
	ret;
}

	// .globl	reduce_row_mean_f
.visible .entry reduce_row_mean_f(
	.param .u64 reduce_row_mean_f_param_0,
	.param .u64 reduce_row_mean_f_param_1,
	.param .u32 reduce_row_mean_f_param_2,
	.param .u32 reduce_row_mean_f_param_3
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<76>;
	.reg .b32 	%r<39>;
	.reg .b64 	%rd<43>;


	ld.param.u64 	%rd1, [reduce_row_mean_f_param_0];
	ld.param.u64 	%rd2, [reduce_row_mean_f_param_1];
	ld.param.u32 	%r5, [reduce_row_mean_f_param_2];
	ld.param.u32 	%r4, [reduce_row_mean_f_param_3];
	mov.u32 	%r6, %ctaid.x;
	setp.ge.u32	%p1, %r6, %r5;
	@%p1 bra 	BB53_35;

	mov.u32 	%r38, %tid.x;
	mov.f32 	%f74, 0f00000000;
	mov.f32 	%f75, %f74;
	setp.ge.u32	%p2, %r38, %r4;
	@%p2 bra 	BB53_4;

	cvta.to.global.u64 	%rd3, %rd1;

BB53_3:
	mad.lo.s32 	%r8, %r6, %r4, %r38;
	mul.wide.u32 	%rd4, %r8, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f28, [%rd5];
	add.f32 	%f75, %f75, %f28;
	mov.u32 	%r9, %ntid.x;
	add.s32 	%r38, %r9, %r38;
	setp.lt.u32	%p3, %r38, %r4;
	mov.f32 	%f74, %f75;
	@%p3 bra 	BB53_3;

BB53_4:
	mov.f32 	%f72, %f74;
	mov.u32 	%r10, %tid.x;
	mul.wide.u32 	%rd6, %r10, 4;
	mov.u64 	%rd7, my_sdata;
	add.s64 	%rd8, %rd7, %rd6;
	st.shared.f32 	[%rd8], %f72;
	bar.sync 	0;
	mov.u32 	%r11, %ntid.x;
	setp.lt.u32	%p4, %r11, 1024;
	@%p4 bra 	BB53_8;

	setp.gt.u32	%p5, %r10, 511;
	mov.f32 	%f73, %f72;
	@%p5 bra 	BB53_7;

	ld.shared.f32 	%f29, [%rd8+2048];
	add.f32 	%f73, %f72, %f29;
	st.shared.f32 	[%rd8], %f73;

BB53_7:
	mov.f32 	%f72, %f73;
	bar.sync 	0;

BB53_8:
	mov.f32 	%f70, %f72;
	setp.lt.u32	%p6, %r11, 512;
	@%p6 bra 	BB53_12;

	setp.gt.u32	%p7, %r10, 255;
	mov.f32 	%f71, %f70;
	@%p7 bra 	BB53_11;

	ld.shared.f32 	%f30, [%rd8+1024];
	add.f32 	%f71, %f70, %f30;
	st.shared.f32 	[%rd8], %f71;

BB53_11:
	mov.f32 	%f70, %f71;
	bar.sync 	0;

BB53_12:
	mov.f32 	%f68, %f70;
	setp.lt.u32	%p8, %r11, 256;
	@%p8 bra 	BB53_16;

	setp.gt.u32	%p9, %r10, 127;
	mov.f32 	%f69, %f68;
	@%p9 bra 	BB53_15;

	ld.shared.f32 	%f31, [%rd8+512];
	add.f32 	%f69, %f68, %f31;
	st.shared.f32 	[%rd8], %f69;

BB53_15:
	mov.f32 	%f68, %f69;
	bar.sync 	0;

BB53_16:
	mov.f32 	%f66, %f68;
	setp.lt.u32	%p10, %r11, 128;
	@%p10 bra 	BB53_20;

	setp.gt.u32	%p11, %r10, 63;
	mov.f32 	%f67, %f66;
	@%p11 bra 	BB53_19;

	ld.shared.f32 	%f32, [%rd8+256];
	add.f32 	%f67, %f66, %f32;
	st.shared.f32 	[%rd8], %f67;

BB53_19:
	mov.f32 	%f66, %f67;
	bar.sync 	0;

BB53_20:
	mov.f32 	%f65, %f66;
	setp.gt.u32	%p12, %r10, 31;
	@%p12 bra 	BB53_33;

	setp.lt.u32	%p13, %r11, 64;
	@%p13 bra 	BB53_23;

	ld.volatile.shared.f32 	%f33, [%rd8+128];
	add.f32 	%f65, %f65, %f33;
	st.volatile.shared.f32 	[%rd8], %f65;

BB53_23:
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p14, %r11, 32;
	@%p14 bra 	BB53_25;

	ld.volatile.shared.f32 	%f34, [%rd8+64];
	add.f32 	%f64, %f64, %f34;
	st.volatile.shared.f32 	[%rd8], %f64;

BB53_25:
	mov.f32 	%f63, %f64;
	setp.lt.u32	%p15, %r11, 16;
	@%p15 bra 	BB53_27;

	ld.volatile.shared.f32 	%f35, [%rd8+32];
	add.f32 	%f63, %f63, %f35;
	st.volatile.shared.f32 	[%rd8], %f63;

BB53_27:
	mov.f32 	%f62, %f63;
	setp.lt.u32	%p16, %r11, 8;
	@%p16 bra 	BB53_29;

	ld.volatile.shared.f32 	%f36, [%rd8+16];
	add.f32 	%f62, %f62, %f36;
	st.volatile.shared.f32 	[%rd8], %f62;

BB53_29:
	mov.f32 	%f61, %f62;
	setp.lt.u32	%p17, %r11, 4;
	@%p17 bra 	BB53_31;

	ld.volatile.shared.f32 	%f37, [%rd8+8];
	add.f32 	%f61, %f61, %f37;
	st.volatile.shared.f32 	[%rd8], %f61;

BB53_31:
	setp.lt.u32	%p18, %r11, 2;
	@%p18 bra 	BB53_33;

	ld.volatile.shared.f32 	%f38, [%rd8+4];
	add.f32 	%f39, %f61, %f38;
	st.volatile.shared.f32 	[%rd8], %f39;

BB53_33:
	setp.ne.s32	%p19, %r10, 0;
	@%p19 bra 	BB53_35;

	ld.shared.f32 	%f40, [my_sdata];
	cvt.u64.u32	%rd39, %r4;
	cvt.rn.f32.s64	%f41, %rd39;
	div.rn.f32 	%f42, %f40, %f41;
	cvta.to.global.u64 	%rd40, %rd2;
	mul.wide.u32 	%rd41, %r6, 4;
	add.s64 	%rd42, %rd40, %rd41;
	st.global.f32 	[%rd42], %f42;

BB53_35:
	ret;
}

	// .globl	reduce_col_mean_d
.visible .entry reduce_col_mean_d(
	.param .u64 reduce_col_mean_d_param_0,
	.param .u64 reduce_col_mean_d_param_1,
	.param .u32 reduce_col_mean_d_param_2,
	.param .u32 reduce_col_mean_d_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<12>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [reduce_col_mean_d_param_0];
	ld.param.u64 	%rd3, [reduce_col_mean_d_param_1];
	ld.param.u32 	%r5, [reduce_col_mean_d_param_2];
	ld.param.u32 	%r6, [reduce_col_mean_d_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB54_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f64 	%fd10, 0d0000000000000000;
	mov.f64 	%fd11, %fd10;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB54_4;

	mov.u32 	%r10, %r1;

BB54_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 8;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f64 	%fd6, [%rd5];
	add.f64 	%fd11, %fd11, %fd6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f64 	%fd10, %fd11;
	@%p3 bra 	BB54_3;

BB54_4:
	cvta.to.global.u64 	%rd6, %rd3;
	cvt.u64.u32	%rd7, %r5;
	cvt.rn.f64.s64	%fd7, %rd7;
	div.rn.f64 	%fd8, %fd10, %fd7;
	mul.wide.u32 	%rd8, %r1, 8;
	add.s64 	%rd9, %rd6, %rd8;
	st.global.f64 	[%rd9], %fd8;

BB54_5:
	ret;
}

	// .globl	reduce_col_mean_f
.visible .entry reduce_col_mean_f(
	.param .u64 reduce_col_mean_f_param_0,
	.param .u64 reduce_col_mean_f_param_1,
	.param .u32 reduce_col_mean_f_param_2,
	.param .u32 reduce_col_mean_f_param_3
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<12>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [reduce_col_mean_f_param_0];
	ld.param.u64 	%rd3, [reduce_col_mean_f_param_1];
	ld.param.u32 	%r5, [reduce_col_mean_f_param_2];
	ld.param.u32 	%r6, [reduce_col_mean_f_param_3];
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB55_5;

	cvta.to.global.u64 	%rd1, %rd2;
	mul.lo.s32 	%r2, %r6, %r5;
	mov.f32 	%f10, 0f00000000;
	mov.f32 	%f11, %f10;
	setp.ge.u32	%p2, %r1, %r2;
	@%p2 bra 	BB55_4;

	mov.u32 	%r10, %r1;

BB55_3:
	mov.u32 	%r3, %r10;
	mul.wide.u32 	%rd4, %r3, 4;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f32 	%f6, [%rd5];
	add.f32 	%f11, %f11, %f6;
	add.s32 	%r4, %r3, %r6;
	setp.lt.u32	%p3, %r4, %r2;
	mov.u32 	%r10, %r4;
	mov.f32 	%f10, %f11;
	@%p3 bra 	BB55_3;

BB55_4:
	cvta.to.global.u64 	%rd6, %rd3;
	cvt.u64.u32	%rd7, %r5;
	cvt.rn.f32.s64	%f7, %rd7;
	div.rn.f32 	%f8, %f10, %f7;
	mul.wide.u32 	%rd8, %r1, 4;
	add.s64 	%rd9, %rd6, %rd8;
	st.global.f32 	[%rd9], %f8;

BB55_5:
	ret;
}

	// .globl	matrix_exp_d
.visible .entry matrix_exp_d(
	.param .u64 matrix_exp_d_param_0,
	.param .u64 matrix_exp_d_param_1,
	.param .u32 matrix_exp_d_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<21>;
	.reg .f64 	%fd<41>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_exp_d_param_0];
	ld.param.u64 	%rd3, [matrix_exp_d_param_1];
	ld.param.u32 	%r5, [matrix_exp_d_param_2];
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	setp.ge.u32	%p1, %r1, %r5;
	@%p1 bra 	BB56_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	mov.f64 	%fd6, 0d4338000000000000;
	mov.f64 	%fd7, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd8, %fd1, %fd7, %fd6;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r2, %temp}, %fd8;
	}
	mov.f64 	%fd9, 0dC338000000000000;
	add.rn.f64 	%fd10, %fd8, %fd9;
	mov.f64 	%fd11, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd12, %fd10, %fd11, %fd1;
	mov.f64 	%fd13, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd14, %fd10, %fd13, %fd12;
	mov.f64 	%fd15, 0d3E928AF3FCA213EA;
	mov.f64 	%fd16, 0d3E5ADE1569CE2BDF;
	fma.rn.f64 	%fd17, %fd16, %fd14, %fd15;
	mov.f64 	%fd18, 0d3EC71DEE62401315;
	fma.rn.f64 	%fd19, %fd17, %fd14, %fd18;
	mov.f64 	%fd20, 0d3EFA01997C89EB71;
	fma.rn.f64 	%fd21, %fd19, %fd14, %fd20;
	mov.f64 	%fd22, 0d3F2A01A014761F65;
	fma.rn.f64 	%fd23, %fd21, %fd14, %fd22;
	mov.f64 	%fd24, 0d3F56C16C1852B7AF;
	fma.rn.f64 	%fd25, %fd23, %fd14, %fd24;
	mov.f64 	%fd26, 0d3F81111111122322;
	fma.rn.f64 	%fd27, %fd25, %fd14, %fd26;
	mov.f64 	%fd28, 0d3FA55555555502A1;
	fma.rn.f64 	%fd29, %fd27, %fd14, %fd28;
	mov.f64 	%fd30, 0d3FC5555555555511;
	fma.rn.f64 	%fd31, %fd29, %fd14, %fd30;
	mov.f64 	%fd32, 0d3FE000000000000B;
	fma.rn.f64 	%fd33, %fd31, %fd14, %fd32;
	mov.f64 	%fd34, 0d3FF0000000000000;
	fma.rn.f64 	%fd35, %fd33, %fd14, %fd34;
	fma.rn.f64 	%fd36, %fd35, %fd14, %fd34;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r3, %temp}, %fd36;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r4}, %fd36;
	}
	shl.b32 	%r9, %r2, 20;
	add.s32 	%r10, %r4, %r9;
	mov.b64 	%fd40, {%r3, %r10};
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r11}, %fd1;
	}
	mov.b32 	 %f2, %r11;
	abs.f32 	%f1, %f2;
	setp.lt.f32	%p2, %f1, 0f4086232B;
	@%p2 bra 	BB56_4;

	setp.lt.f64	%p3, %fd1, 0d0000000000000000;
	add.f64 	%fd37, %fd1, 0d7FF0000000000000;
	selp.f64	%fd40, 0d0000000000000000, %fd37, %p3;
	setp.geu.f32	%p4, %f1, 0f40874800;
	@%p4 bra 	BB56_4;

	shr.u32 	%r12, %r2, 31;
	add.s32 	%r13, %r2, %r12;
	shr.s32 	%r14, %r13, 1;
	shl.b32 	%r15, %r14, 20;
	add.s32 	%r16, %r15, %r4;
	mov.b64 	%fd38, {%r3, %r16};
	sub.s32 	%r17, %r2, %r14;
	shl.b32 	%r18, %r17, 20;
	add.s32 	%r19, %r18, 1072693248;
	mov.u32 	%r20, 0;
	mov.b64 	%fd39, {%r20, %r19};
	mul.f64 	%fd40, %fd38, %fd39;

BB56_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd40;

BB56_5:
	ret;
}

	// .globl	matrix_exp_f
.visible .entry matrix_exp_f(
	.param .u64 matrix_exp_f_param_0,
	.param .u64 matrix_exp_f_param_1,
	.param .u32 matrix_exp_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<15>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_exp_f_param_0];
	ld.param.u64 	%rd2, [matrix_exp_f_param_1];
	ld.param.u32 	%r2, [matrix_exp_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB57_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f3, [%rd5];
	mul.f32 	%f4, %f3, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f5, %f4;
	mov.f32 	%f6, 0fBF317200;
	fma.rn.f32 	%f7, %f5, %f6, %f3;
	mov.f32 	%f8, 0fB5BFBE8E;
	fma.rn.f32 	%f9, %f5, %f8, %f7;
	mul.f32 	%f2, %f9, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f1,%f2;
	// inline asm
	add.f32 	%f10, %f5, 0f00000000;
	ex2.approx.f32 	%f11, %f10;
	mul.f32 	%f12, %f1, %f11;
	setp.lt.f32	%p2, %f3, 0fC2D20000;
	selp.f32	%f13, 0f00000000, %f12, %p2;
	setp.gt.f32	%p3, %f3, 0f42D20000;
	selp.f32	%f14, 0f7F800000, %f13, %p3;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f14;

BB57_2:
	ret;
}

	// .globl	matrix_sqrt_d
.visible .entry matrix_sqrt_d(
	.param .u64 matrix_sqrt_d_param_0,
	.param .u64 matrix_sqrt_d_param_1,
	.param .u32 matrix_sqrt_d_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_sqrt_d_param_0];
	ld.param.u64 	%rd2, [matrix_sqrt_d_param_1];
	ld.param.u32 	%r2, [matrix_sqrt_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB58_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	sqrt.rn.f64 	%fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd2;

BB58_2:
	ret;
}

	// .globl	matrix_sqrt_f
.visible .entry matrix_sqrt_f(
	.param .u64 matrix_sqrt_f_param_0,
	.param .u64 matrix_sqrt_f_param_1,
	.param .u32 matrix_sqrt_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_sqrt_f_param_0];
	ld.param.u64 	%rd2, [matrix_sqrt_f_param_1];
	ld.param.u32 	%r2, [matrix_sqrt_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB59_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	sqrt.rn.f32 	%f2, %f1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f2;

BB59_2:
	ret;
}

	// .globl	matrix_round_d
.visible .entry matrix_round_d(
	.param .u64 matrix_round_d_param_0,
	.param .u64 matrix_round_d_param_1,
	.param .u32 matrix_round_d_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<10>;
	.reg .b64 	%rd<11>;


	ld.param.u64 	%rd2, [matrix_round_d_param_0];
	ld.param.u64 	%rd3, [matrix_round_d_param_1];
	ld.param.u32 	%r2, [matrix_round_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB60_4;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd9, [%rd6];
	abs.f64 	%fd2, %fd9;
	setp.ge.f64	%p2, %fd2, 0d4330000000000000;
	@%p2 bra 	BB60_3;

	add.f64 	%fd5, %fd2, 0d3FE0000000000000;
	cvt.rzi.f64.f64	%fd6, %fd5;
	setp.lt.f64	%p3, %fd2, 0d3FE0000000000000;
	selp.f64	%fd7, 0d0000000000000000, %fd6, %p3;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r6, %temp}, %fd7;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r7}, %fd7;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd9;
	}
	and.b32  	%r9, %r8, -2147483648;
	or.b32  	%r10, %r7, %r9;
	mov.b64 	%fd9, {%r6, %r10};

BB60_3:
	cvta.to.global.u64 	%rd7, %rd3;
	cvt.rzi.s64.f64	%rd8, %fd9;
	cvt.rn.f64.s64	%fd8, %rd8;
	shl.b64 	%rd9, %rd1, 3;
	add.s64 	%rd10, %rd7, %rd9;
	st.global.f64 	[%rd10], %fd8;

BB60_4:
	ret;
}

	// .globl	matrix_round_f
.visible .entry matrix_round_f(
	.param .u64 matrix_round_f_param_0,
	.param .u64 matrix_round_f_param_1,
	.param .u32 matrix_round_f_param_2
)
{
	.reg .pred 	%p<8>;
	.reg .f32 	%f<8>;
	.reg .b32 	%r<17>;
	.reg .b64 	%rd<26>;


	ld.param.u64 	%rd7, [matrix_round_f_param_0];
	ld.param.u64 	%rd8, [matrix_round_f_param_1];
	ld.param.u32 	%r5, [matrix_round_f_param_2];
	mov.u32 	%r6, %ctaid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	setp.ge.u32	%p1, %r1, %r5;
	@%p1 bra 	BB61_8;

	cvta.to.global.u64 	%rd9, %rd7;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd10, %r1, 4;
	add.s64 	%rd11, %rd9, %rd10;
	ld.global.u32 	%r2, [%rd11];
	and.b32  	%r9, %r2, 2147483647;
	setp.gt.u32	%p2, %r9, 2139095040;
	mov.f32 	%f3, 0fDF000000;
	mov.f32 	%f7, %f3;
	@%p2 bra 	BB61_7;

	setp.gt.s32	%p3, %r2, 1593835519;
	mov.f32 	%f4, 0f5F000000;
	mov.f32 	%f7, %f4;
	@%p3 bra 	BB61_7;

	setp.gt.u32	%p4, %r2, -553648129;
	mov.f32 	%f7, %f3;
	@%p4 bra 	BB61_7;

	bfe.u32 	%r3, %r2, 23, 8;
	mov.u32 	%r10, 189;
	sub.s32 	%r4, %r10, %r3;
	shl.b32 	%r11, %r2, 8;
	shr.u32 	%r12, %r11, 1;
	or.b32  	%r13, %r12, 1073741824;
	cvt.u64.u32	%rd13, %r13;
	shl.b64 	%rd25, %rd13, 32;
	setp.gt.s32	%p5, %r4, 63;
	mov.u64 	%rd24, 0;
	@%p5 bra 	BB61_6;

	setp.ne.s32	%p6, %r3, 189;
	mov.u32 	%r14, 64;
	sub.s32 	%r15, %r14, %r4;
	shl.b64 	%rd14, %rd25, %r15;
	cvt.u64.u32	%rd15, %r4;
	selp.b64	%rd16, %rd15, 0, %p6;
	cvt.u32.u64	%r16, %rd16;
	shr.u64 	%rd24, %rd25, %r16;
	selp.b64	%rd25, %rd14, 0, %p6;

BB61_6:
	shr.u64 	%rd17, %rd25, 63;
	add.s64 	%rd18, %rd17, %rd24;
	neg.s64 	%rd19, %rd18;
	setp.lt.s32	%p7, %r2, 0;
	selp.b64	%rd20, %rd19, %rd18, %p7;
	cvt.rn.f32.s64	%f7, %rd20;

BB61_7:
	cvta.to.global.u64 	%rd21, %rd8;
	shl.b64 	%rd22, %rd1, 2;
	add.s64 	%rd23, %rd21, %rd22;
	st.global.f32 	[%rd23], %f7;

BB61_8:
	ret;
}

	// .globl	matrix_abs_d
.visible .entry matrix_abs_d(
	.param .u64 matrix_abs_d_param_0,
	.param .u64 matrix_abs_d_param_1,
	.param .u32 matrix_abs_d_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_abs_d_param_0];
	ld.param.u64 	%rd2, [matrix_abs_d_param_1];
	ld.param.u32 	%r2, [matrix_abs_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB62_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	abs.f64 	%fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd2;

BB62_2:
	ret;
}

	// .globl	matrix_abs_f
.visible .entry matrix_abs_f(
	.param .u64 matrix_abs_f_param_0,
	.param .u64 matrix_abs_f_param_1,
	.param .u32 matrix_abs_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_abs_f_param_0];
	ld.param.u64 	%rd2, [matrix_abs_f_param_1];
	ld.param.u32 	%r2, [matrix_abs_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB63_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	abs.f32 	%f2, %f1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f2;

BB63_2:
	ret;
}

	// .globl	matrix_log_d
.visible .entry matrix_log_d(
	.param .u64 matrix_log_d_param_0,
	.param .u64 matrix_log_d_param_1,
	.param .u32 matrix_log_d_param_2
)
{
	.reg .pred 	%p<6>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<59>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_log_d_param_0];
	ld.param.u64 	%rd3, [matrix_log_d_param_1];
	ld.param.u32 	%r12, [matrix_log_d_param_2];
	mov.u32 	%r13, %ctaid.x;
	mov.u32 	%r14, %ntid.x;
	mov.u32 	%r15, %tid.x;
	mad.lo.s32 	%r1, %r14, %r13, %r15;
	setp.ge.u32	%p1, %r1, %r12;
	@%p1 bra 	BB64_9;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd56, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r29}, %fd56;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r30, %temp}, %fd56;
	}
	mov.u32 	%r31, -1023;
	setp.gt.s32	%p2, %r29, 1048575;
	@%p2 bra 	BB64_3;

	mul.f64 	%fd56, %fd56, 0d4350000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r29}, %fd56;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r30, %temp}, %fd56;
	}
	mov.u32 	%r31, -1077;

BB64_3:
	add.s32 	%r18, %r29, -1;
	setp.lt.u32	%p3, %r18, 2146435071;
	@%p3 bra 	BB64_5;
	bra.uni 	BB64_4;

BB64_5:
	shr.u32 	%r20, %r29, 20;
	add.s32 	%r32, %r31, %r20;
	and.b32  	%r21, %r29, -2146435073;
	or.b32  	%r22, %r21, 1072693248;
	mov.b64 	%fd57, {%r30, %r22};
	setp.lt.s32	%p5, %r22, 1073127583;
	@%p5 bra 	BB64_7;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r23, %temp}, %fd57;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r24}, %fd57;
	}
	add.s32 	%r25, %r24, -1048576;
	mov.b64 	%fd57, {%r23, %r25};
	add.s32 	%r32, %r32, 1;

BB64_7:
	add.f64 	%fd13, %fd57, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd12,%fd13;
	// inline asm
	neg.f64 	%fd14, %fd13;
	mov.f64 	%fd15, 0d3FF0000000000000;
	fma.rn.f64 	%fd16, %fd14, %fd12, %fd15;
	fma.rn.f64 	%fd17, %fd16, %fd16, %fd16;
	fma.rn.f64 	%fd18, %fd17, %fd12, %fd12;
	add.f64 	%fd19, %fd57, 0dBFF0000000000000;
	mul.f64 	%fd20, %fd19, %fd18;
	fma.rn.f64 	%fd21, %fd19, %fd18, %fd20;
	mul.f64 	%fd22, %fd21, %fd21;
	mov.f64 	%fd23, 0d3ED0EE258B7A8B04;
	mov.f64 	%fd24, 0d3EB1380B3AE80F1E;
	fma.rn.f64 	%fd25, %fd24, %fd22, %fd23;
	mov.f64 	%fd26, 0d3EF3B2669F02676F;
	fma.rn.f64 	%fd27, %fd25, %fd22, %fd26;
	mov.f64 	%fd28, 0d3F1745CBA9AB0956;
	fma.rn.f64 	%fd29, %fd27, %fd22, %fd28;
	mov.f64 	%fd30, 0d3F3C71C72D1B5154;
	fma.rn.f64 	%fd31, %fd29, %fd22, %fd30;
	mov.f64 	%fd32, 0d3F624924923BE72D;
	fma.rn.f64 	%fd33, %fd31, %fd22, %fd32;
	mov.f64 	%fd34, 0d3F8999999999A3C4;
	fma.rn.f64 	%fd35, %fd33, %fd22, %fd34;
	mov.f64 	%fd36, 0d3FB5555555555554;
	fma.rn.f64 	%fd37, %fd35, %fd22, %fd36;
	sub.f64 	%fd38, %fd19, %fd21;
	add.f64 	%fd39, %fd38, %fd38;
	neg.f64 	%fd40, %fd21;
	fma.rn.f64 	%fd41, %fd40, %fd19, %fd39;
	mul.f64 	%fd42, %fd18, %fd41;
	mul.f64 	%fd43, %fd22, %fd37;
	fma.rn.f64 	%fd44, %fd43, %fd21, %fd42;
	xor.b32  	%r26, %r32, -2147483648;
	mov.u32 	%r27, 1127219200;
	mov.b64 	%fd45, {%r26, %r27};
	mov.u32 	%r28, -2147483648;
	mov.b64 	%fd46, {%r28, %r27};
	sub.f64 	%fd47, %fd45, %fd46;
	mov.f64 	%fd48, 0d3FE62E42FEFA39EF;
	fma.rn.f64 	%fd49, %fd47, %fd48, %fd21;
	neg.f64 	%fd50, %fd47;
	fma.rn.f64 	%fd51, %fd50, %fd48, %fd49;
	sub.f64 	%fd52, %fd51, %fd21;
	sub.f64 	%fd53, %fd44, %fd52;
	mov.f64 	%fd54, 0d3C7ABC9E3B39803F;
	fma.rn.f64 	%fd55, %fd47, %fd54, %fd53;
	add.f64 	%fd58, %fd49, %fd55;
	bra.uni 	BB64_8;

BB64_4:
	mov.f64 	%fd10, 0d7FF0000000000000;
	fma.rn.f64 	%fd11, %fd56, %fd10, %fd10;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r19}, %fd56;
	}
	mov.b32 	 %f1, %r19;
	setp.eq.f32	%p4, %f1, 0f00000000;
	selp.f64	%fd58, 0dFFF0000000000000, %fd11, %p4;

BB64_8:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd58;

BB64_9:
	ret;
}

	// .globl	matrix_log_f
.visible .entry matrix_log_f(
	.param .u64 matrix_log_f_param_0,
	.param .u64 matrix_log_f_param_1,
	.param .u32 matrix_log_f_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<36>;
	.reg .b32 	%r<10>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_log_f_param_0];
	ld.param.u64 	%rd3, [matrix_log_f_param_1];
	ld.param.u32 	%r2, [matrix_log_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB65_4;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f5, [%rd6];
	setp.lt.f32	%p2, %f5, 0f00800000;
	mul.f32 	%f6, %f5, 0f4B000000;
	selp.f32	%f1, %f6, %f5, %p2;
	mov.b32 	 %r6, %f1;
	add.s32 	%r7, %r6, -1059760811;
	and.b32  	%r8, %r7, -8388608;
	sub.s32 	%r9, %r6, %r8;
	mov.b32 	 %f7, %r9;
	cvt.rn.f32.s32	%f8, %r8;
	selp.f32	%f9, 0fC1B80000, 0f00000000, %p2;
	mov.f32 	%f10, 0f34000000;
	fma.rn.f32 	%f11, %f8, %f10, %f9;
	add.f32 	%f12, %f7, 0fBF800000;
	mov.f32 	%f13, 0f3E1039F6;
	mov.f32 	%f14, 0fBE055027;
	fma.rn.f32 	%f15, %f14, %f12, %f13;
	mov.f32 	%f16, 0fBDF8CDCC;
	fma.rn.f32 	%f17, %f15, %f12, %f16;
	mov.f32 	%f18, 0f3E0F2955;
	fma.rn.f32 	%f19, %f17, %f12, %f18;
	mov.f32 	%f20, 0fBE2AD8B9;
	fma.rn.f32 	%f21, %f19, %f12, %f20;
	mov.f32 	%f22, 0f3E4CED0B;
	fma.rn.f32 	%f23, %f21, %f12, %f22;
	mov.f32 	%f24, 0fBE7FFF22;
	fma.rn.f32 	%f25, %f23, %f12, %f24;
	mov.f32 	%f26, 0f3EAAAA78;
	fma.rn.f32 	%f27, %f25, %f12, %f26;
	mov.f32 	%f28, 0fBF000000;
	fma.rn.f32 	%f29, %f27, %f12, %f28;
	mul.f32 	%f30, %f12, %f29;
	fma.rn.f32 	%f31, %f30, %f12, %f12;
	mov.f32 	%f32, 0f3F317218;
	fma.rn.f32 	%f35, %f11, %f32, %f31;
	setp.lt.u32	%p3, %r6, 2139095040;
	@%p3 bra 	BB65_3;

	mov.f32 	%f33, 0f7F800000;
	fma.rn.f32 	%f35, %f1, %f33, %f33;

BB65_3:
	cvta.to.global.u64 	%rd7, %rd3;
	setp.eq.f32	%p4, %f1, 0f00000000;
	selp.f32	%f34, 0fFF800000, %f35, %p4;
	shl.b64 	%rd8, %rd1, 2;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f32 	[%rd9], %f34;

BB65_4:
	ret;
}

	// .globl	matrix_floor_d
.visible .entry matrix_floor_d(
	.param .u64 matrix_floor_d_param_0,
	.param .u64 matrix_floor_d_param_1,
	.param .u32 matrix_floor_d_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_floor_d_param_0];
	ld.param.u64 	%rd2, [matrix_floor_d_param_1];
	ld.param.u32 	%r2, [matrix_floor_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB66_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	cvt.rmi.f64.f64	%fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd2;

BB66_2:
	ret;
}

	// .globl	matrix_floor_f
.visible .entry matrix_floor_f(
	.param .u64 matrix_floor_f_param_0,
	.param .u64 matrix_floor_f_param_1,
	.param .u32 matrix_floor_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_floor_f_param_0];
	ld.param.u64 	%rd2, [matrix_floor_f_param_1];
	ld.param.u32 	%r2, [matrix_floor_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB67_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	cvt.rmi.f32.f32	%f2, %f1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f2;

BB67_2:
	ret;
}

	// .globl	matrix_ceil_d
.visible .entry matrix_ceil_d(
	.param .u64 matrix_ceil_d_param_0,
	.param .u64 matrix_ceil_d_param_1,
	.param .u32 matrix_ceil_d_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<6>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_ceil_d_param_0];
	ld.param.u64 	%rd2, [matrix_ceil_d_param_1];
	ld.param.u32 	%r2, [matrix_ceil_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB68_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 8;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f64 	%fd1, [%rd5];
	cvt.rpi.f64.f64	%fd2, %fd1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f64 	[%rd7], %fd2;

BB68_2:
	ret;
}

	// .globl	matrix_ceil_f
.visible .entry matrix_ceil_f(
	.param .u64 matrix_ceil_f_param_0,
	.param .u64 matrix_ceil_f_param_1,
	.param .u32 matrix_ceil_f_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_ceil_f_param_0];
	ld.param.u64 	%rd2, [matrix_ceil_f_param_1];
	ld.param.u32 	%r2, [matrix_ceil_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB69_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	cvt.rpi.f32.f32	%f2, %f1;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f2;

BB69_2:
	ret;
}

	// .globl	matrix_sin_d
.visible .entry matrix_sin_d(
	.param .u64 matrix_sin_d_param_0,
	.param .u64 matrix_sin_d_param_1,
	.param .u32 matrix_sin_d_param_2
)
{
	.local .align 4 .b8 	__local_depot70[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<18>;
	.reg .f64 	%fd<41>;
	.reg .b64 	%rd<17>;


	mov.u64 	%rd16, __local_depot70;
	cvta.local.u64 	%SP, %rd16;
	ld.param.u64 	%rd3, [matrix_sin_d_param_0];
	ld.param.u64 	%rd4, [matrix_sin_d_param_1];
	ld.param.u32 	%r5, [matrix_sin_d_param_2];
	add.u64 	%rd5, %SP, 0;
	cvta.to.local.u64 	%rd1, %rd5;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r6, %r7, %r8;
	setp.ge.u32	%p1, %r1, %r5;
	@%p1 bra 	BB70_10;

	cvta.to.global.u64 	%rd6, %rd3;
	cvt.s64.s32	%rd2, %r1;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f64 	%fd38, [%rd8];
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd38;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r10}, %fd38;
	}
	and.b32  	%r11, %r10, 2147483647;
	setp.eq.s32	%p2, %r11, 2146435072;
	setp.eq.s32	%p3, %r9, 0;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB70_3;
	bra.uni 	BB70_2;

BB70_2:
	mov.f64 	%fd14, 0d0000000000000000;
	mul.rn.f64 	%fd38, %fd38, %fd14;

BB70_3:
	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64	%r17, %fd15;
	st.local.u32 	[%rd1], %r17;
	cvt.rn.f64.s32	%fd16, %r17;
	neg.f64 	%fd17, %fd16;
	mov.f64 	%fd18, 0d3FF921FB54442D18;
	fma.rn.f64 	%fd19, %fd17, %fd18, %fd38;
	mov.f64 	%fd20, 0d3C91A62633145C00;
	fma.rn.f64 	%fd21, %fd17, %fd20, %fd19;
	mov.f64 	%fd22, 0d397B839A252049C0;
	fma.rn.f64 	%fd39, %fd17, %fd22, %fd21;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r12}, %fd38;
	}
	and.b32  	%r13, %r12, 2145386496;
	setp.lt.u32	%p5, %r13, 1105199104;
	@%p5 bra 	BB70_5;

	// Callseq Start 3
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd38;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd5;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd39, [retval0+0];
	
	//{
	}// Callseq End 3
	ld.local.u32 	%r17, [%rd1];

BB70_5:
	and.b32  	%r14, %r17, 1;
	shl.b32 	%r15, %r14, 3;
	setp.eq.b32	%p6, %r14, 1;
	selp.f64	%fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
	mul.wide.u32 	%rd10, %r15, 8;
	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
	add.s64 	%rd12, %rd10, %rd11;
	ld.const.f64 	%fd24, [%rd12+8];
	mul.rn.f64 	%fd7, %fd39, %fd39;
	fma.rn.f64 	%fd25, %fd23, %fd7, %fd24;
	ld.const.f64 	%fd26, [%rd12+16];
	fma.rn.f64 	%fd27, %fd25, %fd7, %fd26;
	ld.const.f64 	%fd28, [%rd12+24];
	fma.rn.f64 	%fd29, %fd27, %fd7, %fd28;
	ld.const.f64 	%fd30, [%rd12+32];
	fma.rn.f64 	%fd31, %fd29, %fd7, %fd30;
	ld.const.f64 	%fd32, [%rd12+40];
	fma.rn.f64 	%fd33, %fd31, %fd7, %fd32;
	ld.const.f64 	%fd34, [%rd12+48];
	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
	setp.eq.s32	%p7, %r14, 0;
	@%p7 bra 	BB70_7;

	mov.f64 	%fd35, 0d3FF0000000000000;
	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;

BB70_7:
	and.b32  	%r16, %r17, 2;
	setp.eq.s32	%p8, %r16, 0;
	@%p8 bra 	BB70_9;

	mov.f64 	%fd36, 0d0000000000000000;
	mov.f64 	%fd37, 0dBFF0000000000000;
	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;

BB70_9:
	cvta.to.global.u64 	%rd13, %rd4;
	shl.b64 	%rd14, %rd2, 3;
	add.s64 	%rd15, %rd13, %rd14;
	st.global.f64 	[%rd15], %fd40;

BB70_10:
	ret;
}

	// .globl	matrix_sin_f
.visible .entry matrix_sin_f(
	.param .u64 matrix_sin_f_param_0,
	.param .u64 matrix_sin_f_param_1,
	.param .u32 matrix_sin_f_param_2
)
{
	.local .align 4 .b8 	__local_depot71[28];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<15>;
	.reg .f32 	%f<48>;
	.reg .b32 	%r<95>;
	.reg .b64 	%rd<22>;


	mov.u64 	%rd21, __local_depot71;
	cvta.local.u64 	%SP, %rd21;
	ld.param.u64 	%rd8, [matrix_sin_f_param_0];
	ld.param.u64 	%rd9, [matrix_sin_f_param_1];
	ld.param.u32 	%r30, [matrix_sin_f_param_2];
	mov.u32 	%r31, %ntid.x;
	mov.u32 	%r32, %ctaid.x;
	mov.u32 	%r33, %tid.x;
	mad.lo.s32 	%r1, %r31, %r32, %r33;
	setp.ge.u32	%p1, %r1, %r30;
	@%p1 bra 	BB71_22;

	cvta.to.global.u64 	%rd10, %rd8;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd11, %r1, 4;
	add.s64 	%rd12, %rd10, %rd11;
	add.u64 	%rd13, %SP, 0;
	cvta.to.local.u64 	%rd2, %rd13;
	ld.global.f32 	%f43, [%rd12];
	abs.f32 	%f19, %f43;
	setp.neu.f32	%p2, %f19, 0f7F800000;
	@%p2 bra 	BB71_3;

	mov.f32 	%f20, 0f00000000;
	mul.rn.f32 	%f43, %f43, %f20;

BB71_3:
	mul.f32 	%f21, %f43, 0f3F22F983;
	cvt.rni.s32.f32	%r94, %f21;
	cvt.rn.f32.s32	%f22, %r94;
	neg.f32 	%f23, %f22;
	mov.f32 	%f24, 0f3FC90FDA;
	fma.rn.f32 	%f25, %f23, %f24, %f43;
	mov.f32 	%f26, 0f33A22168;
	fma.rn.f32 	%f27, %f23, %f26, %f25;
	mov.f32 	%f28, 0f27C234C5;
	fma.rn.f32 	%f44, %f23, %f28, %f27;
	abs.f32 	%f29, %f43;
	setp.leu.f32	%p3, %f29, 0f47CE4780;
	@%p3 bra 	BB71_11;

	mov.b32 	 %r3, %f43;
	shr.u32 	%r4, %r3, 23;
	shl.b32 	%r36, %r3, 8;
	or.b32  	%r5, %r36, -2147483648;
	mov.u32 	%r88, 0;
	mov.u64 	%rd19, __cudart_i2opi_f;
	mov.u32 	%r87, -6;
	mov.u64 	%rd20, %rd2;

BB71_5:
	.pragma "nounroll";
	mov.u64 	%rd4, %rd20;
	ld.const.u32 	%r39, [%rd19];
	// inline asm
	{
	mad.lo.cc.u32   %r37, %r39, %r5, %r88;
	madc.hi.u32     %r88, %r39, %r5,  0;
	}
	// inline asm
	st.local.u32 	[%rd4], %r37;
	add.s64 	%rd5, %rd4, 4;
	add.s64 	%rd19, %rd19, 4;
	add.s32 	%r87, %r87, 1;
	setp.ne.s32	%p4, %r87, 0;
	mov.u64 	%rd20, %rd5;
	@%p4 bra 	BB71_5;

	and.b32  	%r42, %r4, 255;
	add.s32 	%r43, %r42, -128;
	shr.u32 	%r44, %r43, 5;
	and.b32  	%r10, %r3, -2147483648;
	st.local.u32 	[%rd2+24], %r88;
	mov.u32 	%r45, 6;
	sub.s32 	%r46, %r45, %r44;
	mul.wide.s32 	%rd15, %r46, 4;
	add.s64 	%rd7, %rd2, %rd15;
	ld.local.u32 	%r89, [%rd7];
	ld.local.u32 	%r90, [%rd7+-4];
	and.b32  	%r13, %r4, 31;
	setp.eq.s32	%p5, %r13, 0;
	@%p5 bra 	BB71_8;

	mov.u32 	%r47, 32;
	sub.s32 	%r48, %r47, %r13;
	shr.u32 	%r49, %r90, %r48;
	shl.b32 	%r50, %r89, %r13;
	add.s32 	%r89, %r49, %r50;
	ld.local.u32 	%r51, [%rd7+-8];
	shr.u32 	%r52, %r51, %r48;
	shl.b32 	%r53, %r90, %r13;
	add.s32 	%r90, %r52, %r53;

BB71_8:
	shr.u32 	%r54, %r90, 30;
	shl.b32 	%r55, %r89, 2;
	add.s32 	%r91, %r54, %r55;
	shl.b32 	%r19, %r90, 2;
	shr.u32 	%r56, %r91, 31;
	shr.u32 	%r57, %r89, 30;
	add.s32 	%r20, %r56, %r57;
	setp.eq.s32	%p6, %r56, 0;
	mov.u32 	%r92, %r10;
	mov.u32 	%r93, %r19;
	@%p6 bra 	BB71_10;

	not.b32 	%r58, %r91;
	neg.s32 	%r21, %r19;
	setp.eq.s32	%p7, %r19, 0;
	selp.u32	%r59, 1, 0, %p7;
	add.s32 	%r91, %r59, %r58;
	xor.b32  	%r23, %r10, -2147483648;
	mov.u32 	%r92, %r23;
	mov.u32 	%r93, %r21;

BB71_10:
	mov.u32 	%r25, %r92;
	neg.s32 	%r60, %r20;
	setp.ne.s32	%p8, %r10, 0;
	selp.b32	%r94, %r60, %r20, %p8;
	clz.b32 	%r61, %r91;
	setp.ne.s32	%p9, %r61, 0;
	shl.b32 	%r62, %r91, %r61;
	mov.u32 	%r63, 32;
	sub.s32 	%r64, %r63, %r61;
	shr.u32 	%r65, %r93, %r64;
	add.s32 	%r66, %r65, %r62;
	selp.b32	%r67, %r66, %r91, %p9;
	mul.lo.s32 	%r68, %r67, -921707870;
	mov.u32 	%r69, -921707870;
	mul.hi.u32 	%r70, %r67, %r69;
	setp.gt.s32	%p10, %r70, 0;
	shl.b32 	%r71, %r70, 1;
	shr.u32 	%r72, %r68, 31;
	add.s32 	%r73, %r72, %r71;
	selp.b32	%r74, %r73, %r70, %p10;
	selp.b32	%r75, -1, 0, %p10;
	mov.u32 	%r76, 126;
	sub.s32 	%r77, %r76, %r61;
	add.s32 	%r78, %r77, %r75;
	shl.b32 	%r79, %r78, 23;
	add.s32 	%r80, %r74, 1;
	shr.u32 	%r81, %r80, 7;
	add.s32 	%r82, %r81, 1;
	shr.u32 	%r83, %r82, 1;
	add.s32 	%r84, %r83, %r79;
	or.b32  	%r85, %r84, %r25;
	mov.b32 	 %f44, %r85;

BB71_11:
	mul.rn.f32 	%f7, %f44, %f44;
	and.b32  	%r29, %r94, 1;
	setp.eq.s32	%p11, %r29, 0;
	@%p11 bra 	BB71_13;

	mov.f32 	%f30, 0fBAB6061A;
	mov.f32 	%f31, 0f37CCF5CE;
	fma.rn.f32 	%f45, %f31, %f7, %f30;
	bra.uni 	BB71_14;

BB71_13:
	mov.f32 	%f32, 0f3C08839E;
	mov.f32 	%f33, 0fB94CA1F9;
	fma.rn.f32 	%f45, %f33, %f7, %f32;

BB71_14:
	@%p11 bra 	BB71_16;

	mov.f32 	%f34, 0f3D2AAAA5;
	fma.rn.f32 	%f35, %f45, %f7, %f34;
	mov.f32 	%f36, 0fBF000000;
	fma.rn.f32 	%f46, %f35, %f7, %f36;
	bra.uni 	BB71_17;

BB71_16:
	mov.f32 	%f37, 0fBE2AAAA3;
	fma.rn.f32 	%f38, %f45, %f7, %f37;
	mov.f32 	%f39, 0f00000000;
	fma.rn.f32 	%f46, %f38, %f7, %f39;

BB71_17:
	fma.rn.f32 	%f47, %f46, %f44, %f44;
	@%p11 bra 	BB71_19;

	mov.f32 	%f40, 0f3F800000;
	fma.rn.f32 	%f47, %f46, %f7, %f40;

BB71_19:
	and.b32  	%r86, %r94, 2;
	setp.eq.s32	%p14, %r86, 0;
	@%p14 bra 	BB71_21;

	mov.f32 	%f41, 0f00000000;
	mov.f32 	%f42, 0fBF800000;
	fma.rn.f32 	%f47, %f47, %f42, %f41;

BB71_21:
	cvta.to.global.u64 	%rd16, %rd9;
	shl.b64 	%rd17, %rd1, 2;
	add.s64 	%rd18, %rd16, %rd17;
	st.global.f32 	[%rd18], %f47;

BB71_22:
	ret;
}

	// .globl	matrix_sinh_d
.visible .entry matrix_sinh_d(
	.param .u64 matrix_sinh_d_param_0,
	.param .u64 matrix_sinh_d_param_1,
	.param .u32 matrix_sinh_d_param_2
)
{
	.reg .pred 	%p<7>;
	.reg .b32 	%r<24>;
	.reg .f64 	%fd<68>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_sinh_d_param_0];
	ld.param.u64 	%rd3, [matrix_sinh_d_param_1];
	ld.param.u32 	%r3, [matrix_sinh_d_param_2];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	setp.ge.u32	%p1, %r1, %r3;
	@%p1 bra 	BB72_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd5, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd5;
	}
	and.b32  	%r7, %r2, 2147483647;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r8, %temp}, %fd5;
	}
	mov.b64 	%fd1, {%r8, %r7};
	setp.lt.u32	%p2, %r7, 1072693248;
	@%p2 bra 	BB72_3;
	bra.uni 	BB72_2;

BB72_3:
	mul.f64 	%fd51, %fd1, %fd1;
	mov.f64 	%fd52, 0d3DE611A561D87DEF;
	mov.f64 	%fd53, 0d3D6B4C75AB274C53;
	fma.rn.f64 	%fd54, %fd53, %fd51, %fd52;
	mov.f64 	%fd55, 0d3E5AE64671B18F5C;
	fma.rn.f64 	%fd56, %fd54, %fd51, %fd55;
	mov.f64 	%fd57, 0d3EC71DE3A465B1E4;
	fma.rn.f64 	%fd58, %fd56, %fd51, %fd57;
	mov.f64 	%fd59, 0d3F2A01A01A02899D;
	fma.rn.f64 	%fd60, %fd58, %fd51, %fd59;
	mov.f64 	%fd61, 0d3F811111111110A6;
	fma.rn.f64 	%fd62, %fd60, %fd51, %fd61;
	mov.f64 	%fd63, 0d3FC5555555555556;
	fma.rn.f64 	%fd64, %fd62, %fd51, %fd63;
	mul.f64 	%fd65, %fd51, %fd64;
	fma.rn.f64 	%fd67, %fd65, %fd1, %fd1;
	bra.uni 	BB72_4;

BB72_2:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r9}, %fd1;
	}
	mov.f64 	%fd6, 0d4338000000000000;
	mov.f64 	%fd7, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd8, %fd1, %fd7, %fd6;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r10, %temp}, %fd8;
	}
	add.s32 	%r11, %r10, -1;
	mov.f64 	%fd9, 0dC338000000000000;
	add.rn.f64 	%fd10, %fd8, %fd9;
	mov.f64 	%fd11, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd12, %fd10, %fd11, %fd1;
	mov.f64 	%fd13, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd14, %fd10, %fd13, %fd12;
	add.s32 	%r12, %r9, %r9;
	setp.lt.u32	%p3, %r12, 2142496327;
	selp.b32	%r13, 0, %r11, %p3;
	selp.f64	%fd15, %fd1, %fd14, %p3;
	mov.f64 	%fd16, 0d3E5AF86D8EBD13CD;
	mov.f64 	%fd17, 0d3E21F4076ACD15B6;
	fma.rn.f64 	%fd18, %fd17, %fd15, %fd16;
	mov.f64 	%fd19, 0d3E927E5092BA033D;
	fma.rn.f64 	%fd20, %fd18, %fd15, %fd19;
	mov.f64 	%fd21, 0d3EC71DDE6C5F9DA1;
	fma.rn.f64 	%fd22, %fd20, %fd15, %fd21;
	mov.f64 	%fd23, 0d3EFA01A018D034E6;
	fma.rn.f64 	%fd24, %fd22, %fd15, %fd23;
	mov.f64 	%fd25, 0d3F2A01A01B3B6940;
	fma.rn.f64 	%fd26, %fd24, %fd15, %fd25;
	mov.f64 	%fd27, 0d3F56C16C16C1B5DD;
	fma.rn.f64 	%fd28, %fd26, %fd15, %fd27;
	mov.f64 	%fd29, 0d3F8111111110F74D;
	fma.rn.f64 	%fd30, %fd28, %fd15, %fd29;
	mov.f64 	%fd31, 0d3FA555555555554D;
	fma.rn.f64 	%fd32, %fd30, %fd15, %fd31;
	mov.f64 	%fd33, 0d3FC5555555555557;
	fma.rn.f64 	%fd34, %fd32, %fd15, %fd33;
	mov.f64 	%fd35, 0d3FE0000000000000;
	fma.rn.f64 	%fd36, %fd34, %fd15, %fd35;
	mul.f64 	%fd37, %fd15, %fd36;
	fma.rn.f64 	%fd38, %fd37, %fd15, %fd15;
	setp.eq.s32	%p4, %r13, 1024;
	selp.b32	%r14, -1, 0, %p4;
	add.s32 	%r15, %r14, %r13;
	shl.b32 	%r16, %r15, 20;
	add.s32 	%r17, %r16, 1072693248;
	mov.u32 	%r18, 0;
	mov.b64 	%fd39, {%r18, %r17};
	mov.u32 	%r19, 1071644672;
	mov.b64 	%fd40, {%r18, %r19};
	sub.f64 	%fd41, %fd39, %fd40;
	fma.rn.f64 	%fd42, %fd38, %fd39, %fd41;
	add.f64 	%fd43, %fd42, %fd42;
	selp.f64	%fd44, %fd43, %fd42, %p4;
	setp.eq.s32	%p5, %r12, 0;
	selp.f64	%fd45, %fd15, %fd44, %p5;
	mov.f64 	%fd46, 0d3FF0000000000000;
	mov.f64 	%fd47, 0d4000000000000000;
	fma.rn.f64 	%fd48, %fd47, %fd45, %fd46;
	div.rn.f64 	%fd49, %fd45, %fd48;
	add.f64 	%fd50, %fd49, %fd45;
	setp.ge.f64	%p6, %fd1, 0d408633CE8FB9F87E;
	selp.f64	%fd67, 0d7FF0000000000000, %fd50, %p6;

BB72_4:
	cvta.to.global.u64 	%rd7, %rd3;
	and.b32  	%r20, %r2, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r21}, %fd67;
	}
	or.b32  	%r22, %r21, %r20;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r23, %temp}, %fd67;
	}
	mov.b64 	%fd66, {%r23, %r22};
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd66;

BB72_5:
	ret;
}

	// .globl	matrix_sinh_f
.visible .entry matrix_sinh_f(
	.param .u64 matrix_sinh_f_param_0,
	.param .u64 matrix_sinh_f_param_1,
	.param .u32 matrix_sinh_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<32>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_sinh_f_param_0];
	ld.param.u64 	%rd3, [matrix_sinh_f_param_1];
	ld.param.u32 	%r2, [matrix_sinh_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB73_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	abs.f32 	%f2, %f1;
	setp.ltu.f32	%p2, %f2, 0f3F800000;
	@%p2 bra 	BB73_3;
	bra.uni 	BB73_2;

BB73_3:
	mul.f32 	%f22, %f1, %f1;
	mov.f32 	%f23, 0f394FFF49;
	mov.f32 	%f24, 0f363D0ADA;
	fma.rn.f32 	%f25, %f24, %f22, %f23;
	mov.f32 	%f26, 0f3C08889A;
	fma.rn.f32 	%f27, %f25, %f22, %f26;
	mov.f32 	%f28, 0f3E2AAAAB;
	fma.rn.f32 	%f29, %f27, %f22, %f28;
	mul.f32 	%f30, %f22, %f29;
	fma.rn.f32 	%f31, %f30, %f1, %f1;
	bra.uni 	BB73_4;

BB73_2:
	mul.f32 	%f8, %f2, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f9, %f8;
	mov.f32 	%f10, 0fBF317200;
	fma.rn.f32 	%f11, %f9, %f10, %f2;
	mov.f32 	%f12, 0fB5BFBE8E;
	fma.rn.f32 	%f13, %f9, %f12, %f11;
	mul.f32 	%f7, %f13, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f6,%f7;
	// inline asm
	add.f32 	%f14, %f9, 0fC0000000;
	ex2.approx.f32 	%f15, %f14;
	mul.f32 	%f16, %f6, %f15;
	mov.f32 	%f17, 0f3E000000;
	div.approx.f32 	%f18, %f17, %f16;
	neg.f32 	%f19, %f18;
	mov.f32 	%f20, 0f40000000;
	fma.rn.f32 	%f21, %f20, %f16, %f19;
	mov.b32 	 %r6, %f21;
	setp.ge.f32	%p3, %f2, 0f42B40000;
	selp.b32	%r7, 2139095040, %r6, %p3;
	mov.b32 	 %r8, %f1;
	and.b32  	%r9, %r8, -2147483648;
	or.b32  	%r10, %r7, %r9;
	mov.b32 	 %f31, %r10;

BB73_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 2;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f32 	[%rd9], %f31;

BB73_5:
	ret;
}

	// .globl	matrix_cos_d
.visible .entry matrix_cos_d(
	.param .u64 matrix_cos_d_param_0,
	.param .u64 matrix_cos_d_param_1,
	.param .u32 matrix_cos_d_param_2
)
{
	.local .align 4 .b8 	__local_depot74[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<19>;
	.reg .f64 	%fd<41>;
	.reg .b64 	%rd<17>;


	mov.u64 	%rd16, __local_depot74;
	cvta.local.u64 	%SP, %rd16;
	ld.param.u64 	%rd3, [matrix_cos_d_param_0];
	ld.param.u64 	%rd4, [matrix_cos_d_param_1];
	ld.param.u32 	%r6, [matrix_cos_d_param_2];
	add.u64 	%rd5, %SP, 0;
	cvta.to.local.u64 	%rd1, %rd5;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	setp.ge.u32	%p1, %r1, %r6;
	@%p1 bra 	BB74_10;

	cvta.to.global.u64 	%rd6, %rd3;
	cvt.s64.s32	%rd2, %r1;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f64 	%fd38, [%rd8];
	{
	.reg .b32 %temp; 
	mov.b64 	{%r10, %temp}, %fd38;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r11}, %fd38;
	}
	and.b32  	%r12, %r11, 2147483647;
	setp.eq.s32	%p2, %r12, 2146435072;
	setp.eq.s32	%p3, %r10, 0;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB74_3;
	bra.uni 	BB74_2;

BB74_2:
	mov.f64 	%fd14, 0d0000000000000000;
	mul.rn.f64 	%fd38, %fd38, %fd14;

BB74_3:
	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64	%r18, %fd15;
	st.local.u32 	[%rd1], %r18;
	cvt.rn.f64.s32	%fd16, %r18;
	neg.f64 	%fd17, %fd16;
	mov.f64 	%fd18, 0d3FF921FB54442D18;
	fma.rn.f64 	%fd19, %fd17, %fd18, %fd38;
	mov.f64 	%fd20, 0d3C91A62633145C00;
	fma.rn.f64 	%fd21, %fd17, %fd20, %fd19;
	mov.f64 	%fd22, 0d397B839A252049C0;
	fma.rn.f64 	%fd39, %fd17, %fd22, %fd21;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r13}, %fd38;
	}
	and.b32  	%r14, %r13, 2145386496;
	setp.lt.u32	%p5, %r14, 1105199104;
	@%p5 bra 	BB74_5;

	// Callseq Start 4
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd38;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd5;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd39, [retval0+0];
	
	//{
	}// Callseq End 4
	ld.local.u32 	%r18, [%rd1];

BB74_5:
	add.s32 	%r5, %r18, 1;
	and.b32  	%r15, %r5, 1;
	shl.b32 	%r16, %r15, 3;
	setp.eq.b32	%p6, %r15, 1;
	selp.f64	%fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6;
	mul.wide.u32 	%rd10, %r16, 8;
	mov.u64 	%rd11, __cudart_sin_cos_coeffs;
	add.s64 	%rd12, %rd10, %rd11;
	ld.const.f64 	%fd24, [%rd12+8];
	mul.rn.f64 	%fd7, %fd39, %fd39;
	fma.rn.f64 	%fd25, %fd23, %fd7, %fd24;
	ld.const.f64 	%fd26, [%rd12+16];
	fma.rn.f64 	%fd27, %fd25, %fd7, %fd26;
	ld.const.f64 	%fd28, [%rd12+24];
	fma.rn.f64 	%fd29, %fd27, %fd7, %fd28;
	ld.const.f64 	%fd30, [%rd12+32];
	fma.rn.f64 	%fd31, %fd29, %fd7, %fd30;
	ld.const.f64 	%fd32, [%rd12+40];
	fma.rn.f64 	%fd33, %fd31, %fd7, %fd32;
	ld.const.f64 	%fd34, [%rd12+48];
	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
	setp.eq.s32	%p7, %r15, 0;
	@%p7 bra 	BB74_7;

	mov.f64 	%fd35, 0d3FF0000000000000;
	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;

BB74_7:
	and.b32  	%r17, %r5, 2;
	setp.eq.s32	%p8, %r17, 0;
	@%p8 bra 	BB74_9;

	mov.f64 	%fd36, 0d0000000000000000;
	mov.f64 	%fd37, 0dBFF0000000000000;
	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;

BB74_9:
	cvta.to.global.u64 	%rd13, %rd4;
	shl.b64 	%rd14, %rd2, 3;
	add.s64 	%rd15, %rd13, %rd14;
	st.global.f64 	[%rd15], %fd40;

BB74_10:
	ret;
}

	// .globl	matrix_cos_f
.visible .entry matrix_cos_f(
	.param .u64 matrix_cos_f_param_0,
	.param .u64 matrix_cos_f_param_1,
	.param .u32 matrix_cos_f_param_2
)
{
	.local .align 4 .b8 	__local_depot75[28];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<15>;
	.reg .f32 	%f<48>;
	.reg .b32 	%r<96>;
	.reg .b64 	%rd<22>;


	mov.u64 	%rd21, __local_depot75;
	cvta.local.u64 	%SP, %rd21;
	ld.param.u64 	%rd8, [matrix_cos_f_param_0];
	ld.param.u64 	%rd9, [matrix_cos_f_param_1];
	ld.param.u32 	%r31, [matrix_cos_f_param_2];
	mov.u32 	%r32, %ntid.x;
	mov.u32 	%r33, %ctaid.x;
	mov.u32 	%r34, %tid.x;
	mad.lo.s32 	%r1, %r32, %r33, %r34;
	setp.ge.u32	%p1, %r1, %r31;
	@%p1 bra 	BB75_22;

	cvta.to.global.u64 	%rd10, %rd8;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd11, %r1, 4;
	add.s64 	%rd12, %rd10, %rd11;
	add.u64 	%rd13, %SP, 0;
	cvta.to.local.u64 	%rd2, %rd13;
	ld.global.f32 	%f43, [%rd12];
	abs.f32 	%f19, %f43;
	setp.neu.f32	%p2, %f19, 0f7F800000;
	@%p2 bra 	BB75_3;

	mov.f32 	%f20, 0f00000000;
	mul.rn.f32 	%f43, %f43, %f20;

BB75_3:
	mul.f32 	%f21, %f43, 0f3F22F983;
	cvt.rni.s32.f32	%r95, %f21;
	cvt.rn.f32.s32	%f22, %r95;
	neg.f32 	%f23, %f22;
	mov.f32 	%f24, 0f3FC90FDA;
	fma.rn.f32 	%f25, %f23, %f24, %f43;
	mov.f32 	%f26, 0f33A22168;
	fma.rn.f32 	%f27, %f23, %f26, %f25;
	mov.f32 	%f28, 0f27C234C5;
	fma.rn.f32 	%f44, %f23, %f28, %f27;
	abs.f32 	%f29, %f43;
	setp.leu.f32	%p3, %f29, 0f47CE4780;
	@%p3 bra 	BB75_11;

	mov.b32 	 %r3, %f43;
	shr.u32 	%r4, %r3, 23;
	shl.b32 	%r37, %r3, 8;
	or.b32  	%r5, %r37, -2147483648;
	mov.u32 	%r89, 0;
	mov.u64 	%rd19, __cudart_i2opi_f;
	mov.u32 	%r88, -6;
	mov.u64 	%rd20, %rd2;

BB75_5:
	.pragma "nounroll";
	mov.u64 	%rd4, %rd20;
	ld.const.u32 	%r40, [%rd19];
	// inline asm
	{
	mad.lo.cc.u32   %r38, %r40, %r5, %r89;
	madc.hi.u32     %r89, %r40, %r5,  0;
	}
	// inline asm
	st.local.u32 	[%rd4], %r38;
	add.s64 	%rd5, %rd4, 4;
	add.s64 	%rd19, %rd19, 4;
	add.s32 	%r88, %r88, 1;
	setp.ne.s32	%p4, %r88, 0;
	mov.u64 	%rd20, %rd5;
	@%p4 bra 	BB75_5;

	and.b32  	%r43, %r4, 255;
	add.s32 	%r44, %r43, -128;
	shr.u32 	%r45, %r44, 5;
	and.b32  	%r10, %r3, -2147483648;
	st.local.u32 	[%rd2+24], %r89;
	mov.u32 	%r46, 6;
	sub.s32 	%r47, %r46, %r45;
	mul.wide.s32 	%rd15, %r47, 4;
	add.s64 	%rd7, %rd2, %rd15;
	ld.local.u32 	%r90, [%rd7];
	ld.local.u32 	%r91, [%rd7+-4];
	and.b32  	%r13, %r4, 31;
	setp.eq.s32	%p5, %r13, 0;
	@%p5 bra 	BB75_8;

	mov.u32 	%r48, 32;
	sub.s32 	%r49, %r48, %r13;
	shr.u32 	%r50, %r91, %r49;
	shl.b32 	%r51, %r90, %r13;
	add.s32 	%r90, %r50, %r51;
	ld.local.u32 	%r52, [%rd7+-8];
	shr.u32 	%r53, %r52, %r49;
	shl.b32 	%r54, %r91, %r13;
	add.s32 	%r91, %r53, %r54;

BB75_8:
	shr.u32 	%r55, %r91, 30;
	shl.b32 	%r56, %r90, 2;
	add.s32 	%r92, %r55, %r56;
	shl.b32 	%r19, %r91, 2;
	shr.u32 	%r57, %r92, 31;
	shr.u32 	%r58, %r90, 30;
	add.s32 	%r20, %r57, %r58;
	setp.eq.s32	%p6, %r57, 0;
	mov.u32 	%r93, %r10;
	mov.u32 	%r94, %r19;
	@%p6 bra 	BB75_10;

	not.b32 	%r59, %r92;
	neg.s32 	%r21, %r19;
	setp.eq.s32	%p7, %r19, 0;
	selp.u32	%r60, 1, 0, %p7;
	add.s32 	%r92, %r60, %r59;
	xor.b32  	%r23, %r10, -2147483648;
	mov.u32 	%r93, %r23;
	mov.u32 	%r94, %r21;

BB75_10:
	mov.u32 	%r25, %r93;
	neg.s32 	%r61, %r20;
	setp.ne.s32	%p8, %r10, 0;
	selp.b32	%r95, %r61, %r20, %p8;
	clz.b32 	%r62, %r92;
	setp.ne.s32	%p9, %r62, 0;
	shl.b32 	%r63, %r92, %r62;
	mov.u32 	%r64, 32;
	sub.s32 	%r65, %r64, %r62;
	shr.u32 	%r66, %r94, %r65;
	add.s32 	%r67, %r66, %r63;
	selp.b32	%r68, %r67, %r92, %p9;
	mul.lo.s32 	%r69, %r68, -921707870;
	mov.u32 	%r70, -921707870;
	mul.hi.u32 	%r71, %r68, %r70;
	setp.gt.s32	%p10, %r71, 0;
	shl.b32 	%r72, %r71, 1;
	shr.u32 	%r73, %r69, 31;
	add.s32 	%r74, %r73, %r72;
	selp.b32	%r75, %r74, %r71, %p10;
	selp.b32	%r76, -1, 0, %p10;
	mov.u32 	%r77, 126;
	sub.s32 	%r78, %r77, %r62;
	add.s32 	%r79, %r78, %r76;
	shl.b32 	%r80, %r79, 23;
	add.s32 	%r81, %r75, 1;
	shr.u32 	%r82, %r81, 7;
	add.s32 	%r83, %r82, 1;
	shr.u32 	%r84, %r83, 1;
	add.s32 	%r85, %r84, %r80;
	or.b32  	%r86, %r85, %r25;
	mov.b32 	 %f44, %r86;

BB75_11:
	mul.rn.f32 	%f7, %f44, %f44;
	add.s32 	%r29, %r95, 1;
	and.b32  	%r30, %r29, 1;
	setp.eq.s32	%p11, %r30, 0;
	@%p11 bra 	BB75_13;

	mov.f32 	%f30, 0fBAB6061A;
	mov.f32 	%f31, 0f37CCF5CE;
	fma.rn.f32 	%f45, %f31, %f7, %f30;
	bra.uni 	BB75_14;

BB75_13:
	mov.f32 	%f32, 0f3C08839E;
	mov.f32 	%f33, 0fB94CA1F9;
	fma.rn.f32 	%f45, %f33, %f7, %f32;

BB75_14:
	@%p11 bra 	BB75_16;

	mov.f32 	%f34, 0f3D2AAAA5;
	fma.rn.f32 	%f35, %f45, %f7, %f34;
	mov.f32 	%f36, 0fBF000000;
	fma.rn.f32 	%f46, %f35, %f7, %f36;
	bra.uni 	BB75_17;

BB75_16:
	mov.f32 	%f37, 0fBE2AAAA3;
	fma.rn.f32 	%f38, %f45, %f7, %f37;
	mov.f32 	%f39, 0f00000000;
	fma.rn.f32 	%f46, %f38, %f7, %f39;

BB75_17:
	fma.rn.f32 	%f47, %f46, %f44, %f44;
	@%p11 bra 	BB75_19;

	mov.f32 	%f40, 0f3F800000;
	fma.rn.f32 	%f47, %f46, %f7, %f40;

BB75_19:
	and.b32  	%r87, %r29, 2;
	setp.eq.s32	%p14, %r87, 0;
	@%p14 bra 	BB75_21;

	mov.f32 	%f41, 0f00000000;
	mov.f32 	%f42, 0fBF800000;
	fma.rn.f32 	%f47, %f47, %f42, %f41;

BB75_21:
	cvta.to.global.u64 	%rd16, %rd9;
	shl.b64 	%rd17, %rd1, 2;
	add.s64 	%rd18, %rd16, %rd17;
	st.global.f32 	[%rd18], %f47;

BB75_22:
	ret;
}

	// .globl	matrix_cosh_d
.visible .entry matrix_cosh_d(
	.param .u64 matrix_cosh_d_param_0,
	.param .u64 matrix_cosh_d_param_1,
	.param .u32 matrix_cosh_d_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<16>;
	.reg .f64 	%fd<46>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_cosh_d_param_0];
	ld.param.u64 	%rd3, [matrix_cosh_d_param_1];
	ld.param.u32 	%r2, [matrix_cosh_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB76_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r6}, %fd1;
	}
	and.b32  	%r7, %r6, 2147483647;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r8, %temp}, %fd1;
	}
	mov.b64 	%fd2, {%r8, %r7};
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r9}, %fd2;
	}
	setp.lt.u32	%p2, %r9, 1082536911;
	@%p2 bra 	BB76_3;
	bra.uni 	BB76_2;

BB76_3:
	mov.f64 	%fd8, 0d4338000000000000;
	mov.f64 	%fd9, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd10, %fd2, %fd9, %fd8;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r10, %temp}, %fd10;
	}
	mov.f64 	%fd11, 0dC338000000000000;
	add.rn.f64 	%fd12, %fd10, %fd11;
	mov.f64 	%fd13, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd14, %fd12, %fd13, %fd2;
	mov.f64 	%fd15, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd16, %fd12, %fd15, %fd14;
	mov.f64 	%fd17, 0d3E928AF3FCA213EA;
	mov.f64 	%fd18, 0d3E5ADE1569CE2BDF;
	fma.rn.f64 	%fd19, %fd18, %fd16, %fd17;
	mov.f64 	%fd20, 0d3EC71DEE62401315;
	fma.rn.f64 	%fd21, %fd19, %fd16, %fd20;
	mov.f64 	%fd22, 0d3EFA01997C89EB71;
	fma.rn.f64 	%fd23, %fd21, %fd16, %fd22;
	mov.f64 	%fd24, 0d3F2A01A014761F65;
	fma.rn.f64 	%fd25, %fd23, %fd16, %fd24;
	mov.f64 	%fd26, 0d3F56C16C1852B7AF;
	fma.rn.f64 	%fd27, %fd25, %fd16, %fd26;
	mov.f64 	%fd28, 0d3F81111111122322;
	fma.rn.f64 	%fd29, %fd27, %fd16, %fd28;
	mov.f64 	%fd30, 0d3FA55555555502A1;
	fma.rn.f64 	%fd31, %fd29, %fd16, %fd30;
	mov.f64 	%fd32, 0d3FC5555555555511;
	fma.rn.f64 	%fd33, %fd31, %fd16, %fd32;
	mov.f64 	%fd34, 0d3FE000000000000B;
	fma.rn.f64 	%fd35, %fd33, %fd16, %fd34;
	mov.f64 	%fd36, 0d3FF0000000000000;
	fma.rn.f64 	%fd37, %fd35, %fd16, %fd36;
	fma.rn.f64 	%fd38, %fd37, %fd16, %fd36;
	shl.b32 	%r11, %r10, 20;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r12, %temp}, %fd38;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r13}, %fd38;
	}
	add.s32 	%r14, %r11, %r13;
	add.s32 	%r15, %r14, -2097152;
	mov.b64 	%fd7, {%r12, %r15};
	// inline asm
	rcp.approx.ftz.f64 %fd6,%fd7;
	// inline asm
	neg.f64 	%fd39, %fd7;
	fma.rn.f64 	%fd40, %fd39, %fd6, %fd36;
	fma.rn.f64 	%fd41, %fd40, %fd40, %fd40;
	fma.rn.f64 	%fd42, %fd41, %fd6, %fd6;
	mov.f64 	%fd43, 0d3FB0000000000000;
	fma.rn.f64 	%fd45, %fd42, %fd43, %fd7;
	bra.uni 	BB76_4;

BB76_2:
	setp.le.f64	%p3, %fd1, 0d7FF0000000000000;
	selp.f64	%fd45, 0d7FF0000000000000, %fd1, %p3;

BB76_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	add.f64 	%fd44, %fd45, %fd45;
	st.global.f64 	[%rd9], %fd44;

BB76_5:
	ret;
}

	// .globl	matrix_cosh_f
.visible .entry matrix_cosh_f(
	.param .u64 matrix_cosh_f_param_0,
	.param .u64 matrix_cosh_f_param_1,
	.param .u32 matrix_cosh_f_param_2
)
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<19>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_cosh_f_param_0];
	ld.param.u64 	%rd2, [matrix_cosh_f_param_1];
	ld.param.u32 	%r2, [matrix_cosh_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB77_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f3, [%rd5];
	abs.f32 	%f4, %f3;
	mul.f32 	%f5, %f4, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f6, %f5;
	mov.f32 	%f7, 0fBF317200;
	fma.rn.f32 	%f8, %f6, %f7, %f4;
	mov.f32 	%f9, 0fB5BFBE8E;
	fma.rn.f32 	%f10, %f6, %f9, %f8;
	mul.f32 	%f2, %f10, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f1,%f2;
	// inline asm
	add.f32 	%f11, %f6, 0fC0000000;
	ex2.approx.f32 	%f12, %f11;
	mul.f32 	%f13, %f1, %f12;
	mov.f32 	%f14, 0f3E000000;
	div.approx.f32 	%f15, %f14, %f13;
	mov.f32 	%f16, 0f40000000;
	fma.rn.f32 	%f17, %f16, %f13, %f15;
	setp.ge.f32	%p2, %f4, 0f42B40000;
	selp.f32	%f18, 0f7F800000, %f17, %p2;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f18;

BB77_2:
	ret;
}

	// .globl	matrix_tan_d
.visible .entry matrix_tan_d(
	.param .u64 matrix_tan_d_param_0,
	.param .u64 matrix_tan_d_param_1,
	.param .u32 matrix_tan_d_param_2
)
{
	.local .align 4 .b8 	__local_depot78[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<7>;
	.reg .b32 	%r<16>;
	.reg .f64 	%fd<66>;
	.reg .b64 	%rd<14>;


	mov.u64 	%rd13, __local_depot78;
	cvta.local.u64 	%SP, %rd13;
	ld.param.u64 	%rd3, [matrix_tan_d_param_0];
	ld.param.u64 	%rd4, [matrix_tan_d_param_1];
	ld.param.u32 	%r5, [matrix_tan_d_param_2];
	add.u64 	%rd5, %SP, 0;
	cvta.to.local.u64 	%rd1, %rd5;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r6, %r7, %r8;
	setp.ge.u32	%p1, %r1, %r5;
	@%p1 bra 	BB78_8;

	cvta.to.global.u64 	%rd6, %rd3;
	cvt.s64.s32	%rd2, %r1;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	ld.global.f64 	%fd63, [%rd8];
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd63;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r10}, %fd63;
	}
	and.b32  	%r11, %r10, 2147483647;
	setp.eq.s32	%p2, %r11, 2146435072;
	setp.eq.s32	%p3, %r9, 0;
	and.pred  	%p4, %p2, %p3;
	@!%p4 bra 	BB78_3;
	bra.uni 	BB78_2;

BB78_2:
	mov.f64 	%fd11, 0d0000000000000000;
	mul.rn.f64 	%fd63, %fd63, %fd11;

BB78_3:
	mul.f64 	%fd12, %fd63, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64	%r15, %fd12;
	st.local.u32 	[%rd1], %r15;
	cvt.rn.f64.s32	%fd13, %r15;
	neg.f64 	%fd14, %fd13;
	mov.f64 	%fd15, 0d3FF921FB54442D18;
	fma.rn.f64 	%fd16, %fd14, %fd15, %fd63;
	mov.f64 	%fd17, 0d3C91A62633145C00;
	fma.rn.f64 	%fd18, %fd14, %fd17, %fd16;
	mov.f64 	%fd19, 0d397B839A252049C0;
	fma.rn.f64 	%fd64, %fd14, %fd19, %fd18;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r12}, %fd63;
	}
	and.b32  	%r13, %r12, 2145386496;
	setp.lt.u32	%p5, %r13, 1105199104;
	@%p5 bra 	BB78_5;

	// Callseq Start 5
	{
	.reg .b32 temp_param_reg;
	// }
	.param .b64 param0;
	st.param.f64	[param0+0], %fd63;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd5;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	ld.param.f64	%fd64, [retval0+0];
	
	//{
	}// Callseq End 5
	ld.local.u32 	%r15, [%rd1];

BB78_5:
	mul.f64 	%fd20, %fd64, %fd64;
	mov.f64 	%fd21, 0dBEF9757C5B27EBB1;
	mov.f64 	%fd22, 0d3EE48DAC2799BCB9;
	fma.rn.f64 	%fd23, %fd22, %fd20, %fd21;
	mov.f64 	%fd24, 0d3F0980E90FD91E04;
	fma.rn.f64 	%fd25, %fd23, %fd20, %fd24;
	mov.f64 	%fd26, 0dBEFAE2B0417D7E1D;
	fma.rn.f64 	%fd27, %fd25, %fd20, %fd26;
	mov.f64 	%fd28, 0d3F119F5341BFBA57;
	fma.rn.f64 	%fd29, %fd27, %fd20, %fd28;
	mov.f64 	%fd30, 0d3F15E791A00F6919;
	fma.rn.f64 	%fd31, %fd29, %fd20, %fd30;
	mov.f64 	%fd32, 0d3F2FF2E7FADEC73A;
	fma.rn.f64 	%fd33, %fd31, %fd20, %fd32;
	mov.f64 	%fd34, 0d3F434BC1B206DA62;
	fma.rn.f64 	%fd35, %fd33, %fd20, %fd34;
	mov.f64 	%fd36, 0d3F57DB18EF2F83F9;
	fma.rn.f64 	%fd37, %fd35, %fd20, %fd36;
	mov.f64 	%fd38, 0d3F6D6D2E7AE49FBC;
	fma.rn.f64 	%fd39, %fd37, %fd20, %fd38;
	mov.f64 	%fd40, 0d3F8226E3A816A776;
	fma.rn.f64 	%fd41, %fd39, %fd20, %fd40;
	mov.f64 	%fd42, 0d3F9664F485D25660;
	fma.rn.f64 	%fd43, %fd41, %fd20, %fd42;
	mov.f64 	%fd44, 0d3FABA1BA1BABF31D;
	fma.rn.f64 	%fd45, %fd43, %fd20, %fd44;
	mov.f64 	%fd46, 0d3FC11111111105D2;
	fma.rn.f64 	%fd47, %fd45, %fd20, %fd46;
	mov.f64 	%fd48, 0d3FD555555555555E;
	fma.rn.f64 	%fd49, %fd47, %fd20, %fd48;
	mul.f64 	%fd7, %fd20, %fd49;
	fma.rn.f64 	%fd65, %fd7, %fd64, %fd64;
	and.b32  	%r14, %r15, 1;
	setp.eq.b32	%p6, %r14, 1;
	@!%p6 bra 	BB78_7;
	bra.uni 	BB78_6;

BB78_6:
	sub.f64 	%fd52, %fd65, %fd64;
	neg.f64 	%fd53, %fd52;
	fma.rn.f64 	%fd54, %fd7, %fd64, %fd53;
	// inline asm
	rcp.approx.ftz.f64 %fd50,%fd65;
	// inline asm
	neg.f64 	%fd55, %fd65;
	mov.f64 	%fd56, 0d3FF0000000000000;
	fma.rn.f64 	%fd57, %fd55, %fd50, %fd56;
	fma.rn.f64 	%fd58, %fd57, %fd57, %fd57;
	fma.rn.f64 	%fd59, %fd58, %fd50, %fd50;
	neg.f64 	%fd60, %fd59;
	fma.rn.f64 	%fd61, %fd65, %fd60, %fd56;
	fma.rn.f64 	%fd62, %fd60, %fd54, %fd61;
	fma.rn.f64 	%fd65, %fd62, %fd60, %fd60;

BB78_7:
	cvta.to.global.u64 	%rd10, %rd4;
	shl.b64 	%rd11, %rd2, 3;
	add.s64 	%rd12, %rd10, %rd11;
	st.global.f64 	[%rd12], %fd65;

BB78_8:
	ret;
}

	// .globl	matrix_tan_f
.visible .entry matrix_tan_f(
	.param .u64 matrix_tan_f_param_0,
	.param .u64 matrix_tan_f_param_1,
	.param .u32 matrix_tan_f_param_2
)
{
	.local .align 4 .b8 	__local_depot79[28];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<12>;
	.reg .f32 	%f<33>;
	.reg .b32 	%r<94>;
	.reg .b64 	%rd<22>;


	mov.u64 	%rd21, __local_depot79;
	cvta.local.u64 	%SP, %rd21;
	ld.param.u64 	%rd8, [matrix_tan_f_param_0];
	ld.param.u64 	%rd9, [matrix_tan_f_param_1];
	ld.param.u32 	%r29, [matrix_tan_f_param_2];
	mov.u32 	%r30, %ntid.x;
	mov.u32 	%r31, %ctaid.x;
	mov.u32 	%r32, %tid.x;
	mad.lo.s32 	%r1, %r30, %r31, %r32;
	setp.ge.u32	%p1, %r1, %r29;
	@%p1 bra 	BB79_14;

	cvta.to.global.u64 	%rd10, %rd8;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd11, %r1, 4;
	add.s64 	%rd12, %rd10, %rd11;
	add.u64 	%rd13, %SP, 0;
	cvta.to.local.u64 	%rd2, %rd13;
	ld.global.f32 	%f30, [%rd12];
	abs.f32 	%f10, %f30;
	setp.neu.f32	%p2, %f10, 0f7F800000;
	@%p2 bra 	BB79_3;

	mov.f32 	%f11, 0f00000000;
	mul.rn.f32 	%f30, %f30, %f11;

BB79_3:
	mul.f32 	%f12, %f30, 0f3F22F983;
	cvt.rni.s32.f32	%r93, %f12;
	cvt.rn.f32.s32	%f13, %r93;
	neg.f32 	%f14, %f13;
	mov.f32 	%f15, 0f3FC90FDA;
	fma.rn.f32 	%f16, %f14, %f15, %f30;
	mov.f32 	%f17, 0f33A22168;
	fma.rn.f32 	%f18, %f14, %f17, %f16;
	mov.f32 	%f19, 0f27C234C5;
	fma.rn.f32 	%f31, %f14, %f19, %f18;
	abs.f32 	%f20, %f30;
	setp.leu.f32	%p3, %f20, 0f47CE4780;
	@%p3 bra 	BB79_11;

	mov.b32 	 %r3, %f30;
	shr.u32 	%r4, %r3, 23;
	shl.b32 	%r35, %r3, 8;
	or.b32  	%r5, %r35, -2147483648;
	mov.u32 	%r87, 0;
	mov.u64 	%rd19, __cudart_i2opi_f;
	mov.u32 	%r86, -6;
	mov.u64 	%rd20, %rd2;

BB79_5:
	.pragma "nounroll";
	mov.u64 	%rd4, %rd20;
	ld.const.u32 	%r38, [%rd19];
	// inline asm
	{
	mad.lo.cc.u32   %r36, %r38, %r5, %r87;
	madc.hi.u32     %r87, %r38, %r5,  0;
	}
	// inline asm
	st.local.u32 	[%rd4], %r36;
	add.s64 	%rd5, %rd4, 4;
	add.s64 	%rd19, %rd19, 4;
	add.s32 	%r86, %r86, 1;
	setp.ne.s32	%p4, %r86, 0;
	mov.u64 	%rd20, %rd5;
	@%p4 bra 	BB79_5;

	and.b32  	%r41, %r4, 255;
	add.s32 	%r42, %r41, -128;
	shr.u32 	%r43, %r42, 5;
	and.b32  	%r10, %r3, -2147483648;
	st.local.u32 	[%rd2+24], %r87;
	mov.u32 	%r44, 6;
	sub.s32 	%r45, %r44, %r43;
	mul.wide.s32 	%rd15, %r45, 4;
	add.s64 	%rd7, %rd2, %rd15;
	ld.local.u32 	%r88, [%rd7];
	ld.local.u32 	%r89, [%rd7+-4];
	and.b32  	%r13, %r4, 31;
	setp.eq.s32	%p5, %r13, 0;
	@%p5 bra 	BB79_8;

	mov.u32 	%r46, 32;
	sub.s32 	%r47, %r46, %r13;
	shr.u32 	%r48, %r89, %r47;
	shl.b32 	%r49, %r88, %r13;
	add.s32 	%r88, %r48, %r49;
	ld.local.u32 	%r50, [%rd7+-8];
	shr.u32 	%r51, %r50, %r47;
	shl.b32 	%r52, %r89, %r13;
	add.s32 	%r89, %r51, %r52;

BB79_8:
	shr.u32 	%r53, %r89, 30;
	shl.b32 	%r54, %r88, 2;
	add.s32 	%r90, %r53, %r54;
	shl.b32 	%r19, %r89, 2;
	shr.u32 	%r55, %r90, 31;
	shr.u32 	%r56, %r88, 30;
	add.s32 	%r20, %r55, %r56;
	setp.eq.s32	%p6, %r55, 0;
	mov.u32 	%r91, %r10;
	mov.u32 	%r92, %r19;
	@%p6 bra 	BB79_10;

	not.b32 	%r57, %r90;
	neg.s32 	%r21, %r19;
	setp.eq.s32	%p7, %r19, 0;
	selp.u32	%r58, 1, 0, %p7;
	add.s32 	%r90, %r58, %r57;
	xor.b32  	%r23, %r10, -2147483648;
	mov.u32 	%r91, %r23;
	mov.u32 	%r92, %r21;

BB79_10:
	mov.u32 	%r25, %r91;
	neg.s32 	%r59, %r20;
	setp.ne.s32	%p8, %r10, 0;
	selp.b32	%r93, %r59, %r20, %p8;
	clz.b32 	%r60, %r90;
	setp.ne.s32	%p9, %r60, 0;
	shl.b32 	%r61, %r90, %r60;
	mov.u32 	%r62, 32;
	sub.s32 	%r63, %r62, %r60;
	shr.u32 	%r64, %r92, %r63;
	add.s32 	%r65, %r64, %r61;
	selp.b32	%r66, %r65, %r90, %p9;
	mul.lo.s32 	%r67, %r66, -921707870;
	mov.u32 	%r68, -921707870;
	mul.hi.u32 	%r69, %r66, %r68;
	setp.gt.s32	%p10, %r69, 0;
	shl.b32 	%r70, %r69, 1;
	shr.u32 	%r71, %r67, 31;
	add.s32 	%r72, %r71, %r70;
	selp.b32	%r73, %r72, %r69, %p10;
	selp.b32	%r74, -1, 0, %p10;
	mov.u32 	%r75, 126;
	sub.s32 	%r76, %r75, %r60;
	add.s32 	%r77, %r76, %r74;
	shl.b32 	%r78, %r77, 23;
	add.s32 	%r79, %r73, 1;
	shr.u32 	%r80, %r79, 7;
	add.s32 	%r81, %r80, 1;
	shr.u32 	%r82, %r81, 1;
	add.s32 	%r83, %r82, %r78;
	or.b32  	%r84, %r83, %r25;
	mov.b32 	 %f31, %r84;

BB79_11:
	mul.f32 	%f21, %f31, %f31;
	mov.f32 	%f22, 0fBF52B7F4;
	mov.f32 	%f23, 0f3B86D46D;
	fma.rn.f32 	%f24, %f23, %f21, %f22;
	add.f32 	%f25, %f21, 0fC01E09D0;
	rcp.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f24, %f26;
	mul.f32 	%f28, %f21, %f27;
	fma.rn.f32 	%f32, %f28, %f31, %f31;
	and.b32  	%r85, %r93, 1;
	setp.eq.b32	%p11, %r85, 1;
	@!%p11 bra 	BB79_13;
	bra.uni 	BB79_12;

BB79_12:
	mov.f32 	%f29, 0fBF800000;
	div.rn.f32 	%f32, %f29, %f32;

BB79_13:
	cvta.to.global.u64 	%rd16, %rd9;
	shl.b64 	%rd17, %rd1, 2;
	add.s64 	%rd18, %rd16, %rd17;
	st.global.f32 	[%rd18], %f32;

BB79_14:
	ret;
}

	// .globl	matrix_tanh_d
.visible .entry matrix_tanh_d(
	.param .u64 matrix_tanh_d_param_0,
	.param .u64 matrix_tanh_d_param_1,
	.param .u32 matrix_tanh_d_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<17>;
	.reg .f64 	%fd<74>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_tanh_d_param_0];
	ld.param.u64 	%rd3, [matrix_tanh_d_param_1];
	ld.param.u32 	%r4, [matrix_tanh_d_param_2];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	setp.ge.u32	%p1, %r1, %r4;
	@%p1 bra 	BB80_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd1;
	}
	and.b32  	%r3, %r2, 2147483647;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r8, %temp}, %fd1;
	}
	mov.b64 	%fd2, {%r8, %r3};
	setp.ltu.f64	%p2, %fd2, 0d3FE1C7A398201CD6;
	@%p2 bra 	BB80_3;
	bra.uni 	BB80_2;

BB80_3:
	mul.f64 	%fd51, %fd1, %fd1;
	mov.f64 	%fd52, 0dBF2B9093D89F0E23;
	mov.f64 	%fd53, 0d3F0ABFFC9B5786C4;
	fma.rn.f64 	%fd54, %fd53, %fd51, %fd52;
	mov.f64 	%fd55, 0d3F42FA2744C30B61;
	fma.rn.f64 	%fd56, %fd54, %fd51, %fd55;
	mov.f64 	%fd57, 0dBF57CF3B9C1E491D;
	fma.rn.f64 	%fd58, %fd56, %fd51, %fd57;
	mov.f64 	%fd59, 0d3F6D6C61D450119A;
	fma.rn.f64 	%fd60, %fd58, %fd51, %fd59;
	mov.f64 	%fd61, 0dBF8226DDD44294F5;
	fma.rn.f64 	%fd62, %fd60, %fd51, %fd61;
	mov.f64 	%fd63, 0d3F9664F45C2B04A6;
	fma.rn.f64 	%fd64, %fd62, %fd51, %fd63;
	mov.f64 	%fd65, 0dBFABA1BA1AD70754;
	fma.rn.f64 	%fd66, %fd64, %fd51, %fd65;
	mov.f64 	%fd67, 0d3FC111111110295E;
	fma.rn.f64 	%fd68, %fd66, %fd51, %fd67;
	mov.f64 	%fd69, 0dBFD555555555549F;
	fma.rn.f64 	%fd70, %fd68, %fd51, %fd69;
	mul.f64 	%fd71, %fd51, %fd70;
	fma.rn.f64 	%fd73, %fd71, %fd1, %fd1;
	bra.uni 	BB80_4;

BB80_2:
	add.f64 	%fd8, %fd2, %fd2;
	mov.f64 	%fd9, 0d4338000000000000;
	mov.f64 	%fd10, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd11, %fd8, %fd10, %fd9;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd11;
	}
	mov.f64 	%fd12, 0dC338000000000000;
	add.rn.f64 	%fd13, %fd11, %fd12;
	mov.f64 	%fd14, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd15, %fd13, %fd14, %fd8;
	mov.f64 	%fd16, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd17, %fd13, %fd16, %fd15;
	mov.f64 	%fd18, 0d3E5AF86D8EBD13CD;
	mov.f64 	%fd19, 0d3E21F4076ACD15B6;
	fma.rn.f64 	%fd20, %fd19, %fd17, %fd18;
	mov.f64 	%fd21, 0d3E927E5092BA033D;
	fma.rn.f64 	%fd22, %fd20, %fd17, %fd21;
	mov.f64 	%fd23, 0d3EC71DDE6C5F9DA1;
	fma.rn.f64 	%fd24, %fd22, %fd17, %fd23;
	mov.f64 	%fd25, 0d3EFA01A018D034E6;
	fma.rn.f64 	%fd26, %fd24, %fd17, %fd25;
	mov.f64 	%fd27, 0d3F2A01A01B3B6940;
	fma.rn.f64 	%fd28, %fd26, %fd17, %fd27;
	mov.f64 	%fd29, 0d3F56C16C16C1B5DD;
	fma.rn.f64 	%fd30, %fd28, %fd17, %fd29;
	mov.f64 	%fd31, 0d3F8111111110F74D;
	fma.rn.f64 	%fd32, %fd30, %fd17, %fd31;
	mov.f64 	%fd33, 0d3FA555555555554D;
	fma.rn.f64 	%fd34, %fd32, %fd17, %fd33;
	mov.f64 	%fd35, 0d3FC5555555555557;
	fma.rn.f64 	%fd36, %fd34, %fd17, %fd35;
	mov.f64 	%fd37, 0d3FE0000000000000;
	fma.rn.f64 	%fd38, %fd36, %fd17, %fd37;
	mul.f64 	%fd39, %fd17, %fd38;
	fma.rn.f64 	%fd40, %fd39, %fd17, %fd17;
	shl.b32 	%r10, %r9, 20;
	add.s32 	%r11, %r10, 1072693248;
	mov.u32 	%r12, 0;
	mov.b64 	%fd41, {%r12, %r11};
	fma.rn.f64 	%fd42, %fd40, %fd41, %fd41;
	add.f64 	%fd7, %fd42, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd6,%fd7;
	// inline asm
	neg.f64 	%fd43, %fd7;
	mov.f64 	%fd44, 0d3FF0000000000000;
	fma.rn.f64 	%fd45, %fd43, %fd6, %fd44;
	fma.rn.f64 	%fd46, %fd45, %fd45, %fd45;
	fma.rn.f64 	%fd47, %fd46, %fd6, %fd6;
	neg.f64 	%fd48, %fd47;
	mov.f64 	%fd49, 0d4000000000000000;
	fma.rn.f64 	%fd50, %fd49, %fd48, %fd44;
	setp.gt.u32	%p3, %r3, 1077936127;
	selp.f64	%fd73, 0d3FF0000000000000, %fd50, %p3;

BB80_4:
	cvta.to.global.u64 	%rd7, %rd3;
	and.b32  	%r13, %r2, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r14}, %fd73;
	}
	or.b32  	%r15, %r14, %r13;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r16, %temp}, %fd73;
	}
	mov.b64 	%fd72, {%r16, %r15};
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd72;

BB80_5:
	ret;
}

	// .globl	matrix_tanh_f
.visible .entry matrix_tanh_f(
	.param .u64 matrix_tanh_f_param_0,
	.param .u64 matrix_tanh_f_param_1,
	.param .u32 matrix_tanh_f_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<33>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_tanh_f_param_0];
	ld.param.u64 	%rd3, [matrix_tanh_f_param_1];
	ld.param.u32 	%r2, [matrix_tanh_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB81_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	abs.f32 	%f2, %f1;
	setp.ltu.f32	%p2, %f2, 0f3F0CCCCD;
	@%p2 bra 	BB81_3;
	bra.uni 	BB81_2;

BB81_3:
	mul.f32 	%f21, %f1, %f1;
	mov.f32 	%f22, 0fBD57BE66;
	mov.f32 	%f23, 0f3C86A81B;
	fma.rn.f32 	%f24, %f23, %f21, %f22;
	mov.f32 	%f25, 0f3E08677B;
	fma.rn.f32 	%f26, %f24, %f21, %f25;
	mov.f32 	%f27, 0fBEAAAA29;
	fma.rn.f32 	%f28, %f26, %f21, %f27;
	mul.f32 	%f29, %f21, %f28;
	fma.rn.f32 	%f30, %f29, %f1, %f1;
	add.f32 	%f31, %f1, %f1;
	setp.eq.f32	%p4, %f1, 0f00000000;
	selp.f32	%f32, %f31, %f30, %p4;
	bra.uni 	BB81_4;

BB81_2:
	add.f32 	%f10, %f2, %f2;
	mul.f32 	%f11, %f10, 0f3FB8AA3B;
	cvt.rzi.f32.f32	%f12, %f11;
	mov.f32 	%f13, 0fBF317200;
	fma.rn.f32 	%f14, %f12, %f13, %f10;
	mov.f32 	%f15, 0fB5BFBE8E;
	fma.rn.f32 	%f16, %f12, %f15, %f14;
	mul.f32 	%f7, %f16, 0f3FB8AA3B;
	// inline asm
	ex2.approx.ftz.f32 %f6,%f7;
	// inline asm
	ex2.approx.f32 	%f17, %f12;
	mov.f32 	%f18, 0f3F800000;
	fma.rn.f32 	%f9, %f6, %f17, %f18;
	// inline asm
	rcp.approx.ftz.f32 %f8,%f9;
	// inline asm
	mov.f32 	%f19, 0fC0000000;
	fma.rn.f32 	%f20, %f8, %f19, %f18;
	mov.b32 	 %r6, %f20;
	setp.ge.f32	%p3, %f2, 0f42B00000;
	selp.b32	%r7, 1065353216, %r6, %p3;
	mov.b32 	 %r8, %f1;
	and.b32  	%r9, %r8, -2147483648;
	or.b32  	%r10, %r7, %r9;
	mov.b32 	 %f32, %r10;

BB81_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 2;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f32 	[%rd9], %f32;

BB81_5:
	ret;
}

	// .globl	matrix_asin_d
.visible .entry matrix_asin_d(
	.param .u64 matrix_asin_d_param_0,
	.param .u64 matrix_asin_d_param_1,
	.param .u32 matrix_asin_d_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<15>;
	.reg .f64 	%fd<83>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_asin_d_param_0];
	ld.param.u64 	%rd3, [matrix_asin_d_param_1];
	ld.param.u32 	%r3, [matrix_asin_d_param_2];
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	mad.lo.s32 	%r1, %r5, %r4, %r6;
	setp.ge.u32	%p1, %r1, %r3;
	@%p1 bra 	BB82_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd1;
	}
	mov.b32 	 %f1, %r2;
	abs.f32 	%f2, %f1;
	setp.lt.f32	%p2, %f2, 0f3FE26666;
	@%p2 bra 	BB82_3;
	bra.uni 	BB82_2;

BB82_3:
	mul.f64 	%fd55, %fd1, %fd1;
	mov.f64 	%fd56, 0dBFB3823B180754AF;
	mov.f64 	%fd57, 0d3FB0066BDC1895E9;
	fma.rn.f64 	%fd58, %fd57, %fd55, %fd56;
	mov.f64 	%fd59, 0d3FB11E52CC2F79AE;
	fma.rn.f64 	%fd60, %fd58, %fd55, %fd59;
	mov.f64 	%fd61, 0dBF924EAF3526861B;
	fma.rn.f64 	%fd62, %fd60, %fd55, %fd61;
	mov.f64 	%fd63, 0d3F91DF02A31E6CB7;
	fma.rn.f64 	%fd64, %fd62, %fd55, %fd63;
	mov.f64 	%fd65, 0d3F847D18B0EEC6CC;
	fma.rn.f64 	%fd66, %fd64, %fd55, %fd65;
	mov.f64 	%fd67, 0d3F8D0AF961BA53B0;
	fma.rn.f64 	%fd68, %fd66, %fd55, %fd67;
	mov.f64 	%fd69, 0d3F91BF7734CF1C48;
	fma.rn.f64 	%fd70, %fd68, %fd55, %fd69;
	mov.f64 	%fd71, 0d3F96E91483144EF7;
	fma.rn.f64 	%fd72, %fd70, %fd55, %fd71;
	mov.f64 	%fd73, 0d3F9F1C6E0A4F9F81;
	fma.rn.f64 	%fd74, %fd72, %fd55, %fd73;
	mov.f64 	%fd75, 0d3FA6DB6DC27FA92B;
	fma.rn.f64 	%fd76, %fd74, %fd55, %fd75;
	mov.f64 	%fd77, 0d3FB333333320F91B;
	fma.rn.f64 	%fd78, %fd76, %fd55, %fd77;
	mov.f64 	%fd79, 0d3FC5555555555F4D;
	fma.rn.f64 	%fd80, %fd78, %fd55, %fd79;
	mul.f64 	%fd81, %fd55, %fd80;
	fma.rn.f64 	%fd82, %fd81, %fd1, %fd1;
	bra.uni 	BB82_4;

BB82_2:
	abs.f64 	%fd7, %fd1;
	mov.f64 	%fd8, 0d3FE0000000000000;
	mov.f64 	%fd9, 0dBFE0000000000000;
	fma.rn.f64 	%fd6, %fd9, %fd7, %fd8;
	// inline asm
	rsqrt.approx.ftz.f64 %fd5, %fd6;
	// inline asm
	{
	.reg .b32 %temp; 
	mov.b64 	{%r7, %temp}, %fd5;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd5;
	}
	add.s32 	%r9, %r8, -1048576;
	mov.b64 	%fd10, {%r7, %r9};
	mul.f64 	%fd11, %fd6, %fd5;
	neg.f64 	%fd12, %fd11;
	fma.rn.f64 	%fd13, %fd11, %fd12, %fd6;
	fma.rn.f64 	%fd14, %fd13, %fd10, %fd11;
	neg.f64 	%fd15, %fd14;
	mov.f64 	%fd16, 0d3FF0000000000000;
	fma.rn.f64 	%fd17, %fd5, %fd15, %fd16;
	fma.rn.f64 	%fd18, %fd17, %fd10, %fd10;
	fma.rn.f64 	%fd19, %fd14, %fd15, %fd6;
	fma.rn.f64 	%fd20, %fd19, %fd18, %fd14;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r10}, %fd6;
	}
	setp.lt.s32	%p3, %r10, 0;
	selp.f64	%fd21, 0dFFF8000000000000, %fd20, %p3;
	setp.ne.f64	%p4, %fd6, 0d0000000000000000;
	selp.f64	%fd22, %fd21, %fd6, %p4;
	mov.f64 	%fd23, 0dBFB3823B180754AF;
	mov.f64 	%fd24, 0d3FB0066BDC1895E9;
	fma.rn.f64 	%fd25, %fd24, %fd6, %fd23;
	mov.f64 	%fd26, 0d3FB11E52CC2F79AE;
	fma.rn.f64 	%fd27, %fd25, %fd6, %fd26;
	mov.f64 	%fd28, 0dBF924EAF3526861B;
	fma.rn.f64 	%fd29, %fd27, %fd6, %fd28;
	mov.f64 	%fd30, 0d3F91DF02A31E6CB7;
	fma.rn.f64 	%fd31, %fd29, %fd6, %fd30;
	mov.f64 	%fd32, 0d3F847D18B0EEC6CC;
	fma.rn.f64 	%fd33, %fd31, %fd6, %fd32;
	mov.f64 	%fd34, 0d3F8D0AF961BA53B0;
	fma.rn.f64 	%fd35, %fd33, %fd6, %fd34;
	mov.f64 	%fd36, 0d3F91BF7734CF1C48;
	fma.rn.f64 	%fd37, %fd35, %fd6, %fd36;
	mov.f64 	%fd38, 0d3F96E91483144EF7;
	fma.rn.f64 	%fd39, %fd37, %fd6, %fd38;
	mov.f64 	%fd40, 0d3F9F1C6E0A4F9F81;
	fma.rn.f64 	%fd41, %fd39, %fd6, %fd40;
	mov.f64 	%fd42, 0d3FA6DB6DC27FA92B;
	fma.rn.f64 	%fd43, %fd41, %fd6, %fd42;
	mov.f64 	%fd44, 0d3FB333333320F91B;
	fma.rn.f64 	%fd45, %fd43, %fd6, %fd44;
	mov.f64 	%fd46, 0d3FC5555555555F4D;
	fma.rn.f64 	%fd47, %fd45, %fd6, %fd46;
	mul.f64 	%fd48, %fd6, %fd47;
	mul.f64 	%fd49, %fd22, 0dC000000000000000;
	mov.f64 	%fd50, 0d3C91A62633145C07;
	fma.rn.f64 	%fd51, %fd49, %fd48, %fd50;
	add.f64 	%fd52, %fd49, 0d3FE921FB54442D18;
	add.f64 	%fd53, %fd52, %fd51;
	add.f64 	%fd54, %fd53, 0d3FE921FB54442D18;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r11, %temp}, %fd54;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r12}, %fd54;
	}
	and.b32  	%r13, %r2, -2147483648;
	or.b32  	%r14, %r12, %r13;
	mov.b64 	%fd82, {%r11, %r14};

BB82_4:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd82;

BB82_5:
	ret;
}

	// .globl	matrix_asin_f
.visible .entry matrix_asin_f(
	.param .u64 matrix_asin_f_param_0,
	.param .u64 matrix_asin_f_param_1,
	.param .u32 matrix_asin_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<26>;
	.reg .b32 	%r<10>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_asin_f_param_0];
	ld.param.u64 	%rd2, [matrix_asin_f_param_1];
	ld.param.u32 	%r2, [matrix_asin_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB83_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	abs.f32 	%f2, %f1;
	mov.f32 	%f3, 0f3F800000;
	sub.f32 	%f4, %f3, %f2;
	mul.f32 	%f5, %f4, 0f3F000000;
	sqrt.rn.f32 	%f6, %f5;
	setp.gt.f32	%p2, %f2, 0f3F11EB85;
	selp.f32	%f7, %f6, %f2, %p2;
	mul.f32 	%f8, %f7, %f7;
	mov.f32 	%f9, 0f3C94D2E9;
	mov.f32 	%f10, 0f3D53F941;
	fma.rn.f32 	%f11, %f10, %f8, %f9;
	mov.f32 	%f12, 0f3D3F841F;
	fma.rn.f32 	%f13, %f11, %f8, %f12;
	mov.f32 	%f14, 0f3D994929;
	fma.rn.f32 	%f15, %f13, %f8, %f14;
	mov.f32 	%f16, 0f3E2AAB94;
	fma.rn.f32 	%f17, %f15, %f8, %f16;
	mul.f32 	%f18, %f8, %f17;
	fma.rn.f32 	%f19, %f18, %f7, %f7;
	mov.f32 	%f20, 0f3FC90FDB;
	mov.f32 	%f21, 0fC0000000;
	fma.rn.f32 	%f22, %f21, %f19, %f20;
	selp.f32	%f23, %f22, %f19, %p2;
	setp.le.f32	%p3, %f23, 0f7F800000;
	mov.b32 	 %r6, %f23;
	mov.b32 	 %r7, %f1;
	and.b32  	%r8, %r7, -2147483648;
	or.b32  	%r9, %r6, %r8;
	mov.b32 	 %f24, %r9;
	selp.f32	%f25, %f24, %f23, %p3;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f25;

BB83_2:
	ret;
}

	// .globl	matrix_acos_d
.visible .entry matrix_acos_d(
	.param .u64 matrix_acos_d_param_0,
	.param .u64 matrix_acos_d_param_1,
	.param .u32 matrix_acos_d_param_2
)
{
	.reg .pred 	%p<7>;
	.reg .b32 	%r<17>;
	.reg .f64 	%fd<95>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_acos_d_param_0];
	ld.param.u64 	%rd3, [matrix_acos_d_param_1];
	ld.param.u32 	%r4, [matrix_acos_d_param_2];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	setp.ge.u32	%p1, %r1, %r4;
	@%p1 bra 	BB84_14;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd16, [%rd6];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd16;
	}
	abs.f64 	%fd1, %fd16;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd1;
	}
	setp.lt.s32	%p2, %r8, 1071801958;
	@%p2 bra 	BB84_9;
	bra.uni 	BB84_2;

BB84_9:
	mul.f64 	%fd62, %fd1, %fd1;
	mov.f64 	%fd63, 0dBFB3823B180754AF;
	mov.f64 	%fd64, 0d3FB0066BDC1895E9;
	fma.rn.f64 	%fd65, %fd64, %fd62, %fd63;
	mov.f64 	%fd66, 0d3FB11E52CC2F79AE;
	fma.rn.f64 	%fd67, %fd65, %fd62, %fd66;
	mov.f64 	%fd68, 0dBF924EAF3526861B;
	fma.rn.f64 	%fd69, %fd67, %fd62, %fd68;
	mov.f64 	%fd70, 0d3F91DF02A31E6CB7;
	fma.rn.f64 	%fd71, %fd69, %fd62, %fd70;
	mov.f64 	%fd72, 0d3F847D18B0EEC6CC;
	fma.rn.f64 	%fd73, %fd71, %fd62, %fd72;
	mov.f64 	%fd74, 0d3F8D0AF961BA53B0;
	fma.rn.f64 	%fd75, %fd73, %fd62, %fd74;
	mov.f64 	%fd76, 0d3F91BF7734CF1C48;
	fma.rn.f64 	%fd77, %fd75, %fd62, %fd76;
	mov.f64 	%fd78, 0d3F96E91483144EF7;
	fma.rn.f64 	%fd79, %fd77, %fd62, %fd78;
	mov.f64 	%fd80, 0d3F9F1C6E0A4F9F81;
	fma.rn.f64 	%fd81, %fd79, %fd62, %fd80;
	mov.f64 	%fd82, 0d3FA6DB6DC27FA92B;
	fma.rn.f64 	%fd83, %fd81, %fd62, %fd82;
	mov.f64 	%fd84, 0d3FB333333320F91B;
	fma.rn.f64 	%fd85, %fd83, %fd62, %fd84;
	mov.f64 	%fd86, 0d3FC5555555555F4D;
	fma.rn.f64 	%fd87, %fd85, %fd62, %fd86;
	mul.f64 	%fd88, %fd62, %fd87;
	fma.rn.f64 	%fd10, %fd88, %fd1, %fd1;
	setp.lt.s32	%p6, %r2, 0;
	@%p6 bra 	BB84_11;

	mov.f64 	%fd89, 0dBC91A62633145C07;
	add.rn.f64 	%fd90, %fd10, %fd89;
	neg.f64 	%fd93, %fd90;
	bra.uni 	BB84_12;

BB84_2:
	mov.f64 	%fd19, 0d3FF0000000000000;
	sub.f64 	%fd2, %fd19, %fd1;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd2;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r3}, %fd2;
	}
	add.s32 	%r10, %r3, -1048576;
	mov.b64 	%fd18, {%r9, %r10};
	// inline asm
	rsqrt.approx.ftz.f64 %fd17, %fd18;
	// inline asm
	{
	.reg .b32 %temp; 
	mov.b64 	{%r11, %temp}, %fd17;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r12}, %fd17;
	}
	add.s32 	%r13, %r12, -1048576;
	mov.b64 	%fd20, {%r11, %r13};
	mul.f64 	%fd21, %fd18, %fd17;
	neg.f64 	%fd22, %fd21;
	fma.rn.f64 	%fd23, %fd21, %fd22, %fd18;
	fma.rn.f64 	%fd24, %fd23, %fd20, %fd21;
	neg.f64 	%fd25, %fd24;
	fma.rn.f64 	%fd26, %fd17, %fd25, %fd19;
	fma.rn.f64 	%fd27, %fd26, %fd20, %fd20;
	fma.rn.f64 	%fd28, %fd24, %fd25, %fd18;
	fma.rn.f64 	%fd3, %fd28, %fd27, %fd24;
	setp.lt.s32	%p3, %r3, 1;
	@%p3 bra 	BB84_4;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r14, %temp}, %fd3;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r15}, %fd3;
	}
	add.s32 	%r16, %r15, 1048576;
	mov.b64 	%fd29, {%r14, %r16};
	mov.f64 	%fd30, 0dBEBAC2FE66FAAC4B;
	mov.f64 	%fd31, 0d3EC715B371155F70;
	fma.rn.f64 	%fd32, %fd31, %fd2, %fd30;
	mov.f64 	%fd33, 0d3ED9A9B88EFCD9B8;
	fma.rn.f64 	%fd34, %fd32, %fd2, %fd33;
	mov.f64 	%fd35, 0d3EDD0F40A8A0C4C3;
	fma.rn.f64 	%fd36, %fd34, %fd2, %fd35;
	mov.f64 	%fd37, 0d3EF46D4CFA9E0E1F;
	fma.rn.f64 	%fd38, %fd36, %fd2, %fd37;
	mov.f64 	%fd39, 0d3F079C168D1E2422;
	fma.rn.f64 	%fd40, %fd38, %fd2, %fd39;
	mov.f64 	%fd41, 0d3F1C9A88C3BCA540;
	fma.rn.f64 	%fd42, %fd40, %fd2, %fd41;
	mov.f64 	%fd43, 0d3F31C4E64BD476DF;
	fma.rn.f64 	%fd44, %fd42, %fd2, %fd43;
	mov.f64 	%fd45, 0d3F46E8BA60009C8F;
	fma.rn.f64 	%fd46, %fd44, %fd2, %fd45;
	mov.f64 	%fd47, 0d3F5F1C71C62B05A2;
	fma.rn.f64 	%fd48, %fd46, %fd2, %fd47;
	mov.f64 	%fd49, 0d3F76DB6DB6DC9F2C;
	fma.rn.f64 	%fd50, %fd48, %fd2, %fd49;
	mov.f64 	%fd51, 0d3F9333333333329C;
	fma.rn.f64 	%fd52, %fd50, %fd2, %fd51;
	mov.f64 	%fd53, 0d3FB5555555555555;
	fma.rn.f64 	%fd54, %fd52, %fd2, %fd53;
	mul.f64 	%fd55, %fd2, %fd54;
	fma.rn.f64 	%fd94, %fd55, %fd29, %fd29;
	bra.uni 	BB84_5;

BB84_11:
	mov.f64 	%fd91, 0d3C91A62633145C07;
	add.rn.f64 	%fd93, %fd10, %fd91;

BB84_12:
	mov.f64 	%fd92, 0d3FF921FB54442D18;
	add.rn.f64 	%fd94, %fd92, %fd93;
	bra.uni 	BB84_13;

BB84_4:
	mov.f64 	%fd56, 0d0000000000000000;
	mul.rn.f64 	%fd94, %fd1, %fd56;

BB84_5:
	setp.gt.s32	%p4, %r3, -1;
	@%p4 bra 	BB84_7;

	mov.f64 	%fd57, 0d7FF0000000000000;
	mul.rn.f64 	%fd94, %fd94, %fd57;

BB84_7:
	setp.gt.s32	%p5, %r2, -1;
	@%p5 bra 	BB84_13;

	mov.f64 	%fd58, 0dBCA1A62633145C07;
	add.rn.f64 	%fd59, %fd94, %fd58;
	neg.f64 	%fd60, %fd59;
	mov.f64 	%fd61, 0d400921FB54442D18;
	add.rn.f64 	%fd94, %fd61, %fd60;

BB84_13:
	cvta.to.global.u64 	%rd7, %rd3;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd94;

BB84_14:
	ret;
}

	// .globl	matrix_acos_f
.visible .entry matrix_acos_f(
	.param .u64 matrix_acos_f_param_0,
	.param .u64 matrix_acos_f_param_1,
	.param .u32 matrix_acos_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<27>;
	.reg .b32 	%r<6>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [matrix_acos_f_param_0];
	ld.param.u64 	%rd2, [matrix_acos_f_param_1];
	ld.param.u32 	%r2, [matrix_acos_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB85_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r1, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	abs.f32 	%f2, %f1;
	mov.f32 	%f3, 0f3F800000;
	sub.f32 	%f4, %f3, %f2;
	mul.f32 	%f5, %f4, 0f3F000000;
	sqrt.rn.f32 	%f6, %f5;
	setp.gt.f32	%p2, %f2, 0f3F11EB85;
	selp.f32	%f7, %f6, %f2, %p2;
	mul.f32 	%f8, %f7, %f7;
	mov.f32 	%f9, 0f3C94D2E9;
	mov.f32 	%f10, 0f3D53F941;
	fma.rn.f32 	%f11, %f10, %f8, %f9;
	mov.f32 	%f12, 0f3D3F841F;
	fma.rn.f32 	%f13, %f11, %f8, %f12;
	mov.f32 	%f14, 0f3D994929;
	fma.rn.f32 	%f15, %f13, %f8, %f14;
	mov.f32 	%f16, 0f3E2AAB94;
	fma.rn.f32 	%f17, %f15, %f8, %f16;
	mul.f32 	%f18, %f8, %f17;
	fma.rn.f32 	%f19, %f18, %f7, %f7;
	add.f32 	%f20, %f19, %f19;
	mov.f32 	%f21, 0f3FC90FDB;
	sub.f32 	%f22, %f21, %f19;
	selp.f32	%f23, %f20, %f22, %p2;
	setp.lt.f32	%p3, %f1, 0f00000000;
	mov.f32 	%f24, 0f40490FDB;
	sub.f32 	%f25, %f24, %f23;
	selp.f32	%f26, %f25, %f23, %p3;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	st.global.f32 	[%rd7], %f26;

BB85_2:
	ret;
}

	// .globl	matrix_atan_d
.visible .entry matrix_atan_d(
	.param .u64 matrix_atan_d_param_0,
	.param .u64 matrix_atan_d_param_1,
	.param .u32 matrix_atan_d_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .b32 	%r<11>;
	.reg .f64 	%fd<57>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_atan_d_param_0];
	ld.param.u64 	%rd3, [matrix_atan_d_param_1];
	ld.param.u32 	%r2, [matrix_atan_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB86_4;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	abs.f64 	%fd2, %fd1;
	setp.leu.f64	%p2, %fd2, 0d3FF0000000000000;
	mov.f64 	%fd56, %fd2;
	@%p2 bra 	BB86_3;

	// inline asm
	rcp.approx.ftz.f64 %fd5,%fd2;
	// inline asm
	neg.f64 	%fd7, %fd2;
	mov.f64 	%fd8, 0d3FF0000000000000;
	fma.rn.f64 	%fd9, %fd7, %fd5, %fd8;
	fma.rn.f64 	%fd10, %fd9, %fd9, %fd9;
	fma.rn.f64 	%fd11, %fd10, %fd5, %fd5;
	setp.eq.f64	%p3, %fd2, 0d7FF0000000000000;
	selp.f64	%fd3, 0d0000000000000000, %fd11, %p3;
	mov.f64 	%fd56, %fd3;

BB86_3:
	mov.f64 	%fd4, %fd56;
	cvta.to.global.u64 	%rd7, %rd3;
	mul.f64 	%fd12, %fd4, %fd4;
	mov.f64 	%fd13, 0d3F2D3B63DBB65B49;
	mov.f64 	%fd14, 0dBEF53E1D2A25FF7E;
	fma.rn.f64 	%fd15, %fd14, %fd12, %fd13;
	mov.f64 	%fd16, 0dBF5312788DDE082E;
	fma.rn.f64 	%fd17, %fd15, %fd12, %fd16;
	mov.f64 	%fd18, 0d3F6F9690C8249315;
	fma.rn.f64 	%fd19, %fd17, %fd12, %fd18;
	mov.f64 	%fd20, 0dBF82CF5AABC7CF0D;
	fma.rn.f64 	%fd21, %fd19, %fd12, %fd20;
	mov.f64 	%fd22, 0d3F9162B0B2A3BFDE;
	fma.rn.f64 	%fd23, %fd21, %fd12, %fd22;
	mov.f64 	%fd24, 0dBF9A7256FEB6FC6B;
	fma.rn.f64 	%fd25, %fd23, %fd12, %fd24;
	mov.f64 	%fd26, 0d3FA171560CE4A489;
	fma.rn.f64 	%fd27, %fd25, %fd12, %fd26;
	mov.f64 	%fd28, 0dBFA4F44D841450E4;
	fma.rn.f64 	%fd29, %fd27, %fd12, %fd28;
	mov.f64 	%fd30, 0d3FA7EE3D3F36BB95;
	fma.rn.f64 	%fd31, %fd29, %fd12, %fd30;
	mov.f64 	%fd32, 0dBFAAD32AE04A9FD1;
	fma.rn.f64 	%fd33, %fd31, %fd12, %fd32;
	mov.f64 	%fd34, 0d3FAE17813D66954F;
	fma.rn.f64 	%fd35, %fd33, %fd12, %fd34;
	mov.f64 	%fd36, 0dBFB11089CA9A5BCD;
	fma.rn.f64 	%fd37, %fd35, %fd12, %fd36;
	mov.f64 	%fd38, 0d3FB3B12B2DB51738;
	fma.rn.f64 	%fd39, %fd37, %fd12, %fd38;
	mov.f64 	%fd40, 0dBFB745D022F8DC5C;
	fma.rn.f64 	%fd41, %fd39, %fd12, %fd40;
	mov.f64 	%fd42, 0d3FBC71C709DFE927;
	fma.rn.f64 	%fd43, %fd41, %fd12, %fd42;
	mov.f64 	%fd44, 0dBFC2492491FA1744;
	fma.rn.f64 	%fd45, %fd43, %fd12, %fd44;
	mov.f64 	%fd46, 0d3FC99999999840D2;
	fma.rn.f64 	%fd47, %fd45, %fd12, %fd46;
	mov.f64 	%fd48, 0dBFD555555555544C;
	fma.rn.f64 	%fd49, %fd47, %fd12, %fd48;
	mul.f64 	%fd50, %fd12, %fd49;
	fma.rn.f64 	%fd51, %fd50, %fd4, %fd4;
	mov.f64 	%fd52, 0d3FF921FB54442D18;
	sub.f64 	%fd53, %fd52, %fd51;
	setp.gt.f64	%p4, %fd2, 0d3FF0000000000000;
	selp.f64	%fd54, %fd53, %fd51, %p4;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r6, %temp}, %fd54;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r7}, %fd54;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd1;
	}
	and.b32  	%r9, %r8, -2147483648;
	or.b32  	%r10, %r7, %r9;
	mov.b64 	%fd55, {%r6, %r10};
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd55;

BB86_4:
	ret;
}

	// .globl	matrix_atan_f
.visible .entry matrix_atan_f(
	.param .u64 matrix_atan_f_param_0,
	.param .u64 matrix_atan_f_param_1,
	.param .u32 matrix_atan_f_param_2
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<26>;
	.reg .b32 	%r<10>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_atan_f_param_0];
	ld.param.u64 	%rd3, [matrix_atan_f_param_1];
	ld.param.u32 	%r2, [matrix_atan_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB87_4;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	abs.f32 	%f2, %f1;
	setp.leu.f32	%p2, %f2, 0f3F800000;
	mov.f32 	%f25, %f2;
	@%p2 bra 	BB87_3;

	rcp.rn.f32 	%f3, %f2;
	mov.f32 	%f25, %f3;

BB87_3:
	mov.f32 	%f4, %f25;
	cvta.to.global.u64 	%rd7, %rd3;
	mul.rn.f32 	%f5, %f4, %f4;
	mov.f32 	%f6, 0fC0B59883;
	mov.f32 	%f7, 0fBF52C7EA;
	fma.rn.f32 	%f8, %f5, %f7, %f6;
	mov.f32 	%f9, 0fC0D21907;
	fma.rn.f32 	%f10, %f8, %f5, %f9;
	mul.f32 	%f11, %f5, %f10;
	mul.f32 	%f12, %f4, %f11;
	add.f32 	%f13, %f5, 0f41355DC0;
	mov.f32 	%f14, 0f41E6BD60;
	fma.rn.f32 	%f15, %f13, %f5, %f14;
	mov.f32 	%f16, 0f419D92C8;
	fma.rn.f32 	%f17, %f15, %f5, %f16;
	rcp.rn.f32 	%f18, %f17;
	fma.rn.f32 	%f19, %f12, %f18, %f4;
	mov.f32 	%f20, 0f3FC90FDB;
	sub.f32 	%f21, %f20, %f19;
	setp.gt.f32	%p3, %f2, 0f3F800000;
	selp.f32	%f22, %f21, %f19, %p3;
	mov.b32 	 %r6, %f22;
	mov.b32 	 %r7, %f1;
	and.b32  	%r8, %r7, -2147483648;
	or.b32  	%r9, %r6, %r8;
	mov.b32 	 %f23, %r9;
	setp.le.f32	%p4, %f2, 0f7F800000;
	selp.f32	%f24, %f23, %f22, %p4;
	shl.b64 	%rd8, %rd1, 2;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f32 	[%rd9], %f24;

BB87_4:
	ret;
}

	// .globl	matrix_sign_d
.visible .entry matrix_sign_d(
	.param .u64 matrix_sign_d_param_0,
	.param .u64 matrix_sign_d_param_1,
	.param .u32 matrix_sign_d_param_2
)
{
	.reg .pred 	%p<3>;
	.reg .b32 	%r<12>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd2, [matrix_sign_d_param_0];
	ld.param.u64 	%rd3, [matrix_sign_d_param_1];
	ld.param.u32 	%r2, [matrix_sign_d_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB88_4;

	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd1, [%rd6];
	setp.eq.f64	%p2, %fd1, 0d0000000000000000;
	cvta.to.global.u64 	%rd7, %rd3;
	add.s64 	%rd1, %rd7, %rd5;
	@%p2 bra 	BB88_3;
	bra.uni 	BB88_2;

BB88_3:
	mov.u64 	%rd8, 0;
	st.global.u64 	[%rd1], %rd8;
	bra.uni 	BB88_4;

BB88_2:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r6}, %fd1;
	}
	and.b32  	%r7, %r6, -2147483648;
	mov.f64 	%fd2, 0d3FF0000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd2;
	}
	and.b32  	%r9, %r8, 2147483647;
	or.b32  	%r10, %r9, %r7;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r11, %temp}, %fd2;
	}
	mov.b64 	%fd3, {%r11, %r10};
	st.global.f64 	[%rd1], %fd3;

BB88_4:
	ret;
}

	// .globl	matrix_sign_f
.visible .entry matrix_sign_f(
	.param .u64 matrix_sign_f_param_0,
	.param .u64 matrix_sign_f_param_1,
	.param .u32 matrix_sign_f_param_2
)
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<13>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd2, [matrix_sign_f_param_0];
	ld.param.u64 	%rd3, [matrix_sign_f_param_1];
	ld.param.u32 	%r2, [matrix_sign_f_param_2];
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mov.u32 	%r5, %tid.x;
	mad.lo.s32 	%r1, %r4, %r3, %r5;
	setp.ge.u32	%p1, %r1, %r2;
	@%p1 bra 	BB89_4;

	cvta.to.global.u64 	%rd4, %rd2;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	setp.eq.f32	%p2, %f1, 0f00000000;
	cvta.to.global.u64 	%rd7, %rd3;
	add.s64 	%rd1, %rd7, %rd5;
	@%p2 bra 	BB89_3;
	bra.uni 	BB89_2;

BB89_3:
	mov.u32 	%r12, 0;
	st.global.u32 	[%rd1], %r12;
	bra.uni 	BB89_4;

BB89_2:
	cvt.f64.f32	%fd1, %f1;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r6}, %fd1;
	}
	and.b32  	%r7, %r6, -2147483648;
	mov.f64 	%fd2, 0d3FF0000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r8}, %fd2;
	}
	and.b32  	%r9, %r8, 2147483647;
	or.b32  	%r10, %r9, %r7;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r11, %temp}, %fd2;
	}
	mov.b64 	%fd3, {%r11, %r10};
	cvt.rn.f32.f64	%f2, %fd3;
	st.global.f32 	[%rd1], %f2;

BB89_4:
	ret;
}

	// .globl	matrix_sigmoid_d
.visible .entry matrix_sigmoid_d(
	.param .u64 matrix_sigmoid_d_param_0,
	.param .u64 matrix_sigmoid_d_param_1,
	.param .u32 matrix_sigmoid_d_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .b32 	%r<17>;
	.reg .f64 	%fd<76>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_sigmoid_d_param_0];
	ld.param.u64 	%rd3, [matrix_sigmoid_d_param_1];
	ld.param.u32 	%r4, [matrix_sigmoid_d_param_2];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	setp.ge.u32	%p1, %r1, %r4;
	@%p1 bra 	BB90_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd6, [%rd6];
	mul.f64 	%fd1, %fd6, 0d3FE0000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd1;
	}
	and.b32  	%r3, %r2, 2147483647;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r8, %temp}, %fd1;
	}
	mov.b64 	%fd2, {%r8, %r3};
	setp.ltu.f64	%p2, %fd2, 0d3FE1C7A398201CD6;
	@%p2 bra 	BB90_3;
	bra.uni 	BB90_2;

BB90_3:
	mul.f64 	%fd52, %fd1, %fd1;
	mov.f64 	%fd53, 0dBF2B9093D89F0E23;
	mov.f64 	%fd54, 0d3F0ABFFC9B5786C4;
	fma.rn.f64 	%fd55, %fd54, %fd52, %fd53;
	mov.f64 	%fd56, 0d3F42FA2744C30B61;
	fma.rn.f64 	%fd57, %fd55, %fd52, %fd56;
	mov.f64 	%fd58, 0dBF57CF3B9C1E491D;
	fma.rn.f64 	%fd59, %fd57, %fd52, %fd58;
	mov.f64 	%fd60, 0d3F6D6C61D450119A;
	fma.rn.f64 	%fd61, %fd59, %fd52, %fd60;
	mov.f64 	%fd62, 0dBF8226DDD44294F5;
	fma.rn.f64 	%fd63, %fd61, %fd52, %fd62;
	mov.f64 	%fd64, 0d3F9664F45C2B04A6;
	fma.rn.f64 	%fd65, %fd63, %fd52, %fd64;
	mov.f64 	%fd66, 0dBFABA1BA1AD70754;
	fma.rn.f64 	%fd67, %fd65, %fd52, %fd66;
	mov.f64 	%fd68, 0d3FC111111110295E;
	fma.rn.f64 	%fd69, %fd67, %fd52, %fd68;
	mov.f64 	%fd70, 0dBFD555555555549F;
	fma.rn.f64 	%fd71, %fd69, %fd52, %fd70;
	mul.f64 	%fd72, %fd52, %fd71;
	fma.rn.f64 	%fd75, %fd72, %fd1, %fd1;
	bra.uni 	BB90_4;

BB90_2:
	add.f64 	%fd9, %fd2, %fd2;
	mov.f64 	%fd10, 0d4338000000000000;
	mov.f64 	%fd11, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd12, %fd9, %fd11, %fd10;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd12;
	}
	mov.f64 	%fd13, 0dC338000000000000;
	add.rn.f64 	%fd14, %fd12, %fd13;
	mov.f64 	%fd15, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd16, %fd14, %fd15, %fd9;
	mov.f64 	%fd17, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd18, %fd14, %fd17, %fd16;
	mov.f64 	%fd19, 0d3E5AF86D8EBD13CD;
	mov.f64 	%fd20, 0d3E21F4076ACD15B6;
	fma.rn.f64 	%fd21, %fd20, %fd18, %fd19;
	mov.f64 	%fd22, 0d3E927E5092BA033D;
	fma.rn.f64 	%fd23, %fd21, %fd18, %fd22;
	mov.f64 	%fd24, 0d3EC71DDE6C5F9DA1;
	fma.rn.f64 	%fd25, %fd23, %fd18, %fd24;
	mov.f64 	%fd26, 0d3EFA01A018D034E6;
	fma.rn.f64 	%fd27, %fd25, %fd18, %fd26;
	mov.f64 	%fd28, 0d3F2A01A01B3B6940;
	fma.rn.f64 	%fd29, %fd27, %fd18, %fd28;
	mov.f64 	%fd30, 0d3F56C16C16C1B5DD;
	fma.rn.f64 	%fd31, %fd29, %fd18, %fd30;
	mov.f64 	%fd32, 0d3F8111111110F74D;
	fma.rn.f64 	%fd33, %fd31, %fd18, %fd32;
	mov.f64 	%fd34, 0d3FA555555555554D;
	fma.rn.f64 	%fd35, %fd33, %fd18, %fd34;
	mov.f64 	%fd36, 0d3FC5555555555557;
	fma.rn.f64 	%fd37, %fd35, %fd18, %fd36;
	mov.f64 	%fd38, 0d3FE0000000000000;
	fma.rn.f64 	%fd39, %fd37, %fd18, %fd38;
	mul.f64 	%fd40, %fd18, %fd39;
	fma.rn.f64 	%fd41, %fd40, %fd18, %fd18;
	shl.b32 	%r10, %r9, 20;
	add.s32 	%r11, %r10, 1072693248;
	mov.u32 	%r12, 0;
	mov.b64 	%fd42, {%r12, %r11};
	fma.rn.f64 	%fd43, %fd41, %fd42, %fd42;
	add.f64 	%fd8, %fd43, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd7,%fd8;
	// inline asm
	neg.f64 	%fd44, %fd8;
	mov.f64 	%fd45, 0d3FF0000000000000;
	fma.rn.f64 	%fd46, %fd44, %fd7, %fd45;
	fma.rn.f64 	%fd47, %fd46, %fd46, %fd46;
	fma.rn.f64 	%fd48, %fd47, %fd7, %fd7;
	neg.f64 	%fd49, %fd48;
	mov.f64 	%fd50, 0d4000000000000000;
	fma.rn.f64 	%fd51, %fd50, %fd49, %fd45;
	setp.gt.u32	%p3, %r3, 1077936127;
	selp.f64	%fd75, 0d3FF0000000000000, %fd51, %p3;

BB90_4:
	cvta.to.global.u64 	%rd7, %rd3;
	and.b32  	%r13, %r2, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r14}, %fd75;
	}
	or.b32  	%r15, %r14, %r13;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r16, %temp}, %fd75;
	}
	mov.b64 	%fd73, {%r16, %r15};
	fma.rn.f64 	%fd74, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
	shl.b64 	%rd8, %rd1, 3;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f64 	[%rd9], %fd74;

BB90_5:
	ret;
}

	// .globl	matrix_sigmoid_f
.visible .entry matrix_sigmoid_f(
	.param .u64 matrix_sigmoid_f_param_0,
	.param .u64 matrix_sigmoid_f_param_1,
	.param .u32 matrix_sigmoid_f_param_2
)
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<17>;
	.reg .f64 	%fd<76>;
	.reg .b64 	%rd<10>;


	ld.param.u64 	%rd2, [matrix_sigmoid_f_param_0];
	ld.param.u64 	%rd3, [matrix_sigmoid_f_param_1];
	ld.param.u32 	%r4, [matrix_sigmoid_f_param_2];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.x;
	mad.lo.s32 	%r1, %r6, %r5, %r7;
	setp.ge.u32	%p1, %r1, %r4;
	@%p1 bra 	BB91_5;

	cvta.to.global.u64 	%rd4, %rd2;
	cvt.s64.s32	%rd1, %r1;
	mul.wide.s32 	%rd5, %r1, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f1, [%rd6];
	cvt.f64.f32	%fd6, %f1;
	mul.f64 	%fd1, %fd6, 0d3FE0000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd1;
	}
	and.b32  	%r3, %r2, 2147483647;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r8, %temp}, %fd1;
	}
	mov.b64 	%fd2, {%r8, %r3};
	setp.ltu.f64	%p2, %fd2, 0d3FE1C7A398201CD6;
	@%p2 bra 	BB91_3;
	bra.uni 	BB91_2;

BB91_3:
	mul.f64 	%fd52, %fd1, %fd1;
	mov.f64 	%fd53, 0dBF2B9093D89F0E23;
	mov.f64 	%fd54, 0d3F0ABFFC9B5786C4;
	fma.rn.f64 	%fd55, %fd54, %fd52, %fd53;
	mov.f64 	%fd56, 0d3F42FA2744C30B61;
	fma.rn.f64 	%fd57, %fd55, %fd52, %fd56;
	mov.f64 	%fd58, 0dBF57CF3B9C1E491D;
	fma.rn.f64 	%fd59, %fd57, %fd52, %fd58;
	mov.f64 	%fd60, 0d3F6D6C61D450119A;
	fma.rn.f64 	%fd61, %fd59, %fd52, %fd60;
	mov.f64 	%fd62, 0dBF8226DDD44294F5;
	fma.rn.f64 	%fd63, %fd61, %fd52, %fd62;
	mov.f64 	%fd64, 0d3F9664F45C2B04A6;
	fma.rn.f64 	%fd65, %fd63, %fd52, %fd64;
	mov.f64 	%fd66, 0dBFABA1BA1AD70754;
	fma.rn.f64 	%fd67, %fd65, %fd52, %fd66;
	mov.f64 	%fd68, 0d3FC111111110295E;
	fma.rn.f64 	%fd69, %fd67, %fd52, %fd68;
	mov.f64 	%fd70, 0dBFD555555555549F;
	fma.rn.f64 	%fd71, %fd69, %fd52, %fd70;
	mul.f64 	%fd72, %fd52, %fd71;
	fma.rn.f64 	%fd75, %fd72, %fd1, %fd1;
	bra.uni 	BB91_4;

BB91_2:
	add.f64 	%fd9, %fd2, %fd2;
	mov.f64 	%fd10, 0d4338000000000000;
	mov.f64 	%fd11, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd12, %fd9, %fd11, %fd10;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r9, %temp}, %fd12;
	}
	mov.f64 	%fd13, 0dC338000000000000;
	add.rn.f64 	%fd14, %fd12, %fd13;
	mov.f64 	%fd15, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd16, %fd14, %fd15, %fd9;
	mov.f64 	%fd17, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd18, %fd14, %fd17, %fd16;
	mov.f64 	%fd19, 0d3E5AF86D8EBD13CD;
	mov.f64 	%fd20, 0d3E21F4076ACD15B6;
	fma.rn.f64 	%fd21, %fd20, %fd18, %fd19;
	mov.f64 	%fd22, 0d3E927E5092BA033D;
	fma.rn.f64 	%fd23, %fd21, %fd18, %fd22;
	mov.f64 	%fd24, 0d3EC71DDE6C5F9DA1;
	fma.rn.f64 	%fd25, %fd23, %fd18, %fd24;
	mov.f64 	%fd26, 0d3EFA01A018D034E6;
	fma.rn.f64 	%fd27, %fd25, %fd18, %fd26;
	mov.f64 	%fd28, 0d3F2A01A01B3B6940;
	fma.rn.f64 	%fd29, %fd27, %fd18, %fd28;
	mov.f64 	%fd30, 0d3F56C16C16C1B5DD;
	fma.rn.f64 	%fd31, %fd29, %fd18, %fd30;
	mov.f64 	%fd32, 0d3F8111111110F74D;
	fma.rn.f64 	%fd33, %fd31, %fd18, %fd32;
	mov.f64 	%fd34, 0d3FA555555555554D;
	fma.rn.f64 	%fd35, %fd33, %fd18, %fd34;
	mov.f64 	%fd36, 0d3FC5555555555557;
	fma.rn.f64 	%fd37, %fd35, %fd18, %fd36;
	mov.f64 	%fd38, 0d3FE0000000000000;
	fma.rn.f64 	%fd39, %fd37, %fd18, %fd38;
	mul.f64 	%fd40, %fd18, %fd39;
	fma.rn.f64 	%fd41, %fd40, %fd18, %fd18;
	shl.b32 	%r10, %r9, 20;
	add.s32 	%r11, %r10, 1072693248;
	mov.u32 	%r12, 0;
	mov.b64 	%fd42, {%r12, %r11};
	fma.rn.f64 	%fd43, %fd41, %fd42, %fd42;
	add.f64 	%fd8, %fd43, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd7,%fd8;
	// inline asm
	neg.f64 	%fd44, %fd8;
	mov.f64 	%fd45, 0d3FF0000000000000;
	fma.rn.f64 	%fd46, %fd44, %fd7, %fd45;
	fma.rn.f64 	%fd47, %fd46, %fd46, %fd46;
	fma.rn.f64 	%fd48, %fd47, %fd7, %fd7;
	neg.f64 	%fd49, %fd48;
	mov.f64 	%fd50, 0d4000000000000000;
	fma.rn.f64 	%fd51, %fd50, %fd49, %fd45;
	setp.gt.u32	%p3, %r3, 1077936127;
	selp.f64	%fd75, 0d3FF0000000000000, %fd51, %p3;

BB91_4:
	cvta.to.global.u64 	%rd7, %rd3;
	and.b32  	%r13, %r2, -2147483648;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r14}, %fd75;
	}
	or.b32  	%r15, %r14, %r13;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r16, %temp}, %fd75;
	}
	mov.b64 	%fd73, {%r16, %r15};
	fma.rn.f64 	%fd74, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
	cvt.rn.f32.f64	%f2, %fd74;
	shl.b64 	%rd8, %rd1, 2;
	add.s64 	%rd9, %rd7, %rd8;
	st.global.f32 	[%rd9], %f2;

BB91_5:
	ret;
}

.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
	.local .align 8 .b8 	__local_depot92[40];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<42>;
	.reg .f64 	%fd<5>;
	.reg .b64 	%rd<101>;


	mov.u64 	%rd100, __local_depot92;
	cvta.local.u64 	%SP, %rd100;
	ld.param.f64 	%fd4, [__internal_trig_reduction_slowpathd_param_0];
	ld.param.u64 	%rd37, [__internal_trig_reduction_slowpathd_param_1];
	add.u64 	%rd38, %SP, 0;
	cvta.to.local.u64 	%rd1, %rd38;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r1}, %fd4;
	}
	and.b32  	%r40, %r1, -2147483648;
	shr.u32 	%r3, %r1, 20;
	bfe.u32 	%r4, %r1, 20, 11;
	setp.eq.s32	%p1, %r4, 2047;
	@%p1 bra 	BB92_13;

	add.s32 	%r16, %r4, -1024;
	shr.u32 	%r17, %r16, 6;
	mov.u32 	%r18, 16;
	sub.s32 	%r5, %r18, %r17;
	mov.u32 	%r19, 19;
	sub.s32 	%r20, %r19, %r17;
	mov.u32 	%r21, 18;
	min.s32 	%r6, %r21, %r20;
	setp.gt.s32	%p2, %r5, %r6;
	mov.u64 	%rd94, 0;
	mov.u64 	%rd93, %rd1;
	@%p2 bra 	BB92_4;

	mov.b64 	 %rd41, %fd4;
	shl.b64 	%rd42, %rd41, 11;
	or.b64  	%rd3, %rd42, -9223372036854775808;
	add.s32 	%r7, %r5, -1;
	mov.u64 	%rd92, %rd1;
	bfe.u32 	%r22, %r1, 20, 11;
	add.s32 	%r23, %r22, -1024;
	shr.u32 	%r24, %r23, 6;
	neg.s32 	%r25, %r24;
	mul.wide.s32 	%rd43, %r25, 8;
	mov.u64 	%rd44, __cudart_i2opi_d;
	add.s64 	%rd45, %rd43, %rd44;
	add.s64 	%rd90, %rd45, 120;
	mov.u64 	%rd94, 0;
	mov.u64 	%rd91, %rd1;
	mov.u32 	%r39, %r7;

BB92_3:
	.pragma "nounroll";
	mov.u32 	%r8, %r39;
	mov.u64 	%rd7, %rd91;
	ld.const.u64 	%rd48, [%rd90];
	// inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
	mov.b64         {alo,ahi}, %rd48;    
	mov.b64         {blo,bhi}, %rd3;    
	mov.b64         {clo,chi}, %rd94;    
	mad.lo.cc.u32   r0, alo, blo, clo;
	madc.hi.cc.u32  r1, alo, blo, chi;
	madc.hi.u32     r2, alo, bhi,   0;
	mad.lo.cc.u32   r1, alo, bhi,  r1;
	madc.hi.cc.u32  r2, ahi, blo,  r2;
	madc.hi.u32     r3, ahi, bhi,   0;
	mad.lo.cc.u32   r1, ahi, blo,  r1;
	madc.lo.cc.u32  r2, ahi, bhi,  r2;
	addc.u32        r3,  r3,   0;     
	mov.b64         %rd46, {r0,r1};      
	mov.b64         %rd94, {r2,r3};      
	}
	// inline asm
	st.local.u64 	[%rd92], %rd46;
	add.s32 	%r9, %r8, 1;
	sub.s32 	%r26, %r9, %r7;
	mul.wide.s32 	%rd51, %r26, 8;
	add.s64 	%rd92, %rd1, %rd51;
	add.s64 	%rd13, %rd7, 8;
	mov.u64 	%rd93, %rd13;
	add.s64 	%rd90, %rd90, 8;
	setp.lt.s32	%p3, %r9, %r6;
	mov.u64 	%rd91, %rd13;
	mov.u32 	%r39, %r9;
	@%p3 bra 	BB92_3;

BB92_4:
	st.local.u64 	[%rd93], %rd94;
	ld.local.u64 	%rd95, [%rd1+16];
	ld.local.u64 	%rd96, [%rd1+24];
	and.b32  	%r10, %r3, 63;
	setp.eq.s32	%p4, %r10, 0;
	@%p4 bra 	BB92_6;

	mov.u32 	%r27, 64;
	sub.s32 	%r28, %r27, %r10;
	shl.b64 	%rd52, %rd96, %r10;
	shr.u64 	%rd53, %rd95, %r28;
	or.b64  	%rd96, %rd52, %rd53;
	shl.b64 	%rd54, %rd95, %r10;
	ld.local.u64 	%rd55, [%rd1+8];
	shr.u64 	%rd56, %rd55, %r28;
	or.b64  	%rd95, %rd56, %rd54;

BB92_6:
	cvta.to.local.u64 	%rd57, %rd37;
	shr.u64 	%rd58, %rd96, 62;
	cvt.u32.u64	%r29, %rd58;
	shr.u64 	%rd59, %rd95, 62;
	shl.b64 	%rd60, %rd96, 2;
	or.b64  	%rd98, %rd60, %rd59;
	shl.b64 	%rd97, %rd95, 2;
	shr.u64 	%rd61, %rd96, 61;
	cvt.u32.u64	%r30, %rd61;
	and.b32  	%r31, %r30, 1;
	add.s32 	%r32, %r31, %r29;
	neg.s32 	%r33, %r32;
	setp.ne.s32	%p5, %r40, 0;
	selp.b32	%r34, %r33, %r32, %p5;
	st.local.u32 	[%rd57], %r34;
	setp.eq.s32	%p6, %r31, 0;
	@%p6 bra 	BB92_8;

	mov.u64 	%rd65, 0;
	// inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd65;
	mov.b64         {a2,a3}, %rd65;
	mov.b64         {b0,b1}, %rd97;
	mov.b64         {b2,b3}, %rd98;
	sub.cc.u32      r0, a0, b0; 
	subc.cc.u32     r1, a1, b1; 
	subc.cc.u32     r2, a2, b2; 
	subc.u32        r3, a3, b3; 
	mov.b64         %rd97, {r0,r1};
	mov.b64         %rd98, {r2,r3};
	}
	// inline asm
	xor.b32  	%r40, %r40, -2147483648;

BB92_8:
	clz.b64 	%r41, %rd98;
	setp.eq.s32	%p7, %r41, 0;
	@%p7 bra 	BB92_10;

	shl.b64 	%rd68, %rd98, %r41;
	mov.u32 	%r35, 64;
	sub.s32 	%r36, %r35, %r41;
	shr.u64 	%rd69, %rd97, %r36;
	or.b64  	%rd98, %rd69, %rd68;

BB92_10:
	mov.u64 	%rd73, -3958705157555305931;
	// inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
	mov.b64         {alo,ahi}, %rd98;   
	mov.b64         {blo,bhi}, %rd73;   
	mul.lo.u32      r0, alo, blo;    
	mul.hi.u32      r1, alo, blo;    
	mad.lo.cc.u32   r1, alo, bhi, r1;
	madc.hi.u32     r2, alo, bhi,  0;
	mad.lo.cc.u32   r1, ahi, blo, r1;
	madc.hi.cc.u32  r2, ahi, blo, r2;
	madc.hi.u32     r3, ahi, bhi,  0;
	mad.lo.cc.u32   r2, ahi, bhi, r2;
	addc.u32        r3, r3,  0;      
	mov.b64         %rd70, {r0,r1};     
	mov.b64         %rd99, {r2,r3};     
	}
	// inline asm
	setp.lt.s64	%p8, %rd99, 1;
	@%p8 bra 	BB92_12;

	// inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd70;
	mov.b64         {a2,a3}, %rd99;
	mov.b64         {b0,b1}, %rd70;
	mov.b64         {b2,b3}, %rd99;
	add.cc.u32      r0, a0, b0; 
	addc.cc.u32     r1, a1, b1; 
	addc.cc.u32     r2, a2, b2; 
	addc.u32        r3, a3, b3; 
	mov.b64         %rd74, {r0,r1};
	mov.b64         %rd99, {r2,r3};
	}
	// inline asm
	add.s32 	%r41, %r41, 1;

BB92_12:
	cvt.u64.u32	%rd80, %r40;
	shl.b64 	%rd81, %rd80, 32;
	mov.u32 	%r37, 1022;
	sub.s32 	%r38, %r37, %r41;
	cvt.u64.u32	%rd82, %r38;
	shl.b64 	%rd83, %rd82, 52;
	add.s64 	%rd84, %rd99, 1;
	shr.u64 	%rd85, %rd84, 10;
	add.s64 	%rd86, %rd85, 1;
	shr.u64 	%rd87, %rd86, 1;
	add.s64 	%rd88, %rd87, %rd83;
	or.b64  	%rd89, %rd88, %rd81;
	mov.b64 	 %fd4, %rd89;

BB92_13:
	st.param.f64	[func_retval0+0], %fd4;
	ret;
}

.func  (.param .b64 func_retval0) __internal_accurate_pow(
	.param .b64 __internal_accurate_pow_param_0,
	.param .b64 __internal_accurate_pow_param_1
)
{
	.reg .pred 	%p<10>;
	.reg .f32 	%f<3>;
	.reg .b32 	%r<52>;
	.reg .f64 	%fd<134>;


	ld.param.f64 	%fd12, [__internal_accurate_pow_param_0];
	ld.param.f64 	%fd13, [__internal_accurate_pow_param_1];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r49}, %fd12;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r48, %temp}, %fd12;
	}
	shr.u32 	%r50, %r49, 20;
	setp.ne.s32	%p1, %r50, 0;
	@%p1 bra 	BB93_2;

	mul.f64 	%fd14, %fd12, 0d4350000000000000;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r49}, %fd14;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%r48, %temp}, %fd14;
	}
	shr.u32 	%r16, %r49, 20;
	add.s32 	%r50, %r16, -54;

BB93_2:
	add.s32 	%r51, %r50, -1023;
	and.b32  	%r17, %r49, -2146435073;
	or.b32  	%r18, %r17, 1072693248;
	mov.b64 	%fd132, {%r48, %r18};
	setp.lt.u32	%p2, %r18, 1073127583;
	@%p2 bra 	BB93_4;

	{
	.reg .b32 %temp; 
	mov.b64 	{%r19, %temp}, %fd132;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r20}, %fd132;
	}
	add.s32 	%r21, %r20, -1048576;
	mov.b64 	%fd132, {%r19, %r21};
	add.s32 	%r51, %r50, -1022;

BB93_4:
	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
	// inline asm
	rcp.approx.ftz.f64 %fd15,%fd16;
	// inline asm
	neg.f64 	%fd17, %fd16;
	mov.f64 	%fd18, 0d3FF0000000000000;
	fma.rn.f64 	%fd19, %fd17, %fd15, %fd18;
	fma.rn.f64 	%fd20, %fd19, %fd19, %fd19;
	fma.rn.f64 	%fd21, %fd20, %fd15, %fd15;
	add.f64 	%fd22, %fd132, 0dBFF0000000000000;
	mul.f64 	%fd23, %fd22, %fd21;
	fma.rn.f64 	%fd24, %fd22, %fd21, %fd23;
	mul.f64 	%fd25, %fd24, %fd24;
	mov.f64 	%fd26, 0d3ED0F5D241AD3B5A;
	mov.f64 	%fd27, 0d3EB0F5FF7D2CAFE2;
	fma.rn.f64 	%fd28, %fd27, %fd25, %fd26;
	mov.f64 	%fd29, 0d3EF3B20A75488A3F;
	fma.rn.f64 	%fd30, %fd28, %fd25, %fd29;
	mov.f64 	%fd31, 0d3F1745CDE4FAECD5;
	fma.rn.f64 	%fd32, %fd30, %fd25, %fd31;
	mov.f64 	%fd33, 0d3F3C71C7258A578B;
	fma.rn.f64 	%fd34, %fd32, %fd25, %fd33;
	mov.f64 	%fd35, 0d3F6249249242B910;
	fma.rn.f64 	%fd36, %fd34, %fd25, %fd35;
	mov.f64 	%fd37, 0d3F89999999999DFB;
	fma.rn.f64 	%fd38, %fd36, %fd25, %fd37;
	sub.f64 	%fd39, %fd22, %fd24;
	add.f64 	%fd40, %fd39, %fd39;
	neg.f64 	%fd41, %fd24;
	fma.rn.f64 	%fd42, %fd41, %fd22, %fd40;
	mul.f64 	%fd43, %fd21, %fd42;
	fma.rn.f64 	%fd44, %fd25, %fd38, 0d3FB5555555555555;
	mov.f64 	%fd45, 0d3FB5555555555555;
	sub.f64 	%fd46, %fd45, %fd44;
	fma.rn.f64 	%fd47, %fd25, %fd38, %fd46;
	add.f64 	%fd48, %fd47, 0d0000000000000000;
	add.f64 	%fd49, %fd48, 0dBC46A4CB00B9E7B0;
	add.f64 	%fd50, %fd44, %fd49;
	sub.f64 	%fd51, %fd44, %fd50;
	add.f64 	%fd52, %fd49, %fd51;
	mul.rn.f64 	%fd53, %fd24, %fd24;
	neg.f64 	%fd54, %fd53;
	fma.rn.f64 	%fd55, %fd24, %fd24, %fd54;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r22, %temp}, %fd43;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r23}, %fd43;
	}
	add.s32 	%r24, %r23, 1048576;
	mov.b64 	%fd56, {%r22, %r24};
	fma.rn.f64 	%fd57, %fd24, %fd56, %fd55;
	mul.rn.f64 	%fd58, %fd53, %fd24;
	neg.f64 	%fd59, %fd58;
	fma.rn.f64 	%fd60, %fd53, %fd24, %fd59;
	fma.rn.f64 	%fd61, %fd53, %fd43, %fd60;
	fma.rn.f64 	%fd62, %fd57, %fd24, %fd61;
	mul.rn.f64 	%fd63, %fd50, %fd58;
	neg.f64 	%fd64, %fd63;
	fma.rn.f64 	%fd65, %fd50, %fd58, %fd64;
	fma.rn.f64 	%fd66, %fd50, %fd62, %fd65;
	fma.rn.f64 	%fd67, %fd52, %fd58, %fd66;
	add.f64 	%fd68, %fd63, %fd67;
	sub.f64 	%fd69, %fd63, %fd68;
	add.f64 	%fd70, %fd67, %fd69;
	add.f64 	%fd71, %fd24, %fd68;
	sub.f64 	%fd72, %fd24, %fd71;
	add.f64 	%fd73, %fd68, %fd72;
	add.f64 	%fd74, %fd70, %fd73;
	add.f64 	%fd75, %fd43, %fd74;
	add.f64 	%fd76, %fd71, %fd75;
	sub.f64 	%fd77, %fd71, %fd76;
	add.f64 	%fd78, %fd75, %fd77;
	xor.b32  	%r25, %r51, -2147483648;
	mov.u32 	%r26, 1127219200;
	mov.b64 	%fd79, {%r25, %r26};
	mov.u32 	%r27, -2147483648;
	mov.b64 	%fd80, {%r27, %r26};
	sub.f64 	%fd81, %fd79, %fd80;
	mov.f64 	%fd82, 0d3FE62E42FEFA39EF;
	fma.rn.f64 	%fd83, %fd81, %fd82, %fd76;
	neg.f64 	%fd84, %fd81;
	fma.rn.f64 	%fd85, %fd84, %fd82, %fd83;
	sub.f64 	%fd86, %fd85, %fd76;
	sub.f64 	%fd87, %fd78, %fd86;
	mov.f64 	%fd88, 0d3C7ABC9E3B39803F;
	fma.rn.f64 	%fd89, %fd81, %fd88, %fd87;
	add.f64 	%fd90, %fd83, %fd89;
	sub.f64 	%fd91, %fd83, %fd90;
	add.f64 	%fd92, %fd89, %fd91;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r28}, %fd13;
	}
	add.s32 	%r29, %r28, %r28;
	setp.gt.u32	%p3, %r29, -33554433;
	and.b32  	%r30, %r28, -15728641;
	selp.b32	%r31, %r30, %r28, %p3;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r32, %temp}, %fd13;
	}
	mov.b64 	%fd93, {%r32, %r31};
	mul.rn.f64 	%fd94, %fd90, %fd93;
	neg.f64 	%fd95, %fd94;
	fma.rn.f64 	%fd96, %fd90, %fd93, %fd95;
	fma.rn.f64 	%fd97, %fd92, %fd93, %fd96;
	add.f64 	%fd4, %fd94, %fd97;
	sub.f64 	%fd98, %fd94, %fd4;
	add.f64 	%fd5, %fd97, %fd98;
	mov.f64 	%fd99, 0d4338000000000000;
	mov.f64 	%fd100, 0d3FF71547652B82FE;
	fma.rn.f64 	%fd101, %fd4, %fd100, %fd99;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r13, %temp}, %fd101;
	}
	mov.f64 	%fd102, 0dC338000000000000;
	add.rn.f64 	%fd103, %fd101, %fd102;
	mov.f64 	%fd104, 0dBFE62E42FEFA39EF;
	fma.rn.f64 	%fd105, %fd103, %fd104, %fd4;
	mov.f64 	%fd106, 0dBC7ABC9E3B39803F;
	fma.rn.f64 	%fd107, %fd103, %fd106, %fd105;
	mov.f64 	%fd108, 0d3E928AF3FCA213EA;
	mov.f64 	%fd109, 0d3E5ADE1569CE2BDF;
	fma.rn.f64 	%fd110, %fd109, %fd107, %fd108;
	mov.f64 	%fd111, 0d3EC71DEE62401315;
	fma.rn.f64 	%fd112, %fd110, %fd107, %fd111;
	mov.f64 	%fd113, 0d3EFA01997C89EB71;
	fma.rn.f64 	%fd114, %fd112, %fd107, %fd113;
	mov.f64 	%fd115, 0d3F2A01A014761F65;
	fma.rn.f64 	%fd116, %fd114, %fd107, %fd115;
	mov.f64 	%fd117, 0d3F56C16C1852B7AF;
	fma.rn.f64 	%fd118, %fd116, %fd107, %fd117;
	mov.f64 	%fd119, 0d3F81111111122322;
	fma.rn.f64 	%fd120, %fd118, %fd107, %fd119;
	mov.f64 	%fd121, 0d3FA55555555502A1;
	fma.rn.f64 	%fd122, %fd120, %fd107, %fd121;
	mov.f64 	%fd123, 0d3FC5555555555511;
	fma.rn.f64 	%fd124, %fd122, %fd107, %fd123;
	mov.f64 	%fd125, 0d3FE000000000000B;
	fma.rn.f64 	%fd126, %fd124, %fd107, %fd125;
	fma.rn.f64 	%fd127, %fd126, %fd107, %fd18;
	fma.rn.f64 	%fd128, %fd127, %fd107, %fd18;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r14, %temp}, %fd128;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r15}, %fd128;
	}
	shl.b32 	%r33, %r13, 20;
	add.s32 	%r34, %r15, %r33;
	mov.b64 	%fd133, {%r14, %r34};
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r35}, %fd4;
	}
	mov.b32 	 %f2, %r35;
	abs.f32 	%f1, %f2;
	setp.lt.f32	%p4, %f1, 0f4086232B;
	@%p4 bra 	BB93_7;

	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
	add.f64 	%fd129, %fd4, 0d7FF0000000000000;
	selp.f64	%fd133, 0d0000000000000000, %fd129, %p5;
	setp.geu.f32	%p6, %f1, 0f40874800;
	@%p6 bra 	BB93_7;

	shr.u32 	%r36, %r13, 31;
	add.s32 	%r37, %r13, %r36;
	shr.s32 	%r38, %r37, 1;
	shl.b32 	%r39, %r38, 20;
	add.s32 	%r40, %r39, %r15;
	mov.b64 	%fd130, {%r14, %r40};
	sub.s32 	%r41, %r13, %r38;
	shl.b32 	%r42, %r41, 20;
	add.s32 	%r43, %r42, 1072693248;
	mov.u32 	%r44, 0;
	mov.b64 	%fd131, {%r44, %r43};
	mul.f64 	%fd133, %fd130, %fd131;

BB93_7:
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r45}, %fd133;
	}
	and.b32  	%r46, %r45, 2147483647;
	setp.ne.s32	%p7, %r46, 2146435072;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r47, %temp}, %fd133;
	}
	setp.ne.s32	%p8, %r47, 0;
	or.pred  	%p9, %p8, %p7;
	@!%p9 bra 	BB93_9;
	bra.uni 	BB93_8;

BB93_8:
	fma.rn.f64 	%fd133, %fd133, %fd5, %fd133;

BB93_9:
	st.param.f64	[func_retval0+0], %fd133;
	ret;
}






© 2015 - 2024 Weber Informatics LLC | Privacy Policy