All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gust.linalg.cuda.enforceLUFloat.ptx Maven / Gradle / Ivy

The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep  5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//

.version 3.2
.target sm_30
.address_size 64

	.file	1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/enforceLUFloat.cu", 1413672495, 1095

.visible .entry enforceLU(
	.param .u64 enforceLU_param_0,
	.param .u32 enforceLU_param_1
)
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<2>;
	.reg .s32 	%r<5>;
	.reg .s64 	%rd<5>;


	ld.param.u64 	%rd2, [enforceLU_param_0];
	ld.param.u32 	%r3, [enforceLU_param_1];
	cvta.to.global.u64 	%rd1, %rd2;
	.loc 1 8 1
	mov.u32 	%r1, %ctaid.x;
	.loc 1 7 1
	mov.u32 	%r2, %tid.x;
	.loc 1 9 1
	setp.gt.s32	%p1, %r2, %r1;
	@%p1 bra 	BB0_2;

	.loc 1 10 1
	setp.eq.s32	%p2, %r2, %r1;
	selp.f32	%f1, 0f3F800000, 0f00000000, %p2;
	mad.lo.s32 	%r4, %r1, %r3, %r2;
	mul.wide.s32 	%rd3, %r4, 4;
	add.s64 	%rd4, %rd1, %rd3;
	.loc 1 10 1
	st.global.f32 	[%rd4], %f1;

BB0_2:
	.loc 1 11 2
	ret;
}

.visible .entry zerosU(
	.param .u32 zerosU_param_0,
	.param .u32 zerosU_param_1,
	.param .u64 zerosU_param_2,
	.param .u32 zerosU_param_3,
	.param .u32 zerosU_param_4
)
{
	.reg .pred 	%p<8>;
	.reg .s32 	%r<16>;
	.reg .s64 	%rd<5>;


	ld.param.u32 	%r5, [zerosU_param_0];
	ld.param.u32 	%r6, [zerosU_param_1];
	ld.param.u64 	%rd3, [zerosU_param_2];
	ld.param.u32 	%r3, [zerosU_param_3];
	ld.param.u32 	%r4, [zerosU_param_4];
	cvta.to.global.u64 	%rd1, %rd3;
	.loc 1 21 1
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	.loc 1 22 1
	mov.u32 	%r10, %ntid.y;
	mov.u32 	%r11, %ctaid.y;
	mov.u32 	%r12, %tid.y;
	mad.lo.s32 	%r2, %r10, %r11, %r12;
	.loc 1 24 1
	setp.ge.s32	%p1, %r2, %r6;
	setp.ge.s32	%p2, %r1, %r5;
	or.pred  	%p3, %p2, %p1;
	.loc 1 24 1
	@%p3 bra 	BB1_5;

	.loc 1 26 1
	setp.lt.s32	%p4, %r1, %r2;
	.loc 1 27 1
	mad.lo.s32 	%r13, %r2, %r3, %r1;
	mul.wide.s32 	%rd4, %r13, 4;
	add.s64 	%rd2, %rd1, %rd4;
	.loc 1 26 1
	@%p4 bra 	BB1_4;

	.loc 1 28 1
	setp.eq.s32	%p5, %r1, %r2;
	setp.ne.s32	%p6, %r4, 0;
	and.pred  	%p7, %p5, %p6;
	.loc 1 28 1
	@!%p7 bra 	BB1_5;
	bra.uni 	BB1_3;

BB1_3:
	mov.u32 	%r14, 0;
	.loc 1 29 1
	st.global.u32 	[%rd2], %r14;
	bra.uni 	BB1_5;

BB1_4:
	mov.u32 	%r15, 0;
	.loc 1 27 1
	st.global.u32 	[%rd2], %r15;

BB1_5:
	.loc 1 30 2
	ret;
}

.visible .entry zerosL(
	.param .u32 zerosL_param_0,
	.param .u32 zerosL_param_1,
	.param .u64 zerosL_param_2,
	.param .u32 zerosL_param_3,
	.param .u32 zerosL_param_4
)
{
	.reg .pred 	%p<8>;
	.reg .s32 	%r<16>;
	.reg .s64 	%rd<5>;


	ld.param.u32 	%r5, [zerosL_param_0];
	ld.param.u32 	%r6, [zerosL_param_1];
	ld.param.u64 	%rd3, [zerosL_param_2];
	ld.param.u32 	%r3, [zerosL_param_3];
	ld.param.u32 	%r4, [zerosL_param_4];
	cvta.to.global.u64 	%rd1, %rd3;
	.loc 1 39 1
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %tid.x;
	mad.lo.s32 	%r1, %r7, %r8, %r9;
	.loc 1 40 1
	mov.u32 	%r10, %ntid.y;
	mov.u32 	%r11, %ctaid.y;
	mov.u32 	%r12, %tid.y;
	mad.lo.s32 	%r2, %r10, %r11, %r12;
	.loc 1 42 1
	setp.ge.s32	%p1, %r2, %r6;
	setp.ge.s32	%p2, %r1, %r5;
	or.pred  	%p3, %p2, %p1;
	.loc 1 42 1
	@%p3 bra 	BB2_5;

	.loc 1 44 1
	setp.gt.s32	%p4, %r1, %r2;
	.loc 1 45 1
	mad.lo.s32 	%r13, %r2, %r3, %r1;
	mul.wide.s32 	%rd4, %r13, 4;
	add.s64 	%rd2, %rd1, %rd4;
	.loc 1 44 1
	@%p4 bra 	BB2_4;

	.loc 1 46 1
	setp.eq.s32	%p5, %r1, %r2;
	setp.ne.s32	%p6, %r4, 0;
	and.pred  	%p7, %p5, %p6;
	.loc 1 46 1
	@!%p7 bra 	BB2_5;
	bra.uni 	BB2_3;

BB2_3:
	mov.u32 	%r14, 0;
	.loc 1 47 1
	st.global.u32 	[%rd2], %r14;
	bra.uni 	BB2_5;

BB2_4:
	mov.u32 	%r15, 0;
	.loc 1 45 1
	st.global.u32 	[%rd2], %r15;

BB2_5:
	.loc 1 48 2
	ret;
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy