All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gust.linalg.cuda.reduceDouble.ptx Maven / Gradle / Ivy

The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep  5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//

.version 3.2
.target sm_30
.address_size 64

	.file	1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/reduceDouble.cu", 1413672495, 1961
.extern .shared .align 8 .b8 sdata[];

.visible .entry reduce(
	.param .u64 reduce_param_0,
	.param .u64 reduce_param_1,
	.param .u32 reduce_param_2
)
{
	.reg .pred 	%p<18>;
	.reg .s32 	%r<31>;
	.reg .s64 	%rd<15>;
	.reg .f64 	%fd<70>;


	ld.param.u64 	%rd2, [reduce_param_0];
	ld.param.u64 	%rd3, [reduce_param_1];
	ld.param.u32 	%r5, [reduce_param_2];
	.loc 1 14 1
	mov.u32 	%r6, %ctaid.x;
	shl.b32 	%r7, %r6, 1;
	mov.u32 	%r8, %ntid.x;
	.loc 1 13 1
	mov.u32 	%r9, %tid.x;
	.loc 1 14 1
	mad.lo.s32 	%r30, %r7, %r8, %r9;
	.loc 1 19 1
	setp.lt.u32	%p1, %r30, %r5;
	@%p1 bra 	BB0_2;

	mov.f64 	%fd67, 0d3FF0000000000000;
	bra.uni 	BB0_6;

BB0_2:
	mov.f64 	%fd68, 0d3FF0000000000000;

BB0_3:
	.loc 1 21 1
	mov.f64 	%fd1, %fd68;
	cvta.to.global.u64 	%rd4, %rd2;
	.loc 1 21 1
	mul.wide.u32 	%rd5, %r30, 8;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f64 	%fd27, [%rd6];
	mul.f64 	%fd69, %fd1, %fd27;
	.loc 1 23 1
	add.s32 	%r3, %r8, %r30;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB0_5;

	.loc 1 24 1
	mul.wide.u32 	%rd8, %r3, 8;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f64 	%fd28, [%rd9];
	mul.f64 	%fd69, %fd69, %fd28;

BB0_5:
	.loc 1 25 1
	mov.f64 	%fd68, %fd69;
	.loc 1 15 1
	shl.b32 	%r12, %r8, 1;
	mov.u32 	%r13, %nctaid.x;
	.loc 1 25 1
	mad.lo.s32 	%r30, %r12, %r13, %r30;
	.loc 1 19 1
	setp.lt.u32	%p3, %r30, %r5;
	mov.f64 	%fd67, %fd68;
	@%p3 bra 	BB0_3;

BB0_6:
	.loc 1 29 1
	mov.f64 	%fd66, %fd67;
	mul.wide.u32 	%rd10, %r9, 8;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f64 	[%rd1], %fd66;
	.loc 1 30 1
	bar.sync 	0;
	.loc 1 34 1
	setp.lt.u32	%p4, %r8, 512;
	mov.f64 	%fd65, %fd66;
	@%p4 bra 	BB0_10;

	.loc 1 34 1
	setp.gt.u32	%p5, %r9, 255;
	@%p5 bra 	BB0_9;

	.loc 1 34 1
	ld.shared.f64 	%fd29, [%rd1+2048];
	mul.f64 	%fd66, %fd66, %fd29;
	st.shared.f64 	[%rd1], %fd66;

BB0_9:
	.loc 1 34 1
	mov.f64 	%fd65, %fd66;
	bar.sync 	0;

BB0_10:
	.loc 1 35 1
	mov.f64 	%fd64, %fd65;
	setp.lt.u32	%p6, %r8, 256;
	mov.f64 	%fd63, %fd64;
	@%p6 bra 	BB0_14;

	.loc 1 35 1
	setp.gt.u32	%p7, %r9, 127;
	@%p7 bra 	BB0_13;

	.loc 1 35 1
	ld.shared.f64 	%fd30, [%rd1+1024];
	mul.f64 	%fd64, %fd64, %fd30;
	st.shared.f64 	[%rd1], %fd64;

BB0_13:
	.loc 1 35 1
	mov.f64 	%fd63, %fd64;
	bar.sync 	0;

BB0_14:
	.loc 1 36 1
	mov.f64 	%fd62, %fd63;
	setp.lt.u32	%p8, %r8, 128;
	mov.f64 	%fd61, %fd62;
	@%p8 bra 	BB0_18;

	.loc 1 36 1
	setp.gt.u32	%p9, %r9, 63;
	@%p9 bra 	BB0_17;

	.loc 1 36 1
	ld.shared.f64 	%fd31, [%rd1+512];
	mul.f64 	%fd62, %fd62, %fd31;
	st.shared.f64 	[%rd1], %fd62;

BB0_17:
	.loc 1 36 1
	mov.f64 	%fd61, %fd62;
	bar.sync 	0;

BB0_18:
	.loc 1 38 1
	mov.f64 	%fd60, %fd61;
	setp.gt.u32	%p10, %r9, 31;
	@%p10 bra 	BB0_31;

	.loc 1 41 1
	setp.lt.u32	%p11, %r8, 64;
	@%p11 bra 	BB0_21;

	.loc 1 41 1
	ld.volatile.shared.f64 	%fd32, [%rd1+256];
	mul.f64 	%fd60, %fd60, %fd32;
	st.volatile.shared.f64 	[%rd1], %fd60;

BB0_21:
	.loc 1 42 1
	mov.f64 	%fd59, %fd60;
	setp.lt.u32	%p12, %r8, 32;
	@%p12 bra 	BB0_23;

	.loc 1 42 1
	ld.volatile.shared.f64 	%fd33, [%rd1+128];
	mul.f64 	%fd59, %fd59, %fd33;
	st.volatile.shared.f64 	[%rd1], %fd59;

BB0_23:
	.loc 1 43 1
	mov.f64 	%fd58, %fd59;
	setp.lt.u32	%p13, %r8, 16;
	@%p13 bra 	BB0_25;

	.loc 1 43 1
	ld.volatile.shared.f64 	%fd34, [%rd1+64];
	mul.f64 	%fd58, %fd58, %fd34;
	st.volatile.shared.f64 	[%rd1], %fd58;

BB0_25:
	.loc 1 44 1
	mov.f64 	%fd57, %fd58;
	setp.lt.u32	%p14, %r8, 8;
	@%p14 bra 	BB0_27;

	.loc 1 44 1
	ld.volatile.shared.f64 	%fd35, [%rd1+32];
	mul.f64 	%fd57, %fd57, %fd35;
	st.volatile.shared.f64 	[%rd1], %fd57;

BB0_27:
	.loc 1 45 1
	mov.f64 	%fd56, %fd57;
	setp.lt.u32	%p15, %r8, 4;
	@%p15 bra 	BB0_29;

	.loc 1 45 1
	ld.volatile.shared.f64 	%fd36, [%rd1+16];
	mul.f64 	%fd56, %fd56, %fd36;
	st.volatile.shared.f64 	[%rd1], %fd56;

BB0_29:
	.loc 1 46 1
	setp.lt.u32	%p16, %r8, 2;
	@%p16 bra 	BB0_31;

	.loc 1 46 1
	ld.volatile.shared.f64 	%fd37, [%rd1+8];
	mul.f64 	%fd38, %fd56, %fd37;
	st.volatile.shared.f64 	[%rd1], %fd38;

BB0_31:
	.loc 1 50 1
	setp.ne.s32	%p17, %r9, 0;
	@%p17 bra 	BB0_33;

	cvta.to.global.u64 	%rd12, %rd3;
	.loc 1 51 1
	mul.wide.u32 	%rd13, %r6, 8;
	add.s64 	%rd14, %rd12, %rd13;
	ld.shared.f64 	%fd39, [sdata];
	st.global.f64 	[%rd14], %fd39;

BB0_33:
	.loc 1 52 2
	ret;
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy