All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gust.linalg.cuda.reduceFloat.ptx Maven / Gradle / Ivy

The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep  5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//

.version 3.2
.target sm_30
.address_size 64

	.file	1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/reduceFloat.cu", 1413672495, 1957
.extern .shared .align 4 .b8 sdata[];

.visible .entry reduce(
	.param .u64 reduce_param_0,
	.param .u64 reduce_param_1,
	.param .u32 reduce_param_2
)
{
	.reg .pred 	%p<18>;
	.reg .f32 	%f<70>;
	.reg .s32 	%r<31>;
	.reg .s64 	%rd<15>;


	ld.param.u64 	%rd2, [reduce_param_0];
	ld.param.u64 	%rd3, [reduce_param_1];
	ld.param.u32 	%r5, [reduce_param_2];
	.loc 1 13 1
	mov.u32 	%r6, %tid.x;
	.loc 1 14 1
	mov.u32 	%r7, %ctaid.x;
	shl.b32 	%r8, %r7, 1;
	mov.u32 	%r9, %ntid.x;
	mad.lo.s32 	%r30, %r8, %r9, %r6;
	.loc 1 19 1
	setp.lt.u32	%p1, %r30, %r5;
	@%p1 bra 	BB0_2;

	mov.f32 	%f67, 0f3F800000;
	bra.uni 	BB0_6;

BB0_2:
	mov.f32 	%f68, 0f3F800000;

BB0_3:
	.loc 1 21 1
	mov.f32 	%f1, %f68;
	cvta.to.global.u64 	%rd4, %rd2;
	.loc 1 21 1
	mul.wide.u32 	%rd5, %r30, 4;
	add.s64 	%rd6, %rd4, %rd5;
	ld.global.f32 	%f27, [%rd6];
	mul.f32 	%f69, %f1, %f27;
	.loc 1 23 1
	add.s32 	%r3, %r9, %r30;
	setp.ge.u32	%p2, %r3, %r5;
	@%p2 bra 	BB0_5;

	.loc 1 24 1
	mul.wide.u32 	%rd8, %r3, 4;
	add.s64 	%rd9, %rd4, %rd8;
	ld.global.f32 	%f28, [%rd9];
	mul.f32 	%f69, %f69, %f28;

BB0_5:
	.loc 1 25 1
	mov.f32 	%f68, %f69;
	.loc 1 15 1
	shl.b32 	%r12, %r9, 1;
	mov.u32 	%r13, %nctaid.x;
	.loc 1 25 1
	mad.lo.s32 	%r30, %r12, %r13, %r30;
	.loc 1 19 1
	setp.lt.u32	%p3, %r30, %r5;
	mov.f32 	%f67, %f68;
	@%p3 bra 	BB0_3;

BB0_6:
	.loc 1 29 1
	mov.f32 	%f66, %f67;
	mul.wide.u32 	%rd10, %r6, 4;
	mov.u64 	%rd11, sdata;
	add.s64 	%rd1, %rd11, %rd10;
	st.shared.f32 	[%rd1], %f66;
	.loc 1 30 1
	bar.sync 	0;
	.loc 1 34 1
	setp.lt.u32	%p4, %r9, 512;
	mov.f32 	%f65, %f66;
	@%p4 bra 	BB0_10;

	.loc 1 34 1
	setp.gt.u32	%p5, %r6, 255;
	@%p5 bra 	BB0_9;

	.loc 1 34 1
	ld.shared.f32 	%f29, [%rd1+1024];
	mul.f32 	%f66, %f66, %f29;
	st.shared.f32 	[%rd1], %f66;

BB0_9:
	.loc 1 34 1
	mov.f32 	%f65, %f66;
	bar.sync 	0;

BB0_10:
	.loc 1 35 1
	mov.f32 	%f64, %f65;
	setp.lt.u32	%p6, %r9, 256;
	mov.f32 	%f63, %f64;
	@%p6 bra 	BB0_14;

	.loc 1 35 1
	setp.gt.u32	%p7, %r6, 127;
	@%p7 bra 	BB0_13;

	.loc 1 35 1
	ld.shared.f32 	%f30, [%rd1+512];
	mul.f32 	%f64, %f64, %f30;
	st.shared.f32 	[%rd1], %f64;

BB0_13:
	.loc 1 35 1
	mov.f32 	%f63, %f64;
	bar.sync 	0;

BB0_14:
	.loc 1 36 1
	mov.f32 	%f62, %f63;
	setp.lt.u32	%p8, %r9, 128;
	mov.f32 	%f61, %f62;
	@%p8 bra 	BB0_18;

	.loc 1 36 1
	setp.gt.u32	%p9, %r6, 63;
	@%p9 bra 	BB0_17;

	.loc 1 36 1
	ld.shared.f32 	%f31, [%rd1+256];
	mul.f32 	%f62, %f62, %f31;
	st.shared.f32 	[%rd1], %f62;

BB0_17:
	.loc 1 36 1
	mov.f32 	%f61, %f62;
	bar.sync 	0;

BB0_18:
	.loc 1 38 1
	mov.f32 	%f60, %f61;
	setp.gt.u32	%p10, %r6, 31;
	@%p10 bra 	BB0_31;

	.loc 1 41 1
	setp.lt.u32	%p11, %r9, 64;
	@%p11 bra 	BB0_21;

	.loc 1 41 1
	ld.volatile.shared.f32 	%f32, [%rd1+128];
	mul.f32 	%f60, %f60, %f32;
	st.volatile.shared.f32 	[%rd1], %f60;

BB0_21:
	.loc 1 42 1
	mov.f32 	%f59, %f60;
	setp.lt.u32	%p12, %r9, 32;
	@%p12 bra 	BB0_23;

	.loc 1 42 1
	ld.volatile.shared.f32 	%f33, [%rd1+64];
	mul.f32 	%f59, %f59, %f33;
	st.volatile.shared.f32 	[%rd1], %f59;

BB0_23:
	.loc 1 43 1
	mov.f32 	%f58, %f59;
	setp.lt.u32	%p13, %r9, 16;
	@%p13 bra 	BB0_25;

	.loc 1 43 1
	ld.volatile.shared.f32 	%f34, [%rd1+32];
	mul.f32 	%f58, %f58, %f34;
	st.volatile.shared.f32 	[%rd1], %f58;

BB0_25:
	.loc 1 44 1
	mov.f32 	%f57, %f58;
	setp.lt.u32	%p14, %r9, 8;
	@%p14 bra 	BB0_27;

	.loc 1 44 1
	ld.volatile.shared.f32 	%f35, [%rd1+16];
	mul.f32 	%f57, %f57, %f35;
	st.volatile.shared.f32 	[%rd1], %f57;

BB0_27:
	.loc 1 45 1
	mov.f32 	%f56, %f57;
	setp.lt.u32	%p15, %r9, 4;
	@%p15 bra 	BB0_29;

	.loc 1 45 1
	ld.volatile.shared.f32 	%f36, [%rd1+8];
	mul.f32 	%f56, %f56, %f36;
	st.volatile.shared.f32 	[%rd1], %f56;

BB0_29:
	.loc 1 46 1
	setp.lt.u32	%p16, %r9, 2;
	@%p16 bra 	BB0_31;

	.loc 1 46 1
	ld.volatile.shared.f32 	%f37, [%rd1+4];
	mul.f32 	%f38, %f56, %f37;
	st.volatile.shared.f32 	[%rd1], %f38;

BB0_31:
	.loc 1 50 1
	setp.ne.s32	%p17, %r6, 0;
	@%p17 bra 	BB0_33;

	.loc 1 51 1
	ld.shared.f32 	%f39, [sdata];
	cvta.to.global.u64 	%rd12, %rd3;
	.loc 1 51 1
	mul.wide.u32 	%rd13, %r7, 4;
	add.s64 	%rd14, %rd12, %rd13;
	st.global.f32 	[%rd14], %f39;

BB0_33:
	.loc 1 52 2
	ret;
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy