
gust.linalg.cuda.reduceFloat.ptx Maven / Gradle / Ivy
The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep 5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_30
.address_size 64
.file 1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/reduceFloat.cu", 1413672495, 1957
.extern .shared .align 4 .b8 sdata[];
.visible .entry reduce(
.param .u64 reduce_param_0,
.param .u64 reduce_param_1,
.param .u32 reduce_param_2
)
{
.reg .pred %p<18>;
.reg .f32 %f<70>;
.reg .s32 %r<31>;
.reg .s64 %rd<15>;
ld.param.u64 %rd2, [reduce_param_0];
ld.param.u64 %rd3, [reduce_param_1];
ld.param.u32 %r5, [reduce_param_2];
.loc 1 13 1
mov.u32 %r6, %tid.x;
.loc 1 14 1
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r30, %r8, %r9, %r6;
.loc 1 19 1
setp.lt.u32 %p1, %r30, %r5;
@%p1 bra BB0_2;
mov.f32 %f67, 0f3F800000;
bra.uni BB0_6;
BB0_2:
mov.f32 %f68, 0f3F800000;
BB0_3:
.loc 1 21 1
mov.f32 %f1, %f68;
cvta.to.global.u64 %rd4, %rd2;
.loc 1 21 1
mul.wide.u32 %rd5, %r30, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f27, [%rd6];
mul.f32 %f69, %f1, %f27;
.loc 1 23 1
add.s32 %r3, %r9, %r30;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB0_5;
.loc 1 24 1
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f28, [%rd9];
mul.f32 %f69, %f69, %f28;
BB0_5:
.loc 1 25 1
mov.f32 %f68, %f69;
.loc 1 15 1
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
.loc 1 25 1
mad.lo.s32 %r30, %r12, %r13, %r30;
.loc 1 19 1
setp.lt.u32 %p3, %r30, %r5;
mov.f32 %f67, %f68;
@%p3 bra BB0_3;
BB0_6:
.loc 1 29 1
mov.f32 %f66, %f67;
mul.wide.u32 %rd10, %r6, 4;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f66;
.loc 1 30 1
bar.sync 0;
.loc 1 34 1
setp.lt.u32 %p4, %r9, 512;
mov.f32 %f65, %f66;
@%p4 bra BB0_10;
.loc 1 34 1
setp.gt.u32 %p5, %r6, 255;
@%p5 bra BB0_9;
.loc 1 34 1
ld.shared.f32 %f29, [%rd1+1024];
mul.f32 %f66, %f66, %f29;
st.shared.f32 [%rd1], %f66;
BB0_9:
.loc 1 34 1
mov.f32 %f65, %f66;
bar.sync 0;
BB0_10:
.loc 1 35 1
mov.f32 %f64, %f65;
setp.lt.u32 %p6, %r9, 256;
mov.f32 %f63, %f64;
@%p6 bra BB0_14;
.loc 1 35 1
setp.gt.u32 %p7, %r6, 127;
@%p7 bra BB0_13;
.loc 1 35 1
ld.shared.f32 %f30, [%rd1+512];
mul.f32 %f64, %f64, %f30;
st.shared.f32 [%rd1], %f64;
BB0_13:
.loc 1 35 1
mov.f32 %f63, %f64;
bar.sync 0;
BB0_14:
.loc 1 36 1
mov.f32 %f62, %f63;
setp.lt.u32 %p8, %r9, 128;
mov.f32 %f61, %f62;
@%p8 bra BB0_18;
.loc 1 36 1
setp.gt.u32 %p9, %r6, 63;
@%p9 bra BB0_17;
.loc 1 36 1
ld.shared.f32 %f31, [%rd1+256];
mul.f32 %f62, %f62, %f31;
st.shared.f32 [%rd1], %f62;
BB0_17:
.loc 1 36 1
mov.f32 %f61, %f62;
bar.sync 0;
BB0_18:
.loc 1 38 1
mov.f32 %f60, %f61;
setp.gt.u32 %p10, %r6, 31;
@%p10 bra BB0_31;
.loc 1 41 1
setp.lt.u32 %p11, %r9, 64;
@%p11 bra BB0_21;
.loc 1 41 1
ld.volatile.shared.f32 %f32, [%rd1+128];
mul.f32 %f60, %f60, %f32;
st.volatile.shared.f32 [%rd1], %f60;
BB0_21:
.loc 1 42 1
mov.f32 %f59, %f60;
setp.lt.u32 %p12, %r9, 32;
@%p12 bra BB0_23;
.loc 1 42 1
ld.volatile.shared.f32 %f33, [%rd1+64];
mul.f32 %f59, %f59, %f33;
st.volatile.shared.f32 [%rd1], %f59;
BB0_23:
.loc 1 43 1
mov.f32 %f58, %f59;
setp.lt.u32 %p13, %r9, 16;
@%p13 bra BB0_25;
.loc 1 43 1
ld.volatile.shared.f32 %f34, [%rd1+32];
mul.f32 %f58, %f58, %f34;
st.volatile.shared.f32 [%rd1], %f58;
BB0_25:
.loc 1 44 1
mov.f32 %f57, %f58;
setp.lt.u32 %p14, %r9, 8;
@%p14 bra BB0_27;
.loc 1 44 1
ld.volatile.shared.f32 %f35, [%rd1+16];
mul.f32 %f57, %f57, %f35;
st.volatile.shared.f32 [%rd1], %f57;
BB0_27:
.loc 1 45 1
mov.f32 %f56, %f57;
setp.lt.u32 %p15, %r9, 4;
@%p15 bra BB0_29;
.loc 1 45 1
ld.volatile.shared.f32 %f36, [%rd1+8];
mul.f32 %f56, %f56, %f36;
st.volatile.shared.f32 [%rd1], %f56;
BB0_29:
.loc 1 46 1
setp.lt.u32 %p16, %r9, 2;
@%p16 bra BB0_31;
.loc 1 46 1
ld.volatile.shared.f32 %f37, [%rd1+4];
mul.f32 %f38, %f56, %f37;
st.volatile.shared.f32 [%rd1], %f38;
BB0_31:
.loc 1 50 1
setp.ne.s32 %p17, %r6, 0;
@%p17 bra BB0_33;
.loc 1 51 1
ld.shared.f32 %f39, [sdata];
cvta.to.global.u64 %rd12, %rd3;
.loc 1 51 1
mul.wide.u32 %rd13, %r7, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f39;
BB0_33:
.loc 1 52 2
ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy