
gust.linalg.cuda.reduceDouble.ptx Maven / Gradle / Ivy
The newest version!
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep 5 10:08:11 2013 (1378400891)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_30
.address_size 64
.file 1 "/Users/dlwh/src/gust/src/main/resources/gust/linalg/cuda/reduceDouble.cu", 1413672495, 1961
.extern .shared .align 8 .b8 sdata[];
.visible .entry reduce(
.param .u64 reduce_param_0,
.param .u64 reduce_param_1,
.param .u32 reduce_param_2
)
{
.reg .pred %p<18>;
.reg .s32 %r<31>;
.reg .s64 %rd<15>;
.reg .f64 %fd<70>;
ld.param.u64 %rd2, [reduce_param_0];
ld.param.u64 %rd3, [reduce_param_1];
ld.param.u32 %r5, [reduce_param_2];
.loc 1 14 1
mov.u32 %r6, %ctaid.x;
shl.b32 %r7, %r6, 1;
mov.u32 %r8, %ntid.x;
.loc 1 13 1
mov.u32 %r9, %tid.x;
.loc 1 14 1
mad.lo.s32 %r30, %r7, %r8, %r9;
.loc 1 19 1
setp.lt.u32 %p1, %r30, %r5;
@%p1 bra BB0_2;
mov.f64 %fd67, 0d3FF0000000000000;
bra.uni BB0_6;
BB0_2:
mov.f64 %fd68, 0d3FF0000000000000;
BB0_3:
.loc 1 21 1
mov.f64 %fd1, %fd68;
cvta.to.global.u64 %rd4, %rd2;
.loc 1 21 1
mul.wide.u32 %rd5, %r30, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd27, [%rd6];
mul.f64 %fd69, %fd1, %fd27;
.loc 1 23 1
add.s32 %r3, %r8, %r30;
setp.ge.u32 %p2, %r3, %r5;
@%p2 bra BB0_5;
.loc 1 24 1
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd28, [%rd9];
mul.f64 %fd69, %fd69, %fd28;
BB0_5:
.loc 1 25 1
mov.f64 %fd68, %fd69;
.loc 1 15 1
shl.b32 %r12, %r8, 1;
mov.u32 %r13, %nctaid.x;
.loc 1 25 1
mad.lo.s32 %r30, %r12, %r13, %r30;
.loc 1 19 1
setp.lt.u32 %p3, %r30, %r5;
mov.f64 %fd67, %fd68;
@%p3 bra BB0_3;
BB0_6:
.loc 1 29 1
mov.f64 %fd66, %fd67;
mul.wide.u32 %rd10, %r9, 8;
mov.u64 %rd11, sdata;
add.s64 %rd1, %rd11, %rd10;
st.shared.f64 [%rd1], %fd66;
.loc 1 30 1
bar.sync 0;
.loc 1 34 1
setp.lt.u32 %p4, %r8, 512;
mov.f64 %fd65, %fd66;
@%p4 bra BB0_10;
.loc 1 34 1
setp.gt.u32 %p5, %r9, 255;
@%p5 bra BB0_9;
.loc 1 34 1
ld.shared.f64 %fd29, [%rd1+2048];
mul.f64 %fd66, %fd66, %fd29;
st.shared.f64 [%rd1], %fd66;
BB0_9:
.loc 1 34 1
mov.f64 %fd65, %fd66;
bar.sync 0;
BB0_10:
.loc 1 35 1
mov.f64 %fd64, %fd65;
setp.lt.u32 %p6, %r8, 256;
mov.f64 %fd63, %fd64;
@%p6 bra BB0_14;
.loc 1 35 1
setp.gt.u32 %p7, %r9, 127;
@%p7 bra BB0_13;
.loc 1 35 1
ld.shared.f64 %fd30, [%rd1+1024];
mul.f64 %fd64, %fd64, %fd30;
st.shared.f64 [%rd1], %fd64;
BB0_13:
.loc 1 35 1
mov.f64 %fd63, %fd64;
bar.sync 0;
BB0_14:
.loc 1 36 1
mov.f64 %fd62, %fd63;
setp.lt.u32 %p8, %r8, 128;
mov.f64 %fd61, %fd62;
@%p8 bra BB0_18;
.loc 1 36 1
setp.gt.u32 %p9, %r9, 63;
@%p9 bra BB0_17;
.loc 1 36 1
ld.shared.f64 %fd31, [%rd1+512];
mul.f64 %fd62, %fd62, %fd31;
st.shared.f64 [%rd1], %fd62;
BB0_17:
.loc 1 36 1
mov.f64 %fd61, %fd62;
bar.sync 0;
BB0_18:
.loc 1 38 1
mov.f64 %fd60, %fd61;
setp.gt.u32 %p10, %r9, 31;
@%p10 bra BB0_31;
.loc 1 41 1
setp.lt.u32 %p11, %r8, 64;
@%p11 bra BB0_21;
.loc 1 41 1
ld.volatile.shared.f64 %fd32, [%rd1+256];
mul.f64 %fd60, %fd60, %fd32;
st.volatile.shared.f64 [%rd1], %fd60;
BB0_21:
.loc 1 42 1
mov.f64 %fd59, %fd60;
setp.lt.u32 %p12, %r8, 32;
@%p12 bra BB0_23;
.loc 1 42 1
ld.volatile.shared.f64 %fd33, [%rd1+128];
mul.f64 %fd59, %fd59, %fd33;
st.volatile.shared.f64 [%rd1], %fd59;
BB0_23:
.loc 1 43 1
mov.f64 %fd58, %fd59;
setp.lt.u32 %p13, %r8, 16;
@%p13 bra BB0_25;
.loc 1 43 1
ld.volatile.shared.f64 %fd34, [%rd1+64];
mul.f64 %fd58, %fd58, %fd34;
st.volatile.shared.f64 [%rd1], %fd58;
BB0_25:
.loc 1 44 1
mov.f64 %fd57, %fd58;
setp.lt.u32 %p14, %r8, 8;
@%p14 bra BB0_27;
.loc 1 44 1
ld.volatile.shared.f64 %fd35, [%rd1+32];
mul.f64 %fd57, %fd57, %fd35;
st.volatile.shared.f64 [%rd1], %fd57;
BB0_27:
.loc 1 45 1
mov.f64 %fd56, %fd57;
setp.lt.u32 %p15, %r8, 4;
@%p15 bra BB0_29;
.loc 1 45 1
ld.volatile.shared.f64 %fd36, [%rd1+16];
mul.f64 %fd56, %fd56, %fd36;
st.volatile.shared.f64 [%rd1], %fd56;
BB0_29:
.loc 1 46 1
setp.lt.u32 %p16, %r8, 2;
@%p16 bra BB0_31;
.loc 1 46 1
ld.volatile.shared.f64 %fd37, [%rd1+8];
mul.f64 %fd38, %fd56, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
BB0_31:
.loc 1 50 1
setp.ne.s32 %p17, %r9, 0;
@%p17 bra BB0_33;
cvta.to.global.u64 %rd12, %rd3;
.loc 1 51 1
mul.wide.u32 %rd13, %r6, 8;
add.s64 %rd14, %rd12, %rd13;
ld.shared.f64 %fd39, [sdata];
st.global.f64 [%rd14], %fd39;
BB0_33:
.loc 1 52 2
ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy