http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca91e68/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index d382fc5..c990f27 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,8 +1,8 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21554848 -// Cuda compilation tools, release 8.0, V8.0.61 +// Compiler Build ID: CL-21124049 +// Cuda compilation tools, release 8.0, V8.0.44 // Based on LLVM 3.4svn // @@ -1277,129 +1277,106 @@ BB21_2: .param .u32 matrix_matrix_cellwise_op_d_param_7 ) { - .reg .pred %p<73>; - .reg .b32 %r<66>; - .reg .f64 %fd<56>; + .reg .pred %p<77>; + .reg .b32 %r<56>; + .reg .f64 %fd<55>; .reg .b64 %rd<19>; ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0]; ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1]; ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2]; - ld.param.u32 %r14, [matrix_matrix_cellwise_op_d_param_3]; - ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_4]; - ld.param.u32 %r11, [matrix_matrix_cellwise_op_d_param_5]; - ld.param.u32 %r12, [matrix_matrix_cellwise_op_d_param_6]; - ld.param.u32 %r13, [matrix_matrix_cellwise_op_d_param_7]; - mov.u32 %r15, %ntid.x; - mov.u32 %r16, %ctaid.x; - mov.u32 %r17, %tid.x; - mad.lo.s32 %r18, %r15, %r16, %r17; - div.s32 %r1, %r18, %r10; - rem.s32 %r2, %r18, %r10; - setp.lt.s32 %p2, %r1, %r14; - setp.gt.s32 %p3, %r10, -1; + ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_3]; + ld.param.u32 %r6, [matrix_matrix_cellwise_op_d_param_4]; + ld.param.u32 %r7, [matrix_matrix_cellwise_op_d_param_5]; + ld.param.u32 %r8, [matrix_matrix_cellwise_op_d_param_6]; + ld.param.u32 %r9, [matrix_matrix_cellwise_op_d_param_7]; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r1, %r12, %r11, %r13; + div.s32 %r2, %r1, %r6; + setp.lt.s32 %p2, %r2, %r10; + setp.gt.s32 %p3, %r6, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB22_77; + @!%p4 bra BB22_65; bra.uni BB22_1; BB22_1: - mad.lo.s32 %r3, %r1, %r10, %r2; - setp.eq.s32 %p5, %r11, 1; - mov.u32 %r64, %r1; - @%p5 bra BB22_5; - - setp.ne.s32 %p6, %r11, 2; - mov.u32 %r65, %r3; - @%p6 bra BB22_4; - - mov.u32 %r65, %r2; - -BB22_4: - mov.u32 %r59, %r65; - mov.u32 %r4, %r59; - mov.u32 %r64, %r4; - -BB22_5: - mov.u32 %r5, %r64; - setp.eq.s32 %p7, %r12, 1; - mov.u32 %r62, %r1; - @%p7 bra BB22_9; - - setp.ne.s32 %p8, %r12, 2; - mov.u32 %r63, %r3; - @%p8 bra BB22_8; - - mov.u32 %r63, %r2; - -BB22_8: - mov.u32 %r62, %r63; - -BB22_9: - cvta.to.global.u64 %rd5, %rd3; - cvta.to.global.u64 %rd6, %rd2; - mul.wide.s32 %rd7, %r5, 8; - add.s64 %rd8, %rd6, %rd7; - ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r62, 8; - add.s64 %rd10, %rd5, %rd9; + rem.s32 %r14, %r1, %r6; + cvta.to.global.u64 %rd5, %rd2; + mad.lo.s32 %r3, %r2, %r6, %r14; + setp.eq.s32 %p5, %r7, 2; + selp.b32 %r15, %r14, %r3, %p5; + setp.eq.s32 %p6, %r7, 1; + selp.b32 %r16, %r2, %r15, %p6; + setp.eq.s32 %p7, %r8, 2; + selp.b32 %r17, %r14, %r3, %p7; + setp.eq.s32 %p8, %r8, 1; + selp.b32 %r18, %r2, %r17, %p8; + mul.wide.s32 %rd6, %r16, 8; + add.s64 %rd7, %rd5, %rd6; + ld.global.f64 %fd1, [%rd7]; + cvta.to.global.u64 %rd8, %rd3; + mul.wide.s32 %rd9, %r18, 8; + add.s64 %rd10, %rd8, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p9, %r13, 8; - @%p9 bra BB22_26; + mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p9, %r9, 8; + @%p9 bra BB22_18; - setp.gt.s32 %p23, %r13, 3; - @%p23 bra BB22_18; + setp.gt.s32 %p23, %r9, 3; + @%p23 bra BB22_10; - setp.gt.s32 %p30, %r13, 1; - @%p30 bra BB22_15; + setp.gt.s32 %p30, %r9, 1; + @%p30 bra BB22_7; - setp.eq.s32 %p33, %r13, 0; - @%p33 bra BB22_75; - bra.uni BB22_13; + setp.eq.s32 %p33, %r9, 0; + @%p33 bra BB22_63; + bra.uni BB22_5; -BB22_75: - add.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_63: + add.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_26: - setp.gt.s32 %p10, %r13, 13; - @%p10 bra BB22_35; +BB22_18: + setp.gt.s32 %p10, %r9, 13; + @%p10 bra BB22_27; - setp.gt.s32 %p17, %r13, 10; - @%p17 bra BB22_31; + setp.gt.s32 %p17, %r9, 10; + @%p17 bra BB22_23; - setp.eq.s32 %p21, %r13, 9; - @%p21 bra BB22_55; - bra.uni BB22_29; + setp.eq.s32 %p21, %r9, 9; + @%p21 bra BB22_45; + bra.uni BB22_21; -BB22_55: - setp.eq.f64 %p48, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB22_76; +BB22_45: + setp.eq.f64 %p50, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB22_64; -BB22_18: - setp.gt.s32 %p24, %r13, 5; - @%p24 bra BB22_22; +BB22_10: + setp.gt.s32 %p24, %r9, 5; + @%p24 bra BB22_14; - setp.eq.s32 %p28, %r13, 4; - @%p28 bra BB22_58; - bra.uni BB22_20; + setp.eq.s32 %p28, %r9, 4; + @%p28 bra BB22_48; + bra.uni BB22_12; -BB22_58: +BB22_48: { .reg .b32 %temp; - mov.b64 {%temp, %r8}, %fd1; + mov.b64 {%temp, %r4}, %fd1; } { .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd2; + mov.b64 {%temp, %r5}, %fd2; } - bfe.u32 %r31, %r9, 20, 11; + bfe.u32 %r31, %r5, 20, 11; add.s32 %r32, %r31, -1012; mov.b64 %rd15, %fd2; shl.b64 %rd1, %rd15, %r32; - setp.eq.s64 %p53, %rd1, -9223372036854775808; + setp.eq.s64 %p55, %rd1, -9223372036854775808; abs.f64 %fd19, %fd1; // Callseq Start 0 { @@ -1416,342 +1393,340 @@ BB22_58: param0, param1 ); - ld.param.f64 %fd54, [retval0+0]; + ld.param.f64 %fd53, [retval0+0]; //{ }// Callseq End 0 - setp.lt.s32 %p54, %r8, 0; - and.pred %p1, %p54, %p53; - @!%p1 bra BB22_60; - bra.uni BB22_59; + setp.lt.s32 %p56, %r4, 0; + and.pred %p1, %p56, %p55; + @!%p1 bra BB22_50; + bra.uni BB22_49; -BB22_59: +BB22_49: { .reg .b32 %temp; - mov.b64 {%temp, %r33}, %fd54; + mov.b64 {%temp, %r33}, %fd53; } xor.b32 %r34, %r33, -2147483648; { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd54; + mov.b64 {%r35, %temp}, %fd53; } - mov.b64 %fd54, {%r35, %r34}; + mov.b64 %fd53, {%r35, %r34}; -BB22_60: - mov.f64 %fd53, %fd54; - setp.eq.f64 %p55, %fd1, 0d0000000000000000; - @%p55 bra BB22_63; - bra.uni BB22_61; +BB22_50: + mov.f64 %fd52, %fd53; + setp.eq.f64 %p57, %fd1, 0d0000000000000000; + @%p57 bra BB22_53; + bra.uni BB22_51; -BB22_63: - selp.b32 %r36, %r8, 0, %p53; +BB22_53: + selp.b32 %r36, %r4, 0, %p55; or.b32 %r37, %r36, 2146435072; - setp.lt.s32 %p59, %r9, 0; - selp.b32 %r38, %r37, %r36, %p59; + setp.lt.s32 %p61, %r5, 0; + selp.b32 %r38, %r37, %r36, %p61; mov.u32 %r39, 0; - mov.b64 %fd53, {%r39, %r38}; - bra.uni BB22_64; + mov.b64 %fd52, {%r39, %r38}; + bra.uni BB22_54; -BB22_35: - setp.gt.s32 %p11, %r13, 15; - @%p11 bra BB22_39; +BB22_27: + setp.gt.s32 %p11, %r9, 15; + @%p11 bra BB22_31; - setp.eq.s32 %p15, %r13, 14; - @%p15 bra BB22_52; - bra.uni BB22_37; + setp.eq.s32 %p15, %r9, 14; + @%p15 bra BB22_42; + bra.uni BB22_29; -BB22_52: +BB22_42: cvt.rni.s64.f64 %rd11, %fd1; cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r25, %rd11; cvt.u32.u64 %r26, %rd12; or.b32 %r27, %r26, %r25; - setp.eq.s32 %p45, %r27, 0; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45; - bra.uni BB22_76; + setp.eq.s32 %p47, %r27, 0; + selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47; + bra.uni BB22_64; -BB22_15: - setp.eq.s32 %p31, %r13, 2; - @%p31 bra BB22_74; - bra.uni BB22_16; +BB22_7: + setp.eq.s32 %p31, %r9, 2; + @%p31 bra BB22_62; + bra.uni BB22_8; -BB22_74: - mul.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_62: + mul.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_31: - setp.eq.s32 %p18, %r13, 11; - @%p18 bra BB22_54; +BB22_23: + setp.eq.s32 %p18, %r9, 11; + @%p18 bra BB22_44; - setp.eq.s32 %p19, %r13, 12; - @%p19 bra BB22_53; - bra.uni BB22_33; + setp.eq.s32 %p19, %r9, 12; + @%p19 bra BB22_43; + bra.uni BB22_25; -BB22_53: - max.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_43: + max.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_22: - setp.eq.s32 %p25, %r13, 6; - @%p25 bra BB22_57; +BB22_14: + setp.eq.s32 %p25, %r9, 6; + @%p25 bra BB22_47; - setp.eq.s32 %p26, %r13, 7; - @%p26 bra BB22_56; - bra.uni BB22_24; + setp.eq.s32 %p26, %r9, 7; + @%p26 bra BB22_46; + bra.uni BB22_16; -BB22_56: - setp.gt.f64 %p50, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB22_76; +BB22_46: + setp.gt.f64 %p52, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52; + bra.uni BB22_64; -BB22_39: - setp.eq.s32 %p12, %r13, 16; - @%p12 bra BB22_51; +BB22_31: + setp.eq.s32 %p12, %r9, 16; + @%p12 bra BB22_41; - setp.eq.s32 %p13, %r13, 17; - @%p13 bra BB22_46; - bra.uni BB22_41; + setp.eq.s32 %p13, %r9, 17; + @%p13 bra BB22_37; + bra.uni BB22_33; -BB22_46: - setp.eq.f64 %p38, %fd2, 0d0000000000000000; - setp.eq.f64 %p39, %fd2, 0d8000000000000000; - or.pred %p40, %p38, %p39; - mov.f64 %fd55, 0d7FF8000000000000; - @%p40 bra BB22_76; +BB22_37: + setp.eq.f64 %p39, %fd2, 0d0000000000000000; + setp.eq.f64 %p40, %fd2, 0d8000000000000000; + or.pred %p41, %p39, %p40; + mov.f64 %fd54, 0d7FF8000000000000; + @%p41 bra BB22_64; - div.rn.f64 %fd55, %fd1, %fd2; - abs.f64 %fd39, %fd55; - setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000; - @%p41 bra BB22_76; + div.rn.f64 %fd54, %fd1, %fd2; + abs.f64 %fd39, %fd54; + setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000; + @%p42 bra BB22_64; { .reg .b32 %temp; - mov.b64 {%temp, %r22}, %fd55; + mov.b64 {%r22, %temp}, %fd54; } - and.b32 %r23, %r22, 2147483647; - setp.ne.s32 %p42, %r23, 2146435072; - @%p42 bra BB22_50; - { .reg .b32 %temp; - mov.b64 {%r24, %temp}, %fd55; + mov.b64 {%temp, %r23}, %fd54; } - setp.eq.s32 %p43, %r24, 0; - @%p43 bra BB22_76; - -BB22_50: - cvt.rmi.f64.f64 %fd40, %fd55; + and.b32 %r24, %r23, 2147483647; + setp.ne.s32 %p43, %r24, 2146435072; + setp.ne.s32 %p44, %r22, 0; + or.pred %p45, %p43, %p44; + @!%p45 bra BB22_64; + bra.uni BB22_40; + +BB22_40: + cvt.rmi.f64.f64 %fd40, %fd54; mul.f64 %fd41, %fd2, %fd40; - sub.f64 %fd55, %fd1, %fd41; - bra.uni BB22_76; + sub.f64 %fd54, %fd1, %fd41; + bra.uni BB22_64; -BB22_13: - setp.eq.s32 %p34, %r13, 1; - @%p34 bra BB22_14; - bra.uni BB22_76; +BB22_5: + setp.eq.s32 %p34, %r9, 1; + @%p34 bra BB22_6; + bra.uni BB22_64; -BB22_14: - sub.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_6: + sub.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_29: - setp.eq.s32 %p22, %r13, 10; - @%p22 bra BB22_30; - bra.uni BB22_76; +BB22_21: + setp.eq.s32 %p22, %r9, 10; + @%p22 bra BB22_22; + bra.uni BB22_64; -BB22_30: - setp.neu.f64 %p47, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47; - bra.uni BB22_76; +BB22_22: + setp.neu.f64 %p49, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49; + bra.uni BB22_64; -BB22_20: - setp.eq.s32 %p29, %r13, 5; - @%p29 bra BB22_21; - bra.uni BB22_76; +BB22_12: + setp.eq.s32 %p29, %r9, 5; + @%p29 bra BB22_13; + bra.uni BB22_64; -BB22_21: - setp.lt.f64 %p52, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52; - bra.uni BB22_76; +BB22_13: + setp.lt.f64 %p54, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54; + bra.uni BB22_64; -BB22_37: - setp.eq.s32 %p16, %r13, 15; - @%p16 bra BB22_38; - bra.uni BB22_76; +BB22_29: + setp.eq.s32 %p16, %r9, 15; + @%p16 bra BB22_30; + bra.uni BB22_64; -BB22_38: +BB22_30: mul.f64 %fd43, %fd1, %fd2; mov.f64 %fd44, 0d3FF0000000000000; - sub.f64 %fd55, %fd44, %fd43; - bra.uni BB22_76; + sub.f64 %fd54, %fd44, %fd43; + bra.uni BB22_64; -BB22_16: - setp.eq.s32 %p32, %r13, 3; - @%p32 bra BB22_17; - bra.uni BB22_76; +BB22_8: + setp.eq.s32 %p32, %r9, 3; + @%p32 bra BB22_9; + bra.uni BB22_64; -BB22_17: - div.rn.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_9: + div.rn.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_54: - min.f64 %fd55, %fd1, %fd2; - bra.uni BB22_76; +BB22_44: + min.f64 %fd54, %fd1, %fd2; + bra.uni BB22_64; -BB22_33: - setp.eq.s32 %p20, %r13, 13; - @%p20 bra BB22_34; - bra.uni BB22_76; +BB22_25: + setp.eq.s32 %p20, %r9, 13; + @%p20 bra BB22_26; + bra.uni BB22_64; -BB22_34: +BB22_26: cvt.rni.s64.f64 %rd13, %fd1; cvt.rni.s64.f64 %rd14, %fd2; cvt.u32.u64 %r28, %rd13; cvt.u32.u64 %r29, %rd14; and.b32 %r30, %r29, %r28; - setp.eq.s32 %p46, %r30, 0; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46; - bra.uni BB22_76; + setp.eq.s32 %p48, %r30, 0; + selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48; + bra.uni BB22_64; -BB22_57: - setp.gtu.f64 %p51, %fd1, %fd2; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p51; - bra.uni BB22_76; +BB22_47: + setp.le.f64 %p53, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53; + bra.uni BB22_64; -BB22_24: - setp.eq.s32 %p27, %r13, 8; - @%p27 bra BB22_25; - bra.uni BB22_76; +BB22_16: + setp.eq.s32 %p27, %r9, 8; + @%p27 bra BB22_17; + bra.uni BB22_64; -BB22_25: - setp.ltu.f64 %p49, %fd1, %fd2; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p49; - bra.uni BB22_76; +BB22_17: + setp.ge.f64 %p51, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51; + bra.uni BB22_64; -BB22_51: - setp.neu.f64 %p44, %fd1, 0d0000000000000000; +BB22_41: + setp.neu.f64 %p46, %fd1, 0d0000000000000000; sub.f64 %fd42, %fd1, %fd2; - selp.f64 %fd55, %fd42, 0d0000000000000000, %p44; - bra.uni BB22_76; + selp.f64 %fd54, %fd42, 0d0000000000000000, %p46; + bra.uni BB22_64; -BB22_41: - setp.ne.s32 %p14, %r13, 18; - @%p14 bra BB22_76; +BB22_33: + setp.ne.s32 %p14, %r9, 18; + @%p14 bra BB22_64; - div.rn.f64 %fd55, %fd1, %fd2; - abs.f64 %fd37, %fd55; + div.rn.f64 %fd54, %fd1, %fd2; + abs.f64 %fd37, %fd54; setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; - @%p35 bra BB22_76; + @%p35 bra BB22_64; { .reg .b32 %temp; - mov.b64 {%temp, %r19}, %fd55; + mov.b64 {%r19, %temp}, %fd54; } - and.b32 %r20, %r19, 2147483647; - setp.ne.s32 %p36, %r20, 2146435072; - @%p36 bra BB22_45; - { .reg .b32 %temp; - mov.b64 {%r21, %temp}, %fd55; + mov.b64 {%temp, %r20}, %fd54; } - setp.eq.s32 %p37, %r21, 0; - @%p37 bra BB22_76; + and.b32 %r21, %r20, 2147483647; + setp.ne.s32 %p36, %r21, 2146435072; + setp.ne.s32 %p37, %r19, 0; + or.pred %p38, %p36, %p37; + @!%p38 bra BB22_64; + bra.uni BB22_36; -BB22_45: - cvt.rmi.f64.f64 %fd55, %fd55; - bra.uni BB22_76; +BB22_36: + cvt.rmi.f64.f64 %fd54, %fd54; + bra.uni BB22_64; -BB22_61: - setp.gt.s32 %p56, %r8, -1; - @%p56 bra BB22_64; +BB22_51: + setp.gt.s32 %p58, %r4, -1; + @%p58 bra BB22_54; cvt.rzi.f64.f64 %fd45, %fd2; - setp.neu.f64 %p57, %fd45, %fd2; - selp.f64 %fd53, 0dFFF8000000000000, %fd53, %p57; + setp.neu.f64 %p59, %fd45, %fd2; + selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59; -BB22_64: - mov.f64 %fd25, %fd53; +BB22_54: + mov.f64 %fd25, %fd52; add.f64 %fd26, %fd1, %fd2; { .reg .b32 %temp; mov.b64 {%temp, %r40}, %fd26; } and.b32 %r41, %r40, 2146435072; - setp.ne.s32 %p60, %r41, 2146435072; - mov.f64 %fd52, %fd25; - @%p60 bra BB22_73; + setp.ne.s32 %p62, %r41, 2146435072; + mov.f64 %fd51, %fd25; + @%p62 bra BB22_61; - setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000; - mov.f64 %fd52, %fd26; - @%p61 bra BB22_73; - - abs.f64 %fd46, %fd2; - setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000; + setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000; mov.f64 %fd51, %fd26; - mov.f64 %fd52, %fd51; - @%p62 bra BB22_73; + @%p63 bra BB22_61; - and.b32 %r42, %r9, 2147483647; - setp.ne.s32 %p63, %r42, 2146435072; - @%p63 bra BB22_69; + abs.f64 %fd46, %fd2; + setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000; + mov.f64 %fd50, %fd26; + mov.f64 %fd51, %fd50; + @%p64 bra BB22_61; { .reg .b32 %temp; - mov.b64 {%r43, %temp}, %fd2; + mov.b64 {%r42, %temp}, %fd2; } - setp.eq.s32 %p64, %r43, 0; - @%p64 bra BB22_72; + and.b32 %r43, %r5, 2147483647; + setp.eq.s32 %p65, %r43, 2146435072; + setp.eq.s32 %p66, %r42, 0; + and.pred %p67, %p65, %p66; + @%p67 bra BB22_60; + bra.uni BB22_58; -BB22_69: - and.b32 %r44, %r8, 2147483647; - setp.ne.s32 %p65, %r44, 2146435072; - mov.f64 %fd49, %fd25; - mov.f64 %fd52, %fd49; - @%p65 bra BB22_73; +BB22_60: + setp.gt.f64 %p71, %fd19, 0d3FF0000000000000; + selp.b32 %r51, 2146435072, 0, %p71; + xor.b32 %r52, %r51, 2146435072; + setp.lt.s32 %p72, %r5, 0; + selp.b32 %r53, %r52, %r51, %p72; + setp.eq.f64 %p73, %fd1, 0dBFF0000000000000; + selp.b32 %r54, 1072693248, %r53, %p73; + mov.u32 %r55, 0; + mov.b64 %fd51, {%r55, %r54}; + bra.uni BB22_61; +BB22_58: { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd1; + mov.b64 {%r44, %temp}, %fd1; } - setp.ne.s32 %p66, %r45, 0; - mov.f64 %fd52, %fd25; - @%p66 bra BB22_73; + and.b32 %r45, %r4, 2147483647; + setp.eq.s32 %p68, %r45, 2146435072; + setp.eq.s32 %p69, %r44, 0; + and.pred %p70, %p68, %p69; + mov.f64 %fd51, %fd25; + @!%p70 bra BB22_61; + bra.uni BB22_59; - shr.s32 %r46, %r9, 31; +BB22_59: + shr.s32 %r46, %r5, 31; and.b32 %r47, %r46, -2146435072; - add.s32 %r48, %r47, 2146435072; - or.b32 %r49, %r48, -2147483648; - selp.b32 %r50, %r49, %r48, %p1; - mov.u32 %r51, 0; - mov.b64 %fd52, {%r51, %r50}; - bra.uni BB22_73; - -BB22_72: - setp.gt.f64 %p67, %fd19, 0d3FF0000000000000; - selp.b32 %r52, 2146435072, 0, %p67; - xor.b32 %r53, %r52, 2146435072; - setp.lt.s32 %p68, %r9, 0; - selp.b32 %r54, %r53, %r52, %p68; - setp.eq.f64 %p69, %fd1, 0dBFF0000000000000; - selp.b32 %r55, 1072693248, %r54, %p69; - mov.u32 %r56, 0; - mov.b64 %fd52, {%r56, %r55}; - -BB22_73: - setp.eq.f64 %p70, %fd2, 0d0000000000000000; - setp.eq.f64 %p71, %fd1, 0d3FF0000000000000; - or.pred %p72, %p71, %p70; - selp.f64 %fd55, 0d3FF0000000000000, %fd52, %p72; + selp.b32 %r48, -1048576, 2146435072, %p1; + add.s32 %r49, %r48, %r47; + mov.u32 %r50, 0; + mov.b64 %fd51, {%r50, %r49}; -BB22_76: +BB22_61: + setp.eq.f64 %p74, %fd2, 0d0000000000000000; + setp.eq.f64 %p75, %fd1, 0d3FF0000000000000; + or.pred %p76, %p75, %p74; + selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76; + +BB22_64: cvta.to.global.u64 %rd16, %rd4; mul.wide.s32 %rd17, %r3, 8; add.s64 %rd18, %rd16, %rd17; - st.global.f64 [%rd18], %fd55; + st.global.f64 [%rd18], %fd54; bar.sync 0; -BB22_77: +BB22_65: ret; } @@ -1769,114 +1744,91 @@ BB22_77: { .reg .pred %p<76>; .reg .f32 %f<134>; - .reg .b32 %r<51>; + .reg .b32 %r<42>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0]; ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1]; ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2]; - ld.param.u32 %r12, [matrix_matrix_cellwise_op_f_param_3]; - ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_4]; - ld.param.u32 %r9, [matrix_matrix_cellwise_op_f_param_5]; - ld.param.u32 %r10, [matrix_matrix_cellwise_op_f_param_6]; - ld.param.u32 %r11, [matrix_matrix_cellwise_op_f_param_7]; - mov.u32 %r13, %ntid.x; - mov.u32 %r14, %ctaid.x; - mov.u32 %r15, %tid.x; - mad.lo.s32 %r16, %r13, %r14, %r15; - div.s32 %r1, %r16, %r8; - rem.s32 %r2, %r16, %r8; - setp.lt.s32 %p2, %r1, %r12; - setp.gt.s32 %p3, %r8, -1; + ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_3]; + ld.param.u32 %r4, [matrix_matrix_cellwise_op_f_param_4]; + ld.param.u32 %r5, [matrix_matrix_cellwise_op_f_param_5]; + ld.param.u32 %r6, [matrix_matrix_cellwise_op_f_param_6]; + ld.param.u32 %r7, [matrix_matrix_cellwise_op_f_param_7]; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %tid.x; + mad.lo.s32 %r1, %r9, %r10, %r11; + div.s32 %r2, %r1, %r4; + setp.lt.s32 %p2, %r2, %r8; + setp.gt.s32 %p3, %r4, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB23_71; + @!%p4 bra BB23_63; bra.uni BB23_1; BB23_1: - mad.lo.s32 %r3, %r1, %r8, %r2; - setp.eq.s32 %p5, %r9, 1; - mov.u32 %r49, %r1; - @%p5 bra BB23_5; - - setp.ne.s32 %p6, %r9, 2; - mov.u32 %r50, %r3; - @%p6 bra BB23_4; - - mov.u32 %r50, %r2; - -BB23_4: - mov.u32 %r44, %r50; - mov.u32 %r4, %r44; - mov.u32 %r49, %r4; - -BB23_5: - mov.u32 %r5, %r49; - setp.eq.s32 %p7, %r10, 1; - mov.u32 %r47, %r1; - @%p7 bra BB23_9; - - setp.ne.s32 %p8, %r10, 2; - mov.u32 %r48, %r3; - @%p8 bra BB23_8; - - mov.u32 %r48, %r2; - -BB23_8: - mov.u32 %r47, %r48; - -BB23_9: - cvta.to.global.u64 %rd4, %rd2; - cvta.to.global.u64 %rd5, %rd1; - mul.wide.s32 %rd6, %r5, 4; - add.s64 %rd7, %rd5, %rd6; - ld.global.f32 %f1, [%rd7]; - mul.wide.s32 %rd8, %r47, 4; - add.s64 %rd9, %rd4, %rd8; + rem.s32 %r12, %r1, %r4; + cvta.to.global.u64 %rd4, %rd1; + mad.lo.s32 %r3, %r2, %r4, %r12; + setp.eq.s32 %p5, %r5, 2; + selp.b32 %r13, %r12, %r3, %p5; + setp.eq.s32 %p6, %r5, 1; + selp.b32 %r14, %r2, %r13, %p6; + setp.eq.s32 %p7, %r6, 2; + selp.b32 %r15, %r12, %r3, %p7; + setp.eq.s32 %p8, %r6, 1; + selp.b32 %r16, %r2, %r15, %p8; + mul.wide.s32 %rd5, %r14, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.f32 %f1, [%rd6]; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r16, 4; + add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f2, [%rd9]; mov.f32 %f133, 0f7F7FFFFF; - setp.gt.s32 %p9, %r11, 8; - @%p9 bra BB23_26; + setp.gt.s32 %p9, %r7, 8; + @%p9 bra BB23_18; - setp.gt.s32 %p23, %r11, 3; - @%p23 bra BB23_18; + setp.gt.s32 %p23, %r7, 3; + @%p23 bra BB23_10; - setp.gt.s32 %p30, %r11, 1; - @%p30 bra BB23_15; + setp.gt.s32 %p30, %r7, 1; + @%p30 bra BB23_7; - setp.eq.s32 %p33, %r11, 0; - @%p33 bra BB23_69; - bra.uni BB23_13; + setp.eq.s32 %p33, %r7, 0; + @%p33 bra BB23_61; + bra.uni BB23_5; -BB23_69: +BB23_61: add.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_26: - setp.gt.s32 %p10, %r11, 13; - @%p10 bra BB23_35; +BB23_18: + setp.gt.s32 %p10, %r7, 13; + @%p10 bra BB23_27; - setp.gt.s32 %p17, %r11, 10; - @%p17 bra BB23_31; + setp.gt.s32 %p17, %r7, 10; + @%p17 bra BB23_23; - setp.eq.s32 %p21, %r11, 9; - @%p21 bra BB23_51; - bra.uni BB23_29; + setp.eq.s32 %p21, %r7, 9; + @%p21 bra BB23_43; + bra.uni BB23_21; -BB23_51: +BB23_43: setp.eq.f32 %p44, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p44; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_18: - setp.gt.s32 %p24, %r11, 5; - @%p24 bra BB23_22; +BB23_10: + setp.gt.s32 %p24, %r7, 5; + @%p24 bra BB23_14; - setp.eq.s32 %p28, %r11, 4; - @%p28 bra BB23_54; - bra.uni BB23_20; + setp.eq.s32 %p28, %r7, 4; + @%p28 bra BB23_46; + bra.uni BB23_12; -BB23_54: +BB23_46: mul.f32 %f53, %f2, 0f3F000000; cvt.rzi.f32.f32 %f54, %f53; fma.rn.f32 %f55, %f54, 0fC0000000, %f2; @@ -1976,11 +1928,11 @@ BB23_54: setp.gt.f32 %p54, %f115, 0f42D20000; selp.f32 %f131, 0f7F800000, %f125, %p54; setp.eq.f32 %p55, %f131, 0f7F800000; - @%p55 bra BB23_56; + @%p55 bra BB23_48; fma.rn.f32 %f131, %f131, %f22, %f131; -BB23_56: +BB23_48: setp.lt.f32 %p56, %f1, 0f00000000; setp.eq.f32 %p57, %f19, 0f3F800000; and.pred %p1, %p56, %p57; @@ -1989,10 +1941,10 @@ BB23_56: mov.b32 %f126, %r30; selp.f32 %f132, %f126, %f131, %p1; setp.eq.f32 %p58, %f1, 0f00000000; - @%p58 bra BB23_59; - bra.uni BB23_57; + @%p58 bra BB23_51; + bra.uni BB23_49; -BB23_59: +BB23_51: add.f32 %f128, %f1, %f1; mov.b32 %r31, %f128; selp.b32 %r32, %r31, 0, %p57; @@ -2000,17 +1952,17 @@ BB23_59: setp.lt.f32 %p62, %f2, 0f00000000; selp.b32 %r34, %r33, %r32, %p62; mov.b32 %f132, %r34; - bra.uni BB23_60; + bra.uni BB23_52; -BB23_35: - setp.gt.s32 %p11, %r11, 15; - @%p11 bra BB23_39; +BB23_27: + setp.gt.s32 %p11, %r7, 15; + @%p11 bra BB23_31; - setp.eq.s32 %p15, %r11, 14; - @%p15 bra BB23_48; - bra.uni BB23_37; + setp.eq.s32 %p15, %r7, 14; + @%p15 bra BB23_40; + bra.uni BB23_29; -BB23_48: +BB23_40: cvt.rni.s64.f32 %rd10, %f1; cvt.rni.s64.f32 %rd11, %f2; cvt.u32.u64 %r17, %rd10; @@ -2018,126 +1970,126 @@ BB23_48: or.b32 %r19, %r18, %r17; setp.eq.s32 %p41, %r19, 0; selp.f32 %f133, 0f00000000, 0f3F800000, %p41; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_15: - setp.eq.s32 %p31, %r11, 2; - @%p31 bra BB23_68; - bra.uni BB23_16; +BB23_7: + setp.eq.s32 %p31, %r7, 2; + @%p31 bra BB23_60; + bra.uni BB23_8; -BB23_68: +BB23_60: mul.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_31: - setp.eq.s32 %p18, %r11, 11; - @%p18 bra BB23_50; +BB23_23: + setp.eq.s32 %p18, %r7, 11; + @%p18 bra BB23_42; - setp.eq.s32 %p19, %r11, 12; - @%p19 bra BB23_49; - bra.uni BB23_33; + setp.eq.s32 %p19, %r7, 12; + @%p19 bra BB23_41; + bra.uni BB23_25; -BB23_49: +BB23_41: max.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_22: - setp.eq.s32 %p25, %r11, 6; - @%p25 bra BB23_53; +BB23_14: + setp.eq.s32 %p25, %r7, 6; + @%p25 bra BB23_45; - setp.eq.s32 %p26, %r11, 7; - @%p26 bra BB23_52; - bra.uni BB23_24; + setp.eq.s32 %p26, %r7, 7; + @%p26 bra BB23_44; + bra.uni BB23_16; -BB23_52: +BB23_44: setp.gt.f32 %p46, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p46; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_39: - setp.eq.s32 %p12, %r11, 16; - @%p12 bra BB23_47; +BB23_31: + setp.eq.s32 %p12, %r7, 16; + @%p12 bra BB23_39; - setp.eq.s32 %p13, %r11, 17; - @%p13 bra BB23_44; - bra.uni BB23_41; + setp.eq.s32 %p13, %r7, 17; + @%p13 bra BB23_36; + bra.uni BB23_33; -BB23_44: +BB23_36: setp.eq.f32 %p36, %f2, 0f00000000; setp.eq.f32 %p37, %f2, 0f80000000; or.pred %p38, %p36, %p37; mov.f32 %f133, 0f7FC00000; - @%p38 bra BB23_70; + @%p38 bra BB23_62; div.rn.f32 %f133, %f1, %f2; abs.f32 %f43, %f133; setp.geu.f32 %p39, %f43, 0f7F800000; - @%p39 bra BB23_70; + @%p39 bra BB23_62; cvt.rmi.f32.f32 %f44, %f133; mul.f32 %f45, %f2, %f44; sub.f32 %f133, %f1, %f45; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_13: - setp.eq.s32 %p34, %r11, 1; - @%p34 bra BB23_14; - bra.uni BB23_70; +BB23_5: + setp.eq.s32 %p34, %r7, 1; + @%p34 bra BB23_6; + bra.uni BB23_62; -BB23_14: +BB23_6: sub.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_29: - setp.eq.s32 %p22, %r11, 10; - @%p22 bra BB23_30; - bra.uni BB23_70; +BB23_21: + setp.eq.s32 %p22, %r7, 10; + @%p22 bra BB23_22; + bra.uni BB23_62; -BB23_30: +BB23_22: setp.neu.f32 %p43, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p43; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_20: - setp.eq.s32 %p29, %r11, 5; - @%p29 bra BB23_21; - bra.uni BB23_70; +BB23_12: + setp.eq.s32 %p29, %r7, 5; + @%p29 bra BB23_13; + bra.uni BB23_62; -BB23_21: +BB23_13: setp.lt.f32 %p48, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p48; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_37: - setp.eq.s32 %p16, %r11, 15; - @%p16 bra BB23_38; - bra.uni BB23_70; +BB23_29: + setp.eq.s32 %p16, %r7, 15; + @%p16 bra BB23_30; + bra.uni BB23_62; -BB23_38: +BB23_30: mul.f32 %f47, %f1, %f2; mov.f32 %f48, 0f3F800000; sub.f32 %f133, %f48, %f47; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_16: - setp.eq.s32 %p32, %r11, 3; - @%p32 bra BB23_17; - bra.uni BB23_70; +BB23_8: + setp.eq.s32 %p32, %r7, 3; + @%p32 bra BB23_9; + bra.uni BB23_62; -BB23_17: +BB23_9: div.rn.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_50: +BB23_42: min.f32 %f133, %f1, %f2; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_33: - setp.eq.s32 %p20, %r11, 13; - @%p20 bra BB23_34; - bra.uni BB23_70; +BB23_25: + setp.eq.s32 %p20, %r7, 13; + @%p20 bra BB23_26; + bra.uni BB23_62; -BB23_34: +BB23_26: cvt.rni.s64.f32 %rd12, %f1; cvt.rni.s64.f32 %rd13, %f2; cvt.u32.u64 %r20, %rd12; @@ -2145,71 +2097,71 @@ BB23_34: and.b32 %r22, %r21, %r20; setp.eq.s32 %p42, %r22, 0; selp.f32 %f133, 0f00000000, 0f3F800000, %p42; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_53: - setp.gtu.f32 %p47, %f1, %f2; - selp.f32 %f133, 0f00000000, 0f3F800000, %p47; - bra.uni BB23_70; +BB23_45: + setp.le.f32 %p47, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p47; + bra.uni BB23_62; -BB23_24: - setp.eq.s32 %p27, %r11, 8; - @%p27 bra BB23_25; - bra.uni BB23_70; +BB23_16: + setp.eq.s32 %p27, %r7, 8; + @%p27 bra BB23_17; + bra.uni BB23_62; -BB23_25: - setp.ltu.f32 %p45, %f1, %f2; - selp.f32 %f133, 0f00000000, 0f3F800000, %p45; - bra.uni BB23_70; +BB23_17: + setp.ge.f32 %p45, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p45; + bra.uni BB23_62; -BB23_47: +BB23_39: setp.neu.f32 %p40, %f1, 0f00000000; sub.f32 %f46, %f1, %f2; selp.f32 %f133, %f46, 0f00000000, %p40; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_41: - setp.ne.s32 %p14, %r11, 18; - @%p14 bra BB23_70; +BB23_33: + setp.ne.s32 %p14, %r7, 18; + @%p14 bra BB23_62; div.rn.f32 %f133, %f1, %f2; abs.f32 %f41, %f133; setp.geu.f32 %p35, %f41, 0f7F800000; - @%p35 bra BB23_70; + @%p35 bra BB23_62; cvt.rmi.f32.f32 %f133, %f133; - bra.uni BB23_70; + bra.uni BB23_62; -BB23_57: +BB23_49: setp.geu.f32 %p59, %f1, 0f00000000; - @%p59 bra BB23_60; + @%p59 bra BB23_52; cvt.rzi.f32.f32 %f127, %f2; setp.neu.f32 %p60, %f127, %f2; selp.f32 %f132, 0f7FFFFFFF, %f132, %p60; -BB23_60: +BB23_52: add.f32 %f129, %f20, %f21; mov.b32 %r35, %f129; setp.lt.s32 %p63, %r35, 2139095040; - @%p63 bra BB23_67; + @%p63 bra BB23_59; setp.gtu.f32 %p64, %f20, 0f7F800000; setp.gtu.f32 %p65, %f21, 0f7F800000; or.pred %p66, %p64, %p65; - @%p66 bra BB23_66; - bra.uni BB23_62; + @%p66 bra BB23_58; + bra.uni BB23_54; -BB23_66: +BB23_58: add.f32 %f132, %f1, %f2; - bra.uni BB23_67; + bra.uni BB23_59; -BB23_62: +BB23_54: setp.eq.f32 %p67, %f21, 0f7F800000; - @%p67 bra BB23_65; - bra.uni BB23_63; + @%p67 bra BB23_57; + bra.uni BB23_55; -BB23_65: +BB23_57: setp.gt.f32 %p70, %f20, 0f3F800000; selp.b32 %r39, 2139095040, 0, %p70; xor.b32 %r40, %r39, 2139095040; @@ -2218,32 +2170,32 @@ BB23_65: mov.b32 %f130, %r41; setp.eq.f32 %p72, %f1, 0fBF800000; selp.f32 %f132, 0f3F800000, %f130, %p72; - bra.uni BB23_67; + bra.uni BB23_59; -BB23_63: +BB23_55: setp.neu.f32 %p68, %f20, 0f7F800000; - @%p68 bra BB23_67; + @%p68 bra BB23_59; - setp.ltu.f32 %p69, %f2, 0f00000000; - selp.b32 %r36, 0, 2139095040, %p69; + setp.ge.f32 %p69, %f2, 0f00000000; + selp.b32 %r36, 2139095040, 0, %p69; or.b32 %r37, %r36, -2147483648; selp.b32 %r38, %r37, %r36, %p1; mov.b32 %f132, %r38; -BB23_67: +BB23_59: setp.eq.f32 %p73, %f2, 0f00000000; setp.eq.f32 %p74, %f1, 0f3F800000; or.pred %p75, %p74, %p73; selp.f32 %f133, 0f3F800000, %f132, %p75; -BB23_70: +BB23_62: cvta.to.global.u64 %rd14, %rd3; mul.wide.s32 %rd15, %r3, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f133; bar.sync 0; -BB23_71: +BB23_63: ret; } @@ -2257,9 +2209,9 @@ BB23_71: .param .u32 matrix_scalar_op_d_param_5 ) { - .reg .pred %p<133>; - .reg .b32 %r<88>; - .reg .f64 %fd<109>; + .reg .pred %p<141>; + .reg .b32 %r<86>; + .reg .f64 %fd<107>; .reg .b64 %rd<20>; @@ -2274,7 +2226,7 @@ BB23_71: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB24_138; + @%p3 bra BB24_130; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -2283,9 +2235,9 @@ BB23_71: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB24_70; + @%p4 bra BB24_66; - mov.f64 %fd99, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 8; @%p5 bra BB24_19; @@ -2296,31 +2248,31 @@ BB23_71: @%p26 bra BB24_8; setp.eq.s32 %p29, %r6, 0; - @%p29 bra BB24_68; + @%p29 bra BB24_64; bra.uni BB24_6; -BB24_68: - add.f64 %fd99, %fd1, %fd68; - bra.uni BB24_69; +BB24_64: + add.f64 %fd98, %fd1, %fd68; + bra.uni BB24_65; -BB24_70: - mov.f64 %fd108, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p69, %r6, 8; - @%p69 bra BB24_87; +BB24_66: + mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p73, %r6, 8; + @%p73 bra BB24_83; - setp.gt.s32 %p83, %r6, 3; - @%p83 bra BB24_79; + setp.gt.s32 %p87, %r6, 3; + @%p87 bra BB24_75; - setp.gt.s32 %p90, %r6, 1; - @%p90 bra BB24_76; + setp.gt.s32 %p94, %r6, 1; + @%p94 bra BB24_72; - setp.eq.s32 %p93, %r6, 0; - @%p93 bra BB24_136; - bra.uni BB24_74; + setp.eq.s32 %p97, %r6, 0; + @%p97 bra BB24_128; + bra.uni BB24_70; -BB24_136: - add.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_128: + add.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; BB24_19: setp.gt.s32 %p6, %r6, 13; @@ -2330,39 +2282,39 @@ BB24_19: @%p13 bra BB24_24; setp.eq.s32 %p17, %r6, 9; - @%p17 bra BB24_48; + @%p17 bra BB24_46; bra.uni BB24_22; -BB24_48: - setp.eq.f64 %p44, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44; - bra.uni BB24_69; +BB24_46: + setp.eq.f64 %p46, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46; + bra.uni BB24_65; -BB24_87: - setp.gt.s32 %p70, %r6, 13; - @%p70 bra BB24_96; +BB24_83: + setp.gt.s32 %p74, %r6, 13; + @%p74 bra BB24_92; - setp.gt.s32 %p77, %r6, 10; - @%p77 bra BB24_92; + setp.gt.s32 %p81, %r6, 10; + @%p81 bra BB24_88; - setp.eq.s32 %p81, %r6, 9; - @%p81 bra BB24_116; - bra.uni BB24_90; + setp.eq.s32 %p85, %r6, 9; + @%p85 bra BB24_110; + bra.uni BB24_86; -BB24_116: - setp.eq.f64 %p108, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108; - bra.uni BB24_137; +BB24_110: + setp.eq.f64 %p114, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114; + bra.uni BB24_129; BB24_11: setp.gt.s32 %p20, %r6, 5; @%p20 bra BB24_15; setp.eq.s32 %p24, %r6, 4; - @%p24 bra BB24_51; + @%p24 bra BB24_49; bra.uni BB24_13; -BB24_51: +BB24_49: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd68; @@ -2375,7 +2327,7 @@ BB24_51: add.s32 %r25, %r24, -1012; mov.b64 %rd14, %fd1; shl.b64 %rd2, %rd14, %r25; - setp.eq.s64 %p49, %rd2, -9223372036854775808; + setp.eq.s64 %p51, %rd2, -9223372036854775808; abs.f64 %fd18, %fd68; // Callseq Start 1 { @@ -2392,69 +2344,69 @@ BB24_51: param0, param1 ); - ld.param.f64 %fd98, [retval0+0]; + ld.param.f64 %fd97, [retval0+0]; //{ }// Callseq End 1 - setp.lt.s32 %p50, %r2, 0; - and.pred %p1, %p50, %p49; - @!%p1 bra BB24_53; - bra.uni BB24_52; + setp.lt.s32 %p52, %r2, 0; + and.pred %p1, %p52, %p51; + @!%p1 bra BB24_51; + bra.uni BB24_50; -BB24_52: +BB24_50: { .reg .b32 %temp; - mov.b64 {%temp, %r26}, %fd98; + mov.b64 {%temp, %r26}, %fd97; } xor.b32 %r27, %r26, -2147483648; { .reg .b32 %temp; - mov.b64 {%r28, %temp}, %fd98; + mov.b64 {%r28, %temp}, %fd97; } - mov.b64 %fd98, {%r28, %r27}; + mov.b64 %fd97, {%r28, %r27}; -BB24_53: - mov.f64 %fd97, %fd98; - setp.eq.f64 %p51, %fd68, 0d0000000000000000; - @%p51 bra BB24_56; - bra.uni BB24_54; +BB24_51: + mov.f64 %fd96, %fd97; + setp.eq.f64 %p53, %fd68, 0d0000000000000000; + @%p53 bra BB24_54; + bra.uni BB24_52; -BB24_56: - selp.b32 %r29, %r2, 0, %p49; +BB24_54: + selp.b32 %r29, %r2, 0, %p51; or.b32 %r30, %r29, 2146435072; - setp.lt.s32 %p55, %r3, 0; - selp.b32 %r31, %r30, %r29, %p55; + setp.lt.s32 %p57, %r3, 0; + selp.b32 %r31, %r30, %r29, %p57; mov.u32 %r32, 0; - mov.b64 %fd97, {%r32, %r31}; - bra.uni BB24_57; + mov.b64 %fd96, {%r32, %r31}; + bra.uni BB24_55; BB24_28: setp.gt.s32 %p7, %r6, 15; @%p7 bra BB24_32; setp.eq.s32 %p11, %r6, 14; - @%p11 bra BB24_45; + @%p11 bra BB24_43; bra.uni BB24_30; -BB24_45: +BB24_43: cvt.rni.s64.f64 %rd10, %fd68; cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r18, %rd10; cvt.u32.u64 %r19, %rd11; or.b32 %r20, %r19, %r18; - setp.eq.s32 %p41, %r20, 0; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41; - bra.uni BB24_69; + setp.eq.s32 %p43, %r20, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43; + bra.uni BB24_65; -BB24_79: - setp.gt.s32 %p84, %r6, 5; - @%p84 bra BB24_83; +BB24_75: + setp.gt.s32 %p88, %r6, 5; + @%p88 bra BB24_79; - setp.eq.s32 %p88, %r6, 4; - @%p88 bra BB24_119; - bra.uni BB24_81; + setp.eq.s32 %p92, %r6, 4; + @%p92 bra BB24_113; + bra.uni BB24_77; -BB24_119: +BB24_113: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -2463,11 +2415,11 @@ BB24_119: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd68; } - bfe.u32 %r62, %r5, 20, 11; - add.s32 %r63, %r62, -1012; + bfe.u32 %r61, %r5, 20, 11; + add.s32 %r62, %r61, -1012; mov.b64 %rd19, %fd68; - shl.b64 %rd3, %rd19, %r63; - setp.eq.s64 %p113, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd19, %r62; + setp.eq.s64 %p119, %rd3, -9223372036854775808; abs.f64 %fd51, %fd1; // Callseq Start 2 { @@ -2484,267 +2436,267 @@ BB24_119: param0, param1 ); - ld.param.f64 %fd107, [retval0+0]; + ld.param.f64 %fd105, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p114, %r4, 0; - and.pred %p2, %p114, %p113; - @!%p2 bra BB24_121; - bra.uni BB24_120; + setp.lt.s32 %p120, %r4, 0; + and.pred %p2, %p120, %p119; + @!%p2 bra BB24_115; + bra.uni BB24_114; -BB24_120: +BB24_114: { .reg .b32 %temp; - mov.b64 {%temp, %r64}, %fd107; + mov.b64 {%temp, %r63}, %fd105; } - xor.b32 %r65, %r64, -2147483648; + xor.b32 %r64, %r63, -2147483648; { .reg .b32 %temp; - mov.b64 {%r66, %temp}, %fd107; + mov.b64 {%r65, %temp}, %fd105; } - mov.b64 %fd107, {%r66, %r65}; + mov.b64 %fd105, {%r65, %r64}; -BB24_121: - mov.f64 %fd106, %fd107; - setp.eq.f64 %p115, %fd1, 0d0000000000000000; - @%p115 bra BB24_124; - bra.uni BB24_122; +BB24_115: + mov.f64 %fd104, %fd105; + setp.eq.f64 %p121, %fd1, 0d0000000000000000; + @%p121 bra BB24_118; + bra.uni BB24_116; -BB24_124: - selp.b32 %r67, %r4, 0, %p113; - or.b32 %r68, %r67, 2146435072; - setp.lt.s32 %p119, %r5, 0; - selp.b32 %r69, %r68, %r67, %p119; - mov.u32 %r70, 0; - mov.b64 %fd106, {%r70, %r69}; - bra.uni BB24_125; +BB24_118: + selp.b32 %r66, %r4, 0, %p119; + or.b32 %r67, %r66, 2146435072; + setp.lt.s32 %p125, %r5, 0; + selp.b32 %r68, %r67, %r66, %p125; + mov.u32 %r69, 0; + mov.b64 %fd104, {%r69, %r68}; + bra.uni BB24_119; -BB24_96: - setp.gt.s32 %p71, %r6, 15; - @%p71 bra BB24_100; +BB24_92: + setp.gt.s32 %p75, %r6, 15; + @%p75 bra BB24_96; - setp.eq.s32 %p75, %r6, 14; - @%p75 bra BB24_113; - bra.uni BB24_98; + setp.eq.s32 %p79, %r6, 14; + @%p79 bra BB24_107; + bra.uni BB24_94; -BB24_113: +BB24_107: cvt.rni.s64.f64 %rd15, %fd1; cvt.rni.s64.f64 %rd16, %fd68; - cvt.u32.u64 %r56, %rd15; - cvt.u32.u64 %r57, %rd16; - or.b32 %r58, %r57, %r56; - setp.eq.s32 %p105, %r58, 0; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105; - bra.uni BB24_137; + cvt.u32.u64 %r55, %rd15; + cvt.u32.u64 %r56, %rd16; + or.b32 %r57, %r56, %r55; + setp.eq.s32 %p111, %r57, 0; + selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111; + bra.uni BB24_129; BB24_8: setp.eq.s32 %p27, %r6, 2; - @%p27 bra BB24_67; + @%p27 bra BB24_63; bra.uni BB24_9; -BB24_67: - mul.f64 %fd99, %fd1, %fd68; - bra.uni BB24_69; +BB24_63: + mul.f64 %fd98, %fd1, %fd68; + bra.uni BB24_65; BB24_24: setp.eq.s32 %p14, %r6, 11; - @%p14 bra BB24_47; + @%p14 bra BB24_45; setp.eq.s32 %p15, %r6, 12; - @%p15 bra BB24_46; + @%p15 bra BB24_44; bra.uni BB24_26; -BB24_46: - max.f64 %fd99, %fd68, %fd1; - bra.uni BB24_69; +BB24_44: + max.f64 %fd98, %fd68, %fd1; + bra.uni BB24_65; BB24_15: setp.eq.s32 %p21, %r6, 6; - @%p21 bra BB24_50; + @%p21 bra BB24_48; setp.eq.s32 %p22, %r6, 7; - @%p22 bra BB24_49; + @%p22 bra BB24_47; bra.uni BB24_17; -BB24_49: - setp.lt.f64 %p46, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46; - bra.uni BB24_69; +BB24_47: + setp.lt.f64 %p48, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB24_65; BB24_32: setp.eq.s32 %p8, %r6, 16; - @%p8 bra BB24_44; + @%p8 bra BB24_42; setp.eq.s32 %p9, %r6, 17; - @%p9 bra BB24_39; + @%p9 bra BB24_38; bra.uni BB24_34; -BB24_39: - setp.eq.f64 %p34, %fd1, 0d0000000000000000; - setp.eq.f64 %p35, %fd1, 0d8000000000000000; - or.pred %p36, %p34, %p35; - mov.f64 %fd99, 0d7FF8000000000000; - @%p36 bra BB24_69; +BB24_38: + setp.eq.f64 %p35, %fd1, 0d0000000000000000; + setp.eq.f64 %p36, %fd1, 0d8000000000000000; + or.pred %p37, %p35, %p36; + mov.f64 %fd98, 0d7FF8000000000000; + @%p37 bra BB24_65; - div.rn.f64 %fd99, %fd68, %fd1; - abs.f64 %fd72, %fd99; - setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; - @%p37 bra BB24_69; + div.rn.f64 %fd98, %fd68, %fd1; + abs.f64 %fd72, %fd98; + setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000; + @%p38 bra BB24_65; { .reg .b32 %temp; - mov.b64 {%temp, %r15}, %fd99; + mov.b64 {%r15, %temp}, %fd98; } - and.b32 %r16, %r15, 2147483647; - setp.ne.s32 %p38, %r16, 2146435072; - @%p38 bra BB24_43; - { .reg .b32 %temp; - mov.b64 {%r17, %temp}, %fd99; + mov.b64 {%temp, %r16}, %fd98; } - setp.eq.s32 %p39, %r17, 0; - @%p39 bra BB24_69; - -BB24_43: - cvt.rmi.f64.f64 %fd73, %fd99; + and.b32 %r17, %r16, 2147483647; + setp.ne.s32 %p39, %r17, 2146435072; + setp.ne.s32 %p40, %r15, 0; + or.pred %p41, %p39, %p40; + @!%p41 bra BB24_65; + bra.uni BB24_41; + +BB24_41: + cvt.rmi.f64.f64 %fd73, %fd98; mul.f64 %fd74, %fd1, %fd73; - sub.f64 %fd99, %fd68, %fd74; - bra.uni BB24_69; + sub.f64 %fd98, %fd68, %fd74; + bra.uni BB24_65; -BB24_76: - setp.eq.s32 %p91, %r6, 2; - @%p91 bra BB24_135; - bra.uni BB24_77; +BB24_72: + setp.eq.s32 %p95, %r6, 2; + @%p95 bra BB24_127; + bra.uni BB24_73; -BB24_135: - mul.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_127: + mul.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; -BB24_92: - setp.eq.s32 %p78, %r6, 11; - @%p78 bra BB24_115; +BB24_88: + setp.eq.s32 %p82, %r6, 11; + @%p82 bra BB24_109; - setp.eq.s32 %p79, %r6, 12; - @%p79 bra BB24_114; - bra.uni BB24_94; + setp.eq.s32 %p83, %r6, 12; + @%p83 bra BB24_108; + bra.uni BB24_90; -BB24_114: - max.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_108: + max.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; -BB24_83: - setp.eq.s32 %p85, %r6, 6; - @%p85 bra BB24_118; +BB24_79: + setp.eq.s32 %p89, %r6, 6; + @%p89 bra BB24_112; - setp.eq.s32 %p86, %r6, 7; - @%p86 bra BB24_117; - bra.uni BB24_85; + setp.eq.s32 %p90, %r6, 7; + @%p90 bra BB24_111; + bra.uni BB24_81; -BB24_117: - setp.gt.f64 %p110, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110; - bra.uni BB24_137; +BB24_111: + setp.gt.f64 %p116, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116; + bra.uni BB24_129; -BB24_100: - setp.eq.s32 %p72, %r6, 16; - @%p72 bra BB24_112; +BB24_96: + setp.eq.s32 %p76, %r6, 16; + @%p76 bra BB24_106; - setp.eq.s32 %p73, %r6, 17; - @%p73 bra BB24_107; - bra.uni BB24_102; + setp.eq.s32 %p77, %r6, 17; + @%p77 bra BB24_102; + bra.uni BB24_98; -BB24_107: - setp.eq.f64 %p98, %fd68, 0d0000000000000000; - setp.eq.f64 %p99, %fd68, 0d8000000000000000; - or.pred %p100, %p98, %p99; - mov.f64 %fd108, 0d7FF8000000000000; - @%p100 bra BB24_137; +BB24_102: + setp.eq.f64 %p103, %fd68, 0d0000000000000000; + setp.eq.f64 %p104, %fd68, 0d8000000000000000; + or.pred %p105, %p103, %p104; + mov.f64 %fd106, 0d7FF8000000000000; + @%p105 bra BB24_129; - div.rn.f64 %fd108, %fd1, %fd68; - abs.f64 %fd83, %fd108; - setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000; - @%p101 bra BB24_137; + div.rn.f64 %fd106, %fd1, %fd68; + abs.f64 %fd83, %fd106; + setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000; + @%p106 bra BB24_129; { .reg .b32 %temp; - mov.b64 {%temp, %r53}, %fd108; + mov.b64 {%r52, %temp}, %fd106; } - and.b32 %r54, %r53, 2147483647; - setp.ne.s32 %p102, %r54, 2146435072; - @%p102 bra BB24_111; - { .reg .b32 %temp; - mov.b64 {%r55, %temp}, %fd108; + mov.b64 {%temp, %r53}, %fd106; } - setp.eq.s32 %p103, %r55, 0; - @%p103 bra BB24_137; - -BB24_111: - cvt.rmi.f64.f64 %fd84, %fd108; + and.b32 %r54, %r53, 2147483647; + setp.ne.s32 %p107, %r54, 2146435072; + setp.ne.s32 %p108, %r52, 0; + or.pred %p109, %p107, %p108; + @!%p109 bra BB24_129; + bra.uni BB24_105; + +BB24_105: + cvt.rmi.f64.f64 %fd84, %fd106; mul.f64 %fd85, %fd84, %fd68; - sub.f64 %fd108, %fd1, %fd85; - bra.uni BB24_137; + sub.f64 %fd106, %fd1, %fd85; + bra.uni BB24_129; BB24_6: setp.eq.s32 %p30, %r6, 1; @%p30 bra BB24_7; - bra.uni BB24_69; + bra.uni BB24_65; BB24_7: - sub.f64 %fd99, %fd68, %fd1; - bra.uni BB24_69; + sub.f64 %fd98, %fd68, %fd1; + bra.uni BB24_65; BB24_22: setp.eq.s32 %p18, %r6, 10; @%p18 bra BB24_23; - bra.uni BB24_69; + bra.uni BB24_65; BB24_23: - setp.neu.f64 %p43, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43; - bra.uni BB24_69; + setp.neu.f64 %p45, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45; + bra.uni BB24_65; BB24_13: setp.eq.s32 %p25, %r6, 5; @%p25 bra BB24_14; - bra.uni BB24_69; + bra.uni BB24_65; BB24_14: - setp.gt.f64 %p48, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB24_69; + setp.gt.f64 %p50, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB24_65; BB24_30: setp.eq.s32 %p12, %r6, 15; @%p12 bra BB24_31; - bra.uni BB24_69; + bra.uni BB24_65; BB24_31: mul.f64 %fd76, %fd1, %fd68; mov.f64 %fd77, 0d3FF0000000000000; - sub.f64 %fd99, %fd77, %fd76; - bra.uni BB24_69; + sub.f64 %fd98, %fd77, %fd76; + bra.uni BB24_65; BB24_9: setp.eq.s32 %p28, %r6, 3; @%p28 bra BB24_10; - bra.uni BB24_69; + bra.uni BB24_65; BB24_10: - div.rn.f64 %fd99, %fd68, %fd1; - bra.uni BB24_69; + div.rn.f64 %fd98, %fd68, %fd1; + bra.uni BB24_65; -BB24_47: - min.f64 %fd99, %fd68, %fd1; - bra.uni BB24_69; +BB24_45: + min.f64 %fd98, %fd68, %fd1; + bra.uni BB24_65; BB24_26: setp.eq.s32 %p16, %r6, 13; @%p16 bra BB24_27; - bra.uni BB24_69; + bra.uni BB24_65; BB24_27: cvt.rni.s64.f64 %rd12, %fd68; @@ -2752,348 +2704,344 @@ BB24_27: cvt.u32.u64 %r21, %rd12; cvt.u32.u64 %r22, %rd13; and.b32 %r23, %r22, %r21; - setp.eq.s32 %p42, %r23, 0; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42; - bra.uni BB24_69; + setp.eq.s32 %p44, %r23, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44; + bra.uni BB24_65; -BB24_50: - setp.ltu.f64 %p47, %fd1, %fd68; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p47; - bra.uni BB24_69; +BB24_48: + setp.ge.f64 %p49, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49; + bra.uni BB24_65; BB24_17: setp.eq.s32 %p23, %r6, 8; @%p23 bra BB24_18; - bra.uni BB24_69; + bra.uni BB24_65; BB24_18: - setp.gtu.f64 %p45, %fd1, %fd68; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p45; - bra.uni BB24_69; + setp.le.f64 %p47, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47; + bra.uni BB24_65; -BB24_44: - setp.neu.f64 %p40, %fd68, 0d0000000000000000; +BB24_42: + setp.neu.f64 %p42, %fd68, 0d0000000000000000; sub.f64 %fd75, %fd68, %fd1; - selp.f64 %fd99, %fd75, 0d0000000000000000, %p40; - bra.uni BB24_69; + selp.f64 %fd98, %fd75, 0d0000000000000000, %p42; + bra.uni BB24_65; BB24_34: setp.ne.s32 %p10, %r6, 18; - @%p10 bra BB24_69; + @%p10 bra BB24_65; - div.rn.f64 %fd99, %fd68, %fd1; - abs.f64 %fd70, %fd99; + div.rn.f64 %fd98, %fd68, %fd1; + abs.f64 %fd70, %fd98; setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; - @%p31 bra BB24_69; + @%p31 bra BB24_65; { .reg .b32 %temp; - mov.b64 {%temp, %r12}, %fd99; + mov.b64 {%r12, %temp}, %fd98; } - and.b32 %r13, %r12, 2147483647; - setp.ne.s32 %p32, %r13, 2146435072; - @%p32 bra BB24_38; - { .reg .b32 %temp; - mov.b64 {%r14, %temp}, %fd99; + mov.b64 {%temp, %r13}, %fd98; } - setp.eq.s32 %p33, %r14, 0; - @%p33 bra BB24_69; + and.b32 %r14, %r13, 2147483647; + setp.ne.s32 %p32, %r14, 2146435072; + setp.ne.s32 %p33, %r12, 0; + or.pred %p34, %p32, %p33; + @!%p34 bra BB24_65; + bra.uni BB24_37; -BB24_38: - cvt.rmi.f64.f64 %fd99, %fd99; - bra.uni BB24_69; +BB24_37: + cvt.rmi.f64.f64 %fd98, %fd98; + bra.uni BB24_65; -BB24_74: - setp.eq.s32 %p94, %r6, 1; - @%p94 bra BB24_75; - bra.uni BB24_137; +BB24_70: + setp.eq.s32 %p98, %r6, 1; + @%p98 bra BB24_71; + bra.uni BB24_129; -BB24_75: - sub.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_71: + sub.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; -BB24_90: - setp.eq.s32 %p82, %r6, 10; - @%p82 bra BB24_91; - bra.uni BB24_137; +BB24_86: + setp.eq.s32 %p86, %r6, 10; + @%p86 bra BB24_87; + bra.uni BB24_129; -BB24_91: - setp.neu.f64 %p107, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107; - bra.uni BB24_137; +BB24_87: + setp.neu.f64 %p113, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113; + bra.uni BB24_129; -BB24_81: - setp.eq.s32 %p89, %r6, 5; - @%p89 bra BB24_82; - bra.uni BB24_137; +BB24_77: + setp.eq.s32 %p93, %r6, 5; + @%p93 bra BB24_78; + bra.uni BB24_129; -BB24_82: - setp.lt.f64 %p112, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112; - bra.uni BB24_137; +BB24_78: + setp.lt.f64 %p118, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118; + bra.uni BB24_129; -BB24_98: - setp.eq.s32 %p76, %r6, 15; - @%p76 bra BB24_99; - bra.uni BB24_137; +BB24_94: + setp.eq.s32 %p80, %r6, 15; + @%p80 bra BB24_95; + bra.uni BB24_129; -BB24_99: +BB24_95: mul.f64 %fd87, %fd1, %fd68; mov.f64 %fd88, 0d3FF0000000000000; - sub.f64 %fd108, %fd88, %fd87; - bra.uni BB24_137; + sub.f64 %fd106, %fd88, %fd87; + bra.uni BB24_129; -BB24_77: - setp.eq.s32 %p92, %r6, 3; - @%p92 bra BB24_78; - bra.uni BB24_137; +BB24_73: + setp.eq.s32 %p96, %r6, 3; + @%p96 bra BB24_74; + bra.uni BB24_129; -BB24_78: - div.rn.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_74: + div.rn.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; -BB24_115: - min.f64 %fd108, %fd1, %fd68; - bra.uni BB24_137; +BB24_109: + min.f64 %fd106, %fd1, %fd68; + bra.uni BB24_129; -BB24_94: - setp.eq.s32 %p80, %r6, 13; - @%p80 bra BB24_95; - bra.uni BB24_137; +BB24_90: + setp.eq.s32 %p84, %r6, 13; + @%p84 bra BB24_91; + bra.uni BB24_129; -BB24_95: +BB24_91: cvt.rni.s64.f64 %rd17, %fd1; cvt.rni.s64.f64 %rd18, %fd68; - cvt.u32.u64 %r59, %rd17; - cvt.u32.u64 %r60, %rd18; - and.b32 %r61, %r60, %r59; - setp.eq.s32 %p106, %r61, 0; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106; - bra.uni BB24_137; + cvt.u32.u64 %r58, %rd17; + cvt.u32.u64 %r59, %rd18; + and.b32 %r60, %r59, %r58; + setp.eq.s32 %p112, %r60, 0; + selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112; + bra.uni BB24_129; -BB24_118: - setp.gtu.f64 %p111, %fd1, %fd68; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p111; - bra.uni BB24_137; +BB24_112: + setp.le.f64 %p117, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117; + bra.uni BB24_129; -BB24_85: - setp.eq.s32 %p87, %r6, 8; - @%p87 bra BB24_86; - bra.uni BB24_137; +BB24_81: + setp.eq.s32 %p91, %r6, 8; + @%p91 bra BB24_82; + bra.uni BB24_129; -BB24_86: - setp.ltu.f64 %p109, %fd1, %fd68; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p109; - bra.uni BB24_137; +BB24_82: + setp.ge.f64 %p115, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115; + bra.uni BB24_129; -BB24_112: - setp.neu.f64 %p104, %fd1, 0d0000000000000000; +BB24_106: + setp.neu.f64 %p110, %fd1, 0d0000000000000000; sub.f64 %fd86, %fd1, %fd68; - selp.f64 %fd108, %fd86, 0d0000000000000000, %p104; - bra.uni BB24_137; + selp.f64 %fd106, %fd86, 0d0000000000000000, %p110; + bra.uni BB24_129; -BB24_102: - setp.ne.s32 %p74, %r6, 18; - @%p74 bra BB24_137; +BB24_98: + setp.ne.s32 %p78, %r6, 18; + @%p78 bra BB24_129; - div.rn.f64 %fd108, %fd1, %fd68; - abs.f64 %fd81, %fd108; - setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000; - @%p95 bra BB24_137; + div.rn.f64 %fd106, %fd1, %fd68; + abs.f64 %fd81, %fd106; + setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000; + @%p99 bra BB24_129; { .reg .b32 %temp; - mov.b64 {%temp, %r50}, %fd108; + mov.b64 {%r49, %temp}, %fd106; } - and.b32 %r51, %r50, 2147483647; - setp.ne.s32 %p96, %r51, 2146435072; - @%p96 bra BB24_106; - { .reg .b32 %temp; - mov.b64 {%r52, %temp}, %fd108; + mov.b64 {%temp, %r50}, %fd106; } - setp.eq.s32 %p97, %r52, 0; - @%p97 bra BB24_137; + and.b32 %r51, %r50, 2147483647; + setp.ne.s32 %p100, %r51, 2146435072; + setp.ne.s32 %p101, %r49, 0; + or.pred %p102, %p100, %p101; + @!%p102 bra BB24_129; + bra.uni BB24_101; -BB24_106: - cvt.rmi.f64.f64 %fd108, %fd108; - bra.uni BB24_137; +BB24_101: + cvt.rmi.f64.f64 %fd106, %fd106; + bra.uni BB24_129; -BB24_54: - setp.gt.s32 %p52, %r2, -1; - @%p52 bra BB24_57; +BB24_52: + setp.gt.s32 %p54, %r2, -1; + @%p54 bra BB24_55; cvt.rzi.f64.f64 %fd78, %fd1; - setp.neu.f64 %p53, %fd78, %fd1; - selp.f64 %fd97, 0dFFF8000000000000, %fd97, %p53; + setp.neu.f64 %p55, %fd78, %fd1; + selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55; -BB24_57: - mov.f64 %fd24, %fd97; +BB24_55: + mov.f64 %fd24, %fd96; add.f64 %fd25, %fd1, %fd68; { .reg .b32 %temp; mov.b64 {%temp, %r33}, %fd25; } and.b32 %r34, %r33, 2146435072; - setp.ne.s32 %p56, %r34, 2146435072; - mov.f64 %fd96, %fd24; - @%p56 bra BB24_66; - - setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000; - mov.f64 %fd96, %fd25; - @%p57 bra BB24_66; + setp.ne.s32 %p58, %r34, 2146435072; + mov.f64 %fd95, %fd24; + @%p58 bra BB24_62; - abs.f64 %fd79, %fd1; - setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000; + setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000; mov.f64 %fd95, %fd25; - mov.f64 %fd96, %fd95; - @%p58 bra BB24_66; - - and.b32 %r35, %r3, 2147483647; - setp.ne.s32 %p59, %r35, 2146435072; @%p59 bra BB24_62; - { - .reg .b32 %temp; - mov.b64 {%r36, %temp}, %fd1; - } - setp.eq.s32 %p60, %r36, 0; - @%p60 bra BB24_65; - -BB24_62: - and.b32 %r37, %r2, 2147483647; - setp.ne.s32 %p61, %r37, 2146435072; - mov.f64 %fd93, %fd24; - mov.f64 %fd96, %fd93; - @%p61 bra BB24_66; + abs.f64 %fd79, %fd1; + setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000; + mov.f64 %fd94, %fd25; + mov.f64 %fd95, %fd94; + @%p60 bra BB24_62; { .reg .b32 %temp; - mov.b64 {%r38, %temp}, %fd68; + mov.b64 {%r35, %temp}, %fd1; } - setp.ne.s32 %p62, %r38, 0; - mov.f64 %fd96, %fd24; - @%p62 bra BB24_66; + and.b32 %r36, %r3, 2147483647; + setp.eq.s32 %p61, %r36, 2146435072; + setp.eq.s32 %p62, %r35, 0; + and.pred %p63, %p61, %p62; + @%p63 bra BB24_61; + bra.uni BB24_59; + +BB24_61: + setp.gt.f64 %p67, %fd18, 0d3FF0000000000000; + selp.b32 %r44, 2146435072, 0, %p67; + xor.b32 %r45, %r44, 2146435072; + setp.lt.s32 %p68, %r3, 0; + selp.b32 %r46, %r45, %r44, %p68; + setp.eq.f64 %p69, %fd68, 0dBFF0000000000000; + selp.b32 %r47, 1072693248, %r46, %p69; + mov.u32 %r48, 0; + mov.b64 %fd95, {%r48, %r47}; + bra.uni BB24_62; - shr.s32 %r39, %r3, 31; - and.b32 %r40, %r39, -2146435072; - add.s32 %r41, %r40, 2146435072; - or.b32 %r42, %r41, -2147483648; - selp.b32 %r43, %r42, %r41, %p1; - mov.u32 %r44, 0; - mov.b64 %fd96, {%r44, %r43}; - bra.uni BB24_66; - -BB24_122: - setp.gt.s32 %p116, %r4, -1; - @%p116 bra BB24_125; +BB24_116: + setp.gt.s32 %p122, %r4, -1; + @%p122 bra BB24_119; cvt.rzi.f64.f64 %fd89, %fd68; - setp.neu.f64 %p117, %fd89, %fd68; - selp.f64 %fd106, 0dFFF8000000000000, %fd106, %p117; + setp.neu.f64 %p123, %fd89, %fd68; + selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123; -BB24_125: - mov.f64 %fd57, %fd106; +BB24_119: + mov.f64 %fd57, %fd104; add.f64 %fd58, %fd1, %fd68; { .reg .b32 %temp; - mov.b64 {%temp, %r71}, %fd58; + mov.b64 {%temp, %r70}, %fd58; } - and.b32 %r72, %r71, 2146435072; - setp.ne.s32 %p120, %r72, 2146435072; - mov.f64 %fd105, %fd57; - @%p120 bra BB24_134; + and.b32 %r71, %r70, 2146435072; + setp.ne.s32 %p126, %r71, 2146435072; + mov.f64 %fd103, %fd57; + @%p126 bra BB24_126; - setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000; - mov.f64 %fd105, %fd58; - @%p121 bra BB24_134; + setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000; + mov.f64 %fd103, %fd58; + @%p127 bra BB24_126; abs.f64 %fd90, %fd68; - setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000; - mov.f64 %fd104, %fd58; - mov.f64 %fd105, %fd104; - @%p122 bra BB24_134; - - and.b32 %r73, %r5, 2147483647; - setp.ne.s32 %p123, %r73, 2146435072; - @%p123 bra BB24_130; + setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000; + mov.f64 %fd102, %fd58; + mov.f64 %fd103, %fd102; + @%p128 bra BB24_126; { .reg .b32 %temp; - mov.b64 {%r74, %temp}, %fd68; + mov.b64 {%r72, %temp}, %fd68; } - setp.eq.s32 %p124, %r74, 0; - @%p124 bra BB24_133; - -BB24_130: - and.b32 %r75, %r4, 2147483647; - setp.ne.s32 %p125, %r75, 2146435072; - mov.f64 %fd102, %fd57; - mov.f64 %fd105, %fd102; - @%p125 bra BB24_134; + and.b32 %r73, %r5, 2147483647; + setp.eq.s32 %p129, %r73, 2146435072; + setp.eq.s32 %p130, %r72, 0; + and.pred %p131, %p129, %p130; + @%p131 bra BB24_125; + bra.uni BB24_123; +BB24_125: + setp.gt.f64 %p135, %fd51, 0d3FF0000000000000; + selp.b32 %r81, 2146435072, 0, %p135; + xor.b32 %r82, %r81, 2146435072; + setp.lt.s32 %p136, %r5, 0; + selp.b32 %r83, %r82, %r81, %p136; + setp.eq.f64 %p137, %fd1, 0dBFF0000000000000; + selp.b32 %r84, 1072693248, %r83, %p137; + mov.u32 %r85, 0; + mov.b64 %fd103, {%r85, %r84}; + bra.uni BB24_126; + +BB24_59: { .reg .b32 %temp; - mov.b64 {%r76, %temp}, %fd1; + mov.b64 {%r37, %temp}, %fd68; } - setp.ne.s32 %p126, %r76, 0; - mov.f64 %fd105, %fd57; - @%p126 bra BB24_134; + and.b32 %r38, %r2, 2147483647; + setp.eq.s32 %p64, %r38, 2146435072; + setp.eq.s32 %p65, %r37, 0; + and.pred %p66, %p64, %p65; + mov.f64 %fd95, %fd24; + @!%p66 bra BB24_62; + bra.uni BB24_60; + +BB24_60: + shr.s32 %r39, %r3, 31; + and.b32 %r40, %r39, -2146435072; + selp.b32 %r41, -1048576, 2146435072, %p1; + add.s32 %r42, %r41, %r40; + mov.u32 %r43, 0; + mov.b64 %fd95, {%r43, %r42}; - shr.s32 %r77, %r5, 31; - and.b32 %r78, %r77, -2146435072; - add.s32 %r79, %r78, 2146435072; - or.b32 %r80, %r79, -2147483648; - selp.b32 %r81, %r80, %r79, %p2; - mov.u32 %r82, 0; - mov.b64 %fd105, {%r82, %r81}; - bra.uni BB24_134; +BB24_62: + setp.eq.f64 %p70, %fd1, 0d0000000000000000; + setp.eq.f64 %p71, %fd68, 0d3FF0000000000000; + or.pred %p72, %p71, %p70; + selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72; BB24_65: - setp.gt.f64 %p63, %fd18, 0d3FF0000000000000; - selp.b32 %r45, 2146435072, 0, %p63; - xor.b32 %r46, %r45, 2146435072; - setp.lt.s32 %p64, %r3, 0; - selp.b32 %r47, %r46, %r45, %p64; - setp.eq.f64 %p65, %fd68, 0dBFF0000000000000; - selp.b32 %r48, 1072693248, %r47, %p65; - mov.u32 %r49, 0; - mov.b64 %fd96, {%r49, %r48}; - -BB24_66: - setp.eq.f64 %p66, %fd1, 0d0000000000000000; - setp.eq.f64 %p67, %fd68, 0d3FF0000000000000; - or.pred %p68, %p67, %p66; - selp.f64 %fd99, 0d3FF0000000000000, %fd96, %p68; - -BB24_69: - st.global.f64 [%rd1], %fd99; - bra.uni BB24_138; - -BB24_133: - setp.gt.f64 %p127, %fd51, 0d3FF0000000000000; - selp.b32 %r83, 2146435072, 0, %p127; - xor.b32 %r84, %r83, 2146435072; - setp.lt.s32 %p128, %r5, 0; - selp.b32 %r85, %r84, %r83, %p128; - setp.eq.f64 %p129, %fd1, 0dBFF0000000000000; - selp.b32 %r86, 1072693248, %r85, %p129; - mov.u32 %r87, 0; - mov.b64 %fd105, {%r87, %r86}; + st.global.f64 [%rd1], %fd98; + bra.uni BB24_130; -BB24_134: - setp.eq.f64 %p130, %fd68, 0d0000000000000000; - setp.eq.f64 %p131, %fd1, 0d3FF0000000000000; - or.pred %p132, %p131, %p130; - selp.f64 %fd108, 0d3FF0000000000000, %fd105, %p132; +BB24_123: + { + .reg .b32 %temp; + mov.b64 {%r74, %temp}, %fd1; + } + and.b32 %r75, %r4, 2147483647; + setp.eq.s32 %p132, %r75, 2146435072; + setp.eq.s32 %p133, %r74, 0; + and.pred %p134, %p132, %p133; + mov.f64 %fd103, %fd57; + @!%p134 bra BB24_126; + bra.uni BB24_124; -BB24_137: - st.global.f64 [%rd1], %fd108; +BB24_124: + shr.s32 %r76, %r5, 31; + and.b32 %r77, %r76, -2146435072; + selp.b32 %r78, -1048576, 2146435072, %p2; + add.s32 %r79, %r78, %r77; + mov.u32 %r80, 0; + mov.b64 %fd103, {%r80, %r79}; + +BB24_126: + setp.eq.f64 %p138, %fd68, 0d0000000000000000; + setp.eq.f64 %p139, %fd1, 0d3FF0000000000000; + or.pred %p140, %p139, %p138; + selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140; + +BB24_129: + st.global.f64 [%rd1], %fd106; -BB24_138: +BB24_130: bar.sync 0; ret; } @@ -3698,8 +3646,8 @@ BB25_27: bra.uni BB25_63; BB25_46: - setp.gtu.f32 %p43, %f1, %f2; - selp.f32 %f261, 0f00000000, 0f3F800000, %p43; + setp.le.f32 %p43, %f1, %f2; + selp.f32 %f261, 0f3F800000, 0f00000000, %p43; bra.uni BB25_63; BB25_17: @@ -3708,8 +3656,8 @@ BB25_17: bra.uni BB25_63; BB25_18: - setp.ltu.f32 %p41, %f1, %f2; - selp.f32 %f261, 0f00000000, 0f3F800000, %p41; + setp.ge.f32 %p41, %f1, %f2; + selp.f32 %f261, 0f3F800000, 0f00000000, %p41; bra.uni BB25_63; BB25_40: @@ -3799,8 +3747,8 @@ BB25_89: bra.uni BB25_125; BB25_108: - setp.gtu.f32 %p110, %f2, %f1; - selp.f32 %f264, 0f00000000, 0f3F800000, %p110; + setp.le.f32 %p110, %f2, %f1; + selp.f32 %f264, 0f3F800000, 0f00000000, %p110; bra.uni BB25_125; BB25_79: @@ -3809,8 +3757,8 @@ BB25_79: bra.uni BB25_125; BB25_80: - setp.ltu.f32 %p108, %f2, %f1; - selp.f32 %f264, 0f00000000, 0f3F800000, %p108; + setp.ge.f32 %p108, %f2, %f1; + selp.f32 %f264, 0f3F800000, 0f00000000, %p108; bra.uni BB25_125; BB25_102: @@ -3915,8 +3863,8 @@ BB25_56: setp.neu.f32 %p64, %f20, 0f7F800000; @%p64 bra BB25_60; - setp.ltu.f32 %p65, %f2, 0f00000000; - selp.b32 %r27, 0, 2139095040, %p65; + setp.ge.f32 %p65, %f2, 0f00000000; + selp.b32 %r27, 2139095040, 0, %p65; or.b32 %r28, %r27, -2147483648; selp.b32 %r29, %r28, %r27, %p1; mov.b32 %f260, %r29; @@ -3935,8 +3883,8 @@ BB25_118: setp.neu.f32 %p131, %f57, 0f7F800000; @%p131 bra BB25_122; - setp.ltu.f32 %p132, %f1, 0f00000000; - selp.b32 %r52, 0, 2139095040, %p132; + setp.ge.f32 %p132, %f1, 0f00000000; + selp.b32 %r52, 2139095040, 0, %p132; or.b32 %r53, %r52, -2147483648; selp.b32 %r54, %r53, %r52, %p2; mov.b32 %f263, %r54; @@ -8171,15 +8119,15 @@ BB60_4: mov.u64 %rd24, 0; @%p5 bra BB61_6; - setp.eq.s32 %p6, %r3, 189; + setp.ne.s32 %p6, %r3, 189; mov.u32 %r14, 64; sub.s32 %r15, %r14, %r4; shl.b64 %rd14, %rd25, %r15; cvt.u64.u32 %rd15, %r4; - selp.b64 %rd16, 0, %rd15, %p6; + selp.b64 %rd16, %rd15, 0, %p6; cvt.u32.u64 %r16, %rd16; shr.u64 %rd24, %rd25, %r16; - selp.b64 %rd25, 0, %rd14, %p6; + selp.b64 %rd25, %rd14, 0, %p6; BB61_6: shr.u64 %rd17, %rd25, 63; @@ -8455,16 +8403,16 @@ BB64_9: setp.lt.f32 %p2, %f5, 0f00800000; mul.f32 %f6, %f5, 0f4B000000; selp.f32 %f1, %f6, %f5, %p2; - selp.f32 %f7, 0fC1B80000, 0f00000000, %p2; mov.b32 %r6, %f1; add.s32 %r7, %r6, -1059760811; and.b32 %r8, %r7, -8388608; sub.s32 %r9, %r6, %r8; - mov.b32 %f8, %r9; - cvt.rn.f32.s32 %f9, %r8; + mov.b32 %f7, %r9; + cvt.rn.f32.s32 %f8, %r8; + selp.f32 %f9, 0fC1B80000, 0f00000000, %p2; mov.f32 %f10, 0f34000000; - fma.rn.f32 %f11, %f9, %f10, %f7; - add.f32 %f12, %f8, 0fBF800000; + fma.rn.f32 %f11, %f8, %f10, %f9; + add.f32 %f12, %f7, 0fBF800000; mov.f32 %f13, 0f3E1039F6; mov.f32 %f14, 0fBE055027; fma.rn.f32 %f15, %f14, %f12, %f13; @@ -8658,7 +8606,7 @@ BB69_2: .local .align 4 .b8 __local_depot70[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<7>; + .reg .pred %p<9>; .reg .b32 %r<18>; .reg .f64 %fd<41>; .reg .b64 %rd<17>; @@ -8676,7 +8624,7 @@ BB69_2: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB70_11; + @%p1 bra BB70_10; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -8685,23 +8633,24 @@ BB69_2: ld.global.f64 %fd38, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd38; + mov.b64 {%r9, %temp}, %fd38; } - and.b32 %r10, %r9, 2147483647; - setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB70_4; - { .reg .b32 %temp; - mov.b64 {%r11, %temp}, %fd38; + mov.b64 {%temp, %r10}, %fd38; } - setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB70_4; + and.b32 %r11, %r10, 2147483647; + setp.eq.s32 %p2, %r11, 2146435072; + setp.eq.s32 %p3, %r9, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB70_3; + bra.uni BB70_2; +BB70_2: mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB70_4: +BB70_3: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r17, %fd15; st.local.u32 [%rd1], %r17; @@ -8718,8 +8667,8 @@ BB70_4: mov.b64 {%temp, %r12}, %fd38; } and.b32 %r13, %r12, 2145386496; - setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB70_6; + setp.lt.u32 %p5, %r13, 1105199104; + @%p5 bra BB70_5; // Callseq Start 3 { @@ -8742,11 +8691,11 @@ BB70_4: }// Callseq End 3 ld.local.u32 %r17, [%rd1]; -BB70_6: +BB70_5: and.b32 %r14, %r17, 1; shl.b32 %r15, %r14, 3; - setp.eq.s32 %p5, %r14, 0; - selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + setp.eq.b32 %p6, %r14, 1; + selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6; mul.wide.u32 %rd10, %r15, 8; mov.u64 %rd11, __cudart_sin_cos_coeffs; add.s64 %rd12, %rd10, %rd11; @@ -8764,27 +8713,28 @@ BB70_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB70_8; + setp.eq.s32 %p7, %r14, 0; + @%p7 bra BB70_7; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB70_8: +BB70_7: and.b32 %r16, %r17, 2; - setp.eq.s32 %p6, %r16, 0; - @%p6 bra BB70_10; + setp.eq.s32 %p8, %r16, 0; + @%p8 bra BB70_9; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB70_10: +BB70_9: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB70_11: +BB70_10: ret; } @@ -8800,7 +8750,7 @@ BB70_11: .reg .b64 %SPL; .reg .pred %p<15>; .reg .f32 %f<48>; - .reg .b32 %r<97>; + .reg .b32 %r<95>; .reg .b64 %rd<22>; @@ -8808,13 +8758,13 @@ BB70_11: cvta.local.u64 %SP, %rd21; ld.param.u64 %rd8, [matrix_sin_f_param_0]; ld.param.u64 %rd9, [matrix_sin_f_param_1]; - ld.param.u32 %r37, [matrix_sin_f_param_2]; - mov.u32 %r38, %ntid.x; - mov.u32 %r39, %ctaid.x; - mov.u32 %r40, %tid.x; - mad.lo.s32 %r1, %r38, %r39, %r40; - setp.ge.u32 %p1, %r1, %r37; - @%p1 bra BB71_24; + ld.param.u32 %r30, [matrix_sin_f_param_2]; + mov.u32 %r31, %ntid.x; + mov.u32 %r32, %ctaid.x; + mov.u32 %r33, %tid.x; + mad.lo.s32 %r1, %r31, %r32, %r33; + setp.ge.u32 %p1, %r1, %r30; + @%p1 bra BB71_22; cvta.to.global.u64 %rd10, %rd8; cvt.s64.s32 %rd1, %r1; @@ -8832,8 +8782,8 @@ BB70_11: BB71_3: mul.f32 %f21, %f43, 0f3F22F983; - cvt.rni.s32.f32 %r96, %f21; - cvt.rn.f32.s32 %f22, %r96; + cvt.rni.s32.f32 %r94, %f21; + cvt.rn.f32.s32 %f22, %r94; neg.f32 %f23, %f22; mov.f32 %f24, 0f3FC90FDA; fma.rn.f32 %f25, %f23, %f24, %f43; @@ -8843,12 +8793,12 @@ BB71_3: fma.rn.f32 %f44, %f23, %f28, %f27; abs.f32 %f29, %f43; setp.leu.f32 %p3, %f29, 0f47CE4780; - @%p3 bra BB71_13; + @%p3 bra BB71_11; mov.b32 %r3, %f43; shr.u32 %r4, %r3, 23; - shl.b32 %r43, %r3, 8; - or.b32 %r5, %r43, -2147483648; + shl.b32 %r36, %r3, 8; + or.b32 %r5, %r36, -2147483648; mov.u32 %r88, 0; mov.u64 %rd19, __cudart_i2opi_f; mov.u32 %r87, -6; @@ -8857,14 +8807,14 @@ BB71_3: BB71_5: .pragma "nounroll"; mov.u64 %rd4, %rd20; - ld.const.u32 %r46, [%rd19]; + ld.const.u32 %r39, [%rd19]; // inline asm { - mad.lo.cc.u32 %r44, %r46, %r5, %r88; - madc.hi.u32 %r88, %r46, %r5, 0; + mad.lo.cc.u32 %r37, %r39, %r5, %r88; + madc.hi.u32 %r88, %r39, %r5, 0; } // inline asm - st.local.u32 [%rd4], %r44; + st.local.u32 [%rd4], %r37; add.s64 %rd5, %rd4, 4; add.s64 %rd19, %rd19, 4; add.s32 %r87, %r87, 1; @@ -8872,14 +8822,14 @@ BB71_5: mov.u64 %rd20, %rd5; @%p4 bra BB71_5; - and.b32 %r49, %r4, 255; - add.s32 %r50, %r49, -128; - shr.u32 %r51, %r50, 5; + and.b32 %r42, %r4, 255; + add.s32 %r43, %r42, -128; + shr.u32 %r44, %r43, 5; and.b32 %r10, %r3, -2147483648; st.local.u32 [%rd2+24], %r88; - mov.u32 %r52, 6; - sub.s32 %r53, %r52, %r51; - mul.wide.s32 %rd15, %r53, 4; + mov.u32 %r45, 6; + sub.s32 %r46, %r45, %r44; + mul.wide.s32 %rd15, %r46, 4; add.s64 %rd7, %rd2, %rd15; ld.local.u32 %r89, [%rd7]; ld.local.u32 %r90, [%rd7+-4]; @@ -8887,67 +8837,65 @@ BB71_5: setp.eq.s32 %p5, %r13, 0; @%p5 bra BB71_8; - mov.u32 %r54, 32; - sub.s32 %r55, %r54, %r13; - shr.u32 %r56, %r90, %r55; - shl.b32 %r57, %r89, %r13; - add.s32 %r89, %r56, %r57; - ld.local.u32 %r58, [%rd7+-8]; - shr.u32 %r59, %r58, %r55; - shl.b32 %r60, %r90, %r13; - add.s32 %r90, %r59, %r60; + mov.u32 %r47, 32; + sub.s32 %r48, %r47, %r13; + shr.u32 %r49, %r90, %r48; + shl.b32 %r50, %r89, %r13; + add.s32 %r89, %r49, %r50; + ld.local.u32 %r51, [%rd7+-8]; + shr.u32 %r52, %r51, %r48; + shl.b32 %r53, %r90, %r13; + add.s32 %r90, %r52, %r53; BB71_8: - shr.u32 %r61, %r90, 30; - shl.b32 %r62, %r89, 2; - add.s32 %r91, %r61, %r62; + shr.u32 %r54, %r90, 30; + shl.b32 %r55, %r89, 2; + add.s32 %r91, %r54, %r55; shl.b32 %r19, %r90, 2; - shr.u32 %r63, %r91, 31; - shr.u32 %r64, %r89, 30; - add.s32 %r20, %r63, %r64; - setp.eq.s32 %p6, %r63, 0; + shr.u32 %r56, %r91, 31; + shr.u32 %r57, %r89, 30; + add.s32 %r20, %r56, %r57; + setp.eq.s32 %p6, %r56, 0; mov.u32 %r92, %r10; mov.u32 %r93, %r19; @%p6 bra BB71_10; - not.b32 %r65, %r91; + not.b32 %r58, %r91; neg.s32 %r21, %r19; setp.eq.s32 %p7, %r19, 0; - selp.u32 %r66, 1, 0, %p7; - add.s32 %r91, %r66, %r65; + selp.u32 %r59, 1, 0, %p7; + add.s32 %r91, %r59, %r58; xor.b32 %r23, %r10, -2147483648; mov.u32 %r92, %r23; mov.u32 %r93, %r21; BB71_10: mov.u32 %r25, %r92; - neg.s32 %r67, %r20; - setp.eq.s32 %p8, %r10, 0; - selp.b32 %r96, %r20, %r67, %p8; - clz.b32 %r95, %r91; - setp.eq.s32 %p9, %r95, 0; - shl.b32 %r68, %r91, %r95; - mov.u32 %r69, 32; - sub.s32 %r70, %r69, %r95; - shr.u32 %r71, %r93, %r70; - add.s32 %r72, %r71, %r68; - selp.b32 %r29, %r91, %r72, %p9; - mov.u32 %r73, -921707870; - mul.hi.u32 %r94, %r29, %r73; - setp.lt.s32 %p10, %r94, 1; - @%p10 bra BB71_12; - - mul.lo.s32 %r74, %r29, -921707870; - shr.u32 %r75, %r74, 31; - shl.b32 %r76, %r94, 1; - add.s32 %r94, %r75, %r76; - add.s32 %r95, %r95, 1; - -BB71_12: - mov.u32 %r77, 126; - sub.s32 %r78, %r77, %r95; + neg.s32 %r60, %r20; + setp.ne.s32 %p8, %r10, 0; + selp.b32 %r94, %r60, %r20, %p8; + clz.b32 %r61, %r91; + setp.ne.s32 %p9, %r61, 0; + shl.b32 %r62, %r91, %r61; + mov.u32 %r63, 32; + sub.s32 %r64, %r63, %r61; + shr.u32 %r65, %r93, %r64; + add.s32 %r66, %r65, %r62; + selp.b32 %r67, %r66, %r91, %p9; + mul.lo.s32 %r68, %r67, -921707870; + mov.u32 %r69, -921707870; + mul.hi.u32 %r70, %r67, %r69; + setp.gt.s32 %p10, %r70, 0; + shl.b32 %r71, %r70, 1; + shr.u32 %r72, %r68, 31; + add.s32 %r73, %r72, %r71; + selp.b32 %r74, %r73, %r70, %p10; + selp.b32 %r75, -1, 0, %p10; + mov.u32 %r76, 126; + sub.s32 %r77, %r76, %r61; + add.s32 %r78, %r77, %r75; shl.b32 %r79, %r78, 23; - add.s32 %r80, %r94, 1; + add.s32 %r80, %r74, 1; shr.u32 %r81, %r80, 7; add.s32 %r82, %r81, 1; shr.u32 %r83, %r82, 1; @@ -8955,60 +8903,60 @@ BB71_12: or.b32 %r85, %r84, %r25; mov.b32 %f44, %r85; -BB71_13: +BB71_11: mul.rn.f32 %f7, %f44, %f44; - and.b32 %r36, %r96, 1; - setp.eq.s32 %p11, %r36, 0; - @%p11 bra BB71_15; + and.b32 %r29, %r94, 1; + setp.eq.s32 %p11, %r29, 0; + @%p11 bra BB71_13; mov.f32 %f30, 0fBAB6061A; mov.f32 %f31, 0f37CCF5CE; fma.rn.f32 %f45, %f31, %f7, %f30; - bra.uni BB71_16; + bra.uni BB71_14; -BB71_15: +BB71_13: mov.f32 %f32, 0f3C08839E; mov.f32 %f33, 0fB94CA1F9; fma.rn.f32 %f45, %f33, %f7, %f32; -BB71_16: - @%p11 bra BB71_18; +BB71_14: + @%p11 bra BB71_16; mov.f32 %f34, 0f3D2AAAA5; fma.rn.f32 %f35, %f45, %f7, %f34; mov.f32 %f36, 0fBF000000; fma.rn.f32 %f46, %f35, %f7, %f36; - bra.uni BB71_19; + bra.uni BB71_17; -BB71_18: +BB71_16: mov.f32 %f37, 0fBE2AAAA3; fma.rn.f32 %f38, %f45, %f7, %f37; mov.f32 %f39, 0f00000000; fma.rn.f32 %f46, %f38, %f7, %f39; -BB71_19: +BB71_17: fma.rn.f32 %f47, %f46, %f44, %f44; - @%p11 bra BB71_21; + @%p11 bra BB71_19; mov.f32 %f40, 0f3F800000; fma.rn.f32 %f47, %f46, %f7, %f40; -BB71_21: - and.b32 %r86, %r96, 2; +BB71_19: + and.b32 %r86, %r94, 2; setp.eq.s32 %p14, %r86, 0; - @%p14 bra BB71_23; + @%p14 bra BB71_21; mov.f32 %f41, 0f00000000; mov.f32 %f42, 0fBF800000; fma.rn.f32 %f47, %f47, %f42, %f41; -BB71_23: +BB71_21: cvta.to.global.u64 %rd16, %rd9; shl.b64 %rd17, %rd1, 2; add.s64 %rd18, %rd16, %rd17; st.global.f32 [%rd18], %f47; -BB71_24: +BB71_22: ret; } @@ -9139,8 +9087,8 @@ BB72_2: fma.rn.f64 %fd48, %fd47, %fd45, %fd46; div.rn.f64 %fd49, %fd45, %fd48; add.f64 %fd50, %fd49, %fd45; - setp.ltu.f64 %p6, %fd1, 0d408633CE8FB9F87E; - selp.f64 %fd67, %fd50, 0d7FF0000000000000, %p6; + setp.ge.f64 %p6, %fd1, 0d408633CE8FB9F87E; + selp.f64 %fd67, 0d7FF0000000000000, %fd50, %p6; BB72_4: cvta.to.global.u64 %rd7, %rd3; @@ -9229,8 +9177,8 @@ BB73_2: mov.f32 %f20, 0f40000000; fma.rn.f32 %f21, %f20, %f16, %f19; mov.b32 %r6, %f21; - setp.ltu.f32 %p3, %f2, 0f42B40000; - selp.b32 %r7, %r6, 2139095040, %p3; + setp.ge.f32 %p3, %f2, 0f42B40000; + selp.b32 %r7, 2139095040, %r6, %p3; mov.b32 %r8, %f1; and.b32 %r9, %r8, -2147483648; or.b32 %r10, %r7, %r9; @@ -9256,7 +9204,7 @@ BB73_5: .local .align 4 .b8 __local_depot74[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<7>; + .reg .pred %p<9>; .reg .b32 %r<19>; .reg .f64 %fd<41>; .reg .b64 %rd<17>; @@ -9274,7 +9222,7 @@ BB73_5: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB74_11; + @%p1 bra BB74_10; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -9283,23 +9231,24 @@ BB73_5: ld.global.f64 %fd38, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r10}, %fd38; + mov.b64 {%r10, %temp}, %fd38; } - and.b32 %r11, %r10, 2147483647; - setp.ne.s32 %p2, %r11, 2146435072; - @%p2 bra BB74_4; - { .reg .b32 %temp; - mov.b64 {%r12, %temp}, %fd38; + mov.b64 {%temp, %r11}, %fd38; } - setp.ne.s32 %p3, %r12, 0; - @%p3 bra BB74_4; + and.b32 %r12, %r11, 2147483647; + setp.eq.s32 %p2, %r12, 2146435072; + setp.eq.s32 %p3, %r10, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB74_3; + bra.uni BB74_2; +BB74_2: mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB74_4: +BB74_3: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r18, %fd15; st.local.u32 [%rd1], %r18; @@ -9316,8 +9265,8 @@ BB74_4: mov.b64 {%temp, %r13}, %fd38; } and.b32 %r14, %r13, 2145386496; - setp.lt.u32 %p4, %r14, 1105199104; - @%p4 bra BB74_6; + setp.lt.u32 %p5, %r14, 1105199104; + @%p5 bra BB74_5; // Callseq Start 4 { @@ -9340,12 +9289,12 @@ BB74_4: }// Callseq End 4 ld.local.u32 %r18, [%rd1]; -BB74_6: +BB74_5: add.s32 %r5, %r18, 1; and.b32 %r15, %r5, 1; shl.b32 %r16, %r15, 3; - setp.eq.s32 %p5, %r15, 0; - selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + setp.eq.b32 %p6, %r15, 1; + selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6; mul.wide.u32 %rd10, %r16, 8; mov.u64 %rd11, __cudart_sin_cos_coeffs; add.s64 %rd12, %rd10, %rd11; @@ -9363,27 +9312,28 @@ BB74_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB74_8; + setp.eq.s32 %p7, %r15, 0; + @%p7 bra BB74_7; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB74_8: +BB74_7: and.b32 %r17, %r5, 2; - setp.eq.s32 %p6, %r17, 0; - @%p6 bra BB74_10; + setp.eq.s32 %p8, %r17, 0; + @%p8 bra BB74_9; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB74_10: +BB74_9: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB74_11: +BB74_10: ret; } @@ -9399,7 +9349,7 @@ BB74_11: .reg .b64 %SPL; .reg .pred %p<15>; .reg .f32 %f<48>; - .reg .b32 %r<98>; + .reg .b32 %r<96>; .reg .b64 %rd<22>; @@ -9407,13 +9357,13 @@ BB74_11: cvta.local.u64 %SP, %rd21; ld.param.u64 %rd8, [matrix_cos_f_param_0]; ld.param.u64 %rd9, [matrix_cos_f_param_1]; - ld.param.u32 %r38, [matrix_cos_f_param_2]; - mov.u32 %r39, %ntid.x; - mov.u32 %r40, %ctaid.x; - mov.u32 %r41, %tid.x; - mad.lo.s32 %r1, %r39, %r40, %r41; - setp.ge.u32 %p1, %r1, %r38; - @%p1 bra BB75_24; + ld.param.u32 %r31, [matrix_cos_f_param_2]; + mov.u32 %r32, %ntid.x; + mov.u32 %r33, %ctaid.x; + mov.u32 %r34, %tid.x; + mad.lo.s32 %r1, %r32, %r33, %r34; + setp.ge.u32 %p1, %r1, %r31; + @%p1 bra BB75_22; cvta.to.global.u64 %rd10, %rd8; cvt.s64.s32 %rd1, %r1; @@ -9431,8 +9381,8 @@ BB74_11: BB75_3: mul.f32 %f21, %f43, 0f3F22F983; - cvt.rni.s32.f32 %r97, %f21; - cvt.rn.f32.s32 %f22, %r97; + cvt.rni.s32.f32 %r95, %f21; + cvt.rn.f32.s32 %f22, %r95; neg.f32 %f23, %f22; mov.f32 %f24, 0f3FC90FDA; fma.rn.f32 %f25, %f23, %f24, %f43; @@ -9442,12 +9392,12 @@ BB75_3: fma.rn.f32 %f44, %f23, %f28, %f27; abs.f32 %f29, %f43; setp.leu.f32 %p3, %f29, 0f47CE4780; - @%p3 bra BB75_13; + @%p3 bra BB75_11; mov.b32 %r3, %f43; shr.u32 %r4, %r3, 23; - shl.b32 %r44, %r3, 8; - or.b32 %r5, %r44, -2147483648; + shl.b32 %r37, %r3, 8; + or.b32 %r5, %r37, -2147483648; mov.u32 %r89, 0; mov.u64 %rd19, __cudart_i2opi_f; mov.u32 %r88, -6; @@ -9456,14 +9406,14 @@ BB75_3: BB75_5: .pragma "nounroll"; mov.u64 %rd4, %rd20; - ld.const.u32 %r47, [%rd19]; + ld.const.u32 %r40, [%rd19]; // inline asm { - mad.lo.cc.u32 %r45, %r47, %r5, %r89; - madc.hi.u32 %r89, %r47, %r5, 0; + mad.lo.cc.u32 %r38, %r40, %r5, %r89; + madc.hi.u32 %r89, %r40, %r5, 0; } // inline asm - st.local.u32 [%rd4], %r45; + st.local.u32 [%rd4], %r38; add.s64 %rd5, %rd4, 4; add.s64 %rd19, %rd19, 4; add.s32 %r88, %r88, 1; @@ -9471,14 +9421,14 @@ BB75_5: mov.u64 %rd20, %rd5; @%p4 bra BB75_5; - and.b32 %r50, %r4, 255; - add.s32 %r51, %r50, -128; - shr.u32 %r52, %r51, 5; + and.b32 %r43, %r4, 255; + add.s32 %r44, %r43, -128; + shr.u32 %r45, %r44, 5; and.b32 %r10, %r3, -2147483648; st.local.u32 [%rd2+24], %r89; - mov.u32 %r53, 6; - sub.s32 %r54, %r53, %r52; - mul.wide.s32 %rd15, %r54, 4; + mov.u32 %r46, 6; + sub.s32 %r47, %r46, %r45; + mul.wide.s32 %rd15, %r47, 4; add.s64 %rd7, %rd2, %rd15; ld.local.u32 %r90, [%rd7]; ld.local.u32 %r91, [%rd7+-4]; @@ -9486,67 +9436,65 @@ BB75_5: setp.eq.s32 %p5, %r13, 0; @%p5 bra BB75_8; - mov.u32 %r55, 32; - sub.s32 %r56, %r55, %r13; - shr.u32 %r57, %r91, %r56; - shl.b32 %r58, %r90, %r13; - add.s32 %r90, %r57, %r58; - ld.local.u32 %r59, [%rd7+-8]; - shr.u32 %r60, %r59, %r56; - shl.b32 %r61, %r91, %r13; - add.s32 %r91, %r60, %r61; + mov.u32 %r48, 32; + sub.s32 %r49, %r48, %r13; + shr.u32 %r50, %r91, %r49; + shl.b32 %r51, %r90, %r13; + add.s32 %r90, %r50, %r51; + ld.local.u32 %r52, [%rd7+-8]; + shr.u32 %r53, %r52, %r49; + shl.b32 %r54, %r91, %r13; + add.s32 %r91, %r53, %r54; BB75_8: - shr.u32 %r62, %r91, 30; - shl.b32 %r63, %r90, 2; - add.s32 %r92, %r62, %r63; + shr.u32 %r55, %r91, 30; + shl.b32 %r56, %r90, 2; + add.s32 %r92, %r55, %r56; shl.b32 %r19, %r91, 2; - shr.u32 %r64, %r92, 31; - shr.u32 %r65, %r90, 30; - add.s32 %r20, %r64, %r65; - setp.eq.s32 %p6, %r64, 0; + shr.u32 %r57, %r92, 31; + shr.u32 %r58, %r90, 30; + add.s32 %r20, %r57, %r58; + setp.eq.s32 %p6, %r57, 0; mov.u32 %r93, %r10; mov.u32 %r94, %r19; @%p6 bra BB75_10; - not.b32 %r66, %r92; + not.b32 %r59, %r92; neg.s32 %r21, %r19; setp.eq.s32 %p7, %r19, 0; - selp.u32 %r67, 1, 0, %p7; - add.s32 %r92, %r67, %r66; + selp.u32 %r60, 1, 0, %p7; + add.s32 %r92, %r60, %r59; xor.b32 %r23, %r10, -2147483648; mov.u32 %r93, %r23; mov.u32 %r94, %r21; BB75_10: mov.u32 %r25, %r93; - neg.s32 %r68, %r20; - setp.eq.s32 %p8, %r10, 0; - selp.b32 %r97, %r20, %r68, %p8; - clz.b32 %r96, %r92; - setp.eq.s32 %p9, %r96, 0; - shl.b32 %r69, %r92, %r96; - mov.u32 %r70, 32; - sub.s32 %r71, %r70, %r96; - shr.u32 %r72, %r94, %r71; - add.s32 %r73, %r72, %r69; - selp.b32 %r29, %r92, %r73, %p9; - mov.u32 %r74, -921707870; - mul.hi.u32 %r95, %r29, %r74; - setp.lt.s32 %p10, %r95, 1; - @%p10 bra BB75_12; - - mul.lo.s32 %r75, %r29, -921707870; - shr.u32 %r76, %r75, 31; - shl.b32 %r77, %r95, 1; - add.s32 %r95, %r76, %r77; - add.s32 %r96, %r96, 1; - -BB75_12: - mov.u32 %r78, 126; - sub.s32 %r79, %r78, %r96; + neg.s32 %r61, %r20; + setp.ne.s32 %p8, %r10, 0; + selp.b32 %r95, %r61, %r20, %p8; + clz.b32 %r62, %r92; + setp.ne.s32 %p9, %r62, 0; + shl.b32 %r63, %r92, %r62; + mov.u32 %r64, 32; + sub.s32 %r65, %r64, %r62; + shr.u32 %r66, %r94, %r65; + add.s32 %r67, %r66, %r63; + selp.b32 %r68, %r67, %r92, %p9; + mul.lo.s32 %r69, %r68, -921707870; + mov.u32 %r70, -921707870; + mul.hi.u32 %r71, %r68, %r70; + setp.gt.s32 %p10, %r71, 0; + shl.b32 %r72, %r71, 1; + shr.u32 %r73, %r69, 31; + add.s32 %r74, %r73, %r72; + selp.b32 %r75, %r74, %r71, %p10; + selp.b32 %r76, -1, 0, %p10; + mov.u32 %r77, 126; + sub.s32 %r78, %r77, %r62; + add.s32 %r79, %r78, %r76; shl.b32 %r80, %r79, 23; - add.s32 %r81, %r95, 1; + add.s32 %r81, %r75, 1; shr.u32 %r82, %r81, 7; add.s32 %r83, %r82, 1; shr.u32 %r84, %r83, 1; @@ -9554,61 +9502,61 @@ BB75_12: or.b32 %r86, %r85, %r25; mov.b32 %f44, %r86; -BB75_13: +BB75_11: mul.rn.f32 %f7, %f44, %f44; - add.s32 %r36, %r97, 1; - and.b32 %r37, %r36, 1; - setp.eq.s32 %p11, %r37, 0; - @%p11 bra BB75_15; + add.s32 %r29, %r95, 1; + and.b32 %r30, %r29, 1; + setp.eq.s32 %p11, %r30, 0; + @%p11 bra BB75_13; mov.f32 %f30, 0fBAB6061A; mov.f32 %f31, 0f37CCF5CE; fma.rn.f32 %f45, %f31, %f7, %f30; - bra.uni BB75_16; + bra.uni BB75_14; -BB75_15: +BB75_13: mov.f32 %f32, 0f3C08839E; mov.f32 %f33, 0fB94CA1F9; fma.rn.f32 %f45, %f33, %f7, %f32; -BB75_16: - @%p11 bra BB75_18; +BB75_14: + @%p11 bra BB75_16; mov.f32 %f34, 0f3D2AAAA5; fma.rn.f32 %f35, %f45, %f7, %f34; mov.f32 %f36, 0fBF000000; fma.rn.f32 %f46, %f35, %f7, %f36; - bra.uni BB75_19; + bra.uni BB75_17; -BB75_18: +BB75_16: mov.f32 %f37, 0fBE2AAAA3; fma.rn.f32 %f38, %f45, %f7, %f37; mov.f32 %f39, 0f00000000; fma.rn.f32 %f46, %f38, %f7, %f39; -BB75_19: +BB75_17: fma.rn.f32 %f47, %f46, %f44, %f44; - @%p11 bra BB75_21; + @%p11 bra BB75_19; mov.f32 %f40, 0f3F800000; fma.rn.f32 %f47, %f46, %f7, %f40; -BB75_21: - and.b32 %r87, %r36, 2; +BB75_19: + and.b32 %r87, %r29, 2; setp.eq.s32 %p14, %r87, 0; - @%p14 bra BB75_23; + @%p14 bra BB75_21; mov.f32 %f41, 0f00000000; mov.f32 %f42, 0fBF800000; fma.rn.f32 %f47, %f47, %f42, %f41; -BB75_23: +BB75_21: cvta.to.global.u64 %rd16, %rd9; shl.b64 %rd17, %rd1, 2; add.s64 %rd18, %rd16, %rd17; st.global.f32 [%rd18], %f47; -BB75_24: +BB75_22: ret; } @@ -9718,8 +9666,8 @@ BB76_3: bra.uni BB76_4; BB76_2: - setp.gtu.f64 %p3, %fd1, 0d7FF0000000000000; - selp.f64 %fd45, %fd1, 0d7FF0000000000000, %p3; + setp.le.f64 %p3, %fd1, 0d7FF0000000000000; + selp.f64 %fd45, 0d7FF0000000000000, %fd1, %p3; BB76_4: cvta.to.global.u64 %rd7, %rd3; @@ -9777,8 +9725,8 @@ BB76_5: div.approx.f32 %f15, %f14, %f13; mov.f32 %f16, 0f40000000; fma.rn.f32 %f17, %f16, %f13, %f15; - setp.ltu.f32 %p2, %f4, 0f42B40000; - selp.f32 %f18, %f17, 0f7F800000, %p2; + setp.ge.f32 %p2, %f4, 0f42B40000; + selp.f32 %f18, 0f7F800000, %f17, %p2; cvta.to.global.u64 %rd6, %rd2; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f18; @@ -9797,7 +9745,7 @@ BB77_2: .local .align 4 .b8 __local_depot78[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<6>; + .reg .pred %p<7>; .reg .b32 %r<16>; .reg .f64 %fd<66>; .reg .b64 %rd<14>; @@ -9815,7 +9763,7 @@ BB77_2: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB78_9; + @%p1 bra BB78_8; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -9824,23 +9772,24 @@ BB77_2: ld.global.f64 %fd63, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd63; + mov.b64 {%r9, %temp}, %fd63; } - and.b32 %r10, %r9, 2147483647; - setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB78_4; - { .reg .b32 %temp; - mov.b64 {%r11, %temp}, %fd63; + mov.b64 {%temp, %r10}, %fd63; } - setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB78_4; + and.b32 %r11, %r10, 2147483647; + setp.eq.s32 %p2, %r11, 2146435072; + setp.eq.s32 %p3, %r9, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB78_3; + bra.uni BB78_2; +BB78_2: mov.f64 %fd11, 0d0000000000000000; mul.rn.f64 %fd63, %fd63, %fd11; -BB78_4: +BB78_3: mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r15, %fd12; st.local.u32 [%rd1], %r15; @@ -9857,8 +9806,8 @@ BB78_4: mov.b64 {%temp, %r12}, %fd63; } and.b32 %r13, %r12, 2145386496; - setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB78_6; + setp.lt.u32 %p5, %r13, 1105199104; + @%p5 bra BB78_5; // Callseq Start 5 { @@ -9881,7 +9830,7 @@ BB78_4: }// Callseq End 5 ld.local.u32 %r15, [%rd1]; -BB78_6: +BB78_5: mul.f64 %fd20, %fd64, %fd64; mov.f64 %fd21, 0dBEF9757C5B27EBB1; mov.f64 %fd22, 0d3EE48DAC2799BCB9; @@ -9915,11 +9864,11 @@ BB78_6: mul.f64 %fd7, %fd20, %fd49; fma.rn.f64 %fd65, %fd7, %fd64, %fd64; and.b32 %r14, %r15, 1; - setp.eq.b32 %p5, %r14, 1; - @!%p5 bra BB78_8; - bra.uni BB78_7; + setp.eq.b32 %p6, %r14, 1; + @!%p6 bra BB78_7; + bra.uni BB78_6; -BB78_7: +BB78_6: sub.f64 %fd52, %fd65, %fd64; neg.f64 %fd53, %fd52; fma.rn.f64 %fd54, %fd7, %fd64, %fd53; @@ -9936,13 +9885,13 @@ BB78_7: fma.rn.f64 %fd62, %fd60, %fd54, %fd61; fma.rn.f64 %fd65, %fd62, %fd60, %fd60; -BB78_8: +BB78_7: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd2, 3; add.s64 %rd12, %rd10, %rd11; st.global.f64 [%rd12], %fd65; -BB78_9: +BB78_8: ret; } @@ -9958,7 +9907,7 @@ BB78_9: .reg .b64 %SPL; .reg .pred %p<12>; .reg .f32 %f<33>; - .reg .b32 %r<96>; + .reg .b32 %r<94>; .reg .b64 %rd<22>; @@ -9966,13 +9915,13 @@ BB78_9: cvta.local.u64 %SP, %rd21; ld.param.u64 %rd8, [matrix_tan_f_param_0]; ld.param.u64 %rd9, [matrix_tan_f_param_1]; - ld.param.u32 %r36, [matrix_tan_f_param_2]; - mov.u32 %r37, %ntid.x; - mov.u32 %r38, %ctaid.x; - mov.u32 %r39, %tid.x; - mad.lo.s32 %r1, %r37, %r38, %r39; - setp.ge.u32 %p1, %r1, %r36; - @%p1 bra BB79_16; + ld.param.u32 %r29, [matrix_tan_f_param_2]; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r30, %r31, %r32; + setp.ge.u32 %p1, %r1, %r29; + @%p1 bra BB79_14; cvta.to.global.u64 %rd10, %rd8; cvt.s64.s32 %rd1, %r1; @@ -9990,8 +9939,8 @@ BB78_9: BB79_3: mul.f32 %f12, %f30, 0f3F22F983; - cvt.rni.s32.f32 %r95, %f12; - cvt.rn.f32.s32 %f13, %r95; + cvt.rni.s32.f32 %r93, %f12; + cvt.rn.f32.s32 %f13, %r93; neg.f32 %f14, %f13; mov.f32 %f15, 0f3FC90FDA; fma.rn.f32 %f16, %f14, %f15, %f30; @@ -10001,12 +9950,12 @@ BB79_3: fma.rn.f32 %f31, %f14, %f19, %f18; abs.f32 %f20, %f30; setp.leu.f32 %p3, %f20, 0f47CE4780; - @%p3 bra BB79_13; + @%p3 bra BB79_11; mov.b32 %r3, %f30; shr.u32 %r4, %r3, 23; - shl.b32 %r42, %r3, 8; - or.b32 %r5, %r42, -2147483648; + shl.b32 %r35, %r3, 8; + or.b32 %r5, %r35, -2147483648; mov.u32 %r87, 0; mov.u64 %rd19, __cudart_i2opi_f; mov.u32 %r86, -6; @@ -10015,14 +9964,14 @@ BB79_3: BB79_5: .pragma "nounroll"; mov.u64 %rd4, %rd20; - ld.const.u32 %r45, [%rd19]; + ld.const.u32 %r38, [%rd19]; // inline asm { - mad.lo.cc.u32 %r43, %r45, %r5, %r87; - madc.hi.u32 %r87, %r45, %r5, 0; + mad.lo.cc.u32 %r36, %r38, %r5, %r87; + madc.hi.u32 %r87, %r38, %r5, 0; } // inline asm - st.local.u32 [%rd4], %r43; + st.local.u32 [%rd4], %r36; add.s64 %rd5, %rd4, 4; add.s64 %rd19, %rd19, 4; add.s32 %r86, %r86, 1; @@ -10030,14 +9979,14 @@ BB79_5: mov.u64 %rd20, %rd5; @%p4 bra BB79_5; - and.b32 %r48, %r4, 255; - add.s32 %r49, %r48, -128; - shr.u32 %r50, %r49, 5; + and.b32 %r41, %r4, 255; + add.s32 %r42, %r41, -128; + shr.u32 %r43, %r42, 5; and.b32 %r10, %r3, -2147483648; st.local.u32 [%rd2+24], %r87; - mov.u32 %r51, 6; - sub.s32 %r52, %r51, %r50; - mul.wide.s32 %rd15, %r52, 4; + mov.u32 %r44, 6; + sub.s32 %r45, %r44, %r43; + mul.wide.s32 %rd15, %r45, 4; add.s64 %rd7, %rd2, %rd15; ld.local.u32 %r88, [%rd7]; ld.local.u32 %r89, [%rd7+-4]; @@ -10045,67 +9994,65 @@ BB79_5: setp.eq.s32 %p5, %r13, 0; @%p5 bra BB79_8; - mov.u32 %r53, 32; - sub.s32 %r54, %r53, %r13; - shr.u32 %r55, %r89, %r54; - shl.b32 %r56, %r88, %r13; - add.s32 %r88, %r55, %r56; - ld.local.u32 %r57, [%rd7+-8]; - shr.u32 %r58, %r57, %r54; - shl.b32 %r59, %r89, %r13; - add.s32 %r89, %r58, %r59; + mov.u32 %r46, 32; + sub.s32 %r47, %r46, %r13; + shr.u32 %r48, %r89, %r47; + shl.b32
<TRUNCATED>