[SYSTEMML-1038] Implemented the uark+ op for CUDA. Closes #319.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3caae271 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3caae271 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3caae271 Branch: refs/heads/master Commit: 3caae2718359b2004ba7acabe35386f5c5417fb3 Parents: 154f077 Author: Nakul Jindal <naku...@gmail.com> Authored: Sun Dec 18 11:08:57 2016 -0800 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sun Dec 18 11:08:56 2016 -0800 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 83 +- src/main/cpp/kernels/SystemML.ptx | 1302 ++++++++---------- .../java/org/apache/sysml/hops/AggUnaryOp.java | 5 + .../instructions/GPUInstructionParser.java | 6 + .../gpu/AggregateBinaryGPUInstruction.java | 16 +- .../instructions/gpu/GPUInstruction.java | 2 +- .../context/AggregateUnaryGPUInstruction.java | 85 ++ .../gpu/context/ExecutionConfig.java | 17 +- .../runtime/matrix/data/LibMatrixCUDA.java | 714 +++++++--- 9 files changed, 1274 insertions(+), 956 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 0c78045..5e5fd5e 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -16,13 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - + /********************************** -When updating a kernel or adding a new one, +When updating a kernel or adding a new one, please compile the ptx file and commit it: -nvcc -ptx SystemML.cu +nvcc -ptx SystemML.cu ***********************************/ + // dim => rlen (Assumption: rlen == clen) // N = length of dense array extern "C" @@ -45,8 +46,8 @@ __device__ double getBoolean(int val) { return 1.0; } -// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, -// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, +// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, +// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, // 11=min, 12=max, 13=and, 14=or, 15=log} extern "C" __device__ double binaryOp(double x, double y, int op) { @@ -61,8 +62,8 @@ __device__ double binaryOp(double x, double y, int op) { return x / y; else if(op == 4) return pow(x, y); - // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, - else if(op == 5) + // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, + else if(op == 5) return getBoolean(x < y); else if(op == 6) return getBoolean(x <= y); @@ -91,7 +92,7 @@ __global__ void dense_matrix_set(double* A, double scalar, int rlen, int clen) int index = ix * clen + iy; if(index < rlen*clen) { A[index] = scalar; - } + } } extern "C" @@ -125,17 +126,17 @@ __global__ void compareAndSet(double* A, double* ret, int rlen, int clen, doubl ret[index] = ifEqualsVal; else if(A[index] < compareVal) ret[index] = ifLessThanVal; - else + else ret[index] = ifGreaterThanVal; } } extern "C" -__global__ void binCellOp(double* A, double* B, double* C, +__global__ void binCellOp(double* A, double* B, double* C, int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; - + if(ix < maxRlen && iy < maxClen) { int outIndex = ix * maxClen + iy; int aIndex = outIndex; @@ -180,3 +181,59 @@ __global__ void fill(double* A, double scalar, int lenA) { A[index] = scalar; } } + + + +extern "C" +__global__ void reduce(double *g_idata, double *g_odata, unsigned int n) +{ + extern __shared__ double sdata[]; + + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x; + unsigned int gridSize = blockDim.x*2*gridDim.x; + + double mySum = 0; + + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + while (i < n) + { + mySum += g_idata[i]; + // ensure we don't read out of bounds + if (i + blockDim.x < n) + mySum += g_idata[i+blockDim.x]; + i += gridSize; + } + + // each thread puts its local sum into shared memory + sdata[tid] = mySum; + __syncthreads(); + + + // do reduction in shared mem + if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); } + if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); } + if (blockDim.x >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); } + + if (tid < 32) + { + // now that we are using warp-synchronous programming (below) + // we need to declare our shared memory volatile so that the compiler + // doesn't reorder stores to it and induce incorrect behavior. + volatile double* smem = sdata; + if (blockDim.x >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; } + if (blockDim.x >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; } + if (blockDim.x >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; } + if (blockDim.x >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; } + if (blockDim.x >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; } + if (blockDim.x >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; } + } + + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = sdata[0]; +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index b21e18c..ea27ac0 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,323 +1,24 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-19805474 -// Cuda compilation tools, release 7.5, V7.5.16 +// Compiler Build ID: CL-21124049 +// Cuda compilation tools, release 8.0, V8.0.44 // Based on LLVM 3.4svn // -.version 4.3 +.version 5.0 .target sm_20 .address_size 64 - // .globl getBoolean + // .globl copyUpperToLowerTriangleDense .func (.param .b64 func_retval0) __internal_accurate_pow ( .param .b64 __internal_accurate_pow_param_0, .param .b64 __internal_accurate_pow_param_1 ) ; +.extern .shared .align 8 .b8 sdata[]; -.visible .func (.param .b64 func_retval0) getBoolean( - .param .b32 getBoolean_param_0 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<2>; - .reg .f64 %fd<2>; - - - ld.param.u32 %r1, [getBoolean_param_0]; - setp.eq.s32 %p1, %r1, 0; - selp.f64 %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1; - st.param.f64 [func_retval0+0], %fd1; - ret; -} - - // .globl binaryOp -.visible .func (.param .b64 func_retval0) binaryOp( - .param .b64 binaryOp_param_0, - .param .b64 binaryOp_param_1, - .param .b32 binaryOp_param_2 -) -{ - .reg .pred %p<39>; - .reg .b32 %r<26>; - .reg .f64 %fd<39>; - .reg .b64 %rd<3>; - - - ld.param.f64 %fd27, [binaryOp_param_0]; - ld.param.f64 %fd28, [binaryOp_param_1]; - ld.param.u32 %r3, [binaryOp_param_2]; - setp.eq.s32 %p2, %r3, 0; - @%p2 bra BB1_38; - - setp.eq.s32 %p3, %r3, 1; - @%p3 bra BB1_37; - bra.uni BB1_2; - -BB1_37: - sub.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; - -BB1_38: - add.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; - -BB1_2: - setp.eq.s32 %p4, %r3, 2; - @%p4 bra BB1_36; - bra.uni BB1_3; - -BB1_36: - mul.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; - -BB1_3: - setp.eq.s32 %p5, %r3, 3; - @%p5 bra BB1_35; - bra.uni BB1_4; - -BB1_35: - div.rn.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; - -BB1_4: - setp.eq.s32 %p6, %r3, 4; - @%p6 bra BB1_21; - bra.uni BB1_5; - -BB1_21: - { - .reg .b32 %temp; - mov.b64 {%temp, %r1}, %fd27; - } - { - .reg .b32 %temp; - mov.b64 {%temp, %r2}, %fd28; - } - bfe.u32 %r4, %r2, 20, 11; - add.s32 %r5, %r4, -1012; - mov.b64 %rd2, %fd28; - shl.b64 %rd1, %rd2, %r5; - setp.eq.s64 %p21, %rd1, -9223372036854775808; - abs.f64 %fd9, %fd27; - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - // <end>} - .param .b64 param0; - st.param.f64 [param0+0], %fd9; - .param .b64 param1; - st.param.f64 [param1+0], %fd28; - .param .b64 retval0; - call.uni (retval0), - __internal_accurate_pow, - ( - param0, - param1 - ); - ld.param.f64 %fd37, [retval0+0]; - - //{ - }// Callseq End 0 - setp.lt.s32 %p22, %r1, 0; - and.pred %p1, %p22, %p21; - @!%p1 bra BB1_23; - bra.uni BB1_22; - -BB1_22: - { - .reg .b32 %temp; - mov.b64 {%temp, %r6}, %fd37; - } - xor.b32 %r7, %r6, -2147483648; - { - .reg .b32 %temp; - mov.b64 {%r8, %temp}, %fd37; - } - mov.b64 %fd37, {%r8, %r7}; - -BB1_23: - mov.f64 %fd36, %fd37; - setp.eq.f64 %p23, %fd27, 0d0000000000000000; - @%p23 bra BB1_26; - bra.uni BB1_24; - -BB1_26: - selp.b32 %r9, %r1, 0, %p21; - or.b32 %r10, %r9, 2146435072; - setp.lt.s32 %p27, %r2, 0; - selp.b32 %r11, %r10, %r9, %p27; - mov.u32 %r12, 0; - mov.b64 %fd36, {%r12, %r11}; - bra.uni BB1_27; - -BB1_5: - setp.eq.s32 %p7, %r3, 5; - @%p7 bra BB1_20; - bra.uni BB1_6; - -BB1_20: - setp.lt.f64 %p20, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p20; - bra.uni BB1_39; - -BB1_6: - setp.eq.s32 %p8, %r3, 6; - @%p8 bra BB1_19; - bra.uni BB1_7; - -BB1_19: - setp.le.f64 %p19, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p19; - bra.uni BB1_39; - -BB1_24: - setp.gt.s32 %p24, %r1, -1; - @%p24 bra BB1_27; - - cvt.rzi.f64.f64 %fd30, %fd28; - setp.neu.f64 %p25, %fd30, %fd28; - selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p25; - -BB1_27: - mov.f64 %fd15, %fd36; - add.f64 %fd16, %fd27, %fd28; - { - .reg .b32 %temp; - mov.b64 {%temp, %r13}, %fd16; - } - and.b32 %r14, %r13, 2146435072; - setp.ne.s32 %p28, %r14, 2146435072; - mov.f64 %fd35, %fd15; - @%p28 bra BB1_34; - - setp.gtu.f64 %p29, %fd9, 0d7FF0000000000000; - mov.f64 %fd35, %fd16; - @%p29 bra BB1_34; - - abs.f64 %fd17, %fd28; - setp.gtu.f64 %p30, %fd17, 0d7FF0000000000000; - mov.f64 %fd34, %fd16; - mov.f64 %fd35, %fd34; - @%p30 bra BB1_34; - - setp.eq.f64 %p31, %fd17, 0d7FF0000000000000; - @%p31 bra BB1_33; - bra.uni BB1_31; - -BB1_33: - setp.gt.f64 %p33, %fd9, 0d3FF0000000000000; - selp.b32 %r21, 2146435072, 0, %p33; - xor.b32 %r22, %r21, 2146435072; - setp.lt.s32 %p34, %r2, 0; - selp.b32 %r23, %r22, %r21, %p34; - setp.eq.f64 %p35, %fd27, 0dBFF0000000000000; - selp.b32 %r24, 1072693248, %r23, %p35; - mov.u32 %r25, 0; - mov.b64 %fd35, {%r25, %r24}; - bra.uni BB1_34; - -BB1_7: - setp.eq.s32 %p9, %r3, 7; - @%p9 bra BB1_18; - bra.uni BB1_8; - -BB1_18: - setp.gt.f64 %p18, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p18; - bra.uni BB1_39; - -BB1_8: - setp.eq.s32 %p10, %r3, 8; - @%p10 bra BB1_17; - bra.uni BB1_9; - -BB1_17: - setp.ge.f64 %p17, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p17; - bra.uni BB1_39; - -BB1_9: - setp.eq.s32 %p11, %r3, 9; - @%p11 bra BB1_16; - bra.uni BB1_10; - -BB1_16: - setp.eq.f64 %p16, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p16; - bra.uni BB1_39; - -BB1_31: - setp.neu.f64 %p32, %fd9, 0d7FF0000000000000; - mov.f64 %fd35, %fd15; - @%p32 bra BB1_34; - - shr.s32 %r15, %r2, 31; - and.b32 %r16, %r15, -2146435072; - add.s32 %r17, %r16, 2146435072; - or.b32 %r18, %r17, -2147483648; - selp.b32 %r19, %r18, %r17, %p1; - mov.u32 %r20, 0; - mov.b64 %fd35, {%r20, %r19}; - -BB1_34: - setp.eq.f64 %p36, %fd28, 0d0000000000000000; - setp.eq.f64 %p37, %fd27, 0d3FF0000000000000; - or.pred %p38, %p37, %p36; - selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p38; - -BB1_39: - st.param.f64 [func_retval0+0], %fd38; - ret; - -BB1_10: - setp.eq.s32 %p12, %r3, 10; - @%p12 bra BB1_15; - bra.uni BB1_11; - -BB1_15: - setp.neu.f64 %p15, %fd27, %fd28; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p15; - bra.uni BB1_39; - -BB1_11: - setp.eq.s32 %p13, %r3, 11; - @%p13 bra BB1_14; - bra.uni BB1_12; - -BB1_14: - min.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; - -BB1_12: - mov.f64 %fd38, 0dC08F380000000000; - setp.ne.s32 %p14, %r3, 12; - @%p14 bra BB1_39; - - max.f64 %fd38, %fd27, %fd28; - bra.uni BB1_39; -} - - // .globl copyUpperToLowerTriangleDense .visible .entry copyUpperToLowerTriangleDense( .param .u64 copyUpperToLowerTriangleDense_param_0, .param .u32 copyUpperToLowerTriangleDense_param_1, @@ -345,10 +46,10 @@ BB1_12: setp.gt.s32 %p1, %r2, %r1; setp.lt.s32 %p2, %r3, %r5; and.pred %p3, %p1, %p2; - @!%p3 bra BB2_2; - bra.uni BB2_1; + @!%p3 bra BB0_2; + bra.uni BB0_1; -BB2_1: +BB0_1: cvta.to.global.u64 %rd2, %rd1; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd3, %r12, 8; @@ -358,7 +59,7 @@ BB2_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB2_2: +BB0_2: ret; } @@ -391,14 +92,14 @@ BB2_2: mad.lo.s32 %r1, %r8, %r9, %r11; mul.lo.s32 %r12, %r3, %r2; setp.ge.s32 %p1, %r1, %r12; - @%p1 bra BB3_2; + @%p1 bra BB1_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB3_2: +BB1_2: ret; } @@ -432,10 +133,10 @@ BB3_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB4_2; - bra.uni BB4_1; + @!%p3 bra BB2_2; + bra.uni BB2_1; -BB4_1: +BB2_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -444,7 +145,7 @@ BB4_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd1; -BB4_2: +BB2_2: ret; } @@ -477,10 +178,10 @@ BB4_2: setp.lt.s32 %p1, %r1, %r4; setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB5_2; - bra.uni BB5_1; + @!%p3 bra BB3_2; + bra.uni BB3_1; -BB5_1: +BB3_1: cvta.to.global.u64 %rd3, %rd1; mad.lo.s32 %r11, %r1, %r3, %r2; mul.wide.s32 %rd4, %r11, 8; @@ -492,7 +193,7 @@ BB5_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB5_2: +BB3_2: ret; } @@ -536,10 +237,10 @@ BB5_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB6_6; - bra.uni BB6_1; + @!%p3 bra BB4_6; + bra.uni BB4_1; -BB6_1: +BB4_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -549,26 +250,26 @@ BB6_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB6_5; - bra.uni BB6_2; + @%p4 bra BB4_5; + bra.uni BB4_2; -BB6_5: +BB4_5: st.global.f64 [%rd1], %fd4; - bra.uni BB6_6; + bra.uni BB4_6; -BB6_2: +BB4_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB6_4; - bra.uni BB6_3; + @%p5 bra BB4_4; + bra.uni BB4_3; -BB6_4: +BB4_4: st.global.f64 [%rd1], %fd5; - bra.uni BB6_6; + bra.uni BB4_6; -BB6_3: +BB4_3: st.global.f64 [%rd1], %fd6; -BB6_6: +BB4_6: ret; } @@ -585,7 +286,7 @@ BB6_6: ) { .reg .pred %p<50>; - .reg .b32 %r<52>; + .reg .b32 %r<51>; .reg .f64 %fd<39>; .reg .b64 %rd<15>; @@ -609,93 +310,93 @@ BB6_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB7_53; - bra.uni BB7_1; + @!%p4 bra BB5_53; + bra.uni BB5_1; -BB7_1: +BB5_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r50, %r1; - @%p5 bra BB7_5; + mov.u32 %r49, %r1; + @%p5 bra BB5_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r51, %r3; - @%p6 bra BB7_4; + mov.u32 %r50, %r3; + @%p6 bra BB5_4; - mov.u32 %r51, %r2; + mov.u32 %r50, %r2; -BB7_4: - mov.u32 %r45, %r51; - mov.u32 %r4, %r45; - mov.u32 %r50, %r4; +BB5_4: + mov.u32 %r44, %r50; + mov.u32 %r4, %r44; + mov.u32 %r49, %r4; -BB7_5: - mov.u32 %r5, %r50; +BB5_5: + mov.u32 %r5, %r49; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r48, %r1; - @%p7 bra BB7_9; + mov.u32 %r47, %r1; + @%p7 bra BB5_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r49, %r3; - @%p8 bra BB7_8; + mov.u32 %r48, %r3; + @%p8 bra BB5_8; - mov.u32 %r49, %r2; + mov.u32 %r48, %r2; -BB7_8: - mov.u32 %r48, %r49; +BB5_8: + mov.u32 %r47, %r48; -BB7_9: +BB5_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r48, 8; + mul.wide.s32 %rd9, %r47, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; mov.f64 %fd38, 0dC08F380000000000; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB7_19; + @%p9 bra BB5_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB7_15; + @%p19 bra BB5_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB7_51; + @%p23 bra BB5_51; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB7_50; - bra.uni BB7_13; + @%p24 bra BB5_50; + bra.uni BB5_13; -BB7_50: +BB5_50: sub.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_19: +BB5_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB7_24; + @%p10 bra BB5_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB7_34; + @%p16 bra BB5_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB7_33; - bra.uni BB7_22; + @%p17 bra BB5_33; + bra.uni BB5_22; -BB7_33: +BB5_33: setp.gt.f64 %p29, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_15: +BB5_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB7_49; + @%p20 bra BB5_49; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB7_35; - bra.uni BB7_17; + @%p21 bra BB5_35; + bra.uni BB5_17; -BB7_35: +BB5_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -710,7 +411,7 @@ BB7_35: shl.b64 %rd1, %rd11, %r22; setp.eq.s64 %p32, %rd1, -9223372036854775808; abs.f64 %fd11, %fd1; - // Callseq Start 1 + // Callseq Start 0 { .reg .b32 temp_param_reg; // <end>} @@ -728,13 +429,13 @@ BB7_35: ld.param.f64 %fd37, [retval0+0]; //{ - }// Callseq End 1 + }// Callseq End 0 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB7_37; - bra.uni BB7_36; + @!%p1 bra BB5_37; + bra.uni BB5_36; -BB7_36: +BB5_36: { .reg .b32 %temp; mov.b64 {%temp, %r23}, %fd37; @@ -746,111 +447,111 @@ BB7_36: } mov.b64 %fd37, {%r25, %r24}; -BB7_37: +BB5_37: mov.f64 %fd36, %fd37; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB7_40; - bra.uni BB7_38; + @%p34 bra BB5_40; + bra.uni BB5_38; -BB7_40: +BB5_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; mov.b64 %fd36, {%r29, %r28}; - bra.uni BB7_41; + bra.uni BB5_41; -BB7_24: +BB5_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB7_28; + @%p11 bra BB5_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB7_32; - bra.uni BB7_26; + @%p14 bra BB5_32; + bra.uni BB5_26; -BB7_32: +BB5_32: setp.eq.f64 %p27, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_28: +BB5_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB7_31; - bra.uni BB7_29; + @%p12 bra BB5_31; + bra.uni BB5_29; -BB7_31: +BB5_31: min.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_51: +BB5_51: add.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_13: +BB5_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB7_14; - bra.uni BB7_52; + @%p25 bra BB5_14; + bra.uni BB5_52; -BB7_14: +BB5_14: mul.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_34: +BB5_34: setp.le.f64 %p30, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_22: +BB5_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB7_23; - bra.uni BB7_52; + @%p18 bra BB5_23; + bra.uni BB5_52; -BB7_23: +BB5_23: setp.ge.f64 %p28, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_49: +BB5_49: div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_17: +BB5_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB7_18; - bra.uni BB7_52; + @%p22 bra BB5_18; + bra.uni BB5_52; -BB7_18: +BB5_18: setp.lt.f64 %p31, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_26: +BB5_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB7_27; - bra.uni BB7_52; + @%p15 bra BB5_27; + bra.uni BB5_52; -BB7_27: +BB5_27: setp.neu.f64 %p26, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_29: +BB5_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB7_52; + @%p13 bra BB5_52; max.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + bra.uni BB5_52; -BB7_38: +BB5_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB7_41; + @%p35 bra BB5_41; cvt.rzi.f64.f64 %fd30, %fd2; setp.neu.f64 %p36, %fd30, %fd2; selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; -BB7_41: +BB5_41: mov.f64 %fd17, %fd36; add.f64 %fd18, %fd1, %fd2; { @@ -860,60 +561,59 @@ BB7_41: and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; mov.f64 %fd35, %fd17; - @%p39 bra BB7_48; + @%p39 bra BB5_48; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; mov.f64 %fd35, %fd18; - @%p40 bra BB7_48; + @%p40 bra BB5_48; abs.f64 %fd19, %fd2; setp.gtu.f64 %p41, %fd19, 0d7FF0000000000000; mov.f64 %fd34, %fd18; mov.f64 %fd35, %fd34; - @%p41 bra BB7_48; + @%p41 bra BB5_48; setp.eq.f64 %p42, %fd19, 0d7FF0000000000000; - @%p42 bra BB7_47; - bra.uni BB7_45; + @%p42 bra BB5_47; + bra.uni BB5_45; -BB7_47: +BB5_47: setp.gt.f64 %p44, %fd11, 0d3FF0000000000000; - selp.b32 %r38, 2146435072, 0, %p44; - xor.b32 %r39, %r38, 2146435072; + selp.b32 %r37, 2146435072, 0, %p44; + xor.b32 %r38, %r37, 2146435072; setp.lt.s32 %p45, %r9, 0; - selp.b32 %r40, %r39, %r38, %p45; + selp.b32 %r39, %r38, %r37, %p45; setp.eq.f64 %p46, %fd1, 0dBFF0000000000000; - selp.b32 %r41, 1072693248, %r40, %p46; - mov.u32 %r42, 0; - mov.b64 %fd35, {%r42, %r41}; - bra.uni BB7_48; + selp.b32 %r40, 1072693248, %r39, %p46; + mov.u32 %r41, 0; + mov.b64 %fd35, {%r41, %r40}; + bra.uni BB5_48; -BB7_45: +BB5_45: setp.neu.f64 %p43, %fd11, 0d7FF0000000000000; mov.f64 %fd35, %fd17; - @%p43 bra BB7_48; + @%p43 bra BB5_48; shr.s32 %r32, %r9, 31; and.b32 %r33, %r32, -2146435072; - add.s32 %r34, %r33, 2146435072; - or.b32 %r35, %r34, -2147483648; - selp.b32 %r36, %r35, %r34, %p1; - mov.u32 %r37, 0; - mov.b64 %fd35, {%r37, %r36}; + selp.b32 %r34, -1048576, 2146435072, %p1; + add.s32 %r35, %r34, %r33; + mov.u32 %r36, 0; + mov.b64 %fd35, {%r36, %r35}; -BB7_48: +BB5_48: setp.eq.f64 %p47, %fd2, 0d0000000000000000; setp.eq.f64 %p48, %fd1, 0d3FF0000000000000; or.pred %p49, %p48, %p47; selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p49; -BB7_52: +BB5_52: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd38; -BB7_53: +BB5_53: ret; } @@ -929,7 +629,7 @@ BB7_53: ) { .reg .pred %p<85>; - .reg .b32 %r<63>; + .reg .b32 %r<61>; .reg .f64 %fd<75>; .reg .b64 %rd<12>; @@ -952,7 +652,7 @@ BB7_53: mad.lo.s32 %r1, %r14, %r15, %r17; mul.lo.s32 %r18, %r9, %r8; setp.ge.s32 %p3, %r1, %r18; - @%p3 bra BB8_88; + @%p3 bra BB6_88; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -961,178 +661,178 @@ BB7_53: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB8_45; + @%p4 bra BB6_45; setp.eq.s32 %p5, %r6, 0; - @%p5 bra BB8_43; + @%p5 bra BB6_43; mov.f64 %fd66, 0dC08F380000000000; setp.gt.s32 %p6, %r6, 6; - @%p6 bra BB8_13; + @%p6 bra BB6_13; setp.gt.s32 %p14, %r6, 3; - @%p14 bra BB8_9; + @%p14 bra BB6_9; setp.eq.s32 %p18, %r6, 1; - @%p18 bra BB8_42; + @%p18 bra BB6_42; setp.eq.s32 %p19, %r6, 2; - @%p19 bra BB8_41; - bra.uni BB8_7; + @%p19 bra BB6_41; + bra.uni BB6_7; -BB8_41: +BB6_41: mul.f64 %fd66, %fd1, %fd54; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_45: +BB6_45: setp.eq.s32 %p45, %r6, 0; - @%p45 bra BB8_86; + @%p45 bra BB6_86; mov.f64 %fd74, 0dC08F380000000000; setp.gt.s32 %p46, %r6, 6; - @%p46 bra BB8_56; + @%p46 bra BB6_56; setp.gt.s32 %p54, %r6, 3; - @%p54 bra BB8_52; + @%p54 bra BB6_52; setp.eq.s32 %p58, %r6, 1; - @%p58 bra BB8_85; + @%p58 bra BB6_85; setp.eq.s32 %p59, %r6, 2; - @%p59 bra BB8_84; - bra.uni BB8_50; + @%p59 bra BB6_84; + bra.uni BB6_50; -BB8_84: +BB6_84: mul.f64 %fd74, %fd1, %fd54; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_43: +BB6_43: add.f64 %fd66, %fd1, %fd54; -BB8_44: +BB6_44: st.global.f64 [%rd1], %fd66; - bra.uni BB8_88; + bra.uni BB6_88; -BB8_13: +BB6_13: setp.gt.s32 %p7, %r6, 9; - @%p7 bra BB8_18; + @%p7 bra BB6_18; setp.eq.s32 %p11, %r6, 7; - @%p11 bra BB8_25; + @%p11 bra BB6_25; setp.eq.s32 %p12, %r6, 8; - @%p12 bra BB8_24; - bra.uni BB8_16; + @%p12 bra BB6_24; + bra.uni BB6_16; -BB8_24: +BB6_24: setp.le.f64 %p23, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_86: +BB6_86: add.f64 %fd74, %fd1, %fd54; -BB8_87: +BB6_87: st.global.f64 [%rd1], %fd74; -BB8_88: +BB6_88: ret; -BB8_56: +BB6_56: setp.gt.s32 %p47, %r6, 9; - @%p47 bra BB8_61; + @%p47 bra BB6_61; setp.eq.s32 %p51, %r6, 7; - @%p51 bra BB8_68; + @%p51 bra BB6_68; setp.eq.s32 %p52, %r6, 8; - @%p52 bra BB8_67; - bra.uni BB8_59; + @%p52 bra BB6_67; + bra.uni BB6_59; -BB8_67: +BB6_67: setp.ge.f64 %p63, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p63; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_9: +BB6_9: setp.eq.s32 %p15, %r6, 4; - @%p15 bra BB8_27; + @%p15 bra BB6_27; setp.eq.s32 %p16, %r6, 5; - @%p16 bra BB8_26; - bra.uni BB8_11; + @%p16 bra BB6_26; + bra.uni BB6_11; -BB8_26: +BB6_26: setp.gt.f64 %p26, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_18: +BB6_18: setp.eq.s32 %p8, %r6, 10; - @%p8 bra BB8_23; + @%p8 bra BB6_23; setp.eq.s32 %p9, %r6, 11; - @%p9 bra BB8_22; - bra.uni BB8_20; + @%p9 bra BB6_22; + bra.uni BB6_20; -BB8_22: +BB6_22: min.f64 %fd66, %fd54, %fd1; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_52: +BB6_52: setp.eq.s32 %p55, %r6, 4; - @%p55 bra BB8_70; + @%p55 bra BB6_70; setp.eq.s32 %p56, %r6, 5; - @%p56 bra BB8_69; - bra.uni BB8_54; + @%p56 bra BB6_69; + bra.uni BB6_54; -BB8_69: +BB6_69: setp.lt.f64 %p66, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_61: +BB6_61: setp.eq.s32 %p48, %r6, 10; - @%p48 bra BB8_66; + @%p48 bra BB6_66; setp.eq.s32 %p49, %r6, 11; - @%p49 bra BB8_65; - bra.uni BB8_63; + @%p49 bra BB6_65; + bra.uni BB6_63; -BB8_65: +BB6_65: min.f64 %fd74, %fd1, %fd54; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_42: +BB6_42: sub.f64 %fd66, %fd54, %fd1; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_7: +BB6_7: setp.eq.s32 %p20, %r6, 3; - @%p20 bra BB8_8; - bra.uni BB8_44; + @%p20 bra BB6_8; + bra.uni BB6_44; -BB8_8: +BB6_8: div.rn.f64 %fd66, %fd54, %fd1; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_25: +BB6_25: setp.lt.f64 %p24, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_16: +BB6_16: setp.eq.s32 %p13, %r6, 9; - @%p13 bra BB8_17; - bra.uni BB8_44; + @%p13 bra BB6_17; + bra.uni BB6_44; -BB8_17: +BB6_17: setp.eq.f64 %p22, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_27: +BB6_27: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd54; @@ -1147,7 +847,7 @@ BB8_27: shl.b64 %rd2, %rd10, %r20; setp.eq.s64 %p27, %rd2, -9223372036854775808; abs.f64 %fd10, %fd54; - // Callseq Start 2 + // Callseq Start 1 { .reg .b32 temp_param_reg; // <end>} @@ -1165,13 +865,13 @@ BB8_27: ld.param.f64 %fd65, [retval0+0]; //{ - }// Callseq End 2 + }// Callseq End 1 setp.lt.s32 %p28, %r2, 0; and.pred %p1, %p28, %p27; - @!%p1 bra BB8_29; - bra.uni BB8_28; + @!%p1 bra BB6_29; + bra.uni BB6_28; -BB8_28: +BB6_28: { .reg .b32 %temp; mov.b64 {%temp, %r21}, %fd65; @@ -1183,72 +883,72 @@ BB8_28: } mov.b64 %fd65, {%r23, %r22}; -BB8_29: +BB6_29: mov.f64 %fd64, %fd65; setp.eq.f64 %p29, %fd54, 0d0000000000000000; - @%p29 bra BB8_32; - bra.uni BB8_30; + @%p29 bra BB6_32; + bra.uni BB6_30; -BB8_32: +BB6_32: selp.b32 %r24, %r2, 0, %p27; or.b32 %r25, %r24, 2146435072; setp.lt.s32 %p33, %r3, 0; selp.b32 %r26, %r25, %r24, %p33; mov.u32 %r27, 0; mov.b64 %fd64, {%r27, %r26}; - bra.uni BB8_33; + bra.uni BB6_33; -BB8_11: +BB6_11: setp.eq.s32 %p17, %r6, 6; - @%p17 bra BB8_12; - bra.uni BB8_44; + @%p17 bra BB6_12; + bra.uni BB6_44; -BB8_12: +BB6_12: setp.ge.f64 %p25, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_23: +BB6_23: setp.neu.f64 %p21, %fd1, %fd54; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_20: +BB6_20: setp.ne.s32 %p10, %r6, 12; - @%p10 bra BB8_44; + @%p10 bra BB6_44; max.f64 %fd66, %fd54, %fd1; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_85: +BB6_85: sub.f64 %fd74, %fd1, %fd54; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_50: +BB6_50: setp.eq.s32 %p60, %r6, 3; - @%p60 bra BB8_51; - bra.uni BB8_87; + @%p60 bra BB6_51; + bra.uni BB6_87; -BB8_51: +BB6_51: div.rn.f64 %fd74, %fd1, %fd54; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_68: +BB6_68: setp.gt.f64 %p64, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p64; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_59: +BB6_59: setp.eq.s32 %p53, %r6, 9; - @%p53 bra BB8_60; - bra.uni BB8_87; + @%p53 bra BB6_60; + bra.uni BB6_87; -BB8_60: +BB6_60: setp.eq.f64 %p62, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p62; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_70: +BB6_70: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1257,13 +957,13 @@ BB8_70: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd54; } - bfe.u32 %r41, %r5, 20, 11; - add.s32 %r42, %r41, -1012; + bfe.u32 %r40, %r5, 20, 11; + add.s32 %r41, %r40, -1012; mov.b64 %rd11, %fd54; - shl.b64 %rd3, %rd11, %r42; + shl.b64 %rd3, %rd11, %r41; setp.eq.s64 %p67, %rd3, -9223372036854775808; abs.f64 %fd36, %fd1; - // Callseq Start 3 + // Callseq Start 2 { .reg .b32 temp_param_reg; // <end>} @@ -1281,70 +981,70 @@ BB8_70: ld.param.f64 %fd73, [retval0+0]; //{ - }// Callseq End 3 + }// Callseq End 2 setp.lt.s32 %p68, %r4, 0; and.pred %p2, %p68, %p67; - @!%p2 bra BB8_72; - bra.uni BB8_71; + @!%p2 bra BB6_72; + bra.uni BB6_71; -BB8_71: +BB6_71: { .reg .b32 %temp; - mov.b64 {%temp, %r43}, %fd73; + mov.b64 {%temp, %r42}, %fd73; } - xor.b32 %r44, %r43, -2147483648; + xor.b32 %r43, %r42, -2147483648; { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd73; + mov.b64 {%r44, %temp}, %fd73; } - mov.b64 %fd73, {%r45, %r44}; + mov.b64 %fd73, {%r44, %r43}; -BB8_72: +BB6_72: mov.f64 %fd72, %fd73; setp.eq.f64 %p69, %fd1, 0d0000000000000000; - @%p69 bra BB8_75; - bra.uni BB8_73; + @%p69 bra BB6_75; + bra.uni BB6_73; -BB8_75: - selp.b32 %r46, %r4, 0, %p67; - or.b32 %r47, %r46, 2146435072; +BB6_75: + selp.b32 %r45, %r4, 0, %p67; + or.b32 %r46, %r45, 2146435072; setp.lt.s32 %p73, %r5, 0; - selp.b32 %r48, %r47, %r46, %p73; - mov.u32 %r49, 0; - mov.b64 %fd72, {%r49, %r48}; - bra.uni BB8_76; + selp.b32 %r47, %r46, %r45, %p73; + mov.u32 %r48, 0; + mov.b64 %fd72, {%r48, %r47}; + bra.uni BB6_76; -BB8_54: +BB6_54: setp.eq.s32 %p57, %r6, 6; - @%p57 bra BB8_55; - bra.uni BB8_87; + @%p57 bra BB6_55; + bra.uni BB6_87; -BB8_55: +BB6_55: setp.le.f64 %p65, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_66: +BB6_66: setp.neu.f64 %p61, %fd1, %fd54; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p61; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_63: +BB6_63: setp.ne.s32 %p50, %r6, 12; - @%p50 bra BB8_87; + @%p50 bra BB6_87; max.f64 %fd74, %fd1, %fd54; - bra.uni BB8_87; + bra.uni BB6_87; -BB8_30: +BB6_30: setp.gt.s32 %p30, %r2, -1; - @%p30 bra BB8_33; + @%p30 bra BB6_33; cvt.rzi.f64.f64 %fd56, %fd1; setp.neu.f64 %p31, %fd56, %fd1; selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p31; -BB8_33: +BB6_33: mov.f64 %fd16, %fd64; add.f64 %fd17, %fd1, %fd54; { @@ -1354,119 +1054,117 @@ BB8_33: and.b32 %r29, %r28, 2146435072; setp.ne.s32 %p34, %r29, 2146435072; mov.f64 %fd63, %fd16; - @%p34 bra BB8_40; + @%p34 bra BB6_40; setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000; mov.f64 %fd63, %fd17; - @%p35 bra BB8_40; + @%p35 bra BB6_40; abs.f64 %fd18, %fd1; setp.gtu.f64 %p36, %fd18, 0d7FF0000000000000; mov.f64 %fd62, %fd17; mov.f64 %fd63, %fd62; - @%p36 bra BB8_40; + @%p36 bra BB6_40; setp.eq.f64 %p37, %fd18, 0d7FF0000000000000; - @%p37 bra BB8_39; - bra.uni BB8_37; + @%p37 bra BB6_39; + bra.uni BB6_37; -BB8_39: +BB6_39: setp.gt.f64 %p39, %fd10, 0d3FF0000000000000; - selp.b32 %r36, 2146435072, 0, %p39; - xor.b32 %r37, %r36, 2146435072; + selp.b32 %r35, 2146435072, 0, %p39; + xor.b32 %r36, %r35, 2146435072; setp.lt.s32 %p40, %r3, 0; - selp.b32 %r38, %r37, %r36, %p40; + selp.b32 %r37, %r36, %r35, %p40; setp.eq.f64 %p41, %fd54, 0dBFF0000000000000; - selp.b32 %r39, 1072693248, %r38, %p41; - mov.u32 %r40, 0; - mov.b64 %fd63, {%r40, %r39}; - bra.uni BB8_40; + selp.b32 %r38, 1072693248, %r37, %p41; + mov.u32 %r39, 0; + mov.b64 %fd63, {%r39, %r38}; + bra.uni BB6_40; -BB8_73: +BB6_73: setp.gt.s32 %p70, %r4, -1; - @%p70 bra BB8_76; + @%p70 bra BB6_76; cvt.rzi.f64.f64 %fd58, %fd54; setp.neu.f64 %p71, %fd58, %fd54; selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p71; -BB8_76: +BB6_76: mov.f64 %fd42, %fd72; add.f64 %fd43, %fd1, %fd54; { .reg .b32 %temp; - mov.b64 {%temp, %r50}, %fd43; + mov.b64 {%temp, %r49}, %fd43; } - and.b32 %r51, %r50, 2146435072; - setp.ne.s32 %p74, %r51, 2146435072; + and.b32 %r50, %r49, 2146435072; + setp.ne.s32 %p74, %r50, 2146435072; mov.f64 %fd71, %fd42; - @%p74 bra BB8_83; + @%p74 bra BB6_83; setp.gtu.f64 %p75, %fd36, 0d7FF0000000000000; mov.f64 %fd71, %fd43; - @%p75 bra BB8_83; + @%p75 bra BB6_83; abs.f64 %fd44, %fd54; setp.gtu.f64 %p76, %fd44, 0d7FF0000000000000; mov.f64 %fd70, %fd43; mov.f64 %fd71, %fd70; - @%p76 bra BB8_83; + @%p76 bra BB6_83; setp.eq.f64 %p77, %fd44, 0d7FF0000000000000; - @%p77 bra BB8_82; - bra.uni BB8_80; + @%p77 bra BB6_82; + bra.uni BB6_80; -BB8_82: +BB6_82: setp.gt.f64 %p79, %fd36, 0d3FF0000000000000; - selp.b32 %r58, 2146435072, 0, %p79; - xor.b32 %r59, %r58, 2146435072; + selp.b32 %r56, 2146435072, 0, %p79; + xor.b32 %r57, %r56, 2146435072; setp.lt.s32 %p80, %r5, 0; - selp.b32 %r60, %r59, %r58, %p80; + selp.b32 %r58, %r57, %r56, %p80; setp.eq.f64 %p81, %fd1, 0dBFF0000000000000; - selp.b32 %r61, 1072693248, %r60, %p81; - mov.u32 %r62, 0; - mov.b64 %fd71, {%r62, %r61}; - bra.uni BB8_83; + selp.b32 %r59, 1072693248, %r58, %p81; + mov.u32 %r60, 0; + mov.b64 %fd71, {%r60, %r59}; + bra.uni BB6_83; -BB8_37: +BB6_37: setp.neu.f64 %p38, %fd10, 0d7FF0000000000000; mov.f64 %fd63, %fd16; - @%p38 bra BB8_40; + @%p38 bra BB6_40; shr.s32 %r30, %r3, 31; and.b32 %r31, %r30, -2146435072; - add.s32 %r32, %r31, 2146435072; - or.b32 %r33, %r32, -2147483648; - selp.b32 %r34, %r33, %r32, %p1; - mov.u32 %r35, 0; - mov.b64 %fd63, {%r35, %r34}; + selp.b32 %r32, -1048576, 2146435072, %p1; + add.s32 %r33, %r32, %r31; + mov.u32 %r34, 0; + mov.b64 %fd63, {%r34, %r33}; -BB8_40: +BB6_40: setp.eq.f64 %p42, %fd1, 0d0000000000000000; setp.eq.f64 %p43, %fd54, 0d3FF0000000000000; or.pred %p44, %p43, %p42; selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p44; - bra.uni BB8_44; + bra.uni BB6_44; -BB8_80: +BB6_80: setp.neu.f64 %p78, %fd36, 0d7FF0000000000000; mov.f64 %fd71, %fd42; - @%p78 bra BB8_83; + @%p78 bra BB6_83; - shr.s32 %r52, %r5, 31; - and.b32 %r53, %r52, -2146435072; - add.s32 %r54, %r53, 2146435072; - or.b32 %r55, %r54, -2147483648; - selp.b32 %r56, %r55, %r54, %p2; - mov.u32 %r57, 0; - mov.b64 %fd71, {%r57, %r56}; + shr.s32 %r51, %r5, 31; + and.b32 %r52, %r51, -2146435072; + selp.b32 %r53, -1048576, 2146435072, %p2; + add.s32 %r54, %r53, %r52; + mov.u32 %r55, 0; + mov.b64 %fd71, {%r55, %r54}; -BB8_83: +BB6_83: setp.eq.f64 %p82, %fd54, 0d0000000000000000; setp.eq.f64 %p83, %fd1, 0d3FF0000000000000; or.pred %p84, %p83, %p82; selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p84; - bra.uni BB8_87; + bra.uni BB6_87; } // .globl fill @@ -1490,14 +1188,191 @@ BB8_83: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB9_2; + @%p1 bra BB7_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB9_2: +BB7_2: + ret; +} + + // .globl reduce +.visible .entry reduce( + .param .u64 reduce_param_0, + .param .u64 reduce_param_1, + .param .u32 reduce_param_2 +) +{ + .reg .pred %p<18>; + .reg .b32 %r<31>; + .reg .f64 %fd<70>; + .reg .b64 %rd<15>; + + + ld.param.u64 %rd2, [reduce_param_0]; + ld.param.u64 %rd3, [reduce_param_1]; + ld.param.u32 %r5, [reduce_param_2]; + mov.u32 %r6, %tid.x; + mov.u32 %r7, %ctaid.x; + shl.b32 %r8, %r7, 1; + mov.u32 %r9, %ntid.x; + mad.lo.s32 %r30, %r8, %r9, %r6; + mov.f64 %fd67, 0d0000000000000000; + mov.f64 %fd68, %fd67; + setp.ge.u32 %p1, %r30, %r5; + @%p1 bra BB8_4; + +BB8_1: + mov.f64 %fd1, %fd68; + cvta.to.global.u64 %rd4, %rd2; + mul.wide.u32 %rd5, %r30, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd27, [%rd6]; + add.f64 %fd69, %fd1, %fd27; + add.s32 %r3, %r30, %r9; + setp.ge.u32 %p2, %r3, %r5; + @%p2 bra BB8_3; + + mul.wide.u32 %rd8, %r3, 8; + add.s64 %rd9, %rd4, %rd8; + ld.global.f64 %fd28, [%rd9]; + add.f64 %fd69, %fd69, %fd28; + +BB8_3: + mov.f64 %fd68, %fd69; + shl.b32 %r12, %r9, 1; + mov.u32 %r13, %nctaid.x; + mad.lo.s32 %r30, %r12, %r13, %r30; + setp.lt.u32 %p3, %r30, %r5; + mov.f64 %fd67, %fd68; + @%p3 bra BB8_1; + +BB8_4: + mov.f64 %fd65, %fd67; + mul.wide.u32 %rd10, %r6, 8; + mov.u64 %rd11, sdata; + add.s64 %rd1, %rd11, %rd10; + st.shared.f64 [%rd1], %fd65; + bar.sync 0; + setp.lt.u32 %p4, %r9, 512; + @%p4 bra BB8_8; + + setp.gt.u32 %p5, %r6, 255; + mov.f64 %fd66, %fd65; + @%p5 bra BB8_7; + + ld.shared.f64 %fd29, [%rd1+2048]; + add.f64 %fd66, %fd65, %fd29; + st.shared.f64 [%rd1], %fd66; + +BB8_7: + mov.f64 %fd65, %fd66; + bar.sync 0; + +BB8_8: + mov.f64 %fd63, %fd65; + setp.lt.u32 %p6, %r9, 256; + @%p6 bra BB8_12; + + setp.gt.u32 %p7, %r6, 127; + mov.f64 %fd64, %fd63; + @%p7 bra BB8_11; + + ld.shared.f64 %fd30, [%rd1+1024]; + add.f64 %fd64, %fd63, %fd30; + st.shared.f64 [%rd1], %fd64; + +BB8_11: + mov.f64 %fd63, %fd64; + bar.sync 0; + +BB8_12: + mov.f64 %fd61, %fd63; + setp.lt.u32 %p8, %r9, 128; + @%p8 bra BB8_16; + + setp.gt.u32 %p9, %r6, 63; + mov.f64 %fd62, %fd61; + @%p9 bra BB8_15; + + ld.shared.f64 %fd31, [%rd1+512]; + add.f64 %fd62, %fd61, %fd31; + st.shared.f64 [%rd1], %fd62; + +BB8_15: + mov.f64 %fd61, %fd62; + bar.sync 0; + +BB8_16: + mov.f64 %fd60, %fd61; + setp.gt.u32 %p10, %r6, 31; + @%p10 bra BB8_29; + + setp.lt.u32 %p11, %r9, 64; + @%p11 bra BB8_19; + + ld.volatile.shared.f64 %fd32, [%rd1+256]; + add.f64 %fd60, %fd60, %fd32; + st.volatile.shared.f64 [%rd1], %fd60; + +BB8_19: + mov.f64 %fd59, %fd60; + setp.lt.u32 %p12, %r9, 32; + @%p12 bra BB8_21; + + ld.volatile.shared.f64 %fd33, [%rd1+128]; + add.f64 %fd59, %fd59, %fd33; + st.volatile.shared.f64 [%rd1], %fd59; + +BB8_21: + mov.f64 %fd58, %fd59; + setp.lt.u32 %p13, %r9, 16; + @%p13 bra BB8_23; + + ld.volatile.shared.f64 %fd34, [%rd1+64]; + add.f64 %fd58, %fd58, %fd34; + st.volatile.shared.f64 [%rd1], %fd58; + +BB8_23: + mov.f64 %fd57, %fd58; + setp.lt.u32 %p14, %r9, 8; + @%p14 bra BB8_25; + + ld.volatile.shared.f64 %fd35, [%rd1+32]; + add.f64 %fd57, %fd57, %fd35; + st.volatile.shared.f64 [%rd1], %fd57; + +BB8_25: + mov.f64 %fd56, %fd57; + setp.lt.u32 %p15, %r9, 4; + @%p15 bra BB8_27; + + ld.volatile.shared.f64 %fd36, [%rd1+16]; + add.f64 %fd56, %fd56, %fd36; + st.volatile.shared.f64 [%rd1], %fd56; + +BB8_27: + setp.lt.u32 %p16, %r9, 2; + @%p16 bra BB8_29; + + ld.volatile.shared.f64 %fd37, [%rd1+8]; + add.f64 %fd38, %fd56, %fd37; + st.volatile.shared.f64 [%rd1], %fd38; + +BB8_29: + setp.ne.s32 %p17, %r6, 0; + @%p17 bra BB8_31; + + ld.shared.f64 %fd39, [sdata]; + cvta.to.global.u64 %rd12, %rd3; + mul.wide.u32 %rd13, %r7, 8; + add.s64 %rd14, %rd12, %rd13; + st.global.f64 [%rd14], %fd39; + +BB8_31: ret; } @@ -1509,7 +1384,7 @@ BB9_2: .reg .pred %p<8>; .reg .f32 %f<3>; .reg .b32 %r<49>; - .reg .f64 %fd<136>; + .reg .f64 %fd<135>; ld.param.f64 %fd12, [__internal_accurate_pow_param_0]; @@ -1524,7 +1399,7 @@ BB9_2: } shr.u32 %r47, %r46, 20; setp.ne.s32 %p1, %r47, 0; - @%p1 bra BB10_2; + @%p1 bra BB9_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -1538,28 +1413,28 @@ BB9_2: shr.u32 %r16, %r46, 20; add.s32 %r47, %r16, -54; -BB10_2: +BB9_2: add.s32 %r48, %r47, -1023; and.b32 %r17, %r46, -2146435073; or.b32 %r18, %r17, 1072693248; - mov.b64 %fd134, {%r45, %r18}; + mov.b64 %fd133, {%r45, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB10_4; + @%p2 bra BB9_4; { .reg .b32 %temp; - mov.b64 {%r19, %temp}, %fd134; + mov.b64 {%r19, %temp}, %fd133; } { .reg .b32 %temp; - mov.b64 {%temp, %r20}, %fd134; + mov.b64 {%temp, %r20}, %fd133; } add.s32 %r21, %r20, -1048576; - mov.b64 %fd134, {%r19, %r21}; + mov.b64 %fd133, {%r19, %r21}; add.s32 %r48, %r47, -1022; -BB10_4: - add.f64 %fd16, %fd134, 0d3FF0000000000000; +BB9_4: + add.f64 %fd16, %fd133, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; // inline asm @@ -1568,7 +1443,7 @@ BB10_4: fma.rn.f64 %fd19, %fd17, %fd15, %fd18; fma.rn.f64 %fd20, %fd19, %fd19, %fd19; fma.rn.f64 %fd21, %fd20, %fd15, %fd15; - add.f64 %fd22, %fd134, 0dBFF0000000000000; + add.f64 %fd22, %fd133, 0dBFF0000000000000; mul.f64 %fd23, %fd22, %fd21; fma.rn.f64 %fd24, %fd22, %fd21, %fd23; mul.f64 %fd25, %fd24, %fd24; @@ -1671,52 +1546,51 @@ BB10_4: add.f64 %fd4, %fd94, %fd97; sub.f64 %fd98, %fd94, %fd4; add.f64 %fd5, %fd97, %fd98; - mov.f64 %fd99, 0d3FF71547652B82FE; - mul.rn.f64 %fd100, %fd4, %fd99; - mov.f64 %fd101, 0d4338000000000000; - add.rn.f64 %fd102, %fd100, %fd101; + mov.f64 %fd99, 0d4338000000000000; + mov.f64 %fd100, 0d3FF71547652B82FE; + fma.rn.f64 %fd101, %fd4, %fd100, %fd99; { .reg .b32 %temp; - mov.b64 {%r13, %temp}, %fd102; + mov.b64 {%r13, %temp}, %fd101; } - mov.f64 %fd103, 0dC338000000000000; - add.rn.f64 %fd104, %fd102, %fd103; - mov.f64 %fd105, 0dBFE62E42FEFA39EF; - fma.rn.f64 %fd106, %fd104, %fd105, %fd4; - mov.f64 %fd107, 0dBC7ABC9E3B39803F; - fma.rn.f64 %fd108, %fd104, %fd107, %fd106; - mov.f64 %fd109, 0d3E928AF3FCA213EA; - mov.f64 %fd110, 0d3E5ADE1569CE2BDF; - fma.rn.f64 %fd111, %fd110, %fd108, %fd109; - mov.f64 %fd112, 0d3EC71DEE62401315; - fma.rn.f64 %fd113, %fd111, %fd108, %fd112; - mov.f64 %fd114, 0d3EFA01997C89EB71; - fma.rn.f64 %fd115, %fd113, %fd108, %fd114; - mov.f64 %fd116, 0d3F2A01A014761F65; - fma.rn.f64 %fd117, %fd115, %fd108, %fd116; - mov.f64 %fd118, 0d3F56C16C1852B7AF; - fma.rn.f64 %fd119, %fd117, %fd108, %fd118; - mov.f64 %fd120, 0d3F81111111122322; - fma.rn.f64 %fd121, %fd119, %fd108, %fd120; - mov.f64 %fd122, 0d3FA55555555502A1; - fma.rn.f64 %fd123, %fd121, %fd108, %fd122; - mov.f64 %fd124, 0d3FC5555555555511; - fma.rn.f64 %fd125, %fd123, %fd108, %fd124; - mov.f64 %fd126, 0d3FE000000000000B; - fma.rn.f64 %fd127, %fd125, %fd108, %fd126; - fma.rn.f64 %fd128, %fd127, %fd108, %fd18; - fma.rn.f64 %fd129, %fd128, %fd108, %fd18; + mov.f64 %fd102, 0dC338000000000000; + add.rn.f64 %fd103, %fd101, %fd102; + mov.f64 %fd104, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd105, %fd103, %fd104, %fd4; + mov.f64 %fd106, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd107, %fd103, %fd106, %fd105; + mov.f64 %fd108, 0d3E928AF3FCA213EA; + mov.f64 %fd109, 0d3E5ADE1569CE2BDF; + fma.rn.f64 %fd110, %fd109, %fd107, %fd108; + mov.f64 %fd111, 0d3EC71DEE62401315; + fma.rn.f64 %fd112, %fd110, %fd107, %fd111; + mov.f64 %fd113, 0d3EFA01997C89EB71; + fma.rn.f64 %fd114, %fd112, %fd107, %fd113; + mov.f64 %fd115, 0d3F2A01A014761F65; + fma.rn.f64 %fd116, %fd114, %fd107, %fd115; + mov.f64 %fd117, 0d3F56C16C1852B7AF; + fma.rn.f64 %fd118, %fd116, %fd107, %fd117; + mov.f64 %fd119, 0d3F81111111122322; + fma.rn.f64 %fd120, %fd118, %fd107, %fd119; + mov.f64 %fd121, 0d3FA55555555502A1; + fma.rn.f64 %fd122, %fd120, %fd107, %fd121; + mov.f64 %fd123, 0d3FC5555555555511; + fma.rn.f64 %fd124, %fd122, %fd107, %fd123; + mov.f64 %fd125, 0d3FE000000000000B; + fma.rn.f64 %fd126, %fd124, %fd107, %fd125; + fma.rn.f64 %fd127, %fd126, %fd107, %fd18; + fma.rn.f64 %fd128, %fd127, %fd107, %fd18; { .reg .b32 %temp; - mov.b64 {%r14, %temp}, %fd129; + mov.b64 {%r14, %temp}, %fd128; } { .reg .b32 %temp; - mov.b64 {%temp, %r15}, %fd129; + mov.b64 {%temp, %r15}, %fd128; } shl.b32 %r33, %r13, 20; add.s32 %r34, %r15, %r33; - mov.b64 %fd135, {%r14, %r34}; + mov.b64 %fd134, {%r14, %r34}; { .reg .b32 %temp; mov.b64 {%temp, %r35}, %fd4; @@ -1724,36 +1598,36 @@ BB10_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB10_7; + @%p4 bra BB9_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; - add.f64 %fd130, %fd4, 0d7FF0000000000000; - selp.f64 %fd135, 0d0000000000000000, %fd130, %p5; + add.f64 %fd129, %fd4, 0d7FF0000000000000; + selp.f64 %fd134, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB10_7; + @%p6 bra BB9_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; shr.s32 %r38, %r37, 1; shl.b32 %r39, %r38, 20; add.s32 %r40, %r39, %r15; - mov.b64 %fd131, {%r14, %r40}; + mov.b64 %fd130, {%r14, %r40}; sub.s32 %r41, %r13, %r38; shl.b32 %r42, %r41, 20; add.s32 %r43, %r42, 1072693248; mov.u32 %r44, 0; - mov.b64 %fd132, {%r44, %r43}; - mul.f64 %fd135, %fd131, %fd132; + mov.b64 %fd131, {%r44, %r43}; + mul.f64 %fd134, %fd130, %fd131; -BB10_7: - abs.f64 %fd133, %fd135; - setp.eq.f64 %p7, %fd133, 0d7FF0000000000000; - @%p7 bra BB10_9; +BB9_7: + abs.f64 %fd132, %fd134; + setp.eq.f64 %p7, %fd132, 0d7FF0000000000000; + @%p7 bra BB9_9; - fma.rn.f64 %fd135, %fd135, %fd5, %fd135; + fma.rn.f64 %fd134, %fd134, %fd5, %fd134; -BB10_9: - st.param.f64 [func_retval0+0], %fd135; +BB9_9: + st.param.f64 [func_retval0+0], %fd134; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 11dc3ce..5d795f1 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -19,6 +19,7 @@ package org.apache.sysml.hops; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.AggBinaryOp.SparkAggType; import org.apache.sysml.hops.Hop.MultiThreadedHop; @@ -143,6 +144,10 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop } else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) { + et = ExecType.GPU; + k = 1; + } agg1 = new PartialAggregate(input.constructLops(), HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 9c8be5d..f988e5f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -29,6 +29,7 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE; import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.context.AggregateUnaryGPUInstruction; public class GPUInstructionParser extends InstructionParser { @@ -62,6 +63,8 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary); + + String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); } public static GPUInstruction parseSingleInstruction (String str ) @@ -88,6 +91,9 @@ public class GPUInstructionParser extends InstructionParser throw new DMLRuntimeException("The instruction is not GPU-enabled:" + str); switch(gputype) { + case AggregateUnary: + return AggregateUnaryGPUInstruction.parseInstruction(str); + case AggregateBinary: return AggregateBinaryGPUInstruction.parseInstruction(str); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java index b9cbfab..7219c6c 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -88,14 +88,14 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction //get inputs MatrixObject m1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject m2 = ec.getMatrixInputForGPUInstruction(_input2.getName()); - - //compute matrix multiplication - int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : m1.getNumRows()); - int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); - - ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); + MatrixObject m2 = ec.getMatrixInputForGPUInstruction(_input2.getName()); + + //compute matrix multiplication + int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : m1.getNumRows()); + int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); + + ec.setMetaData(_output.getName(), rlen, clen); + LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 27a12fd..aca197e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -28,7 +28,7 @@ import org.apache.sysml.runtime.matrix.operators.Operator; public abstract class GPUInstruction extends Instruction { - public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary }; + public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary }; protected GPUINSTRUCTION_TYPE _gputype; protected Operator _optr; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java new file mode 100644 index 0000000..04221f6 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java @@ -0,0 +1,85 @@ +package org.apache.sysml.runtime.instructions.gpu.context; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.IndexFunction; +import org.apache.sysml.runtime.functionobjects.ReduceAll; +import org.apache.sysml.runtime.functionobjects.ReduceCol; +import org.apache.sysml.runtime.functionobjects.ReduceRow; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.*; +import org.apache.sysml.utils.Statistics; + +/** + * Implements aggregate unary instructions for CUDA + */ +public class AggregateUnaryGPUInstruction extends GPUInstruction { + private CPOperand _input1 = null; + private CPOperand _output = null; + + public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand out, + String opcode, String istr) + { + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.AggregateUnary; + _input1 = in1; + _output = out; + } + + public static AggregateUnaryGPUInstruction parseInstruction(String str ) + throws DMLRuntimeException + { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + CPOperand in1 = new CPOperand(parts[1]); + CPOperand out = new CPOperand(parts[2]); + + // This follows logic similar to AggregateUnaryCPInstruction. + // nrow, ncol & length should either read or refresh metadata + Operator aggop = null; + if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { + throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); + } else { + aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode); + } + return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str); + } + + @Override + public void processInstruction(ExecutionContext ec) + throws DMLRuntimeException + { + Statistics.incrementNoOfExecutedGPUInst(); + + String opcode = getOpcode(); + + // nrow, ncol & length should either read or refresh metadata + if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { + throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); + } + + //get inputs + MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); + + int rlen = (int)in1.getNumRows(); + int clen = (int)in1.getNumColumns(); + + LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), (AggregateUnaryOperator)_optr); + + //release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + + // If the unary aggregate is a row reduction or a column reduction, it results in a vector + // which needs to be released. Otherwise a scala is produced and it is copied back to the host + // and set in the execution context by invoking the setScalarOutput + IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn; + if (indexFunction instanceof ReduceRow || indexFunction instanceof ReduceCol) { + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java index c04e8a4..ae41bc3 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java @@ -39,9 +39,24 @@ public class ExecutionConfig { public int blockDimZ = 1; public int sharedMemBytes = 0; public CUstream stream = null; + + private static HashMap<Integer, Integer> maxBlockDimForDevice = new HashMap<Integer, Integer>(); - + + /** + * Convenience constructor for setting the number of blocks, number of threads and the + * shared memory size + * @param gridDimX + * @param blockDimX + * @param sharedMemBytes + */ + public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) { + this.gridDimX = gridDimX; + this.blockDimX = blockDimX; + this.sharedMemBytes = sharedMemBytes; + } + /** * Use this for simple vector operations and use following in the kernel * <code>