[SYSTEMML-445] Support non-CuDNN GPU operator for LSTM forward and backward
- Added corresponding GPU tests that compare the result of CuDNN operator with the newly added operator. Also, the results are compared with DML-bodied LSTM implementation in the nn layer. - The LSTM forward operator support sparse weights. - Sparse support for LSTM backward is disabled in the initial implementation. - Unnecessary intermediates are removed from lstm.dml - Extended LibMatrixCuMatMult to support arbitrary alpha and beta during matrix multiplication. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/bd34292d Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/bd34292d Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/bd34292d Branch: refs/heads/master Commit: bd34292d4e521ffaa5118f89ab9350ffe4e89af0 Parents: ef842da Author: Niketan Pansare <npan...@us.ibm.com> Authored: Sat Oct 20 11:03:53 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sat Oct 20 11:08:11 2018 -0700 ---------------------------------------------------------------------- scripts/nn/layers/lstm.dml | 1 - src/main/cpp/kernels/SystemML.cu | 315 +++ src/main/cpp/kernels/SystemML.ptx | 2074 +++++++++++++++++- .../instructions/gpu/DnnGPUInstruction.java | 232 +- .../gpu/context/GPUMemoryManager.java | 4 + .../runtime/matrix/data/LibMatrixCUDA.java | 21 +- .../runtime/matrix/data/LibMatrixCuDNN.java | 236 +- .../runtime/matrix/data/LibMatrixCuMatMult.java | 34 +- .../org/apache/sysml/test/gpu/LstmTest.java | 318 +++ 9 files changed, 3130 insertions(+), 105 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/bd34292d/scripts/nn/layers/lstm.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml index 44942d2..0b0016b 100644 --- a/scripts/nn/layers/lstm.dml +++ b/scripts/nn/layers/lstm.dml @@ -182,7 +182,6 @@ backward = function(matrix[double] dout, matrix[double] dc, for (iter in 1:T) { # each timestep in reverse order X_t = X[,(t-1)*D+1:t*D] # shape (N, D) dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M) - out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M) ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M) if (t == 1) { out_prev = out0 # shape (N, M) http://git-wip-us.apache.org/repos/asf/systemml/blob/bd34292d/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 26d7f43..ab5f326 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -2406,3 +2406,318 @@ extern "C" __global__ void backward_dgamma_tmp_f(double *ema_mean, double *dout, int N, int C, int HW, int CHW, int NCHW) { backward_dgamma_tmp(ema_mean, dout, X, ema_var, ret, N, C, HW, CHW, NCHW); } + + +// Performs the operation: +// X_t = X[,(t-1)*D+1:t*D] # shape (N, D) +// ret = cbind(X_t, out_prev) # shape (N, D+M) +// size => N*(D+M) +template <typename T> +__device__ void prepareInputNNLstm(T *X, T* out_prev, T *ret, int t, int M, int D, int TD, int DPlusM, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + int n = index / DPlusM; + int iy = index % DPlusM; + if(iy < D) { + ret[index] = X[n*TD + t*D + iy]; + } + else { + ret[index] = out_prev[n*M + (iy-D)]; + } + } +} + +extern "C" __global__ void prepareInputNNLstm_d(double *X, double* out_prev, double *ret, int t, int M, int D, int TD, int DPlusM, unsigned int size) { + prepareInputNNLstm(X, out_prev, ret, t, M, D, TD, DPlusM, size); +} + +extern "C" __global__ void prepareInputNNLstm_f(float *X, float* out_prev, float *ret, int t, int M, int D, int TD, int DPlusM, unsigned int size) { + prepareInputNNLstm(X, out_prev, ret, t, M, D, TD, DPlusM, size); +} + + +// Performs the operations: +// ifog = ifog + b +// ifog[,1:3*M] = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid +// ifog[,3*M+1:4*M] = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh +template <typename T> +__device__ void squashIFOG(T *ifog, T *b, int M, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + int M4 = M*4; + int n = index / M4; + int iy = index % M4; + T ifogVal = ifog[index] + b[iy]; + if(iy < M*3) { + ifogVal = 0.5 * tanh(0.5 * ifogVal) + 0.5; // sigmoid + } + else { + ifogVal = tanh(ifogVal); + } + ifog[index] = ifogVal; + } +} + +extern "C" __global__ void squashIFOG_d(double *ifog, double *b, int M, unsigned int size) { + squashIFOG(ifog, b, M, size); +} + +extern "C" __global__ void squashIFOG_f(float *ifog, float *b, int M, unsigned int size) { + squashIFOG(ifog, b, M, size); +} + +// c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] +// out_t = ifog[,2*M+1:3*M] * tanh::forward(c) +// if (return_sequences) { +// out[,(t-1)*M+1:t*M] = out_t +// } +// else { +// out = out_t +// } +// out_prev = out_t +// c_prev = c +// cache_out[t,] = matrix(out_t, rows=1, cols=N*M) +// cache_c[t,] = matrix(c, rows=1, cols=N*M) +template <typename T> +__device__ void postProcessNNLstmForward(T *ifog, + T *c, T* out_prev, T* c_prev, + T *out, T *cache_out, T *cache_c, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < NM) { + int M4 = M*4; + int n = index / M; + int m = index % M; + int m4 = m*4; + T iGate = ifog[n*M4 + m]; // ifog[,1:M] + T fGate = ifog[n*M4 + M + m]; // ifog[,M+1:2*M] + T oGate = ifog[n*M4 + M*2 + m]; // ifog[,2*M+1:3*M] + T gGate = ifog[n*M4 + M*3 + m]; // ifog[,3*M+1:4*M] + T cVal = fGate*c_prev[index] + iGate*gGate; + T out_tVal = oGate*tanh(cVal); + int outIndex = return_sequences == 0 ? index : (n*T1*M + t*M + m); + int cacheIndex = t*NM + index; + + c[index] = cVal; + out_prev[index] = out_tVal; + c_prev[index] = cVal; + cache_out[cacheIndex] = out_tVal; + cache_c[cacheIndex] = cVal; + out[outIndex] = out_tVal; + } +} + +extern "C" __global__ void postProcessNNLstmForward_d(double *ifog, + double *c, double *out_prev, double *c_prev, + double *out, double *cache_out, double *cache_c, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + postProcessNNLstmForward(ifog, c, out_prev, c_prev, out, cache_out, cache_c, return_sequences, t, T1, M, NM); +} + +extern "C" __global__ void postProcessNNLstmForward_f(float *ifog, + float *c, float *out_prev, float *c_prev, + float *out, float *cache_out, float *cache_c, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + postProcessNNLstmForward(ifog, c, out_prev, c_prev, out, cache_out, cache_c, return_sequences, t, T1, M, NM); +} + + +// c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] +// out_t = ifog[,2*M+1:3*M] * tanh::forward(c) +// if (return_sequences) { +// out[,(t-1)*M+1:t*M] = out_t +// } +// else { +// out = out_t +// } +// out_prev = out_t +// c_prev = c +template <typename T> +__device__ void postProcessNNLstmForwardSkipCache(T *ifog, + T *c, T* out_prev, T* c_prev, + T *out, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < NM) { + int M4 = M*4; + int n = index / M; + int m = index % M; + int m4 = m*4; + T iGate = ifog[n*M4 + m]; // ifog[,1:M] + T fGate = ifog[n*M4 + M + m]; // ifog[,M+1:2*M] + T oGate = ifog[n*M4 + M*2 + m]; // ifog[,2*M+1:3*M] + T gGate = ifog[n*M4 + M*3 + m]; // ifog[,3*M+1:4*M] + T cVal = fGate*c_prev[index] + iGate*gGate; + T out_tVal = oGate*tanh(cVal); + int outIndex = return_sequences == 0 ? index : (n*T1*M + t*M + m); + int cacheIndex = t*NM + index; + + c[index] = cVal; + out_prev[index] = out_tVal; + c_prev[index] = cVal; + out[outIndex] = out_tVal; + } +} + +extern "C" __global__ void postProcessNNLstmForwardSkipCache_d(double *ifog, + double *c, double *out_prev, double *c_prev, + double *out, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + postProcessNNLstmForwardSkipCache(ifog, c, out_prev, c_prev, out, return_sequences, t, T1, M, NM); +} + +extern "C" __global__ void postProcessNNLstmForwardSkipCache_f(float *ifog, + float *c, float *out_prev, float *c_prev, + float *out, + int return_sequences, int t, int T1, int M, + unsigned int NM) { + postProcessNNLstmForwardSkipCache(ifog, c, out_prev, c_prev, out, return_sequences, t, T1, M, NM); +} + +template <typename T> +__device__ void initializeDoutWhenReturnSeq(T *dout, T *dout_t, int t, int M, int TM, unsigned int NM) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < NM) { + int n = index / M; + int m = index % M; + dout_t[index] = dout[n*TM + t*M + m]; + } +} + +extern "C" __global__ void initializeDoutWhenReturnSeq_d(double *dout, double *dout_t, int t, int M, int TM, unsigned int NM) { + initializeDoutWhenReturnSeq(dout, dout_t, t, M, TM, NM); +} + +extern "C" __global__ void initializeDoutWhenReturnSeq_f(float *dout, float *dout_t, int t, int M, int TM, unsigned int NM) { + initializeDoutWhenReturnSeq(dout, dout_t, t, M, TM, NM); +} + + +// Performs the operation +// i = ifog[,1:M] # input gate, shape (N, M) +// f = ifog[,M+1:2*M] # forget gate, shape (N, M) +// o = ifog[,2*M+1:3*M] # output gate, shape (N, M) +// g = ifog[,3*M+1:4*M] # g gate, shape (N, M) +// dct = dct + o*tanh::backward(dout_t, ct) # shape (N, M) +// do = tanh::forward(ct) * dout_t # output gate, shape (N, M) +// df = c_prev * dct # forget gate, shape (N, M) +// dc_prev = f * dct # shape (N, M) +// di = g * dct # input gate, shape (N, M) +// dg = i * dct # g gate, shape (N, M) +// di_raw = i * (1-i) * di +// df_raw = f * (1-f) * df +// do_raw = o * (1-o) * do +// dg_raw = (1-g^2) * dg +// difog_raw = cbind(di_raw, df_raw, do_raw, dg_raw) # shape (N, 4M) +template <typename T> +__device__ void computeDifog_raw(T *ifog, T *ct, T *dout_t, T *cache_c, T *c0, + T *difog_raw, T *dct, T *dc0, // output + int return_sequences, int t, int T1, int M, unsigned int NM) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < NM) { + int M4 = M*4; + int n = index / M; + int m = index % M; + + T dout_tVal = dout_t[index]; + + T i = ifog[n*M4 + m]; + T f = ifog[n*M4 + M + m]; + T o = ifog[n*M4 + M*2 + m]; + T g = ifog[n*M4 + M*3 + m]; + + T ctVal = ct[index]; + + // if (t == 1) + // c_prev = c0 # shape (N, M) + // else + // c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M) + T c_prevVal = (t==0) ? c0[index] : cache_c[(t-1)*NM + index]; + + // dct = dct + o*tanh::backward(dout_t, ct) + T tmp = tanh(ctVal); + T dctVal = dct[index] + o*((1-tmp*tmp) * dout_tVal); + + T dc_prevVal = f * dctVal; + + T do1 = tanh(ctVal) * dout_tVal; + T df = c_prevVal * dctVal; + T di = g * dctVal; + T dg = i * dctVal; + + if (t == 0) { + dc0[index] = dc_prevVal; + dct[index] = dctVal; + } + else { + dct[index] = dc_prevVal; + } + difog_raw[n*M4 + m] = i * (1-i) * di; // di_raw + difog_raw[n*M4 + M + m] = f * (1-f) * df; // df_raw + difog_raw[n*M4 + M*2 + m] = o * (1-o) * do1; // do_raw + difog_raw[n*M4 + M*3 + m] = (1-g*g) * dg; // dg_raw + } +} + +extern "C" __global__ void computeDifog_raw_d(double *ifog, double *ct, double *dout_t, double *cache_c, double *c0, + double *difog_raw, double *dct, double *dc0, // output + int return_sequences, int t, int T1, int M, unsigned int NM) { + computeDifog_raw(ifog, ct, dout_t, cache_c, c0, + difog_raw, dct, dc0, // output + return_sequences, t, T1, M, NM); +} + +extern "C" __global__ void computeDifog_raw_f(float *ifog, float *ct, float *dout_t, float *cache_c, float *c0, + float *difog_raw, float *dct, float *dc0, // output + int return_sequences, int t, int T1, int M, unsigned int NM) { + computeDifog_raw(ifog, ct, dout_t, cache_c, c0, + difog_raw, dct, dc0, // output + return_sequences, t, T1, M, NM); +} + +template <typename T> +__device__ void postProcessNNLstmBackward(T *dinput, T *dout0, T* dout, T * dout_t, T *dX, int return_sequences, int t, int N, int D, int M, + int ND, int NM, int TD, int TM, int DPlusM, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < ND) { + int n = index / D; + int d = index % D; + // dX[,(t-1)*D+1:t*D] = dinput[,1:D] // dinput is of shape (N, D+M) + dX[n*TD + t*D + d] = dinput[n*DPlusM + d]; + } + if (index < NM) { + int n = index / M; + int m = index % M; + // dout_prev = dinput[,D+1:D+M] + T dout_prev = dinput[n*DPlusM + D + m]; + if(t == 0) { + // dout0 = dout_prev + dout0[index] = dout_prev; + } + else if(return_sequences != 0) { + // dout_t = dout[,(t-2)*M+1:(t-1)*M] + dout_prev + dout_t[index] = dout[n*TM + (t-1)*M + m] + dout_prev; + } + else { + // dout_t = dout_prev + dout_t[index] = dout_prev; + } + } +} + +extern "C" __global__ void postProcessNNLstmBackward_d(double *dinput, double *dout0, double *dout, double *dout_t, double *dX, int return_sequences, int t, int N, int D, int M, + int ND, int NM, int TD, int TM, int DPlusM, unsigned int size) { + postProcessNNLstmBackward(dinput, dout0, dout, dout_t, dX, return_sequences, t, N, D, M, + ND, NM, TD, TM, DPlusM, size); +} + +extern "C" __global__ void postProcessNNLstmBackward_f(float *dinput, float *dout0, float *dout, float *dout_t, float *dX, int return_sequences, int t, int N, int D, int M, + int ND, int NM, int TD, int TM, int DPlusM, unsigned int size) { + postProcessNNLstmBackward(dinput, dout0, dout, dout_t, dX, return_sequences, t, N, D, M, + ND, NM, TD, TM, DPlusM, size); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/bd34292d/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 3043373..bf40fb9 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -15218,12 +15218,2032 @@ BB125_2: ret; } + // .globl prepareInputNNLstm_d +.visible .entry prepareInputNNLstm_d( + .param .u64 prepareInputNNLstm_d_param_0, + .param .u64 prepareInputNNLstm_d_param_1, + .param .u64 prepareInputNNLstm_d_param_2, + .param .u32 prepareInputNNLstm_d_param_3, + .param .u32 prepareInputNNLstm_d_param_4, + .param .u32 prepareInputNNLstm_d_param_5, + .param .u32 prepareInputNNLstm_d_param_6, + .param .u32 prepareInputNNLstm_d_param_7, + .param .u32 prepareInputNNLstm_d_param_8 +) +{ + .reg .pred %p<3>; + .reg .b32 %r<18>; + .reg .f64 %fd<3>; + .reg .b64 %rd<13>; + + + ld.param.u64 %rd2, [prepareInputNNLstm_d_param_0]; + ld.param.u64 %rd3, [prepareInputNNLstm_d_param_1]; + ld.param.u64 %rd4, [prepareInputNNLstm_d_param_2]; + ld.param.u32 %r4, [prepareInputNNLstm_d_param_3]; + ld.param.u32 %r5, [prepareInputNNLstm_d_param_4]; + ld.param.u32 %r6, [prepareInputNNLstm_d_param_5]; + ld.param.u32 %r7, [prepareInputNNLstm_d_param_6]; + ld.param.u32 %r8, [prepareInputNNLstm_d_param_7]; + ld.param.u32 %r9, [prepareInputNNLstm_d_param_8]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r1, %r10, %r11, %r12; + setp.ge.u32 %p1, %r1, %r9; + @%p1 bra BB126_4; + + cvta.to.global.u64 %rd5, %rd4; + div.s32 %r2, %r1, %r8; + rem.s32 %r3, %r1, %r8; + setp.lt.s32 %p2, %r3, %r6; + mul.wide.s32 %rd6, %r1, 8; + add.s64 %rd1, %rd5, %rd6; + @%p2 bra BB126_3; + bra.uni BB126_2; + +BB126_3: + cvta.to.global.u64 %rd10, %rd2; + mul.lo.s32 %r15, %r6, %r4; + mad.lo.s32 %r16, %r2, %r7, %r15; + add.s32 %r17, %r16, %r3; + mul.wide.s32 %rd11, %r17, 8; + add.s64 %rd12, %rd10, %rd11; + ld.global.f64 %fd2, [%rd12]; + st.global.f64 [%rd1], %fd2; + bra.uni BB126_4; + +BB126_2: + cvta.to.global.u64 %rd7, %rd3; + sub.s32 %r13, %r3, %r6; + mad.lo.s32 %r14, %r2, %r5, %r13; + mul.wide.s32 %rd8, %r14, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + st.global.f64 [%rd1], %fd1; + +BB126_4: + ret; +} + + // .globl prepareInputNNLstm_f +.visible .entry prepareInputNNLstm_f( + .param .u64 prepareInputNNLstm_f_param_0, + .param .u64 prepareInputNNLstm_f_param_1, + .param .u64 prepareInputNNLstm_f_param_2, + .param .u32 prepareInputNNLstm_f_param_3, + .param .u32 prepareInputNNLstm_f_param_4, + .param .u32 prepareInputNNLstm_f_param_5, + .param .u32 prepareInputNNLstm_f_param_6, + .param .u32 prepareInputNNLstm_f_param_7, + .param .u32 prepareInputNNLstm_f_param_8 +) +{ + .reg .pred %p<3>; + .reg .f32 %f<3>; + .reg .b32 %r<18>; + .reg .b64 %rd<13>; + + + ld.param.u64 %rd2, [prepareInputNNLstm_f_param_0]; + ld.param.u64 %rd3, [prepareInputNNLstm_f_param_1]; + ld.param.u64 %rd4, [prepareInputNNLstm_f_param_2]; + ld.param.u32 %r4, [prepareInputNNLstm_f_param_3]; + ld.param.u32 %r5, [prepareInputNNLstm_f_param_4]; + ld.param.u32 %r6, [prepareInputNNLstm_f_param_5]; + ld.param.u32 %r7, [prepareInputNNLstm_f_param_6]; + ld.param.u32 %r8, [prepareInputNNLstm_f_param_7]; + ld.param.u32 %r9, [prepareInputNNLstm_f_param_8]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r1, %r10, %r11, %r12; + setp.ge.u32 %p1, %r1, %r9; + @%p1 bra BB127_4; + + cvta.to.global.u64 %rd5, %rd4; + div.s32 %r2, %r1, %r8; + rem.s32 %r3, %r1, %r8; + setp.lt.s32 %p2, %r3, %r6; + mul.wide.s32 %rd6, %r1, 4; + add.s64 %rd1, %rd5, %rd6; + @%p2 bra BB127_3; + bra.uni BB127_2; + +BB127_3: + cvta.to.global.u64 %rd10, %rd2; + mul.lo.s32 %r15, %r6, %r4; + mad.lo.s32 %r16, %r2, %r7, %r15; + add.s32 %r17, %r16, %r3; + mul.wide.s32 %rd11, %r17, 4; + add.s64 %rd12, %rd10, %rd11; + ld.global.f32 %f2, [%rd12]; + st.global.f32 [%rd1], %f2; + bra.uni BB127_4; + +BB127_2: + cvta.to.global.u64 %rd7, %rd3; + sub.s32 %r13, %r3, %r6; + mad.lo.s32 %r14, %r2, %r5, %r13; + mul.wide.s32 %rd8, %r14, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f1, [%rd9]; + st.global.f32 [%rd1], %f1; + +BB127_4: + ret; +} + + // .globl squashIFOG_d +.visible .entry squashIFOG_d( + .param .u64 squashIFOG_d_param_0, + .param .u64 squashIFOG_d_param_1, + .param .u32 squashIFOG_d_param_2, + .param .u32 squashIFOG_d_param_3 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<32>; + .reg .f64 %fd<152>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd2, [squashIFOG_d_param_0]; + ld.param.u64 %rd3, [squashIFOG_d_param_1]; + ld.param.u32 %r6, [squashIFOG_d_param_2]; + ld.param.u32 %r7, [squashIFOG_d_param_3]; + mov.u32 %r8, %ctaid.x; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %tid.x; + mad.lo.s32 %r1, %r9, %r8, %r10; + setp.ge.u32 %p1, %r1, %r7; + @%p1 bra BB128_11; + + cvta.to.global.u64 %rd4, %rd2; + shl.b32 %r11, %r6, 2; + rem.s32 %r12, %r1, %r11; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd1, %rd4, %rd5; + cvta.to.global.u64 %rd6, %rd3; + mul.wide.s32 %rd7, %r12, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd14, [%rd8]; + ld.global.f64 %fd15, [%rd1]; + add.f64 %fd1, %fd15, %fd14; + mul.lo.s32 %r13, %r6, 3; + setp.lt.s32 %p2, %r12, %r13; + @%p2 bra BB128_6; + bra.uni BB128_2; + +BB128_6: + mul.f64 %fd7, %fd1, 0d3FE0000000000000; + { + .reg .b32 %temp; + mov.b64 {%temp, %r4}, %fd7; + } + and.b32 %r5, %r4, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r23, %temp}, %fd7; + } + mov.b64 %fd8, {%r23, %r5}; + setp.ltu.f64 %p5, %fd8, 0d3FE1C7A398201CD6; + @%p5 bra BB128_8; + bra.uni BB128_7; + +BB128_8: + mul.f64 %fd127, %fd7, %fd7; + mov.f64 %fd128, 0dBF2B9093D89F0E23; + mov.f64 %fd129, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd130, %fd129, %fd127, %fd128; + mov.f64 %fd131, 0d3F42FA2744C30B61; + fma.rn.f64 %fd132, %fd130, %fd127, %fd131; + mov.f64 %fd133, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd134, %fd132, %fd127, %fd133; + mov.f64 %fd135, 0d3F6D6C61D450119A; + fma.rn.f64 %fd136, %fd134, %fd127, %fd135; + mov.f64 %fd137, 0dBF8226DDD44294F5; + fma.rn.f64 %fd138, %fd136, %fd127, %fd137; + mov.f64 %fd139, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd140, %fd138, %fd127, %fd139; + mov.f64 %fd141, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd142, %fd140, %fd127, %fd141; + mov.f64 %fd143, 0d3FC111111110295E; + fma.rn.f64 %fd144, %fd142, %fd127, %fd143; + mov.f64 %fd145, 0dBFD555555555549F; + fma.rn.f64 %fd146, %fd144, %fd127, %fd145; + mul.f64 %fd147, %fd127, %fd146; + fma.rn.f64 %fd150, %fd147, %fd7, %fd7; + bra.uni BB128_9; + +BB128_2: + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd1; + } + and.b32 %r3, %r2, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r14, %temp}, %fd1; + } + mov.b64 %fd2, {%r14, %r3}; + setp.ltu.f64 %p3, %fd2, 0d3FE1C7A398201CD6; + @%p3 bra BB128_4; + bra.uni BB128_3; + +BB128_4: + mul.f64 %fd61, %fd1, %fd1; + mov.f64 %fd62, 0dBF2B9093D89F0E23; + mov.f64 %fd63, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd64, %fd63, %fd61, %fd62; + mov.f64 %fd65, 0d3F42FA2744C30B61; + fma.rn.f64 %fd66, %fd64, %fd61, %fd65; + mov.f64 %fd67, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd68, %fd66, %fd61, %fd67; + mov.f64 %fd69, 0d3F6D6C61D450119A; + fma.rn.f64 %fd70, %fd68, %fd61, %fd69; + mov.f64 %fd71, 0dBF8226DDD44294F5; + fma.rn.f64 %fd72, %fd70, %fd61, %fd71; + mov.f64 %fd73, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd74, %fd72, %fd61, %fd73; + mov.f64 %fd75, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd76, %fd74, %fd61, %fd75; + mov.f64 %fd77, 0d3FC111111110295E; + fma.rn.f64 %fd78, %fd76, %fd61, %fd77; + mov.f64 %fd79, 0dBFD555555555549F; + fma.rn.f64 %fd80, %fd78, %fd61, %fd79; + mul.f64 %fd81, %fd61, %fd80; + fma.rn.f64 %fd149, %fd81, %fd1, %fd1; + bra.uni BB128_5; + +BB128_7: + add.f64 %fd82, %fd8, %fd8; + mov.f64 %fd83, 0d4338000000000000; + mov.f64 %fd84, 0d3FF71547652B82FE; + fma.rn.f64 %fd85, %fd82, %fd84, %fd83; + { + .reg .b32 %temp; + mov.b64 {%r24, %temp}, %fd85; + } + mov.f64 %fd86, 0dC338000000000000; + add.rn.f64 %fd87, %fd85, %fd86; + mov.f64 %fd88, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd89, %fd87, %fd88, %fd82; + mov.f64 %fd90, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd91, %fd87, %fd90, %fd89; + mov.f64 %fd92, 0d3E5AF86D8EBD13CD; + mov.f64 %fd93, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd94, %fd93, %fd91, %fd92; + mov.f64 %fd95, 0d3E927E5092BA033D; + fma.rn.f64 %fd96, %fd94, %fd91, %fd95; + mov.f64 %fd97, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd98, %fd96, %fd91, %fd97; + mov.f64 %fd99, 0d3EFA01A018D034E6; + fma.rn.f64 %fd100, %fd98, %fd91, %fd99; + mov.f64 %fd101, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd102, %fd100, %fd91, %fd101; + mov.f64 %fd103, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd104, %fd102, %fd91, %fd103; + mov.f64 %fd105, 0d3F8111111110F74D; + fma.rn.f64 %fd106, %fd104, %fd91, %fd105; + mov.f64 %fd107, 0d3FA555555555554D; + fma.rn.f64 %fd108, %fd106, %fd91, %fd107; + mov.f64 %fd109, 0d3FC5555555555557; + fma.rn.f64 %fd110, %fd108, %fd91, %fd109; + mov.f64 %fd111, 0d3FE0000000000000; + fma.rn.f64 %fd112, %fd110, %fd91, %fd111; + mul.f64 %fd113, %fd91, %fd112; + fma.rn.f64 %fd114, %fd113, %fd91, %fd91; + shl.b32 %r25, %r24, 20; + add.s32 %r26, %r25, 1072693248; + mov.u32 %r27, 0; + mov.b64 %fd115, {%r27, %r26}; + fma.rn.f64 %fd116, %fd114, %fd115, %fd115; + add.f64 %fd117, %fd116, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd118, %fd117; + neg.f64 %fd119, %fd117; + mov.f64 %fd120, 0d3FF0000000000000; + fma.rn.f64 %fd121, %fd119, %fd118, %fd120; + fma.rn.f64 %fd122, %fd121, %fd121, %fd121; + fma.rn.f64 %fd123, %fd122, %fd118, %fd118; + neg.f64 %fd124, %fd123; + mov.f64 %fd125, 0d4000000000000000; + fma.rn.f64 %fd126, %fd125, %fd124, %fd120; + setp.gt.u32 %p6, %r5, 1077936127; + selp.f64 %fd150, 0d3FF0000000000000, %fd126, %p6; + +BB128_9: + and.b32 %r28, %r4, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r29}, %fd150; + } + or.b32 %r30, %r29, %r28; + { + .reg .b32 %temp; + mov.b64 {%r31, %temp}, %fd150; + } + mov.b64 %fd148, {%r31, %r30}; + fma.rn.f64 %fd151, %fd148, 0d3FE0000000000000, 0d3FE0000000000000; + bra.uni BB128_10; + +BB128_3: + add.f64 %fd16, %fd2, %fd2; + mov.f64 %fd17, 0d4338000000000000; + mov.f64 %fd18, 0d3FF71547652B82FE; + fma.rn.f64 %fd19, %fd16, %fd18, %fd17; + { + .reg .b32 %temp; + mov.b64 {%r15, %temp}, %fd19; + } + mov.f64 %fd20, 0dC338000000000000; + add.rn.f64 %fd21, %fd19, %fd20; + mov.f64 %fd22, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd23, %fd21, %fd22, %fd16; + mov.f64 %fd24, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd25, %fd21, %fd24, %fd23; + mov.f64 %fd26, 0d3E5AF86D8EBD13CD; + mov.f64 %fd27, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd28, %fd27, %fd25, %fd26; + mov.f64 %fd29, 0d3E927E5092BA033D; + fma.rn.f64 %fd30, %fd28, %fd25, %fd29; + mov.f64 %fd31, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd32, %fd30, %fd25, %fd31; + mov.f64 %fd33, 0d3EFA01A018D034E6; + fma.rn.f64 %fd34, %fd32, %fd25, %fd33; + mov.f64 %fd35, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd36, %fd34, %fd25, %fd35; + mov.f64 %fd37, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd38, %fd36, %fd25, %fd37; + mov.f64 %fd39, 0d3F8111111110F74D; + fma.rn.f64 %fd40, %fd38, %fd25, %fd39; + mov.f64 %fd41, 0d3FA555555555554D; + fma.rn.f64 %fd42, %fd40, %fd25, %fd41; + mov.f64 %fd43, 0d3FC5555555555557; + fma.rn.f64 %fd44, %fd42, %fd25, %fd43; + mov.f64 %fd45, 0d3FE0000000000000; + fma.rn.f64 %fd46, %fd44, %fd25, %fd45; + mul.f64 %fd47, %fd25, %fd46; + fma.rn.f64 %fd48, %fd47, %fd25, %fd25; + shl.b32 %r16, %r15, 20; + add.s32 %r17, %r16, 1072693248; + mov.u32 %r18, 0; + mov.b64 %fd49, {%r18, %r17}; + fma.rn.f64 %fd50, %fd48, %fd49, %fd49; + add.f64 %fd51, %fd50, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd52, %fd51; + neg.f64 %fd53, %fd51; + mov.f64 %fd54, 0d3FF0000000000000; + fma.rn.f64 %fd55, %fd53, %fd52, %fd54; + fma.rn.f64 %fd56, %fd55, %fd55, %fd55; + fma.rn.f64 %fd57, %fd56, %fd52, %fd52; + neg.f64 %fd58, %fd57; + mov.f64 %fd59, 0d4000000000000000; + fma.rn.f64 %fd60, %fd59, %fd58, %fd54; + setp.gt.u32 %p4, %r3, 1077936127; + selp.f64 %fd149, 0d3FF0000000000000, %fd60, %p4; + +BB128_5: + and.b32 %r19, %r2, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r20}, %fd149; + } + or.b32 %r21, %r20, %r19; + { + .reg .b32 %temp; + mov.b64 {%r22, %temp}, %fd149; + } + mov.b64 %fd151, {%r22, %r21}; + +BB128_10: + st.global.f64 [%rd1], %fd151; + +BB128_11: + ret; +} + + // .globl squashIFOG_f +.visible .entry squashIFOG_f( + .param .u64 squashIFOG_f_param_0, + .param .u64 squashIFOG_f_param_1, + .param .u32 squashIFOG_f_param_2, + .param .u32 squashIFOG_f_param_3 +) +{ + .reg .pred %p<8>; + .reg .f32 %f<36>; + .reg .b32 %r<26>; + .reg .f64 %fd<76>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd2, [squashIFOG_f_param_0]; + ld.param.u64 %rd3, [squashIFOG_f_param_1]; + ld.param.u32 %r4, [squashIFOG_f_param_2]; + ld.param.u32 %r5, [squashIFOG_f_param_3]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB129_10; + + cvta.to.global.u64 %rd4, %rd2; + shl.b32 %r9, %r4, 2; + rem.s32 %r10, %r1, %r9; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd1, %rd4, %rd5; + cvta.to.global.u64 %rd6, %rd3; + mul.wide.s32 %rd7, %r10, 4; + add.s64 %rd8, %rd6, %rd7; + ld.global.f32 %f7, [%rd8]; + ld.global.f32 %f8, [%rd1]; + add.f32 %f1, %f8, %f7; + mul.lo.s32 %r11, %r4, 3; + setp.lt.s32 %p2, %r10, %r11; + @%p2 bra BB129_5; + bra.uni BB129_2; + +BB129_5: + cvt.f64.f32 %fd6, %f1; + mul.f64 %fd1, %fd6, 0d3FE0000000000000; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd1; + } + and.b32 %r3, %r2, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r17, %temp}, %fd1; + } + mov.b64 %fd2, {%r17, %r3}; + setp.ltu.f64 %p6, %fd2, 0d3FE1C7A398201CD6; + @%p6 bra BB129_7; + bra.uni BB129_6; + +BB129_7: + mul.f64 %fd52, %fd1, %fd1; + mov.f64 %fd53, 0dBF2B9093D89F0E23; + mov.f64 %fd54, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd55, %fd54, %fd52, %fd53; + mov.f64 %fd56, 0d3F42FA2744C30B61; + fma.rn.f64 %fd57, %fd55, %fd52, %fd56; + mov.f64 %fd58, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd59, %fd57, %fd52, %fd58; + mov.f64 %fd60, 0d3F6D6C61D450119A; + fma.rn.f64 %fd61, %fd59, %fd52, %fd60; + mov.f64 %fd62, 0dBF8226DDD44294F5; + fma.rn.f64 %fd63, %fd61, %fd52, %fd62; + mov.f64 %fd64, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd65, %fd63, %fd52, %fd64; + mov.f64 %fd66, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd67, %fd65, %fd52, %fd66; + mov.f64 %fd68, 0d3FC111111110295E; + fma.rn.f64 %fd69, %fd67, %fd52, %fd68; + mov.f64 %fd70, 0dBFD555555555549F; + fma.rn.f64 %fd71, %fd69, %fd52, %fd70; + mul.f64 %fd72, %fd52, %fd71; + fma.rn.f64 %fd75, %fd72, %fd1, %fd1; + bra.uni BB129_8; + +BB129_2: + abs.f32 %f2, %f1; + setp.ltu.f32 %p3, %f2, 0f3F0CCCCD; + @%p3 bra BB129_4; + bra.uni BB129_3; + +BB129_4: + mul.f32 %f24, %f1, %f1; + mov.f32 %f25, 0fBD57BE66; + mov.f32 %f26, 0f3C86A81B; + fma.rn.f32 %f27, %f26, %f24, %f25; + mov.f32 %f28, 0f3E08677B; + fma.rn.f32 %f29, %f27, %f24, %f28; + mov.f32 %f30, 0fBEAAAA29; + fma.rn.f32 %f31, %f29, %f24, %f30; + mul.f32 %f32, %f24, %f31; + fma.rn.f32 %f33, %f32, %f1, %f1; + add.f32 %f34, %f1, %f1; + setp.eq.f32 %p5, %f1, 0f00000000; + selp.f32 %f35, %f34, %f33, %p5; + bra.uni BB129_9; + +BB129_6: + add.f64 %fd7, %fd2, %fd2; + mov.f64 %fd8, 0d4338000000000000; + mov.f64 %fd9, 0d3FF71547652B82FE; + fma.rn.f64 %fd10, %fd7, %fd9, %fd8; + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd10; + } + mov.f64 %fd11, 0dC338000000000000; + add.rn.f64 %fd12, %fd10, %fd11; + mov.f64 %fd13, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd14, %fd12, %fd13, %fd7; + mov.f64 %fd15, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd16, %fd12, %fd15, %fd14; + mov.f64 %fd17, 0d3E5AF86D8EBD13CD; + mov.f64 %fd18, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd19, %fd18, %fd16, %fd17; + mov.f64 %fd20, 0d3E927E5092BA033D; + fma.rn.f64 %fd21, %fd19, %fd16, %fd20; + mov.f64 %fd22, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd23, %fd21, %fd16, %fd22; + mov.f64 %fd24, 0d3EFA01A018D034E6; + fma.rn.f64 %fd25, %fd23, %fd16, %fd24; + mov.f64 %fd26, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd27, %fd25, %fd16, %fd26; + mov.f64 %fd28, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd29, %fd27, %fd16, %fd28; + mov.f64 %fd30, 0d3F8111111110F74D; + fma.rn.f64 %fd31, %fd29, %fd16, %fd30; + mov.f64 %fd32, 0d3FA555555555554D; + fma.rn.f64 %fd33, %fd31, %fd16, %fd32; + mov.f64 %fd34, 0d3FC5555555555557; + fma.rn.f64 %fd35, %fd33, %fd16, %fd34; + mov.f64 %fd36, 0d3FE0000000000000; + fma.rn.f64 %fd37, %fd35, %fd16, %fd36; + mul.f64 %fd38, %fd16, %fd37; + fma.rn.f64 %fd39, %fd38, %fd16, %fd16; + shl.b32 %r19, %r18, 20; + add.s32 %r20, %r19, 1072693248; + mov.u32 %r21, 0; + mov.b64 %fd40, {%r21, %r20}; + fma.rn.f64 %fd41, %fd39, %fd40, %fd40; + add.f64 %fd42, %fd41, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd43, %fd42; + neg.f64 %fd44, %fd42; + mov.f64 %fd45, 0d3FF0000000000000; + fma.rn.f64 %fd46, %fd44, %fd43, %fd45; + fma.rn.f64 %fd47, %fd46, %fd46, %fd46; + fma.rn.f64 %fd48, %fd47, %fd43, %fd43; + neg.f64 %fd49, %fd48; + mov.f64 %fd50, 0d4000000000000000; + fma.rn.f64 %fd51, %fd50, %fd49, %fd45; + setp.gt.u32 %p7, %r3, 1077936127; + selp.f64 %fd75, 0d3FF0000000000000, %fd51, %p7; + +BB129_8: + and.b32 %r22, %r2, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r23}, %fd75; + } + or.b32 %r24, %r23, %r22; + { + .reg .b32 %temp; + mov.b64 {%r25, %temp}, %fd75; + } + mov.b64 %fd73, {%r25, %r24}; + fma.rn.f64 %fd74, %fd73, 0d3FE0000000000000, 0d3FE0000000000000; + cvt.rn.f32.f64 %f35, %fd74; + bra.uni BB129_9; + +BB129_3: + add.f32 %f11, %f2, %f2; + mul.f32 %f12, %f11, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f13, %f12; + mov.f32 %f14, 0fBF317200; + fma.rn.f32 %f15, %f13, %f14, %f11; + mov.f32 %f16, 0fB5BFBE8E; + fma.rn.f32 %f17, %f13, %f16, %f15; + mul.f32 %f18, %f17, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f19, %f18; + ex2.approx.f32 %f20, %f13; + mov.f32 %f21, 0f3F800000; + fma.rn.f32 %f10, %f19, %f20, %f21; + // inline asm + rcp.approx.ftz.f32 %f9,%f10; + // inline asm + mov.f32 %f22, 0fC0000000; + fma.rn.f32 %f23, %f9, %f22, %f21; + mov.b32 %r12, %f23; + setp.ltu.f32 %p4, %f2, 0f42B00000; + selp.b32 %r13, %r12, 1065353216, %p4; + mov.b32 %r14, %f1; + and.b32 %r15, %r14, -2147483648; + or.b32 %r16, %r13, %r15; + mov.b32 %f35, %r16; + +BB129_9: + st.global.f32 [%rd1], %f35; + +BB129_10: + ret; +} + + // .globl postProcessNNLstmForward_d +.visible .entry postProcessNNLstmForward_d( + .param .u64 postProcessNNLstmForward_d_param_0, + .param .u64 postProcessNNLstmForward_d_param_1, + .param .u64 postProcessNNLstmForward_d_param_2, + .param .u64 postProcessNNLstmForward_d_param_3, + .param .u64 postProcessNNLstmForward_d_param_4, + .param .u64 postProcessNNLstmForward_d_param_5, + .param .u64 postProcessNNLstmForward_d_param_6, + .param .u32 postProcessNNLstmForward_d_param_7, + .param .u32 postProcessNNLstmForward_d_param_8, + .param .u32 postProcessNNLstmForward_d_param_9, + .param .u32 postProcessNNLstmForward_d_param_10, + .param .u32 postProcessNNLstmForward_d_param_11 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<41>; + .reg .f64 %fd<81>; + .reg .b64 %rd<34>; + + + ld.param.u64 %rd2, [postProcessNNLstmForward_d_param_0]; + ld.param.u64 %rd5, [postProcessNNLstmForward_d_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmForward_d_param_4]; + ld.param.u64 %rd8, [postProcessNNLstmForward_d_param_6]; + ld.param.u32 %r8, [postProcessNNLstmForward_d_param_10]; + ld.param.u32 %r9, [postProcessNNLstmForward_d_param_11]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r1, %r10, %r11, %r12; + setp.ge.u32 %p1, %r1, %r9; + @%p1 bra BB130_5; + + cvta.to.global.u64 %rd9, %rd2; + cvta.to.global.u64 %rd10, %rd5; + div.s32 %r2, %r1, %r8; + shl.b32 %r13, %r8, 2; + rem.s32 %r3, %r1, %r8; + mad.lo.s32 %r14, %r2, %r13, %r3; + mul.wide.s32 %rd11, %r14, 8; + add.s64 %rd12, %rd9, %rd11; + shl.b32 %r15, %r8, 3; + cvt.s64.s32 %rd13, %r15; + add.s64 %rd14, %rd12, %rd13; + add.s64 %rd15, %rd14, %rd13; + ld.global.f64 %fd1, [%rd15]; + add.s64 %rd16, %rd15, %rd13; + mul.wide.s32 %rd17, %r1, 8; + add.s64 %rd1, %rd10, %rd17; + ld.global.f64 %fd7, [%rd1]; + ld.global.f64 %fd8, [%rd14]; + mul.f64 %fd9, %fd8, %fd7; + ld.global.f64 %fd10, [%rd16]; + ld.global.f64 %fd11, [%rd12]; + fma.rn.f64 %fd2, %fd11, %fd10, %fd9; + { + .reg .b32 %temp; + mov.b64 {%temp, %r4}, %fd2; + } + and.b32 %r16, %r4, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r17, %temp}, %fd2; + } + mov.b64 %fd3, {%r17, %r16}; + setp.ltu.f64 %p2, %fd3, 0d3FE1C7A398201CD6; + @%p2 bra BB130_3; + bra.uni BB130_2; + +BB130_3: + mul.f64 %fd57, %fd2, %fd2; + mov.f64 %fd58, 0dBF2B9093D89F0E23; + mov.f64 %fd59, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd60, %fd59, %fd57, %fd58; + mov.f64 %fd61, 0d3F42FA2744C30B61; + fma.rn.f64 %fd62, %fd60, %fd57, %fd61; + mov.f64 %fd63, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd64, %fd62, %fd57, %fd63; + mov.f64 %fd65, 0d3F6D6C61D450119A; + fma.rn.f64 %fd66, %fd64, %fd57, %fd65; + mov.f64 %fd67, 0dBF8226DDD44294F5; + fma.rn.f64 %fd68, %fd66, %fd57, %fd67; + mov.f64 %fd69, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd70, %fd68, %fd57, %fd69; + mov.f64 %fd71, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd72, %fd70, %fd57, %fd71; + mov.f64 %fd73, 0d3FC111111110295E; + fma.rn.f64 %fd74, %fd72, %fd57, %fd73; + mov.f64 %fd75, 0dBFD555555555549F; + fma.rn.f64 %fd76, %fd74, %fd57, %fd75; + mul.f64 %fd77, %fd57, %fd76; + fma.rn.f64 %fd80, %fd77, %fd2, %fd2; + bra.uni BB130_4; + +BB130_2: + add.f64 %fd12, %fd3, %fd3; + mov.f64 %fd13, 0d4338000000000000; + mov.f64 %fd14, 0d3FF71547652B82FE; + fma.rn.f64 %fd15, %fd12, %fd14, %fd13; + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd15; + } + mov.f64 %fd16, 0dC338000000000000; + add.rn.f64 %fd17, %fd15, %fd16; + mov.f64 %fd18, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd19, %fd17, %fd18, %fd12; + mov.f64 %fd20, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd21, %fd17, %fd20, %fd19; + mov.f64 %fd22, 0d3E5AF86D8EBD13CD; + mov.f64 %fd23, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd24, %fd23, %fd21, %fd22; + mov.f64 %fd25, 0d3E927E5092BA033D; + fma.rn.f64 %fd26, %fd24, %fd21, %fd25; + mov.f64 %fd27, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd28, %fd26, %fd21, %fd27; + mov.f64 %fd29, 0d3EFA01A018D034E6; + fma.rn.f64 %fd30, %fd28, %fd21, %fd29; + mov.f64 %fd31, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd32, %fd30, %fd21, %fd31; + mov.f64 %fd33, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd34, %fd32, %fd21, %fd33; + mov.f64 %fd35, 0d3F8111111110F74D; + fma.rn.f64 %fd36, %fd34, %fd21, %fd35; + mov.f64 %fd37, 0d3FA555555555554D; + fma.rn.f64 %fd38, %fd36, %fd21, %fd37; + mov.f64 %fd39, 0d3FC5555555555557; + fma.rn.f64 %fd40, %fd38, %fd21, %fd39; + mov.f64 %fd41, 0d3FE0000000000000; + fma.rn.f64 %fd42, %fd40, %fd21, %fd41; + mul.f64 %fd43, %fd21, %fd42; + fma.rn.f64 %fd44, %fd43, %fd21, %fd21; + shl.b32 %r19, %r18, 20; + add.s32 %r20, %r19, 1072693248; + mov.u32 %r21, 0; + mov.b64 %fd45, {%r21, %r20}; + fma.rn.f64 %fd46, %fd44, %fd45, %fd45; + add.f64 %fd47, %fd46, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd48, %fd47; + neg.f64 %fd49, %fd47; + mov.f64 %fd50, 0d3FF0000000000000; + fma.rn.f64 %fd51, %fd49, %fd48, %fd50; + fma.rn.f64 %fd52, %fd51, %fd51, %fd51; + fma.rn.f64 %fd53, %fd52, %fd48, %fd48; + neg.f64 %fd54, %fd53; + mov.f64 %fd55, 0d4000000000000000; + fma.rn.f64 %fd56, %fd55, %fd54, %fd50; + setp.gt.u32 %p3, %r16, 1077936127; + selp.f64 %fd80, 0d3FF0000000000000, %fd56, %p3; + +BB130_4: + ld.param.u64 %rd33, [postProcessNNLstmForward_d_param_5]; + ld.param.u64 %rd32, [postProcessNNLstmForward_d_param_2]; + ld.param.u64 %rd31, [postProcessNNLstmForward_d_param_1]; + ld.param.u32 %r40, [postProcessNNLstmForward_d_param_11]; + ld.param.u32 %r39, [postProcessNNLstmForward_d_param_7]; + ld.param.u32 %r38, [postProcessNNLstmForward_d_param_8]; + ld.param.u32 %r37, [postProcessNNLstmForward_d_param_9]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r36}, %fd2; + } + ld.param.u32 %r35, [postProcessNNLstmForward_d_param_10]; + and.b32 %r23, %r36, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r24}, %fd80; + } + or.b32 %r25, %r24, %r23; + { + .reg .b32 %temp; + mov.b64 {%r26, %temp}, %fd80; + } + mov.b64 %fd78, {%r26, %r25}; + mul.f64 %fd79, %fd1, %fd78; + mad.lo.s32 %r27, %r2, %r37, %r38; + mad.lo.s32 %r28, %r27, %r35, %r3; + setp.eq.s32 %p4, %r39, 0; + selp.b32 %r33, %r1, %r28, %p4; + mad.lo.s32 %r34, %r40, %r38, %r1; + cvta.to.global.u64 %rd18, %rd31; + add.s64 %rd20, %rd18, %rd17; + st.global.f64 [%rd20], %fd2; + cvta.to.global.u64 %rd21, %rd32; + add.s64 %rd22, %rd21, %rd17; + st.global.f64 [%rd22], %fd79; + st.global.f64 [%rd1], %fd2; + cvta.to.global.u64 %rd23, %rd33; + mul.wide.s32 %rd24, %r34, 8; + add.s64 %rd25, %rd23, %rd24; + st.global.f64 [%rd25], %fd79; + cvta.to.global.u64 %rd26, %rd8; + add.s64 %rd27, %rd26, %rd24; + st.global.f64 [%rd27], %fd2; + cvta.to.global.u64 %rd28, %rd6; + mul.wide.s32 %rd29, %r33, 8; + add.s64 %rd30, %rd28, %rd29; + st.global.f64 [%rd30], %fd79; + +BB130_5: + ret; +} + + // .globl postProcessNNLstmForward_f +.visible .entry postProcessNNLstmForward_f( + .param .u64 postProcessNNLstmForward_f_param_0, + .param .u64 postProcessNNLstmForward_f_param_1, + .param .u64 postProcessNNLstmForward_f_param_2, + .param .u64 postProcessNNLstmForward_f_param_3, + .param .u64 postProcessNNLstmForward_f_param_4, + .param .u64 postProcessNNLstmForward_f_param_5, + .param .u64 postProcessNNLstmForward_f_param_6, + .param .u32 postProcessNNLstmForward_f_param_7, + .param .u32 postProcessNNLstmForward_f_param_8, + .param .u32 postProcessNNLstmForward_f_param_9, + .param .u32 postProcessNNLstmForward_f_param_10, + .param .u32 postProcessNNLstmForward_f_param_11 +) +{ + .reg .pred %p<6>; + .reg .f32 %f<40>; + .reg .b32 %r<23>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd2, [postProcessNNLstmForward_f_param_0]; + ld.param.u64 %rd3, [postProcessNNLstmForward_f_param_1]; + ld.param.u64 %rd4, [postProcessNNLstmForward_f_param_2]; + ld.param.u64 %rd5, [postProcessNNLstmForward_f_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmForward_f_param_4]; + ld.param.u64 %rd7, [postProcessNNLstmForward_f_param_5]; + ld.param.u64 %rd8, [postProcessNNLstmForward_f_param_6]; + ld.param.u32 %r4, [postProcessNNLstmForward_f_param_7]; + ld.param.u32 %r5, [postProcessNNLstmForward_f_param_8]; + ld.param.u32 %r6, [postProcessNNLstmForward_f_param_9]; + ld.param.u32 %r7, [postProcessNNLstmForward_f_param_10]; + ld.param.u32 %r8, [postProcessNNLstmForward_f_param_11]; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %tid.x; + mad.lo.s32 %r1, %r9, %r10, %r11; + setp.ge.u32 %p1, %r1, %r8; + @%p1 bra BB131_5; + + cvta.to.global.u64 %rd9, %rd2; + cvta.to.global.u64 %rd10, %rd5; + div.s32 %r2, %r1, %r7; + rem.s32 %r3, %r1, %r7; + shl.b32 %r12, %r7, 2; + mad.lo.s32 %r13, %r2, %r12, %r3; + mul.wide.s32 %rd11, %r13, 4; + add.s64 %rd12, %rd9, %rd11; + cvt.s64.s32 %rd13, %r12; + add.s64 %rd14, %rd12, %rd13; + add.s64 %rd15, %rd14, %rd13; + ld.global.f32 %f1, [%rd15]; + add.s64 %rd16, %rd15, %rd13; + mul.wide.s32 %rd17, %r1, 4; + add.s64 %rd1, %rd10, %rd17; + ld.global.f32 %f7, [%rd1]; + ld.global.f32 %f8, [%rd14]; + mul.f32 %f9, %f8, %f7; + ld.global.f32 %f10, [%rd16]; + ld.global.f32 %f11, [%rd12]; + fma.rn.f32 %f2, %f11, %f10, %f9; + abs.f32 %f3, %f2; + setp.ltu.f32 %p2, %f3, 0f3F0CCCCD; + @%p2 bra BB131_3; + bra.uni BB131_2; + +BB131_3: + mul.f32 %f27, %f2, %f2; + mov.f32 %f28, 0fBD57BE66; + mov.f32 %f29, 0f3C86A81B; + fma.rn.f32 %f30, %f29, %f27, %f28; + mov.f32 %f31, 0f3E08677B; + fma.rn.f32 %f32, %f30, %f27, %f31; + mov.f32 %f33, 0fBEAAAA29; + fma.rn.f32 %f34, %f32, %f27, %f33; + mul.f32 %f35, %f27, %f34; + fma.rn.f32 %f36, %f35, %f2, %f2; + add.f32 %f37, %f2, %f2; + setp.eq.f32 %p4, %f2, 0f00000000; + selp.f32 %f39, %f37, %f36, %p4; + bra.uni BB131_4; + +BB131_2: + add.f32 %f14, %f3, %f3; + mul.f32 %f15, %f14, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f16, %f15; + mov.f32 %f17, 0fBF317200; + fma.rn.f32 %f18, %f16, %f17, %f14; + mov.f32 %f19, 0fB5BFBE8E; + fma.rn.f32 %f20, %f16, %f19, %f18; + mul.f32 %f21, %f20, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f22, %f21; + ex2.approx.f32 %f23, %f16; + mov.f32 %f24, 0f3F800000; + fma.rn.f32 %f13, %f22, %f23, %f24; + // inline asm + rcp.approx.ftz.f32 %f12,%f13; + // inline asm + mov.f32 %f25, 0fC0000000; + fma.rn.f32 %f26, %f12, %f25, %f24; + mov.b32 %r14, %f26; + setp.ltu.f32 %p3, %f3, 0f42B00000; + selp.b32 %r15, %r14, 1065353216, %p3; + mov.b32 %r16, %f2; + and.b32 %r17, %r16, -2147483648; + or.b32 %r18, %r15, %r17; + mov.b32 %f39, %r18; + +BB131_4: + mad.lo.s32 %r19, %r2, %r6, %r5; + mad.lo.s32 %r20, %r19, %r7, %r3; + setp.eq.s32 %p5, %r4, 0; + selp.b32 %r21, %r1, %r20, %p5; + mad.lo.s32 %r22, %r8, %r5, %r1; + cvta.to.global.u64 %rd18, %rd3; + add.s64 %rd20, %rd18, %rd17; + st.global.f32 [%rd20], %f2; + cvta.to.global.u64 %rd21, %rd4; + add.s64 %rd22, %rd21, %rd17; + mul.f32 %f38, %f1, %f39; + st.global.f32 [%rd22], %f38; + st.global.f32 [%rd1], %f2; + cvta.to.global.u64 %rd23, %rd7; + mul.wide.s32 %rd24, %r22, 4; + add.s64 %rd25, %rd23, %rd24; + st.global.f32 [%rd25], %f38; + cvta.to.global.u64 %rd26, %rd8; + add.s64 %rd27, %rd26, %rd24; + st.global.f32 [%rd27], %f2; + cvta.to.global.u64 %rd28, %rd6; + mul.wide.s32 %rd29, %r21, 4; + add.s64 %rd30, %rd28, %rd29; + st.global.f32 [%rd30], %f38; + +BB131_5: + ret; +} + + // .globl postProcessNNLstmForwardSkipCache_d +.visible .entry postProcessNNLstmForwardSkipCache_d( + .param .u64 postProcessNNLstmForwardSkipCache_d_param_0, + .param .u64 postProcessNNLstmForwardSkipCache_d_param_1, + .param .u64 postProcessNNLstmForwardSkipCache_d_param_2, + .param .u64 postProcessNNLstmForwardSkipCache_d_param_3, + .param .u64 postProcessNNLstmForwardSkipCache_d_param_4, + .param .u32 postProcessNNLstmForwardSkipCache_d_param_5, + .param .u32 postProcessNNLstmForwardSkipCache_d_param_6, + .param .u32 postProcessNNLstmForwardSkipCache_d_param_7, + .param .u32 postProcessNNLstmForwardSkipCache_d_param_8, + .param .u32 postProcessNNLstmForwardSkipCache_d_param_9 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<35>; + .reg .f64 %fd<81>; + .reg .b64 %rd<25>; + + + ld.param.u64 %rd2, [postProcessNNLstmForwardSkipCache_d_param_0]; + ld.param.u64 %rd4, [postProcessNNLstmForwardSkipCache_d_param_2]; + ld.param.u64 %rd5, [postProcessNNLstmForwardSkipCache_d_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmForwardSkipCache_d_param_4]; + ld.param.u32 %r8, [postProcessNNLstmForwardSkipCache_d_param_8]; + ld.param.u32 %r9, [postProcessNNLstmForwardSkipCache_d_param_9]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r1, %r10, %r11, %r12; + setp.ge.u32 %p1, %r1, %r9; + @%p1 bra BB132_5; + + cvta.to.global.u64 %rd7, %rd2; + cvta.to.global.u64 %rd8, %rd5; + div.s32 %r2, %r1, %r8; + shl.b32 %r13, %r8, 2; + rem.s32 %r3, %r1, %r8; + mad.lo.s32 %r14, %r2, %r13, %r3; + mul.wide.s32 %rd9, %r14, 8; + add.s64 %rd10, %rd7, %rd9; + shl.b32 %r15, %r8, 3; + cvt.s64.s32 %rd11, %r15; + add.s64 %rd12, %rd10, %rd11; + add.s64 %rd13, %rd12, %rd11; + ld.global.f64 %fd1, [%rd13]; + add.s64 %rd14, %rd13, %rd11; + mul.wide.s32 %rd15, %r1, 8; + add.s64 %rd1, %rd8, %rd15; + ld.global.f64 %fd7, [%rd1]; + ld.global.f64 %fd8, [%rd12]; + mul.f64 %fd9, %fd8, %fd7; + ld.global.f64 %fd10, [%rd14]; + ld.global.f64 %fd11, [%rd10]; + fma.rn.f64 %fd2, %fd11, %fd10, %fd9; + { + .reg .b32 %temp; + mov.b64 {%temp, %r4}, %fd2; + } + and.b32 %r16, %r4, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r17, %temp}, %fd2; + } + mov.b64 %fd3, {%r17, %r16}; + setp.ltu.f64 %p2, %fd3, 0d3FE1C7A398201CD6; + @%p2 bra BB132_3; + bra.uni BB132_2; + +BB132_3: + mul.f64 %fd57, %fd2, %fd2; + mov.f64 %fd58, 0dBF2B9093D89F0E23; + mov.f64 %fd59, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd60, %fd59, %fd57, %fd58; + mov.f64 %fd61, 0d3F42FA2744C30B61; + fma.rn.f64 %fd62, %fd60, %fd57, %fd61; + mov.f64 %fd63, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd64, %fd62, %fd57, %fd63; + mov.f64 %fd65, 0d3F6D6C61D450119A; + fma.rn.f64 %fd66, %fd64, %fd57, %fd65; + mov.f64 %fd67, 0dBF8226DDD44294F5; + fma.rn.f64 %fd68, %fd66, %fd57, %fd67; + mov.f64 %fd69, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd70, %fd68, %fd57, %fd69; + mov.f64 %fd71, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd72, %fd70, %fd57, %fd71; + mov.f64 %fd73, 0d3FC111111110295E; + fma.rn.f64 %fd74, %fd72, %fd57, %fd73; + mov.f64 %fd75, 0dBFD555555555549F; + fma.rn.f64 %fd76, %fd74, %fd57, %fd75; + mul.f64 %fd77, %fd57, %fd76; + fma.rn.f64 %fd80, %fd77, %fd2, %fd2; + bra.uni BB132_4; + +BB132_2: + add.f64 %fd12, %fd3, %fd3; + mov.f64 %fd13, 0d4338000000000000; + mov.f64 %fd14, 0d3FF71547652B82FE; + fma.rn.f64 %fd15, %fd12, %fd14, %fd13; + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd15; + } + mov.f64 %fd16, 0dC338000000000000; + add.rn.f64 %fd17, %fd15, %fd16; + mov.f64 %fd18, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd19, %fd17, %fd18, %fd12; + mov.f64 %fd20, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd21, %fd17, %fd20, %fd19; + mov.f64 %fd22, 0d3E5AF86D8EBD13CD; + mov.f64 %fd23, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd24, %fd23, %fd21, %fd22; + mov.f64 %fd25, 0d3E927E5092BA033D; + fma.rn.f64 %fd26, %fd24, %fd21, %fd25; + mov.f64 %fd27, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd28, %fd26, %fd21, %fd27; + mov.f64 %fd29, 0d3EFA01A018D034E6; + fma.rn.f64 %fd30, %fd28, %fd21, %fd29; + mov.f64 %fd31, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd32, %fd30, %fd21, %fd31; + mov.f64 %fd33, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd34, %fd32, %fd21, %fd33; + mov.f64 %fd35, 0d3F8111111110F74D; + fma.rn.f64 %fd36, %fd34, %fd21, %fd35; + mov.f64 %fd37, 0d3FA555555555554D; + fma.rn.f64 %fd38, %fd36, %fd21, %fd37; + mov.f64 %fd39, 0d3FC5555555555557; + fma.rn.f64 %fd40, %fd38, %fd21, %fd39; + mov.f64 %fd41, 0d3FE0000000000000; + fma.rn.f64 %fd42, %fd40, %fd21, %fd41; + mul.f64 %fd43, %fd21, %fd42; + fma.rn.f64 %fd44, %fd43, %fd21, %fd21; + shl.b32 %r19, %r18, 20; + add.s32 %r20, %r19, 1072693248; + mov.u32 %r21, 0; + mov.b64 %fd45, {%r21, %r20}; + fma.rn.f64 %fd46, %fd44, %fd45, %fd45; + add.f64 %fd47, %fd46, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd48, %fd47; + neg.f64 %fd49, %fd47; + mov.f64 %fd50, 0d3FF0000000000000; + fma.rn.f64 %fd51, %fd49, %fd48, %fd50; + fma.rn.f64 %fd52, %fd51, %fd51, %fd51; + fma.rn.f64 %fd53, %fd52, %fd48, %fd48; + neg.f64 %fd54, %fd53; + mov.f64 %fd55, 0d4000000000000000; + fma.rn.f64 %fd56, %fd55, %fd54, %fd50; + setp.gt.u32 %p3, %r16, 1077936127; + selp.f64 %fd80, 0d3FF0000000000000, %fd56, %p3; + +BB132_4: + ld.param.u64 %rd24, [postProcessNNLstmForwardSkipCache_d_param_1]; + ld.param.u32 %r34, [postProcessNNLstmForwardSkipCache_d_param_5]; + ld.param.u32 %r33, [postProcessNNLstmForwardSkipCache_d_param_6]; + ld.param.u32 %r32, [postProcessNNLstmForwardSkipCache_d_param_7]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r31}, %fd2; + } + ld.param.u32 %r30, [postProcessNNLstmForwardSkipCache_d_param_8]; + and.b32 %r23, %r31, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r24}, %fd80; + } + or.b32 %r25, %r24, %r23; + { + .reg .b32 %temp; + mov.b64 {%r26, %temp}, %fd80; + } + mov.b64 %fd78, {%r26, %r25}; + mul.f64 %fd79, %fd1, %fd78; + mad.lo.s32 %r27, %r2, %r32, %r33; + mad.lo.s32 %r28, %r27, %r30, %r3; + setp.eq.s32 %p4, %r34, 0; + selp.b32 %r29, %r1, %r28, %p4; + cvta.to.global.u64 %rd16, %rd24; + add.s64 %rd18, %rd16, %rd15; + st.global.f64 [%rd18], %fd2; + cvta.to.global.u64 %rd19, %rd4; + add.s64 %rd20, %rd19, %rd15; + st.global.f64 [%rd20], %fd79; + st.global.f64 [%rd1], %fd2; + cvta.to.global.u64 %rd21, %rd6; + mul.wide.s32 %rd22, %r29, 8; + add.s64 %rd23, %rd21, %rd22; + st.global.f64 [%rd23], %fd79; + +BB132_5: + ret; +} + + // .globl postProcessNNLstmForwardSkipCache_f +.visible .entry postProcessNNLstmForwardSkipCache_f( + .param .u64 postProcessNNLstmForwardSkipCache_f_param_0, + .param .u64 postProcessNNLstmForwardSkipCache_f_param_1, + .param .u64 postProcessNNLstmForwardSkipCache_f_param_2, + .param .u64 postProcessNNLstmForwardSkipCache_f_param_3, + .param .u64 postProcessNNLstmForwardSkipCache_f_param_4, + .param .u32 postProcessNNLstmForwardSkipCache_f_param_5, + .param .u32 postProcessNNLstmForwardSkipCache_f_param_6, + .param .u32 postProcessNNLstmForwardSkipCache_f_param_7, + .param .u32 postProcessNNLstmForwardSkipCache_f_param_8, + .param .u32 postProcessNNLstmForwardSkipCache_f_param_9 +) +{ + .reg .pred %p<6>; + .reg .f32 %f<40>; + .reg .b32 %r<22>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd2, [postProcessNNLstmForwardSkipCache_f_param_0]; + ld.param.u64 %rd3, [postProcessNNLstmForwardSkipCache_f_param_1]; + ld.param.u64 %rd4, [postProcessNNLstmForwardSkipCache_f_param_2]; + ld.param.u64 %rd5, [postProcessNNLstmForwardSkipCache_f_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmForwardSkipCache_f_param_4]; + ld.param.u32 %r4, [postProcessNNLstmForwardSkipCache_f_param_5]; + ld.param.u32 %r5, [postProcessNNLstmForwardSkipCache_f_param_6]; + ld.param.u32 %r6, [postProcessNNLstmForwardSkipCache_f_param_7]; + ld.param.u32 %r7, [postProcessNNLstmForwardSkipCache_f_param_8]; + ld.param.u32 %r8, [postProcessNNLstmForwardSkipCache_f_param_9]; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %tid.x; + mad.lo.s32 %r1, %r9, %r10, %r11; + setp.ge.u32 %p1, %r1, %r8; + @%p1 bra BB133_5; + + cvta.to.global.u64 %rd7, %rd2; + cvta.to.global.u64 %rd8, %rd5; + div.s32 %r2, %r1, %r7; + rem.s32 %r3, %r1, %r7; + shl.b32 %r12, %r7, 2; + mad.lo.s32 %r13, %r2, %r12, %r3; + mul.wide.s32 %rd9, %r13, 4; + add.s64 %rd10, %rd7, %rd9; + cvt.s64.s32 %rd11, %r12; + add.s64 %rd12, %rd10, %rd11; + add.s64 %rd13, %rd12, %rd11; + ld.global.f32 %f1, [%rd13]; + add.s64 %rd14, %rd13, %rd11; + mul.wide.s32 %rd15, %r1, 4; + add.s64 %rd1, %rd8, %rd15; + ld.global.f32 %f7, [%rd1]; + ld.global.f32 %f8, [%rd12]; + mul.f32 %f9, %f8, %f7; + ld.global.f32 %f10, [%rd14]; + ld.global.f32 %f11, [%rd10]; + fma.rn.f32 %f2, %f11, %f10, %f9; + abs.f32 %f3, %f2; + setp.ltu.f32 %p2, %f3, 0f3F0CCCCD; + @%p2 bra BB133_3; + bra.uni BB133_2; + +BB133_3: + mul.f32 %f27, %f2, %f2; + mov.f32 %f28, 0fBD57BE66; + mov.f32 %f29, 0f3C86A81B; + fma.rn.f32 %f30, %f29, %f27, %f28; + mov.f32 %f31, 0f3E08677B; + fma.rn.f32 %f32, %f30, %f27, %f31; + mov.f32 %f33, 0fBEAAAA29; + fma.rn.f32 %f34, %f32, %f27, %f33; + mul.f32 %f35, %f27, %f34; + fma.rn.f32 %f36, %f35, %f2, %f2; + add.f32 %f37, %f2, %f2; + setp.eq.f32 %p4, %f2, 0f00000000; + selp.f32 %f39, %f37, %f36, %p4; + bra.uni BB133_4; + +BB133_2: + add.f32 %f14, %f3, %f3; + mul.f32 %f15, %f14, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f16, %f15; + mov.f32 %f17, 0fBF317200; + fma.rn.f32 %f18, %f16, %f17, %f14; + mov.f32 %f19, 0fB5BFBE8E; + fma.rn.f32 %f20, %f16, %f19, %f18; + mul.f32 %f21, %f20, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f22, %f21; + ex2.approx.f32 %f23, %f16; + mov.f32 %f24, 0f3F800000; + fma.rn.f32 %f13, %f22, %f23, %f24; + // inline asm + rcp.approx.ftz.f32 %f12,%f13; + // inline asm + mov.f32 %f25, 0fC0000000; + fma.rn.f32 %f26, %f12, %f25, %f24; + mov.b32 %r14, %f26; + setp.ltu.f32 %p3, %f3, 0f42B00000; + selp.b32 %r15, %r14, 1065353216, %p3; + mov.b32 %r16, %f2; + and.b32 %r17, %r16, -2147483648; + or.b32 %r18, %r15, %r17; + mov.b32 %f39, %r18; + +BB133_4: + cvta.to.global.u64 %rd16, %rd4; + cvta.to.global.u64 %rd17, %rd6; + mad.lo.s32 %r19, %r2, %r6, %r5; + mad.lo.s32 %r20, %r19, %r7, %r3; + setp.eq.s32 %p5, %r4, 0; + selp.b32 %r21, %r1, %r20, %p5; + cvta.to.global.u64 %rd18, %rd3; + add.s64 %rd20, %rd18, %rd15; + st.global.f32 [%rd20], %f2; + add.s64 %rd21, %rd16, %rd15; + mul.f32 %f38, %f1, %f39; + st.global.f32 [%rd21], %f38; + st.global.f32 [%rd1], %f2; + mul.wide.s32 %rd22, %r21, 4; + add.s64 %rd23, %rd17, %rd22; + st.global.f32 [%rd23], %f38; + +BB133_5: + ret; +} + + // .globl initializeDoutWhenReturnSeq_d +.visible .entry initializeDoutWhenReturnSeq_d( + .param .u64 initializeDoutWhenReturnSeq_d_param_0, + .param .u64 initializeDoutWhenReturnSeq_d_param_1, + .param .u32 initializeDoutWhenReturnSeq_d_param_2, + .param .u32 initializeDoutWhenReturnSeq_d_param_3, + .param .u32 initializeDoutWhenReturnSeq_d_param_4, + .param .u32 initializeDoutWhenReturnSeq_d_param_5 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<14>; + .reg .f64 %fd<2>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [initializeDoutWhenReturnSeq_d_param_0]; + ld.param.u64 %rd2, [initializeDoutWhenReturnSeq_d_param_1]; + ld.param.u32 %r2, [initializeDoutWhenReturnSeq_d_param_2]; + ld.param.u32 %r3, [initializeDoutWhenReturnSeq_d_param_3]; + ld.param.u32 %r4, [initializeDoutWhenReturnSeq_d_param_4]; + ld.param.u32 %r5, [initializeDoutWhenReturnSeq_d_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB134_2; + + cvta.to.global.u64 %rd3, %rd1; + div.s32 %r9, %r1, %r3; + mul.lo.s32 %r10, %r3, %r2; + mad.lo.s32 %r11, %r9, %r4, %r10; + rem.s32 %r12, %r1, %r3; + add.s32 %r13, %r11, %r12; + mul.wide.s32 %rd4, %r13, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + st.global.f64 [%rd8], %fd1; + +BB134_2: + ret; +} + + // .globl initializeDoutWhenReturnSeq_f +.visible .entry initializeDoutWhenReturnSeq_f( + .param .u64 initializeDoutWhenReturnSeq_f_param_0, + .param .u64 initializeDoutWhenReturnSeq_f_param_1, + .param .u32 initializeDoutWhenReturnSeq_f_param_2, + .param .u32 initializeDoutWhenReturnSeq_f_param_3, + .param .u32 initializeDoutWhenReturnSeq_f_param_4, + .param .u32 initializeDoutWhenReturnSeq_f_param_5 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<2>; + .reg .b32 %r<14>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [initializeDoutWhenReturnSeq_f_param_0]; + ld.param.u64 %rd2, [initializeDoutWhenReturnSeq_f_param_1]; + ld.param.u32 %r2, [initializeDoutWhenReturnSeq_f_param_2]; + ld.param.u32 %r3, [initializeDoutWhenReturnSeq_f_param_3]; + ld.param.u32 %r4, [initializeDoutWhenReturnSeq_f_param_4]; + ld.param.u32 %r5, [initializeDoutWhenReturnSeq_f_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB135_2; + + cvta.to.global.u64 %rd3, %rd1; + div.s32 %r9, %r1, %r3; + mul.lo.s32 %r10, %r3, %r2; + mad.lo.s32 %r11, %r9, %r4, %r10; + rem.s32 %r12, %r1, %r3; + add.s32 %r13, %r11, %r12; + mul.wide.s32 %rd4, %r13, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r1, 4; + add.s64 %rd8, %rd6, %rd7; + st.global.f32 [%rd8], %f1; + +BB135_2: + ret; +} + + // .globl computeDifog_raw_d +.visible .entry computeDifog_raw_d( + .param .u64 computeDifog_raw_d_param_0, + .param .u64 computeDifog_raw_d_param_1, + .param .u64 computeDifog_raw_d_param_2, + .param .u64 computeDifog_raw_d_param_3, + .param .u64 computeDifog_raw_d_param_4, + .param .u64 computeDifog_raw_d_param_5, + .param .u64 computeDifog_raw_d_param_6, + .param .u64 computeDifog_raw_d_param_7, + .param .u32 computeDifog_raw_d_param_8, + .param .u32 computeDifog_raw_d_param_9, + .param .u32 computeDifog_raw_d_param_10, + .param .u32 computeDifog_raw_d_param_11, + .param .u32 computeDifog_raw_d_param_12 +) +{ + .reg .pred %p<6>; + .reg .b32 %r<49>; + .reg .f64 %fd<106>; + .reg .b64 %rd<51>; + + + ld.param.u64 %rd6, [computeDifog_raw_d_param_0]; + ld.param.u64 %rd7, [computeDifog_raw_d_param_1]; + ld.param.u64 %rd8, [computeDifog_raw_d_param_2]; + ld.param.u64 %rd9, [computeDifog_raw_d_param_3]; + ld.param.u64 %rd10, [computeDifog_raw_d_param_4]; + ld.param.u32 %r3, [computeDifog_raw_d_param_9]; + ld.param.u32 %r4, [computeDifog_raw_d_param_11]; + ld.param.u32 %r5, [computeDifog_raw_d_param_12]; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB136_10; + + cvta.to.global.u64 %rd14, %rd6; + cvta.to.global.u64 %rd15, %rd8; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd16, %r1, 8; + add.s64 %rd17, %rd15, %rd16; + ld.global.f64 %fd1, [%rd17]; + div.s32 %r9, %r1, %r4; + shl.b32 %r10, %r4, 2; + rem.s32 %r11, %r1, %r4; + mad.lo.s32 %r12, %r9, %r10, %r11; + mul.wide.s32 %rd18, %r12, 8; + add.s64 %rd19, %rd14, %rd18; + ld.global.f64 %fd2, [%rd19]; + mul.wide.s32 %rd20, %r4, 8; + add.s64 %rd21, %rd19, %rd20; + ld.global.f64 %fd3, [%rd21]; + add.s64 %rd22, %rd21, %rd20; + ld.global.f64 %fd4, [%rd22]; + add.s64 %rd23, %rd22, %rd20; + ld.global.f64 %fd5, [%rd23]; + cvta.to.global.u64 %rd24, %rd7; + add.s64 %rd25, %rd24, %rd16; + ld.global.f64 %fd6, [%rd25]; + setp.eq.s32 %p2, %r3, 0; + @%p2 bra BB136_3; + + cvta.to.global.u64 %rd26, %rd9; + add.s32 %r13, %r3, -1; + mad.lo.s32 %r14, %r13, %r5, %r1; + mul.wide.u32 %rd27, %r14, 8; + add.s64 %rd50, %rd26, %rd27; + bra.uni BB136_4; + +BB136_3: + cvta.to.global.u64 %rd28, %rd10; + shl.b64 %rd29, %rd1, 3; + add.s64 %rd50, %rd28, %rd29; + +BB136_4: + ld.global.f64 %fd7, [%rd50]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd6; + } + and.b32 %r15, %r2, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r16, %temp}, %fd6; + } + mov.b64 %fd8, {%r16, %r15}; + setp.ltu.f64 %p3, %fd8, 0d3FE1C7A398201CD6; + @%p3 bra BB136_6; + bra.uni BB136_5; + +BB136_6: + mul.f64 %fd61, %fd6, %fd6; + mov.f64 %fd62, 0dBF2B9093D89F0E23; + mov.f64 %fd63, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd64, %fd63, %fd61, %fd62; + mov.f64 %fd65, 0d3F42FA2744C30B61; + fma.rn.f64 %fd66, %fd64, %fd61, %fd65; + mov.f64 %fd67, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd68, %fd66, %fd61, %fd67; + mov.f64 %fd69, 0d3F6D6C61D450119A; + fma.rn.f64 %fd70, %fd68, %fd61, %fd69; + mov.f64 %fd71, 0dBF8226DDD44294F5; + fma.rn.f64 %fd72, %fd70, %fd61, %fd71; + mov.f64 %fd73, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd74, %fd72, %fd61, %fd73; + mov.f64 %fd75, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd76, %fd74, %fd61, %fd75; + mov.f64 %fd77, 0d3FC111111110295E; + fma.rn.f64 %fd78, %fd76, %fd61, %fd77; + mov.f64 %fd79, 0dBFD555555555549F; + fma.rn.f64 %fd80, %fd78, %fd61, %fd79; + mul.f64 %fd81, %fd61, %fd80; + fma.rn.f64 %fd104, %fd81, %fd6, %fd6; + bra.uni BB136_7; + +BB136_5: + add.f64 %fd16, %fd8, %fd8; + mov.f64 %fd17, 0d4338000000000000; + mov.f64 %fd18, 0d3FF71547652B82FE; + fma.rn.f64 %fd19, %fd16, %fd18, %fd17; + { + .reg .b32 %temp; + mov.b64 {%r17, %temp}, %fd19; + } + mov.f64 %fd20, 0dC338000000000000; + add.rn.f64 %fd21, %fd19, %fd20; + mov.f64 %fd22, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd23, %fd21, %fd22, %fd16; + mov.f64 %fd24, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd25, %fd21, %fd24, %fd23; + mov.f64 %fd26, 0d3E5AF86D8EBD13CD; + mov.f64 %fd27, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd28, %fd27, %fd25, %fd26; + mov.f64 %fd29, 0d3E927E5092BA033D; + fma.rn.f64 %fd30, %fd28, %fd25, %fd29; + mov.f64 %fd31, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd32, %fd30, %fd25, %fd31; + mov.f64 %fd33, 0d3EFA01A018D034E6; + fma.rn.f64 %fd34, %fd32, %fd25, %fd33; + mov.f64 %fd35, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd36, %fd34, %fd25, %fd35; + mov.f64 %fd37, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd38, %fd36, %fd25, %fd37; + mov.f64 %fd39, 0d3F8111111110F74D; + fma.rn.f64 %fd40, %fd38, %fd25, %fd39; + mov.f64 %fd41, 0d3FA555555555554D; + fma.rn.f64 %fd42, %fd40, %fd25, %fd41; + mov.f64 %fd43, 0d3FC5555555555557; + fma.rn.f64 %fd44, %fd42, %fd25, %fd43; + mov.f64 %fd45, 0d3FE0000000000000; + fma.rn.f64 %fd46, %fd44, %fd25, %fd45; + mul.f64 %fd47, %fd25, %fd46; + fma.rn.f64 %fd48, %fd47, %fd25, %fd25; + shl.b32 %r18, %r17, 20; + add.s32 %r19, %r18, 1072693248; + mov.u32 %r20, 0; + mov.b64 %fd49, {%r20, %r19}; + fma.rn.f64 %fd50, %fd48, %fd49, %fd49; + add.f64 %fd51, %fd50, 0d3FF0000000000000; + rcp.approx.ftz.f64 %fd52, %fd51; + neg.f64 %fd53, %fd51; + mov.f64 %fd54, 0d3FF0000000000000; + fma.rn.f64 %fd55, %fd53, %fd52, %fd54; + fma.rn.f64 %fd56, %fd55, %fd55, %fd55; + fma.rn.f64 %fd57, %fd56, %fd52, %fd52; + neg.f64 %fd58, %fd57; + mov.f64 %fd59, 0d4000000000000000; + fma.rn.f64 %fd60, %fd59, %fd58, %fd54; + setp.gt.u32 %p4, %r15, 1077936127; + selp.f64 %fd104, 0d3FF0000000000000, %fd60, %p4; + +BB136_7: + ld.param.u64 %rd49, [computeDifog_raw_d_param_6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r39}, %fd6; + } + ld.param.u32 %r38, [computeDifog_raw_d_param_9]; + and.b32 %r22, %r39, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r23}, %fd104; + } + or.b32 %r24, %r23, %r22; + { + .reg .b32 %temp; + mov.b64 {%r25, %temp}, %fd104; + } + mov.b64 %fd82, {%r25, %r24}; + cvta.to.global.u64 %rd30, %rd49; + add.s64 %rd32, %rd30, %rd16; + mul.f64 %fd83, %fd82, %fd82; + mov.f64 %fd84, 0d3FF0000000000000; + sub.f64 %fd85, %fd84, %fd83; + mul.f64 %fd86, %fd1, %fd85; + ld.global.f64 %fd87, [%rd32]; + fma.rn.f64 %fd12, %fd4, %fd86, %fd87; + mul.f64 %fd105, %fd3, %fd12; + mul.f64 %fd14, %fd1, %fd82; + setp.ne.s32 %p5, %r38, 0; + @%p5 bra BB136_9; + + ld.param.u64 %rd48, [computeDifog_raw_d_param_7]; + cvta.to.global.u64 %rd33, %rd48; + add.s64 %rd35, %rd33, %rd16; + st.global.f64 [%rd35], %fd105; + mov.f64 %fd105, %fd12; + +BB136_9: + mov.u32 %r48, %tid.x; + mov.u32 %r47, %ctaid.x; + mov.u32 %r46, %ntid.x; + ld.param.u32 %r45, [computeDifog_raw_d_param_11]; + mad.lo.s32 %r44, %r46, %r47, %r48; + rem.s32 %r43, %r44, %r45; + shl.b32 %r42, %r45, 2; + div.s32 %r41, %r44, %r45; + mad.lo.s32 %r40, %r41, %r42, %r43; + cvt.s64.s32 %rd47, %r40; + ld.param.u64 %rd46, [computeDifog_raw_d_param_5]; + mul.f64 %fd88, %fd7, %fd12; + mul.f64 %fd89, %fd5, %fd12; + mul.f64 %fd90, %fd2, %fd12; + st.global.f64 [%rd32], %fd105; + sub.f64 %fd92, %fd84, %fd2; + mul.f64 %fd93, %fd2, %fd92; + mul.f64 %fd94, %fd93, %fd89; + cvta.to.global.u64 %rd39, %rd46; + shl.b64 %rd40, %rd47, 3; + add.s64 %rd41, %rd39, %rd40; + st.global.f64 [%rd41], %fd94; + sub.f64 %fd95, %fd84, %fd3; + mul.f64 %fd96, %fd3, %fd95; + mul.f64 %fd97, %fd96, %fd88; + add.s64 %rd43, %rd41, %rd20; + st.global.f64 [%rd43], %fd97; + sub.f64 %fd98, %fd84, %fd4; + mul.f64 %fd99, %fd4, %fd98; + mul.f64 %fd100, %fd99, %fd14; + add.s64 %rd44, %rd43, %rd20; + st.global.f64 [%rd44], %fd100; + mul.f64 %fd101, %fd5, %fd5; + sub.f64 %fd102, %fd84, %fd101; + mul.f64 %fd103, %fd102, %fd90; + add.s64 %rd45, %rd44, %rd20; + st.global.f64 [%rd45], %fd103; + +BB136_10: + ret; +} + + // .globl computeDifog_raw_f +.visible .entry computeDifog_raw_f( + .param .u64 computeDifog_raw_f_param_0, + .param .u64 computeDifog_raw_f_param_1, + .param .u64 computeDifog_raw_f_param_2, + .param .u64 computeDifog_raw_f_param_3, + .param .u64 computeDifog_raw_f_param_4, + .param .u64 computeDifog_raw_f_param_5, + .param .u64 computeDifog_raw_f_param_6, + .param .u64 computeDifog_raw_f_param_7, + .param .u32 computeDifog_raw_f_param_8, + .param .u32 computeDifog_raw_f_param_9, + .param .u32 computeDifog_raw_f_param_10, + .param .u32 computeDifog_raw_f_param_11, + .param .u32 computeDifog_raw_f_param_12 +) +{ + .reg .pred %p<7>; + .reg .f32 %f<65>; + .reg .b32 %r<35>; + .reg .b64 %rd<46>; + + + ld.param.u64 %rd5, [computeDifog_raw_f_param_0]; + ld.param.u64 %rd6, [computeDifog_raw_f_param_1]; + ld.param.u64 %rd7, [computeDifog_raw_f_param_2]; + ld.param.u64 %rd8, [computeDifog_raw_f_param_3]; + ld.param.u64 %rd9, [computeDifog_raw_f_param_4]; + ld.param.u64 %rd10, [computeDifog_raw_f_param_5]; + ld.param.u64 %rd11, [computeDifog_raw_f_param_6]; + ld.param.u64 %rd12, [computeDifog_raw_f_param_7]; + ld.param.u32 %r2, [computeDifog_raw_f_param_9]; + ld.param.u32 %r3, [computeDifog_raw_f_param_11]; + ld.param.u32 %r4, [computeDifog_raw_f_param_12]; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r5, %r6, %r7; + setp.ge.u32 %p1, %r1, %r4; + @%p1 bra BB137_10; + + cvta.to.global.u64 %rd13, %rd5; + cvta.to.global.u64 %rd14, %rd7; + mul.wide.s32 %rd15, %r1, 4; + add.s64 %rd16, %rd14, %rd15; + ld.global.f32 %f1, [%rd16]; + div.s32 %r8, %r1, %r3; + shl.b32 %r9, %r3, 2; + rem.s32 %r10, %r1, %r3; + mad.lo.s32 %r11, %r8, %r9, %r10; + cvt.s64.s32 %rd1, %r11; + mul.wide.s32 %rd17, %r11, 4; + add.s64 %rd18, %rd13, %rd17; + ld.global.f32 %f2, [%rd18]; + mul.wide.s32 %rd19, %r3, 4; + add.s64 %rd20, %rd18, %rd19; + ld.global.f32 %f3, [%rd20]; + add.s64 %rd21, %rd20, %rd19; + ld.global.f32 %f4, [%rd21]; + add.s64 %rd22, %rd21, %rd19; + ld.global.f32 %f5, [%rd22]; + cvta.to.global.u64 %rd23, %rd6; + add.s64 %rd24, %rd23, %rd15; + ld.global.f32 %f6, [%rd24]; + setp.eq.s32 %p2, %r2, 0; + @%p2 bra BB137_3; + + cvta.to.global.u64 %rd25, %rd8; + add.s32 %r12, %r2, -1; + mad.lo.s32 %r13, %r12, %r4, %r1; + mul.wide.u32 %rd26, %r13, 4; + add.s64 %rd45, %rd25, %rd26; + bra.uni BB137_4; + +BB137_3: + cvta.to.global.u64 %rd27, %rd9; + add.s64 %rd45, %rd27, %rd15; + +BB137_4: + ld.global.f32 %f7, [%rd45]; + abs.f32 %f8, %f6; + setp.ltu.f32 %p3, %f8, 0f3F0CCCCD; + @%p3 bra BB137_6; + bra.uni BB137_5; + +BB137_6: + mul.f32 %f31, %f6, %f6; + mov.f32 %f32, 0fBD57BE66; + mov.f32 %f33, 0f3C86A81B; + fma.rn.f32 %f34, %f33, %f31, %f32; + mov.f32 %f35, 0f3E08677B; + fma.rn.f32 %f36, %f34, %f31, %f35; + mov.f32 %f37, 0fBEAAAA29; + fma.rn.f32 %f38, %f36, %f31, %f37; + mul.f32 %f39, %f31, %f38; + fma.rn.f32 %f40, %f39, %f6, %f6; + add.f32 %f41, %f6, %f6; + setp.eq.f32 %p5, %f6, 0f00000000; + selp.f32 %f63, %f41, %f40, %p5; + bra.uni BB137_7; + +BB137_5: + add.f32 %f18, %f8, %f8; + mul.f32 %f19, %f18, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f20, %f19; + mov.f32 %f21, 0fBF317200; + fma.rn.f32 %f22, %f20, %f21, %f18; + mov.f32 %f23, 0fB5BFBE8E; + fma.rn.f32 %f24, %f20, %f23, %f22; + mul.f32 %f25, %f24, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f26, %f25; + ex2.approx.f32 %f27, %f20; + mov.f32 %f28, 0f3F800000; + fma.rn.f32 %f17, %f26, %f27, %f28; + // inline asm + rcp.approx.ftz.f32 %f16,%f17; + // inline asm + mov.f32 %f29, 0fC0000000; + fma.rn.f32 %f30, %f16, %f29, %f28; + mov.b32 %r18, %f30; + setp.ltu.f32 %p4, %f8, 0f42B00000; + selp.b32 %r19, %r18, 1065353216, %p4; + mov.b32 %r20, %f6; + and.b32 %r21, %r20, -2147483648; + or.b32 %r22, %r19, %r21; + mov.b32 %f63, %r22; + +BB137_7: + cvta.to.global.u64 %rd29, %rd11; + add.s64 %rd31, %rd29, %rd15; + mul.f32 %f42, %f63, %f63; + mov.f32 %f43, 0f3F800000; + sub.f32 %f44, %f43, %f42; + mul.f32 %f45, %f1, %f44; + ld.global.f32 %f46, [%rd31]; + fma.rn.f32 %f12, %f4, %f45, %f46; + mul.f32 %f64, %f3, %f12; + mul.f32 %f14, %f1, %f63; + setp.ne.s32 %p6, %r2, 0; + @%p6 bra BB137_9; + + cvta.to.global.u64 %rd32, %rd12; + add.s64 %rd34, %rd32, %rd15; + st.global.f32 [%rd34], %f64; + mov.f32 %f64, %f12; + +BB137_9: + mul.f32 %f47, %f7, %f12; + mul.f32 %f48, %f5, %f12; + mul.f32 %f49, %f2, %f12; + st.global.f32 [%rd31], %f64; + sub.f32 %f51, %f43, %f2; + mul.f32 %f52, %f2, %f51; + mul.f32 %f53, %f52, %f48; + cvta.to.global.u64 %rd38, %rd10; + shl.b64 %rd39, %rd1, 2; + add.s64 %rd40, %rd38, %rd39; + st.global.f32 [%rd40], %f53; + sub.f32 %f54, %f43, %f3; + mul.f32 %f55, %f3, %f54; + mul.f32 %f56, %f55, %f47; + add.s64 %rd42, %rd40, %rd19; + st.global.f32 [%rd42], %f56; + sub.f32 %f57, %f43, %f4; + mul.f32 %f58, %f4, %f57; + mul.f32 %f59, %f58, %f14; + add.s64 %rd43, %rd42, %rd19; + st.global.f32 [%rd43], %f59; + mul.f32 %f60, %f5, %f5; + sub.f32 %f61, %f43, %f60; + mul.f32 %f62, %f61, %f49; + add.s64 %rd44, %rd43, %rd19; + st.global.f32 [%rd44], %f62; + +BB137_10: + ret; +} + + // .globl postProcessNNLstmBackward_d +.visible .entry postProcessNNLstmBackward_d( + .param .u64 postProcessNNLstmBackward_d_param_0, + .param .u64 postProcessNNLstmBackward_d_param_1, + .param .u64 postProcessNNLstmBackward_d_param_2, + .param .u64 postProcessNNLstmBackward_d_param_3, + .param .u64 postProcessNNLstmBackward_d_param_4, + .param .u32 postProcessNNLstmBackward_d_param_5, + .param .u32 postProcessNNLstmBackward_d_param_6, + .param .u32 postProcessNNLstmBackward_d_param_7, + .param .u32 postProcessNNLstmBackward_d_param_8, + .param .u32 postProcessNNLstmBackward_d_param_9, + .param .u32 postProcessNNLstmBackward_d_param_10, + .param .u32 postProcessNNLstmBackward_d_param_11, + .param .u32 postProcessNNLstmBackward_d_param_12, + .param .u32 postProcessNNLstmBackward_d_param_13, + .param .u32 postProcessNNLstmBackward_d_param_14, + .param .u32 postProcessNNLstmBackward_d_param_15 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<28>; + .reg .f64 %fd<5>; + .reg .b64 %rd<23>; + + + ld.param.u64 %rd7, [postProcessNNLstmBackward_d_param_0]; + ld.param.u64 %rd3, [postProcessNNLstmBackward_d_param_1]; + ld.param.u64 %rd4, [postProcessNNLstmBackward_d_param_2]; + ld.param.u64 %rd5, [postProcessNNLstmBackward_d_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmBackward_d_param_4]; + ld.param.u32 %r4, [postProcessNNLstmBackward_d_param_5]; + ld.param.u32 %r5, [postProcessNNLstmBackward_d_param_6]; + ld.param.u32 %r6, [postProcessNNLstmBackward_d_param_8]; + ld.param.u32 %r7, [postProcessNNLstmBackward_d_param_9]; + ld.param.u32 %r12, [postProcessNNLstmBackward_d_param_10]; + ld.param.u32 %r8, [postProcessNNLstmBackward_d_param_11]; + ld.param.u32 %r9, [postProcessNNLstmBackward_d_param_12]; + ld.param.u32 %r10, [postProcessNNLstmBackward_d_param_13]; + ld.param.u32 %r11, [postProcessNNLstmBackward_d_param_14]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %ctaid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r1, %r13, %r14, %r15; + setp.ge.s32 %p1, %r1, %r12; + @%p1 bra BB138_2; + + cvta.to.global.u64 %rd8, %rd6; + div.s32 %r16, %r1, %r6; + rem.s32 %r17, %r1, %r6; + mad.lo.s32 %r18, %r16, %r11, %r17; + mul.wide.s32 %rd9, %r18, 8; + add.s64 %rd10, %rd1, %rd9; + ld.global.f64 %fd2, [%rd10]; + mul.lo.s32 %r19, %r6, %r5; + mad.lo.s32 %r20, %r16, %r9, %r19; + add.s32 %r21, %r20, %r17; + mul.wide.s32 %rd11, %r21, 8; + add.s64 %rd12, %rd8, %rd11; + st.global.f64 [%rd12], %fd2; + +BB138_2: + setp.ge.s32 %p2, %r1, %r8; + @%p2 bra BB138_8; + + div.s32 %r2, %r1, %r7; + mad.lo.s32 %r22, %r2, %r11, %r6; + rem.s32 %r3, %r1, %r7; + add.s32 %r23, %r22, %r3; + mul.wide.s32 %rd13, %r23, 8; + add.s64 %rd14, %rd1, %rd13; + ld.global.f64 %fd1, [%rd14]; + setp.eq.s32 %p3, %r5, 0; + @%p3 bra BB138_7; + + cvta.to.global.u64 %rd15, %rd5; + setp.eq.s32 %p4, %r4, 0; + mul.wide.s32 %rd16, %r1, 8; + add.s64 %rd2, %rd15, %rd16; + @%p4 bra BB138_6; + + cvta.to.global.u64 %rd17, %rd4; + add.s32 %r24, %r5, -1; + mul.lo.s32 %r25, %r24, %r7; + mad.lo.s32 %r26, %r2, %r10, %r25; + add.s32 %r27, %r26, %r3; + mul.wide.s32 %rd18, %r27, 8; + add.s64 %rd19, %rd17, %rd18; + ld.global.f64 %fd3, [%rd19]; + add.f64 %fd4, %fd1, %fd3; + st.global.f64 [%rd2], %fd4; + bra.uni BB138_8; + +BB138_7: + cvta.to.global.u64 %rd20, %rd3; + mul.wide.s32 %rd21, %r1, 8; + add.s64 %rd22, %rd20, %rd21; + st.global.f64 [%rd22], %fd1; + bra.uni BB138_8; + +BB138_6: + st.global.f64 [%rd2], %fd1; + +BB138_8: + ret; +} + + // .globl postProcessNNLstmBackward_f +.visible .entry postProcessNNLstmBackward_f( + .param .u64 postProcessNNLstmBackward_f_param_0, + .param .u64 postProcessNNLstmBackward_f_param_1, + .param .u64 postProcessNNLstmBackward_f_param_2, + .param .u64 postProcessNNLstmBackward_f_param_3, + .param .u64 postProcessNNLstmBackward_f_param_4, + .param .u32 postProcessNNLstmBackward_f_param_5, + .param .u32 postProcessNNLstmBackward_f_param_6, + .param .u32 postProcessNNLstmBackward_f_param_7, + .param .u32 postProcessNNLstmBackward_f_param_8, + .param .u32 postProcessNNLstmBackward_f_param_9, + .param .u32 postProcessNNLstmBackward_f_param_10, + .param .u32 postProcessNNLstmBackward_f_param_11, + .param .u32 postProcessNNLstmBackward_f_param_12, + .param .u32 postProcessNNLstmBackward_f_param_13, + .param .u32 postProcessNNLstmBackward_f_param_14, + .param .u32 postProcessNNLstmBackward_f_param_15 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<5>; + .reg .b32 %r<28>; + .reg .b64 %rd<23>; + + + ld.param.u64 %rd7, [postProcessNNLstmBackward_f_param_0]; + ld.param.u64 %rd3, [postProcessNNLstmBackward_f_param_1]; + ld.param.u64 %rd4, [postProcessNNLstmBackward_f_param_2]; + ld.param.u64 %rd5, [postProcessNNLstmBackward_f_param_3]; + ld.param.u64 %rd6, [postProcessNNLstmBackward_f_param_4]; + ld.param.u32 %r4, [postProcessNNLstmBackward_f_param_5]; + ld.param.u32 %r5, [postProcessNNLstmBackward_f_param_6]; + ld.param.u32 %r6, [postProcessNNLstmBackward_f_param_8]; + ld.param.u32 %r7, [postProcessNNLstmBackward_f_param_9]; + ld.param.u32 %r12, [postProcessNNLstmBackward_f_param_10]; + ld.param.u32 %r8, [postProcessNNLstmBackward_f_param_11]; + ld.param.u32 %r9, [postProcessNNLstmBackward_f_param_12]; + ld.param.u32 %r10, [postProcessNNLstmBackward_f_param_13]; + ld.param.u32 %r11, [postProcessNNLstmBackward_f_param_14]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %ctaid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r1, %r13, %r14, %r15; + setp.ge.s32 %p1, %r1, %r12; + @%p1 bra BB139_2; + + cvta.to.global.u64 %rd8, %rd6; + div.s32 %r16, %r1, %r6; + rem.s32 %r17, %r1, %r6; + mad.lo.s32 %r18, %r16, %r11, %r17; + mul.wide.s32 %rd9, %r18, 4; + add.s64 %rd10, %rd1, %rd9; + ld.global.f32 %f2, [%rd10]; + mul.lo.s32 %r19, %r6, %r5; + mad.lo.s32 %r20, %r16, %r9, %r19; + add.s32 %r21, %r20, %r17; + mul.wide.s32 %rd11, %r21, 4; + add.s64 %rd12, %rd8, %rd11; + st.global.f32 [%rd12], %f2; + +BB139_2: + setp.ge.s32 %p2, %r1, %r8; + @%p2 bra BB139_8; + + div.s32 %r2, %r1, %r7; + mad.lo.s32 %r22, %r2, %r11, %r6; + rem.s32 %r3, %r1, %r7; + add.s32 %r23, %r22, %r3; + mul.wide.s32 %rd13, %r23, 4; + add.s64 %rd14, %rd1, %rd13; + ld.global.f32 %f1, [%rd14]; + setp.eq.s32 %p3, %r5, 0; + @%p3 bra BB139_7; + + cvta.to.global.u64 %rd15, %rd5; + setp.eq.s32 %p4, %r4, 0; + mul.wide.s32 %rd16, %r1, 4; + add.s64 %rd2, %rd15, %rd16; + @%p4 bra BB139_6; + + cvta.to.global.u64 %rd17, %rd4; + add.s32 %r24, %r5, -1; + mul.lo.s32 %r25, %r24, %r7; + mad.lo.s32 %r26, %r2, %r10, %r25; + add.s32 %r27, %r26, %r3; + mul.wide.s32 %rd18, %r27, 4; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + add.f32 %f4, %f1, %f3; + st.global.f32 [%rd2], %f4; + bra.uni BB139_8; + +BB139_7: + cvta.to.global.u64 %rd20, %rd3; + mul.wide.s32 %rd21, %r1, 4; + add.s64 %rd22, %rd20, %rd21; + st.global.f32 [%rd22], %f1; + bra.uni BB139_8; + +BB139_6: + st.global.f32 [%rd2], %f1; + +BB139_8: + ret; +} + .func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd( .param .b64 __internal_trig_reduction_slowpathd_param_0, .param .b64 __internal_trig_reduction_slowpathd_param_1 ) { - .local .align 8 .b8 __local_depot126[40]; + .local .align 8 .b8 __local_depot140[40]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<9>; @@ -15232,7 +17252,7 @@ BB125_2: .reg .b64 %rd<102>; - mov.u64 %rd101, __local_depot126; + mov.u64 %rd101, __local_depot140; cvta.local.u64 %SP, %rd101; ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0]; ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1]; @@ -15246,7 +17266,7 @@ BB125_2: shr.u32 %r3, %r1, 20; bfe.u32 %r4, %r1, 20, 11; setp.eq.s32 %p1, %r4, 2047; - @%p1 bra BB126_13; + @%p1 bra BB140_13; add.s32 %r15, %r4, -1024; shr.u32 %r16, %r15, 6; @@ -15259,7 +17279,7 @@ BB125_2: mov.u64 %rd94, 0; setp.ge.s32 %p2, %r5, %r6; mov.u64 %rd93, %rd1; - @%p2 bra BB126_4; + @%p2 bra BB140_4; mov.b64 %rd41, %fd4; shl.b64 %rd42, %rd41, 11; @@ -15276,7 +17296,7 @@ BB125_2: mov.u64 %rd91, %rd1; mov.u32 %r39, %r5; -BB126_3: +BB140_3: .pragma "nounroll"; ld.const.u64 %rd47, [%rd89]; // inline asm @@ -15306,15 +17326,15 @@ BB126_3: add.s64 %rd93, %rd93, 8; add.s64 %rd89, %rd89, 8; setp.lt.s32 %p3, %r39, %r6; - @%p3 bra BB126_3; + @%p3 bra BB140_3; -BB126_4: +BB140_4: st.local.u64 [%rd93], %rd94; ld.local.u64 %rd95, [%rd1+16]; ld.local.u64 %rd96, [%rd1+24]; and.b32 %r9, %r3, 63; setp.eq.s32 %p4, %r9, 0; - @%p4 bra BB126_6; + @%p4 bra BB140_6; mov.u32 %r27, 64; sub.s32 %r28, %r27, %r9; @@ -15326,7 +17346,7 @@ BB126_4: shr.u64 %rd55, %rd54, %r28; or.b64 %rd95, %rd55, %rd53; -BB126_6: +BB140_6: cvta.to.local.u64 %rd56, %rd37; shr.u64 %rd57, %rd96, 62; cvt.u32.u64 %r29, %rd57; @@ -15343,7 +17363,7 @@ BB126_6: selp.b32 %r34, %r32, %r33, %p5; st.local.u32 [%rd56], %r34; setp.eq.s32 %p6, %r31, 0; - @%p6 bra BB126_8; + @%p6 bra BB140_8; mov.u64 %rd64, 0; // inline asm @@ -15363,10 +17383,10 @@ BB126_6: // inline asm xor.b32 %r40, %r40, -2147483648; -BB126_8: +BB140_8: clz.b64 %r41, %rd98; setp.eq.s32 %p7, %r41, 0; - @%p7 bra BB126_10; + @%p7 bra BB140_10; shl.b64 %rd67, %rd98, %r41; mov.u32 %r35, 64; @@ -15374,7 +17394,7 @@ BB126_8: shr.u64 %rd68, %rd97, %r36; or.b64 %rd98, %rd68, %rd67; -BB126_10: +BB140_10: mov.u64 %rd72, -3958705157555305931; // inline asm { @@ -15395,7 +17415,7 @@ BB126_10: } // inline asm setp.lt.s64 %p8, %rd100, 1; - @%p8 bra BB126_12; + @%p8 bra BB140_12; // inline asm { @@ -15414,7 +17434,7 @@ BB126_10: // inline asm add.s32 %r41, %r41, 1; -BB126_12: +BB140_12: cvt.u64.u32 %rd79, %r40; shl.b64 %rd80, %rd79, 32; mov.u32 %r37, 1022; @@ -15429,7 +17449,7 @@ BB126_12: or.b64 %rd88, %rd87, %rd80; mov.b64 %fd4, %rd88; -BB126_13: +BB140_13: st.param.f64 [func_retval0+0], %fd4; ret; } @@ -15457,7 +17477,7 @@ BB126_13: } shr.u32 %r51, %r50, 20; setp.ne.s32 %p1, %r51, 0; - @%p1 bra BB127_2; + @%p1 bra BB141_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -15471,13 +17491,13 @@ BB126_13: shr.u32 %r16, %r50, 20; add.s32 %r51, %r16, -54; -BB127_2: +BB141_2: add.s32 %r52, %r51, -1023; and.b32 %r17, %r50, -2146435073; or.b32 %r18, %r17, 1072693248; mov.b64 %fd135, {%r49, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB127_4; + @%p2 bra BB141_4; { .reg .b32 %temp; @@ -15491,7 +17511,7 @@ BB127_2: mov.b64 %fd135, {%r19, %r21}; add.s32 %r52, %r51, -1022; -BB127_4: +BB141_4: add.f64 %fd15, %fd135, 0d3FF0000000000000; rcp.approx.ftz.f64 %fd16, %fd15; neg.f64 %fd17, %fd15; @@ -15654,13 +17674,13 @@ BB127_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB127_7; + @%p4 bra BB141_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd136, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB127_7; + @%p6 bra BB141_7; mov.f64 %fd134, 0d4338000000000000; mov.f64 %fd133, 0d3FF71547652B82FE; @@ -15682,26 +17702,26 @@ BB127_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd136, %fd130, %fd131; -BB127_7: +BB141_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd136; } and.b32 %r46, %r45, 2147483647; setp.ne.s32 %p7, %r46, 2146435072; - @%p7 bra BB127_9; + @%p7 bra BB141_9; { .reg .b32 %temp; mov.b64 {%r47, %temp}, %fd136; } setp.eq.s32 %p8, %r47, 0; - @%p8 bra BB127_10; + @%p8 bra BB141_10; -BB127_9: +BB141_9: fma.rn.f64 %fd136, %fd136, %fd5, %fd136; -BB127_10: +BB141_10: st.param.f64 [func_retval0+0], %fd136; ret; }