[SYSTEMML-1923] Support sinh, cosh and tanh as built-in functions - Added sinh, cosh and tanh builtin functions to CP, Spark and GPU backend. - Added these functions to Python DSL. - Also, updated the DML language reference and Python reference documentation.
Closes #668. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/50a895f8 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/50a895f8 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/50a895f8 Branch: refs/heads/master Commit: 50a895f86167d5766b249a3db89f9c2b06f7a0d0 Parents: aa15197 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 21 10:06:04 2017 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 21 10:07:15 2017 -0700 ---------------------------------------------------------------------- docs/dml-language-reference.md | 2 +- docs/python-reference.md | 2 +- src/main/cpp/kernels/SystemML.cu | 44 +- src/main/cpp/kernels/SystemML.ptx | 2070 +++++++++++------- src/main/java/org/apache/sysml/hops/Hop.java | 11 +- .../java/org/apache/sysml/hops/UnaryOp.java | 11 +- .../sysml/hops/codegen/cplan/CNodeUnary.java | 1 + .../codegen/opt/PlanSelectionFuseCostBased.java | 3 + .../opt/PlanSelectionFuseCostBasedV2.java | 3 + src/main/java/org/apache/sysml/lops/Unary.java | 8 +- .../java/org/apache/sysml/lops/UnaryCP.java | 11 +- .../sysml/parser/BuiltinFunctionExpression.java | 12 + .../org/apache/sysml/parser/DMLTranslator.java | 12 + .../org/apache/sysml/parser/Expression.java | 3 + .../sysml/runtime/functionobjects/Builtin.java | 24 +- .../instructions/CPInstructionParser.java | 3 + .../instructions/GPUInstructionParser.java | 3 + .../instructions/SPInstructionParser.java | 3 + .../instructions/gpu/GPUInstruction.java | 3 + .../gpu/MatrixBuiltinGPUInstruction.java | 6 + .../runtime/matrix/data/LibMatrixCUDA.java | 51 + .../runtime/matrix/operators/UnaryOperator.java | 2 + src/main/python/systemml/defmatrix.py | 9 + .../org/apache/sysml/test/gpu/UnaryOpTests.java | 15 + 24 files changed, 1474 insertions(+), 838 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/dml-language-reference.md ---------------------------------------------------------------------- diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md index bd66a42..d8ca07f 100644 --- a/docs/dml-language-reference.md +++ b/docs/dml-language-reference.md @@ -822,7 +822,7 @@ is same as Function | Description | Parameters | Example -------- | ----------- | ---------- | ------- exp(), log(), abs(), sqrt(), round(), floor(), ceil() | Apply mathematical function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sqrt(X) <br/> log(X,y) <br/> round(X) <br/> floor(X) <br/> ceil(X) -sin(), cos(), tan(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sin(X) +sin(), cos(), tan(), sinh(), cosh(), tanh(), asin(), acos(), atan() | Apply trigonometric function on input (cell wise if input is matrix) | Input: (<matrix>), or (<scalar>) <br/> Output: <matrix>, or <scalar> | sin(X) sign() | Returns a matrix representing the signs of the input matrix elements, where 1 represents positive, 0 represents zero, and -1 represents negative | Input : (A <matrix>) <br/> Output : <matrix> | <span style="white-space: nowrap;">A = matrix("-5 0 3 -3",</span> rows=2, cols=2) <br/>signA = sign(A)<br/>Matrix signA: [[-1, 0], [1, -1]] http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/docs/python-reference.md ---------------------------------------------------------------------- diff --git a/docs/python-reference.md b/docs/python-reference.md index 119c1d0..4fd78fe 100644 --- a/docs/python-reference.md +++ b/docs/python-reference.md @@ -191,7 +191,7 @@ In addition to the above mentioned operators, following functions are supported. | argmax(self, axis=None) | Returns the indices of the maximum values along an axis. | axis : int, optional (only axis=1, i.e. rowIndexMax is supported in this version) | | cumsum(self, axis=None) | Returns the indices of the maximum values along an axis. | axis : int, optional (only axis=0, i.e. cumsum along the rows is supported in this version) | -- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, asin, acos, atan, sign, solve +- Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, sinh, cosh, tanh, asin, acos, atan, sign, solve | | Description | Parameters | |------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 3e1a13a..c243564 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -990,6 +990,20 @@ __global__ void matrix_sin(double *A, double *C, unsigned int size) { } /** + * Do an sinh over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_sinh(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = sinh(A[index]); + } +} + +/** * Do an cos over all the elements of a matrix * @param A the input matrix (of length = size) * @param C the pre-allocated output matrix (of length = size) @@ -1004,6 +1018,20 @@ __global__ void matrix_cos(double *A, double *C, unsigned int size) { } /** + * Do an cosh over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_cosh(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = cosh(A[index]); + } +} + +/** * Do an tan over all the elements of a matrix * @param A the input matrix (of length = size) * @param C the pre-allocated output matrix (of length = size) @@ -1018,6 +1046,20 @@ __global__ void matrix_tan(double *A, double *C, unsigned int size) { } /** + * Do an tanh over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_tanh(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = tanh(A[index]); + } +} + +/** * Do an asin over all the elements of a matrix * @param A the input matrix (of length = size) * @param C the pre-allocated output matrix (of length = size) @@ -1076,4 +1118,4 @@ __global__ void matrix_sign(double *A, double *C, unsigned int size) { C[index] = copysign(1.0, A[index]); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index b4a6559..73b057e 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,8 +1,8 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21554848 -// Cuda compilation tools, release 8.0, V8.0.61 +// Compiler Build ID: CL-21124049 +// Cuda compilation tools, release 8.0, V8.0.44 // Based on LLVM 3.4svn // @@ -689,9 +689,9 @@ BB10_6: .param .u32 matrix_matrix_cellwise_op_param_7 ) { - .reg .pred %p<73>; - .reg .b32 %r<66>; - .reg .f64 %fd<56>; + .reg .pred %p<77>; + .reg .b32 %r<65>; + .reg .f64 %fd<55>; .reg .b64 %rd<19>; @@ -712,40 +712,40 @@ BB10_6: setp.lt.s32 %p2, %r1, %r14; setp.gt.s32 %p3, %r10, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB11_77; + @!%p4 bra BB11_73; bra.uni BB11_1; BB11_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r64, %r1; + mov.u32 %r63, %r1; @%p5 bra BB11_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r65, %r3; + mov.u32 %r64, %r3; @%p6 bra BB11_4; - mov.u32 %r65, %r2; + mov.u32 %r64, %r2; BB11_4: - mov.u32 %r59, %r65; - mov.u32 %r4, %r59; - mov.u32 %r64, %r4; + mov.u32 %r58, %r64; + mov.u32 %r4, %r58; + mov.u32 %r63, %r4; BB11_5: - mov.u32 %r5, %r64; + mov.u32 %r5, %r63; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r62, %r1; + mov.u32 %r61, %r1; @%p7 bra BB11_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r63, %r3; + mov.u32 %r62, %r3; @%p8 bra BB11_8; - mov.u32 %r63, %r2; + mov.u32 %r62, %r2; BB11_8: - mov.u32 %r62, %r63; + mov.u32 %r61, %r62; BB11_9: cvta.to.global.u64 %rd5, %rd3; @@ -753,10 +753,10 @@ BB11_9: mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r62, 8; + mul.wide.s32 %rd9, %r61, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r13, 8; @%p9 bra BB11_26; @@ -767,12 +767,12 @@ BB11_9: @%p30 bra BB11_15; setp.eq.s32 %p33, %r13, 0; - @%p33 bra BB11_75; + @%p33 bra BB11_71; bra.uni BB11_13; -BB11_75: - add.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; +BB11_71: + add.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; BB11_26: setp.gt.s32 %p10, %r13, 13; @@ -782,23 +782,23 @@ BB11_26: @%p17 bra BB11_31; setp.eq.s32 %p21, %r13, 9; - @%p21 bra BB11_55; + @%p21 bra BB11_53; bra.uni BB11_29; -BB11_55: - setp.eq.f64 %p48, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB11_76; +BB11_53: + setp.eq.f64 %p50, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB11_72; BB11_18: setp.gt.s32 %p24, %r13, 5; @%p24 bra BB11_22; setp.eq.s32 %p28, %r13, 4; - @%p28 bra BB11_58; + @%p28 bra BB11_56; bra.uni BB11_20; -BB11_58: +BB11_56: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -811,7 +811,7 @@ BB11_58: add.s32 %r32, %r31, -1012; mov.b64 %rd15, %fd2; shl.b64 %rd1, %rd15, %r32; - setp.eq.s64 %p53, %rd1, -9223372036854775808; + setp.eq.s64 %p55, %rd1, -9223372036854775808; abs.f64 %fd19, %fd1; // Callseq Start 0 { @@ -828,192 +828,192 @@ BB11_58: param0, param1 ); - ld.param.f64 %fd54, [retval0+0]; + ld.param.f64 %fd53, [retval0+0]; //{ }// Callseq End 0 - setp.lt.s32 %p54, %r8, 0; - and.pred %p1, %p54, %p53; - @!%p1 bra BB11_60; - bra.uni BB11_59; + setp.lt.s32 %p56, %r8, 0; + and.pred %p1, %p56, %p55; + @!%p1 bra BB11_58; + bra.uni BB11_57; -BB11_59: +BB11_57: { .reg .b32 %temp; - mov.b64 {%temp, %r33}, %fd54; + mov.b64 {%temp, %r33}, %fd53; } xor.b32 %r34, %r33, -2147483648; { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd54; + mov.b64 {%r35, %temp}, %fd53; } - mov.b64 %fd54, {%r35, %r34}; + mov.b64 %fd53, {%r35, %r34}; -BB11_60: - mov.f64 %fd53, %fd54; - setp.eq.f64 %p55, %fd1, 0d0000000000000000; - @%p55 bra BB11_63; - bra.uni BB11_61; +BB11_58: + mov.f64 %fd52, %fd53; + setp.eq.f64 %p57, %fd1, 0d0000000000000000; + @%p57 bra BB11_61; + bra.uni BB11_59; -BB11_63: - selp.b32 %r36, %r8, 0, %p53; +BB11_61: + selp.b32 %r36, %r8, 0, %p55; or.b32 %r37, %r36, 2146435072; - setp.lt.s32 %p59, %r9, 0; - selp.b32 %r38, %r37, %r36, %p59; + setp.lt.s32 %p61, %r9, 0; + selp.b32 %r38, %r37, %r36, %p61; mov.u32 %r39, 0; - mov.b64 %fd53, {%r39, %r38}; - bra.uni BB11_64; + mov.b64 %fd52, {%r39, %r38}; + bra.uni BB11_62; BB11_35: setp.gt.s32 %p11, %r13, 15; @%p11 bra BB11_39; setp.eq.s32 %p15, %r13, 14; - @%p15 bra BB11_52; + @%p15 bra BB11_50; bra.uni BB11_37; -BB11_52: +BB11_50: cvt.rni.s64.f64 %rd11, %fd1; cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r25, %rd11; cvt.u32.u64 %r26, %rd12; or.b32 %r27, %r26, %r25; - setp.eq.s32 %p45, %r27, 0; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45; - bra.uni BB11_76; + setp.eq.s32 %p47, %r27, 0; + selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47; + bra.uni BB11_72; BB11_15: setp.eq.s32 %p31, %r13, 2; - @%p31 bra BB11_74; + @%p31 bra BB11_70; bra.uni BB11_16; -BB11_74: - mul.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; +BB11_70: + mul.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; BB11_31: setp.eq.s32 %p18, %r13, 11; - @%p18 bra BB11_54; + @%p18 bra BB11_52; setp.eq.s32 %p19, %r13, 12; - @%p19 bra BB11_53; + @%p19 bra BB11_51; bra.uni BB11_33; -BB11_53: - max.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; +BB11_51: + max.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; BB11_22: setp.eq.s32 %p25, %r13, 6; - @%p25 bra BB11_57; + @%p25 bra BB11_55; setp.eq.s32 %p26, %r13, 7; - @%p26 bra BB11_56; + @%p26 bra BB11_54; bra.uni BB11_24; -BB11_56: - setp.gt.f64 %p50, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB11_76; +BB11_54: + setp.gt.f64 %p52, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52; + bra.uni BB11_72; BB11_39: setp.eq.s32 %p12, %r13, 16; - @%p12 bra BB11_51; + @%p12 bra BB11_49; setp.eq.s32 %p13, %r13, 17; - @%p13 bra BB11_46; + @%p13 bra BB11_45; bra.uni BB11_41; -BB11_46: - setp.eq.f64 %p38, %fd2, 0d0000000000000000; - setp.eq.f64 %p39, %fd2, 0d8000000000000000; - or.pred %p40, %p38, %p39; - mov.f64 %fd55, 0d7FF8000000000000; - @%p40 bra BB11_76; +BB11_45: + setp.eq.f64 %p39, %fd2, 0d0000000000000000; + setp.eq.f64 %p40, %fd2, 0d8000000000000000; + or.pred %p41, %p39, %p40; + mov.f64 %fd54, 0d7FF8000000000000; + @%p41 bra BB11_72; - div.rn.f64 %fd55, %fd1, %fd2; - abs.f64 %fd39, %fd55; - setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000; - @%p41 bra BB11_76; + div.rn.f64 %fd54, %fd1, %fd2; + abs.f64 %fd39, %fd54; + setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000; + @%p42 bra BB11_72; { .reg .b32 %temp; - mov.b64 {%temp, %r22}, %fd55; + mov.b64 {%r22, %temp}, %fd54; } - and.b32 %r23, %r22, 2147483647; - setp.ne.s32 %p42, %r23, 2146435072; - @%p42 bra BB11_50; - { .reg .b32 %temp; - mov.b64 {%r24, %temp}, %fd55; + mov.b64 {%temp, %r23}, %fd54; } - setp.eq.s32 %p43, %r24, 0; - @%p43 bra BB11_76; - -BB11_50: - cvt.rmi.f64.f64 %fd40, %fd55; + and.b32 %r24, %r23, 2147483647; + setp.ne.s32 %p43, %r24, 2146435072; + setp.ne.s32 %p44, %r22, 0; + or.pred %p45, %p43, %p44; + @!%p45 bra BB11_72; + bra.uni BB11_48; + +BB11_48: + cvt.rmi.f64.f64 %fd40, %fd54; mul.f64 %fd41, %fd2, %fd40; - sub.f64 %fd55, %fd1, %fd41; - bra.uni BB11_76; + sub.f64 %fd54, %fd1, %fd41; + bra.uni BB11_72; BB11_13: setp.eq.s32 %p34, %r13, 1; @%p34 bra BB11_14; - bra.uni BB11_76; + bra.uni BB11_72; BB11_14: - sub.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; + sub.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; BB11_29: setp.eq.s32 %p22, %r13, 10; @%p22 bra BB11_30; - bra.uni BB11_76; + bra.uni BB11_72; BB11_30: - setp.neu.f64 %p47, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47; - bra.uni BB11_76; + setp.neu.f64 %p49, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49; + bra.uni BB11_72; BB11_20: setp.eq.s32 %p29, %r13, 5; @%p29 bra BB11_21; - bra.uni BB11_76; + bra.uni BB11_72; BB11_21: - setp.lt.f64 %p52, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52; - bra.uni BB11_76; + setp.lt.f64 %p54, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54; + bra.uni BB11_72; BB11_37: setp.eq.s32 %p16, %r13, 15; @%p16 bra BB11_38; - bra.uni BB11_76; + bra.uni BB11_72; BB11_38: mul.f64 %fd43, %fd1, %fd2; mov.f64 %fd44, 0d3FF0000000000000; - sub.f64 %fd55, %fd44, %fd43; - bra.uni BB11_76; + sub.f64 %fd54, %fd44, %fd43; + bra.uni BB11_72; BB11_16: setp.eq.s32 %p32, %r13, 3; @%p32 bra BB11_17; - bra.uni BB11_76; + bra.uni BB11_72; BB11_17: - div.rn.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; + div.rn.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; -BB11_54: - min.f64 %fd55, %fd1, %fd2; - bra.uni BB11_76; +BB11_52: + min.f64 %fd54, %fd1, %fd2; + bra.uni BB11_72; BB11_33: setp.eq.s32 %p20, %r13, 13; @%p20 bra BB11_34; - bra.uni BB11_76; + bra.uni BB11_72; BB11_34: cvt.rni.s64.f64 %rd13, %fd1; @@ -1021,149 +1021,147 @@ BB11_34: cvt.u32.u64 %r28, %rd13; cvt.u32.u64 %r29, %rd14; and.b32 %r30, %r29, %r28; - setp.eq.s32 %p46, %r30, 0; - selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46; - bra.uni BB11_76; + setp.eq.s32 %p48, %r30, 0; + selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48; + bra.uni BB11_72; -BB11_57: - setp.le.f64 %p51, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p51; - bra.uni BB11_76; +BB11_55: + setp.le.f64 %p53, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53; + bra.uni BB11_72; BB11_24: setp.eq.s32 %p27, %r13, 8; @%p27 bra BB11_25; - bra.uni BB11_76; + bra.uni BB11_72; BB11_25: - setp.ge.f64 %p49, %fd1, %fd2; - selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB11_76; + setp.ge.f64 %p51, %fd1, %fd2; + selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51; + bra.uni BB11_72; -BB11_51: - setp.neu.f64 %p44, %fd1, 0d0000000000000000; +BB11_49: + setp.neu.f64 %p46, %fd1, 0d0000000000000000; sub.f64 %fd42, %fd1, %fd2; - selp.f64 %fd55, %fd42, 0d0000000000000000, %p44; - bra.uni BB11_76; + selp.f64 %fd54, %fd42, 0d0000000000000000, %p46; + bra.uni BB11_72; BB11_41: setp.ne.s32 %p14, %r13, 18; - @%p14 bra BB11_76; + @%p14 bra BB11_72; - div.rn.f64 %fd55, %fd1, %fd2; - abs.f64 %fd37, %fd55; + div.rn.f64 %fd54, %fd1, %fd2; + abs.f64 %fd37, %fd54; setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; - @%p35 bra BB11_76; + @%p35 bra BB11_72; { .reg .b32 %temp; - mov.b64 {%temp, %r19}, %fd55; + mov.b64 {%r19, %temp}, %fd54; } - and.b32 %r20, %r19, 2147483647; - setp.ne.s32 %p36, %r20, 2146435072; - @%p36 bra BB11_45; - { .reg .b32 %temp; - mov.b64 {%r21, %temp}, %fd55; + mov.b64 {%temp, %r20}, %fd54; } - setp.eq.s32 %p37, %r21, 0; - @%p37 bra BB11_76; + and.b32 %r21, %r20, 2147483647; + setp.ne.s32 %p36, %r21, 2146435072; + setp.ne.s32 %p37, %r19, 0; + or.pred %p38, %p36, %p37; + @!%p38 bra BB11_72; + bra.uni BB11_44; -BB11_45: - cvt.rmi.f64.f64 %fd55, %fd55; - bra.uni BB11_76; +BB11_44: + cvt.rmi.f64.f64 %fd54, %fd54; + bra.uni BB11_72; -BB11_61: - setp.gt.s32 %p56, %r8, -1; - @%p56 bra BB11_64; +BB11_59: + setp.gt.s32 %p58, %r8, -1; + @%p58 bra BB11_62; cvt.rzi.f64.f64 %fd45, %fd2; - setp.neu.f64 %p57, %fd45, %fd2; - selp.f64 %fd53, 0dFFF8000000000000, %fd53, %p57; + setp.neu.f64 %p59, %fd45, %fd2; + selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59; -BB11_64: - mov.f64 %fd25, %fd53; +BB11_62: + mov.f64 %fd25, %fd52; add.f64 %fd26, %fd1, %fd2; { .reg .b32 %temp; mov.b64 {%temp, %r40}, %fd26; } and.b32 %r41, %r40, 2146435072; - setp.ne.s32 %p60, %r41, 2146435072; - mov.f64 %fd52, %fd25; - @%p60 bra BB11_73; + setp.ne.s32 %p62, %r41, 2146435072; + mov.f64 %fd51, %fd25; + @%p62 bra BB11_69; - setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000; - mov.f64 %fd52, %fd26; - @%p61 bra BB11_73; - - abs.f64 %fd46, %fd2; - setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000; + setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000; mov.f64 %fd51, %fd26; - mov.f64 %fd52, %fd51; - @%p62 bra BB11_73; - - and.b32 %r42, %r9, 2147483647; - setp.ne.s32 %p63, %r42, 2146435072; @%p63 bra BB11_69; + abs.f64 %fd46, %fd2; + setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000; + mov.f64 %fd50, %fd26; + mov.f64 %fd51, %fd50; + @%p64 bra BB11_69; + { .reg .b32 %temp; - mov.b64 {%r43, %temp}, %fd2; + mov.b64 {%r42, %temp}, %fd2; } - setp.eq.s32 %p64, %r43, 0; - @%p64 bra BB11_72; - -BB11_69: - and.b32 %r44, %r8, 2147483647; - setp.ne.s32 %p65, %r44, 2146435072; - mov.f64 %fd49, %fd25; - mov.f64 %fd52, %fd49; - @%p65 bra BB11_73; - + and.b32 %r43, %r9, 2147483647; + setp.eq.s32 %p65, %r43, 2146435072; + setp.eq.s32 %p66, %r42, 0; + and.pred %p67, %p65, %p66; + @%p67 bra BB11_68; + bra.uni BB11_66; + +BB11_68: + setp.gt.f64 %p71, %fd19, 0d3FF0000000000000; + selp.b32 %r51, 2146435072, 0, %p71; + xor.b32 %r52, %r51, 2146435072; + setp.lt.s32 %p72, %r9, 0; + selp.b32 %r53, %r52, %r51, %p72; + setp.eq.f64 %p73, %fd1, 0dBFF0000000000000; + selp.b32 %r54, 1072693248, %r53, %p73; + mov.u32 %r55, 0; + mov.b64 %fd51, {%r55, %r54}; + bra.uni BB11_69; + +BB11_66: { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd1; + mov.b64 {%r44, %temp}, %fd1; } - setp.ne.s32 %p66, %r45, 0; - mov.f64 %fd52, %fd25; - @%p66 bra BB11_73; - + and.b32 %r45, %r8, 2147483647; + setp.eq.s32 %p68, %r45, 2146435072; + setp.eq.s32 %p69, %r44, 0; + and.pred %p70, %p68, %p69; + mov.f64 %fd51, %fd25; + @!%p70 bra BB11_69; + bra.uni BB11_67; + +BB11_67: shr.s32 %r46, %r9, 31; and.b32 %r47, %r46, -2146435072; - add.s32 %r48, %r47, 2146435072; - or.b32 %r49, %r48, -2147483648; - selp.b32 %r50, %r49, %r48, %p1; - mov.u32 %r51, 0; - mov.b64 %fd52, {%r51, %r50}; - bra.uni BB11_73; - -BB11_72: - setp.gt.f64 %p67, %fd19, 0d3FF0000000000000; - selp.b32 %r52, 2146435072, 0, %p67; - xor.b32 %r53, %r52, 2146435072; - setp.lt.s32 %p68, %r9, 0; - selp.b32 %r54, %r53, %r52, %p68; - setp.eq.f64 %p69, %fd1, 0dBFF0000000000000; - selp.b32 %r55, 1072693248, %r54, %p69; - mov.u32 %r56, 0; - mov.b64 %fd52, {%r56, %r55}; + selp.b32 %r48, -1048576, 2146435072, %p1; + add.s32 %r49, %r48, %r47; + mov.u32 %r50, 0; + mov.b64 %fd51, {%r50, %r49}; -BB11_73: - setp.eq.f64 %p70, %fd2, 0d0000000000000000; - setp.eq.f64 %p71, %fd1, 0d3FF0000000000000; - or.pred %p72, %p71, %p70; - selp.f64 %fd55, 0d3FF0000000000000, %fd52, %p72; +BB11_69: + setp.eq.f64 %p74, %fd2, 0d0000000000000000; + setp.eq.f64 %p75, %fd1, 0d3FF0000000000000; + or.pred %p76, %p75, %p74; + selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76; -BB11_76: +BB11_72: cvta.to.global.u64 %rd16, %rd4; mul.wide.s32 %rd17, %r3, 8; add.s64 %rd18, %rd16, %rd17; - st.global.f64 [%rd18], %fd55; + st.global.f64 [%rd18], %fd54; bar.sync 0; -BB11_77: +BB11_73: ret; } @@ -1177,9 +1175,9 @@ BB11_77: .param .u32 matrix_scalar_op_param_5 ) { - .reg .pred %p<133>; - .reg .b32 %r<88>; - .reg .f64 %fd<109>; + .reg .pred %p<141>; + .reg .b32 %r<86>; + .reg .f64 %fd<107>; .reg .b64 %rd<20>; @@ -1194,7 +1192,7 @@ BB11_77: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB12_138; + @%p3 bra BB12_130; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -1203,9 +1201,9 @@ BB11_77: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB12_70; + @%p4 bra BB12_66; - mov.f64 %fd99, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 8; @%p5 bra BB12_19; @@ -1216,31 +1214,31 @@ BB11_77: @%p26 bra BB12_8; setp.eq.s32 %p29, %r6, 0; - @%p29 bra BB12_68; + @%p29 bra BB12_64; bra.uni BB12_6; -BB12_68: - add.f64 %fd99, %fd1, %fd68; - bra.uni BB12_69; +BB12_64: + add.f64 %fd98, %fd1, %fd68; + bra.uni BB12_65; -BB12_70: - mov.f64 %fd108, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p69, %r6, 8; - @%p69 bra BB12_87; +BB12_66: + mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p73, %r6, 8; + @%p73 bra BB12_83; - setp.gt.s32 %p83, %r6, 3; - @%p83 bra BB12_79; + setp.gt.s32 %p87, %r6, 3; + @%p87 bra BB12_75; - setp.gt.s32 %p90, %r6, 1; - @%p90 bra BB12_76; + setp.gt.s32 %p94, %r6, 1; + @%p94 bra BB12_72; - setp.eq.s32 %p93, %r6, 0; - @%p93 bra BB12_136; - bra.uni BB12_74; + setp.eq.s32 %p97, %r6, 0; + @%p97 bra BB12_128; + bra.uni BB12_70; -BB12_136: - add.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_128: + add.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; BB12_19: setp.gt.s32 %p6, %r6, 13; @@ -1250,39 +1248,39 @@ BB12_19: @%p13 bra BB12_24; setp.eq.s32 %p17, %r6, 9; - @%p17 bra BB12_48; + @%p17 bra BB12_46; bra.uni BB12_22; -BB12_48: - setp.eq.f64 %p44, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44; - bra.uni BB12_69; +BB12_46: + setp.eq.f64 %p46, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46; + bra.uni BB12_65; -BB12_87: - setp.gt.s32 %p70, %r6, 13; - @%p70 bra BB12_96; +BB12_83: + setp.gt.s32 %p74, %r6, 13; + @%p74 bra BB12_92; - setp.gt.s32 %p77, %r6, 10; - @%p77 bra BB12_92; + setp.gt.s32 %p81, %r6, 10; + @%p81 bra BB12_88; - setp.eq.s32 %p81, %r6, 9; - @%p81 bra BB12_116; - bra.uni BB12_90; + setp.eq.s32 %p85, %r6, 9; + @%p85 bra BB12_110; + bra.uni BB12_86; -BB12_116: - setp.eq.f64 %p108, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108; - bra.uni BB12_137; +BB12_110: + setp.eq.f64 %p114, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114; + bra.uni BB12_129; BB12_11: setp.gt.s32 %p20, %r6, 5; @%p20 bra BB12_15; setp.eq.s32 %p24, %r6, 4; - @%p24 bra BB12_51; + @%p24 bra BB12_49; bra.uni BB12_13; -BB12_51: +BB12_49: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd68; @@ -1295,7 +1293,7 @@ BB12_51: add.s32 %r25, %r24, -1012; mov.b64 %rd14, %fd1; shl.b64 %rd2, %rd14, %r25; - setp.eq.s64 %p49, %rd2, -9223372036854775808; + setp.eq.s64 %p51, %rd2, -9223372036854775808; abs.f64 %fd18, %fd68; // Callseq Start 1 { @@ -1312,69 +1310,69 @@ BB12_51: param0, param1 ); - ld.param.f64 %fd98, [retval0+0]; + ld.param.f64 %fd97, [retval0+0]; //{ }// Callseq End 1 - setp.lt.s32 %p50, %r2, 0; - and.pred %p1, %p50, %p49; - @!%p1 bra BB12_53; - bra.uni BB12_52; + setp.lt.s32 %p52, %r2, 0; + and.pred %p1, %p52, %p51; + @!%p1 bra BB12_51; + bra.uni BB12_50; -BB12_52: +BB12_50: { .reg .b32 %temp; - mov.b64 {%temp, %r26}, %fd98; + mov.b64 {%temp, %r26}, %fd97; } xor.b32 %r27, %r26, -2147483648; { .reg .b32 %temp; - mov.b64 {%r28, %temp}, %fd98; + mov.b64 {%r28, %temp}, %fd97; } - mov.b64 %fd98, {%r28, %r27}; + mov.b64 %fd97, {%r28, %r27}; -BB12_53: - mov.f64 %fd97, %fd98; - setp.eq.f64 %p51, %fd68, 0d0000000000000000; - @%p51 bra BB12_56; - bra.uni BB12_54; +BB12_51: + mov.f64 %fd96, %fd97; + setp.eq.f64 %p53, %fd68, 0d0000000000000000; + @%p53 bra BB12_54; + bra.uni BB12_52; -BB12_56: - selp.b32 %r29, %r2, 0, %p49; +BB12_54: + selp.b32 %r29, %r2, 0, %p51; or.b32 %r30, %r29, 2146435072; - setp.lt.s32 %p55, %r3, 0; - selp.b32 %r31, %r30, %r29, %p55; + setp.lt.s32 %p57, %r3, 0; + selp.b32 %r31, %r30, %r29, %p57; mov.u32 %r32, 0; - mov.b64 %fd97, {%r32, %r31}; - bra.uni BB12_57; + mov.b64 %fd96, {%r32, %r31}; + bra.uni BB12_55; BB12_28: setp.gt.s32 %p7, %r6, 15; @%p7 bra BB12_32; setp.eq.s32 %p11, %r6, 14; - @%p11 bra BB12_45; + @%p11 bra BB12_43; bra.uni BB12_30; -BB12_45: +BB12_43: cvt.rni.s64.f64 %rd10, %fd68; cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r18, %rd10; cvt.u32.u64 %r19, %rd11; or.b32 %r20, %r19, %r18; - setp.eq.s32 %p41, %r20, 0; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41; - bra.uni BB12_69; + setp.eq.s32 %p43, %r20, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43; + bra.uni BB12_65; -BB12_79: - setp.gt.s32 %p84, %r6, 5; - @%p84 bra BB12_83; +BB12_75: + setp.gt.s32 %p88, %r6, 5; + @%p88 bra BB12_79; - setp.eq.s32 %p88, %r6, 4; - @%p88 bra BB12_119; - bra.uni BB12_81; + setp.eq.s32 %p92, %r6, 4; + @%p92 bra BB12_113; + bra.uni BB12_77; -BB12_119: +BB12_113: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1383,11 +1381,11 @@ BB12_119: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd68; } - bfe.u32 %r62, %r5, 20, 11; - add.s32 %r63, %r62, -1012; + bfe.u32 %r61, %r5, 20, 11; + add.s32 %r62, %r61, -1012; mov.b64 %rd19, %fd68; - shl.b64 %rd3, %rd19, %r63; - setp.eq.s64 %p113, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd19, %r62; + setp.eq.s64 %p119, %rd3, -9223372036854775808; abs.f64 %fd51, %fd1; // Callseq Start 2 { @@ -1404,267 +1402,267 @@ BB12_119: param0, param1 ); - ld.param.f64 %fd107, [retval0+0]; + ld.param.f64 %fd105, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p114, %r4, 0; - and.pred %p2, %p114, %p113; - @!%p2 bra BB12_121; - bra.uni BB12_120; + setp.lt.s32 %p120, %r4, 0; + and.pred %p2, %p120, %p119; + @!%p2 bra BB12_115; + bra.uni BB12_114; -BB12_120: +BB12_114: { .reg .b32 %temp; - mov.b64 {%temp, %r64}, %fd107; + mov.b64 {%temp, %r63}, %fd105; } - xor.b32 %r65, %r64, -2147483648; + xor.b32 %r64, %r63, -2147483648; { .reg .b32 %temp; - mov.b64 {%r66, %temp}, %fd107; + mov.b64 {%r65, %temp}, %fd105; } - mov.b64 %fd107, {%r66, %r65}; + mov.b64 %fd105, {%r65, %r64}; -BB12_121: - mov.f64 %fd106, %fd107; - setp.eq.f64 %p115, %fd1, 0d0000000000000000; - @%p115 bra BB12_124; - bra.uni BB12_122; +BB12_115: + mov.f64 %fd104, %fd105; + setp.eq.f64 %p121, %fd1, 0d0000000000000000; + @%p121 bra BB12_118; + bra.uni BB12_116; -BB12_124: - selp.b32 %r67, %r4, 0, %p113; - or.b32 %r68, %r67, 2146435072; - setp.lt.s32 %p119, %r5, 0; - selp.b32 %r69, %r68, %r67, %p119; - mov.u32 %r70, 0; - mov.b64 %fd106, {%r70, %r69}; - bra.uni BB12_125; +BB12_118: + selp.b32 %r66, %r4, 0, %p119; + or.b32 %r67, %r66, 2146435072; + setp.lt.s32 %p125, %r5, 0; + selp.b32 %r68, %r67, %r66, %p125; + mov.u32 %r69, 0; + mov.b64 %fd104, {%r69, %r68}; + bra.uni BB12_119; -BB12_96: - setp.gt.s32 %p71, %r6, 15; - @%p71 bra BB12_100; +BB12_92: + setp.gt.s32 %p75, %r6, 15; + @%p75 bra BB12_96; - setp.eq.s32 %p75, %r6, 14; - @%p75 bra BB12_113; - bra.uni BB12_98; + setp.eq.s32 %p79, %r6, 14; + @%p79 bra BB12_107; + bra.uni BB12_94; -BB12_113: +BB12_107: cvt.rni.s64.f64 %rd15, %fd1; cvt.rni.s64.f64 %rd16, %fd68; - cvt.u32.u64 %r56, %rd15; - cvt.u32.u64 %r57, %rd16; - or.b32 %r58, %r57, %r56; - setp.eq.s32 %p105, %r58, 0; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105; - bra.uni BB12_137; + cvt.u32.u64 %r55, %rd15; + cvt.u32.u64 %r56, %rd16; + or.b32 %r57, %r56, %r55; + setp.eq.s32 %p111, %r57, 0; + selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111; + bra.uni BB12_129; BB12_8: setp.eq.s32 %p27, %r6, 2; - @%p27 bra BB12_67; + @%p27 bra BB12_63; bra.uni BB12_9; -BB12_67: - mul.f64 %fd99, %fd1, %fd68; - bra.uni BB12_69; +BB12_63: + mul.f64 %fd98, %fd1, %fd68; + bra.uni BB12_65; BB12_24: setp.eq.s32 %p14, %r6, 11; - @%p14 bra BB12_47; + @%p14 bra BB12_45; setp.eq.s32 %p15, %r6, 12; - @%p15 bra BB12_46; + @%p15 bra BB12_44; bra.uni BB12_26; -BB12_46: - max.f64 %fd99, %fd68, %fd1; - bra.uni BB12_69; +BB12_44: + max.f64 %fd98, %fd68, %fd1; + bra.uni BB12_65; BB12_15: setp.eq.s32 %p21, %r6, 6; - @%p21 bra BB12_50; + @%p21 bra BB12_48; setp.eq.s32 %p22, %r6, 7; - @%p22 bra BB12_49; + @%p22 bra BB12_47; bra.uni BB12_17; -BB12_49: - setp.lt.f64 %p46, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46; - bra.uni BB12_69; +BB12_47: + setp.lt.f64 %p48, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB12_65; BB12_32: setp.eq.s32 %p8, %r6, 16; - @%p8 bra BB12_44; + @%p8 bra BB12_42; setp.eq.s32 %p9, %r6, 17; - @%p9 bra BB12_39; + @%p9 bra BB12_38; bra.uni BB12_34; -BB12_39: - setp.eq.f64 %p34, %fd1, 0d0000000000000000; - setp.eq.f64 %p35, %fd1, 0d8000000000000000; - or.pred %p36, %p34, %p35; - mov.f64 %fd99, 0d7FF8000000000000; - @%p36 bra BB12_69; +BB12_38: + setp.eq.f64 %p35, %fd1, 0d0000000000000000; + setp.eq.f64 %p36, %fd1, 0d8000000000000000; + or.pred %p37, %p35, %p36; + mov.f64 %fd98, 0d7FF8000000000000; + @%p37 bra BB12_65; - div.rn.f64 %fd99, %fd68, %fd1; - abs.f64 %fd72, %fd99; - setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; - @%p37 bra BB12_69; + div.rn.f64 %fd98, %fd68, %fd1; + abs.f64 %fd72, %fd98; + setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000; + @%p38 bra BB12_65; { .reg .b32 %temp; - mov.b64 {%temp, %r15}, %fd99; + mov.b64 {%r15, %temp}, %fd98; } - and.b32 %r16, %r15, 2147483647; - setp.ne.s32 %p38, %r16, 2146435072; - @%p38 bra BB12_43; - { .reg .b32 %temp; - mov.b64 {%r17, %temp}, %fd99; + mov.b64 {%temp, %r16}, %fd98; } - setp.eq.s32 %p39, %r17, 0; - @%p39 bra BB12_69; - -BB12_43: - cvt.rmi.f64.f64 %fd73, %fd99; + and.b32 %r17, %r16, 2147483647; + setp.ne.s32 %p39, %r17, 2146435072; + setp.ne.s32 %p40, %r15, 0; + or.pred %p41, %p39, %p40; + @!%p41 bra BB12_65; + bra.uni BB12_41; + +BB12_41: + cvt.rmi.f64.f64 %fd73, %fd98; mul.f64 %fd74, %fd1, %fd73; - sub.f64 %fd99, %fd68, %fd74; - bra.uni BB12_69; + sub.f64 %fd98, %fd68, %fd74; + bra.uni BB12_65; -BB12_76: - setp.eq.s32 %p91, %r6, 2; - @%p91 bra BB12_135; - bra.uni BB12_77; +BB12_72: + setp.eq.s32 %p95, %r6, 2; + @%p95 bra BB12_127; + bra.uni BB12_73; -BB12_135: - mul.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_127: + mul.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; -BB12_92: - setp.eq.s32 %p78, %r6, 11; - @%p78 bra BB12_115; +BB12_88: + setp.eq.s32 %p82, %r6, 11; + @%p82 bra BB12_109; - setp.eq.s32 %p79, %r6, 12; - @%p79 bra BB12_114; - bra.uni BB12_94; + setp.eq.s32 %p83, %r6, 12; + @%p83 bra BB12_108; + bra.uni BB12_90; -BB12_114: - max.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_108: + max.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; -BB12_83: - setp.eq.s32 %p85, %r6, 6; - @%p85 bra BB12_118; +BB12_79: + setp.eq.s32 %p89, %r6, 6; + @%p89 bra BB12_112; - setp.eq.s32 %p86, %r6, 7; - @%p86 bra BB12_117; - bra.uni BB12_85; + setp.eq.s32 %p90, %r6, 7; + @%p90 bra BB12_111; + bra.uni BB12_81; -BB12_117: - setp.gt.f64 %p110, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110; - bra.uni BB12_137; +BB12_111: + setp.gt.f64 %p116, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116; + bra.uni BB12_129; -BB12_100: - setp.eq.s32 %p72, %r6, 16; - @%p72 bra BB12_112; +BB12_96: + setp.eq.s32 %p76, %r6, 16; + @%p76 bra BB12_106; - setp.eq.s32 %p73, %r6, 17; - @%p73 bra BB12_107; - bra.uni BB12_102; + setp.eq.s32 %p77, %r6, 17; + @%p77 bra BB12_102; + bra.uni BB12_98; -BB12_107: - setp.eq.f64 %p98, %fd68, 0d0000000000000000; - setp.eq.f64 %p99, %fd68, 0d8000000000000000; - or.pred %p100, %p98, %p99; - mov.f64 %fd108, 0d7FF8000000000000; - @%p100 bra BB12_137; +BB12_102: + setp.eq.f64 %p103, %fd68, 0d0000000000000000; + setp.eq.f64 %p104, %fd68, 0d8000000000000000; + or.pred %p105, %p103, %p104; + mov.f64 %fd106, 0d7FF8000000000000; + @%p105 bra BB12_129; - div.rn.f64 %fd108, %fd1, %fd68; - abs.f64 %fd83, %fd108; - setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000; - @%p101 bra BB12_137; + div.rn.f64 %fd106, %fd1, %fd68; + abs.f64 %fd83, %fd106; + setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000; + @%p106 bra BB12_129; { .reg .b32 %temp; - mov.b64 {%temp, %r53}, %fd108; + mov.b64 {%r52, %temp}, %fd106; } - and.b32 %r54, %r53, 2147483647; - setp.ne.s32 %p102, %r54, 2146435072; - @%p102 bra BB12_111; - { .reg .b32 %temp; - mov.b64 {%r55, %temp}, %fd108; + mov.b64 {%temp, %r53}, %fd106; } - setp.eq.s32 %p103, %r55, 0; - @%p103 bra BB12_137; - -BB12_111: - cvt.rmi.f64.f64 %fd84, %fd108; + and.b32 %r54, %r53, 2147483647; + setp.ne.s32 %p107, %r54, 2146435072; + setp.ne.s32 %p108, %r52, 0; + or.pred %p109, %p107, %p108; + @!%p109 bra BB12_129; + bra.uni BB12_105; + +BB12_105: + cvt.rmi.f64.f64 %fd84, %fd106; mul.f64 %fd85, %fd84, %fd68; - sub.f64 %fd108, %fd1, %fd85; - bra.uni BB12_137; + sub.f64 %fd106, %fd1, %fd85; + bra.uni BB12_129; BB12_6: setp.eq.s32 %p30, %r6, 1; @%p30 bra BB12_7; - bra.uni BB12_69; + bra.uni BB12_65; BB12_7: - sub.f64 %fd99, %fd68, %fd1; - bra.uni BB12_69; + sub.f64 %fd98, %fd68, %fd1; + bra.uni BB12_65; BB12_22: setp.eq.s32 %p18, %r6, 10; @%p18 bra BB12_23; - bra.uni BB12_69; + bra.uni BB12_65; BB12_23: - setp.neu.f64 %p43, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43; - bra.uni BB12_69; + setp.neu.f64 %p45, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45; + bra.uni BB12_65; BB12_13: setp.eq.s32 %p25, %r6, 5; @%p25 bra BB12_14; - bra.uni BB12_69; + bra.uni BB12_65; BB12_14: - setp.gt.f64 %p48, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB12_69; + setp.gt.f64 %p50, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB12_65; BB12_30: setp.eq.s32 %p12, %r6, 15; @%p12 bra BB12_31; - bra.uni BB12_69; + bra.uni BB12_65; BB12_31: mul.f64 %fd76, %fd1, %fd68; mov.f64 %fd77, 0d3FF0000000000000; - sub.f64 %fd99, %fd77, %fd76; - bra.uni BB12_69; + sub.f64 %fd98, %fd77, %fd76; + bra.uni BB12_65; BB12_9: setp.eq.s32 %p28, %r6, 3; @%p28 bra BB12_10; - bra.uni BB12_69; + bra.uni BB12_65; BB12_10: - div.rn.f64 %fd99, %fd68, %fd1; - bra.uni BB12_69; + div.rn.f64 %fd98, %fd68, %fd1; + bra.uni BB12_65; -BB12_47: - min.f64 %fd99, %fd68, %fd1; - bra.uni BB12_69; +BB12_45: + min.f64 %fd98, %fd68, %fd1; + bra.uni BB12_65; BB12_26: setp.eq.s32 %p16, %r6, 13; @%p16 bra BB12_27; - bra.uni BB12_69; + bra.uni BB12_65; BB12_27: cvt.rni.s64.f64 %rd12, %fd68; @@ -1672,348 +1670,344 @@ BB12_27: cvt.u32.u64 %r21, %rd12; cvt.u32.u64 %r22, %rd13; and.b32 %r23, %r22, %r21; - setp.eq.s32 %p42, %r23, 0; - selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42; - bra.uni BB12_69; + setp.eq.s32 %p44, %r23, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44; + bra.uni BB12_65; -BB12_50: - setp.ge.f64 %p47, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p47; - bra.uni BB12_69; +BB12_48: + setp.ge.f64 %p49, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49; + bra.uni BB12_65; BB12_17: setp.eq.s32 %p23, %r6, 8; @%p23 bra BB12_18; - bra.uni BB12_69; + bra.uni BB12_65; BB12_18: - setp.le.f64 %p45, %fd1, %fd68; - selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p45; - bra.uni BB12_69; + setp.le.f64 %p47, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47; + bra.uni BB12_65; -BB12_44: - setp.neu.f64 %p40, %fd68, 0d0000000000000000; +BB12_42: + setp.neu.f64 %p42, %fd68, 0d0000000000000000; sub.f64 %fd75, %fd68, %fd1; - selp.f64 %fd99, %fd75, 0d0000000000000000, %p40; - bra.uni BB12_69; + selp.f64 %fd98, %fd75, 0d0000000000000000, %p42; + bra.uni BB12_65; BB12_34: setp.ne.s32 %p10, %r6, 18; - @%p10 bra BB12_69; + @%p10 bra BB12_65; - div.rn.f64 %fd99, %fd68, %fd1; - abs.f64 %fd70, %fd99; + div.rn.f64 %fd98, %fd68, %fd1; + abs.f64 %fd70, %fd98; setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; - @%p31 bra BB12_69; + @%p31 bra BB12_65; { .reg .b32 %temp; - mov.b64 {%temp, %r12}, %fd99; + mov.b64 {%r12, %temp}, %fd98; } - and.b32 %r13, %r12, 2147483647; - setp.ne.s32 %p32, %r13, 2146435072; - @%p32 bra BB12_38; - { .reg .b32 %temp; - mov.b64 {%r14, %temp}, %fd99; + mov.b64 {%temp, %r13}, %fd98; } - setp.eq.s32 %p33, %r14, 0; - @%p33 bra BB12_69; + and.b32 %r14, %r13, 2147483647; + setp.ne.s32 %p32, %r14, 2146435072; + setp.ne.s32 %p33, %r12, 0; + or.pred %p34, %p32, %p33; + @!%p34 bra BB12_65; + bra.uni BB12_37; -BB12_38: - cvt.rmi.f64.f64 %fd99, %fd99; - bra.uni BB12_69; +BB12_37: + cvt.rmi.f64.f64 %fd98, %fd98; + bra.uni BB12_65; -BB12_74: - setp.eq.s32 %p94, %r6, 1; - @%p94 bra BB12_75; - bra.uni BB12_137; +BB12_70: + setp.eq.s32 %p98, %r6, 1; + @%p98 bra BB12_71; + bra.uni BB12_129; -BB12_75: - sub.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_71: + sub.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; -BB12_90: - setp.eq.s32 %p82, %r6, 10; - @%p82 bra BB12_91; - bra.uni BB12_137; +BB12_86: + setp.eq.s32 %p86, %r6, 10; + @%p86 bra BB12_87; + bra.uni BB12_129; -BB12_91: - setp.neu.f64 %p107, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p107; - bra.uni BB12_137; +BB12_87: + setp.neu.f64 %p113, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113; + bra.uni BB12_129; -BB12_81: - setp.eq.s32 %p89, %r6, 5; - @%p89 bra BB12_82; - bra.uni BB12_137; +BB12_77: + setp.eq.s32 %p93, %r6, 5; + @%p93 bra BB12_78; + bra.uni BB12_129; -BB12_82: - setp.lt.f64 %p112, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p112; - bra.uni BB12_137; +BB12_78: + setp.lt.f64 %p118, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118; + bra.uni BB12_129; -BB12_98: - setp.eq.s32 %p76, %r6, 15; - @%p76 bra BB12_99; - bra.uni BB12_137; +BB12_94: + setp.eq.s32 %p80, %r6, 15; + @%p80 bra BB12_95; + bra.uni BB12_129; -BB12_99: +BB12_95: mul.f64 %fd87, %fd1, %fd68; mov.f64 %fd88, 0d3FF0000000000000; - sub.f64 %fd108, %fd88, %fd87; - bra.uni BB12_137; + sub.f64 %fd106, %fd88, %fd87; + bra.uni BB12_129; -BB12_77: - setp.eq.s32 %p92, %r6, 3; - @%p92 bra BB12_78; - bra.uni BB12_137; +BB12_73: + setp.eq.s32 %p96, %r6, 3; + @%p96 bra BB12_74; + bra.uni BB12_129; -BB12_78: - div.rn.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_74: + div.rn.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; -BB12_115: - min.f64 %fd108, %fd1, %fd68; - bra.uni BB12_137; +BB12_109: + min.f64 %fd106, %fd1, %fd68; + bra.uni BB12_129; -BB12_94: - setp.eq.s32 %p80, %r6, 13; - @%p80 bra BB12_95; - bra.uni BB12_137; +BB12_90: + setp.eq.s32 %p84, %r6, 13; + @%p84 bra BB12_91; + bra.uni BB12_129; -BB12_95: +BB12_91: cvt.rni.s64.f64 %rd17, %fd1; cvt.rni.s64.f64 %rd18, %fd68; - cvt.u32.u64 %r59, %rd17; - cvt.u32.u64 %r60, %rd18; - and.b32 %r61, %r60, %r59; - setp.eq.s32 %p106, %r61, 0; - selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p106; - bra.uni BB12_137; + cvt.u32.u64 %r58, %rd17; + cvt.u32.u64 %r59, %rd18; + and.b32 %r60, %r59, %r58; + setp.eq.s32 %p112, %r60, 0; + selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112; + bra.uni BB12_129; -BB12_118: - setp.le.f64 %p111, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p111; - bra.uni BB12_137; +BB12_112: + setp.le.f64 %p117, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117; + bra.uni BB12_129; -BB12_85: - setp.eq.s32 %p87, %r6, 8; - @%p87 bra BB12_86; - bra.uni BB12_137; +BB12_81: + setp.eq.s32 %p91, %r6, 8; + @%p91 bra BB12_82; + bra.uni BB12_129; -BB12_86: - setp.ge.f64 %p109, %fd1, %fd68; - selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p109; - bra.uni BB12_137; +BB12_82: + setp.ge.f64 %p115, %fd1, %fd68; + selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115; + bra.uni BB12_129; -BB12_112: - setp.neu.f64 %p104, %fd1, 0d0000000000000000; +BB12_106: + setp.neu.f64 %p110, %fd1, 0d0000000000000000; sub.f64 %fd86, %fd1, %fd68; - selp.f64 %fd108, %fd86, 0d0000000000000000, %p104; - bra.uni BB12_137; + selp.f64 %fd106, %fd86, 0d0000000000000000, %p110; + bra.uni BB12_129; -BB12_102: - setp.ne.s32 %p74, %r6, 18; - @%p74 bra BB12_137; +BB12_98: + setp.ne.s32 %p78, %r6, 18; + @%p78 bra BB12_129; - div.rn.f64 %fd108, %fd1, %fd68; - abs.f64 %fd81, %fd108; - setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000; - @%p95 bra BB12_137; + div.rn.f64 %fd106, %fd1, %fd68; + abs.f64 %fd81, %fd106; + setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000; + @%p99 bra BB12_129; { .reg .b32 %temp; - mov.b64 {%temp, %r50}, %fd108; + mov.b64 {%r49, %temp}, %fd106; } - and.b32 %r51, %r50, 2147483647; - setp.ne.s32 %p96, %r51, 2146435072; - @%p96 bra BB12_106; - { .reg .b32 %temp; - mov.b64 {%r52, %temp}, %fd108; + mov.b64 {%temp, %r50}, %fd106; } - setp.eq.s32 %p97, %r52, 0; - @%p97 bra BB12_137; + and.b32 %r51, %r50, 2147483647; + setp.ne.s32 %p100, %r51, 2146435072; + setp.ne.s32 %p101, %r49, 0; + or.pred %p102, %p100, %p101; + @!%p102 bra BB12_129; + bra.uni BB12_101; -BB12_106: - cvt.rmi.f64.f64 %fd108, %fd108; - bra.uni BB12_137; +BB12_101: + cvt.rmi.f64.f64 %fd106, %fd106; + bra.uni BB12_129; -BB12_54: - setp.gt.s32 %p52, %r2, -1; - @%p52 bra BB12_57; +BB12_52: + setp.gt.s32 %p54, %r2, -1; + @%p54 bra BB12_55; cvt.rzi.f64.f64 %fd78, %fd1; - setp.neu.f64 %p53, %fd78, %fd1; - selp.f64 %fd97, 0dFFF8000000000000, %fd97, %p53; + setp.neu.f64 %p55, %fd78, %fd1; + selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55; -BB12_57: - mov.f64 %fd24, %fd97; +BB12_55: + mov.f64 %fd24, %fd96; add.f64 %fd25, %fd1, %fd68; { .reg .b32 %temp; mov.b64 {%temp, %r33}, %fd25; } and.b32 %r34, %r33, 2146435072; - setp.ne.s32 %p56, %r34, 2146435072; - mov.f64 %fd96, %fd24; - @%p56 bra BB12_66; + setp.ne.s32 %p58, %r34, 2146435072; + mov.f64 %fd95, %fd24; + @%p58 bra BB12_62; - setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000; - mov.f64 %fd96, %fd25; - @%p57 bra BB12_66; - - abs.f64 %fd79, %fd1; - setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000; + setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000; mov.f64 %fd95, %fd25; - mov.f64 %fd96, %fd95; - @%p58 bra BB12_66; - - and.b32 %r35, %r3, 2147483647; - setp.ne.s32 %p59, %r35, 2146435072; @%p59 bra BB12_62; - { - .reg .b32 %temp; - mov.b64 {%r36, %temp}, %fd1; - } - setp.eq.s32 %p60, %r36, 0; - @%p60 bra BB12_65; - -BB12_62: - and.b32 %r37, %r2, 2147483647; - setp.ne.s32 %p61, %r37, 2146435072; - mov.f64 %fd93, %fd24; - mov.f64 %fd96, %fd93; - @%p61 bra BB12_66; + abs.f64 %fd79, %fd1; + setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000; + mov.f64 %fd94, %fd25; + mov.f64 %fd95, %fd94; + @%p60 bra BB12_62; { .reg .b32 %temp; - mov.b64 {%r38, %temp}, %fd68; + mov.b64 {%r35, %temp}, %fd1; } - setp.ne.s32 %p62, %r38, 0; - mov.f64 %fd96, %fd24; - @%p62 bra BB12_66; - - shr.s32 %r39, %r3, 31; - and.b32 %r40, %r39, -2146435072; - add.s32 %r41, %r40, 2146435072; - or.b32 %r42, %r41, -2147483648; - selp.b32 %r43, %r42, %r41, %p1; - mov.u32 %r44, 0; - mov.b64 %fd96, {%r44, %r43}; - bra.uni BB12_66; + and.b32 %r36, %r3, 2147483647; + setp.eq.s32 %p61, %r36, 2146435072; + setp.eq.s32 %p62, %r35, 0; + and.pred %p63, %p61, %p62; + @%p63 bra BB12_61; + bra.uni BB12_59; + +BB12_61: + setp.gt.f64 %p67, %fd18, 0d3FF0000000000000; + selp.b32 %r44, 2146435072, 0, %p67; + xor.b32 %r45, %r44, 2146435072; + setp.lt.s32 %p68, %r3, 0; + selp.b32 %r46, %r45, %r44, %p68; + setp.eq.f64 %p69, %fd68, 0dBFF0000000000000; + selp.b32 %r47, 1072693248, %r46, %p69; + mov.u32 %r48, 0; + mov.b64 %fd95, {%r48, %r47}; + bra.uni BB12_62; -BB12_122: - setp.gt.s32 %p116, %r4, -1; - @%p116 bra BB12_125; +BB12_116: + setp.gt.s32 %p122, %r4, -1; + @%p122 bra BB12_119; cvt.rzi.f64.f64 %fd89, %fd68; - setp.neu.f64 %p117, %fd89, %fd68; - selp.f64 %fd106, 0dFFF8000000000000, %fd106, %p117; + setp.neu.f64 %p123, %fd89, %fd68; + selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123; -BB12_125: - mov.f64 %fd57, %fd106; +BB12_119: + mov.f64 %fd57, %fd104; add.f64 %fd58, %fd1, %fd68; { .reg .b32 %temp; - mov.b64 {%temp, %r71}, %fd58; + mov.b64 {%temp, %r70}, %fd58; } - and.b32 %r72, %r71, 2146435072; - setp.ne.s32 %p120, %r72, 2146435072; - mov.f64 %fd105, %fd57; - @%p120 bra BB12_134; + and.b32 %r71, %r70, 2146435072; + setp.ne.s32 %p126, %r71, 2146435072; + mov.f64 %fd103, %fd57; + @%p126 bra BB12_126; - setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000; - mov.f64 %fd105, %fd58; - @%p121 bra BB12_134; + setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000; + mov.f64 %fd103, %fd58; + @%p127 bra BB12_126; abs.f64 %fd90, %fd68; - setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000; - mov.f64 %fd104, %fd58; - mov.f64 %fd105, %fd104; - @%p122 bra BB12_134; + setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000; + mov.f64 %fd102, %fd58; + mov.f64 %fd103, %fd102; + @%p128 bra BB12_126; + { + .reg .b32 %temp; + mov.b64 {%r72, %temp}, %fd68; + } and.b32 %r73, %r5, 2147483647; - setp.ne.s32 %p123, %r73, 2146435072; - @%p123 bra BB12_130; + setp.eq.s32 %p129, %r73, 2146435072; + setp.eq.s32 %p130, %r72, 0; + and.pred %p131, %p129, %p130; + @%p131 bra BB12_125; + bra.uni BB12_123; +BB12_125: + setp.gt.f64 %p135, %fd51, 0d3FF0000000000000; + selp.b32 %r81, 2146435072, 0, %p135; + xor.b32 %r82, %r81, 2146435072; + setp.lt.s32 %p136, %r5, 0; + selp.b32 %r83, %r82, %r81, %p136; + setp.eq.f64 %p137, %fd1, 0dBFF0000000000000; + selp.b32 %r84, 1072693248, %r83, %p137; + mov.u32 %r85, 0; + mov.b64 %fd103, {%r85, %r84}; + bra.uni BB12_126; + +BB12_59: { .reg .b32 %temp; - mov.b64 {%r74, %temp}, %fd68; + mov.b64 {%r37, %temp}, %fd68; } - setp.eq.s32 %p124, %r74, 0; - @%p124 bra BB12_133; + and.b32 %r38, %r2, 2147483647; + setp.eq.s32 %p64, %r38, 2146435072; + setp.eq.s32 %p65, %r37, 0; + and.pred %p66, %p64, %p65; + mov.f64 %fd95, %fd24; + @!%p66 bra BB12_62; + bra.uni BB12_60; + +BB12_60: + shr.s32 %r39, %r3, 31; + and.b32 %r40, %r39, -2146435072; + selp.b32 %r41, -1048576, 2146435072, %p1; + add.s32 %r42, %r41, %r40; + mov.u32 %r43, 0; + mov.b64 %fd95, {%r43, %r42}; -BB12_130: - and.b32 %r75, %r4, 2147483647; - setp.ne.s32 %p125, %r75, 2146435072; - mov.f64 %fd102, %fd57; - mov.f64 %fd105, %fd102; - @%p125 bra BB12_134; +BB12_62: + setp.eq.f64 %p70, %fd1, 0d0000000000000000; + setp.eq.f64 %p71, %fd68, 0d3FF0000000000000; + or.pred %p72, %p71, %p70; + selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72; +BB12_65: + st.global.f64 [%rd1], %fd98; + bra.uni BB12_130; + +BB12_123: { .reg .b32 %temp; - mov.b64 {%r76, %temp}, %fd1; + mov.b64 {%r74, %temp}, %fd1; } - setp.ne.s32 %p126, %r76, 0; - mov.f64 %fd105, %fd57; - @%p126 bra BB12_134; - - shr.s32 %r77, %r5, 31; - and.b32 %r78, %r77, -2146435072; - add.s32 %r79, %r78, 2146435072; - or.b32 %r80, %r79, -2147483648; - selp.b32 %r81, %r80, %r79, %p2; - mov.u32 %r82, 0; - mov.b64 %fd105, {%r82, %r81}; - bra.uni BB12_134; + and.b32 %r75, %r4, 2147483647; + setp.eq.s32 %p132, %r75, 2146435072; + setp.eq.s32 %p133, %r74, 0; + and.pred %p134, %p132, %p133; + mov.f64 %fd103, %fd57; + @!%p134 bra BB12_126; + bra.uni BB12_124; -BB12_65: - setp.gt.f64 %p63, %fd18, 0d3FF0000000000000; - selp.b32 %r45, 2146435072, 0, %p63; - xor.b32 %r46, %r45, 2146435072; - setp.lt.s32 %p64, %r3, 0; - selp.b32 %r47, %r46, %r45, %p64; - setp.eq.f64 %p65, %fd68, 0dBFF0000000000000; - selp.b32 %r48, 1072693248, %r47, %p65; - mov.u32 %r49, 0; - mov.b64 %fd96, {%r49, %r48}; +BB12_124: + shr.s32 %r76, %r5, 31; + and.b32 %r77, %r76, -2146435072; + selp.b32 %r78, -1048576, 2146435072, %p2; + add.s32 %r79, %r78, %r77; + mov.u32 %r80, 0; + mov.b64 %fd103, {%r80, %r79}; + +BB12_126: + setp.eq.f64 %p138, %fd68, 0d0000000000000000; + setp.eq.f64 %p139, %fd1, 0d3FF0000000000000; + or.pred %p140, %p139, %p138; + selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140; + +BB12_129: + st.global.f64 [%rd1], %fd106; -BB12_66: - setp.eq.f64 %p66, %fd1, 0d0000000000000000; - setp.eq.f64 %p67, %fd68, 0d3FF0000000000000; - or.pred %p68, %p67, %p66; - selp.f64 %fd99, 0d3FF0000000000000, %fd96, %p68; - -BB12_69: - st.global.f64 [%rd1], %fd99; - bra.uni BB12_138; - -BB12_133: - setp.gt.f64 %p127, %fd51, 0d3FF0000000000000; - selp.b32 %r83, 2146435072, 0, %p127; - xor.b32 %r84, %r83, 2146435072; - setp.lt.s32 %p128, %r5, 0; - selp.b32 %r85, %r84, %r83, %p128; - setp.eq.f64 %p129, %fd1, 0dBFF0000000000000; - selp.b32 %r86, 1072693248, %r85, %p129; - mov.u32 %r87, 0; - mov.b64 %fd105, {%r87, %r86}; - -BB12_134: - setp.eq.f64 %p130, %fd68, 0d0000000000000000; - setp.eq.f64 %p131, %fd1, 0d3FF0000000000000; - or.pred %p132, %p131, %p130; - selp.f64 %fd108, 0d3FF0000000000000, %fd105, %p132; - -BB12_137: - st.global.f64 [%rd1], %fd108; - -BB12_138: +BB12_130: bar.sync 0; ret; } @@ -4431,7 +4425,7 @@ BB34_2: .local .align 4 .b8 __local_depot35[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<7>; + .reg .pred %p<9>; .reg .b32 %r<18>; .reg .f64 %fd<41>; .reg .b64 %rd<17>; @@ -4449,7 +4443,7 @@ BB34_2: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB35_11; + @%p1 bra BB35_10; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4458,23 +4452,24 @@ BB34_2: ld.global.f64 %fd38, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd38; + mov.b64 {%r9, %temp}, %fd38; } - and.b32 %r10, %r9, 2147483647; - setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB35_4; - { .reg .b32 %temp; - mov.b64 {%r11, %temp}, %fd38; + mov.b64 {%temp, %r10}, %fd38; } - setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB35_4; + and.b32 %r11, %r10, 2147483647; + setp.eq.s32 %p2, %r11, 2146435072; + setp.eq.s32 %p3, %r9, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB35_3; + bra.uni BB35_2; +BB35_2: mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB35_4: +BB35_3: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r17, %fd15; st.local.u32 [%rd1], %r17; @@ -4491,8 +4486,8 @@ BB35_4: mov.b64 {%temp, %r12}, %fd38; } and.b32 %r13, %r12, 2145386496; - setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB35_6; + setp.lt.u32 %p5, %r13, 1105199104; + @%p5 bra BB35_5; // Callseq Start 3 { @@ -4515,11 +4510,11 @@ BB35_4: }// Callseq End 3 ld.local.u32 %r17, [%rd1]; -BB35_6: +BB35_5: and.b32 %r14, %r17, 1; shl.b32 %r15, %r14, 3; - setp.eq.s32 %p5, %r14, 0; - selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + setp.eq.b32 %p6, %r14, 1; + selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6; mul.wide.u32 %rd10, %r15, 8; mov.u64 %rd11, __cudart_sin_cos_coeffs; add.s64 %rd12, %rd10, %rd11; @@ -4537,27 +4532,179 @@ BB35_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB35_8; + setp.eq.s32 %p7, %r14, 0; + @%p7 bra BB35_7; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB35_8: +BB35_7: and.b32 %r16, %r17, 2; - setp.eq.s32 %p6, %r16, 0; - @%p6 bra BB35_10; + setp.eq.s32 %p8, %r16, 0; + @%p8 bra BB35_9; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB35_10: +BB35_9: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB35_11: +BB35_10: + ret; +} + + // .globl matrix_sinh +.visible .entry matrix_sinh( + .param .u64 matrix_sinh_param_0, + .param .u64 matrix_sinh_param_1, + .param .u32 matrix_sinh_param_2 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<24>; + .reg .f64 %fd<68>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_sinh_param_0]; + ld.param.u64 %rd3, [matrix_sinh_param_1]; + ld.param.u32 %r3, [matrix_sinh_param_2]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r5, %r4, %r6; + setp.ge.u32 %p1, %r1, %r3; + @%p1 bra BB36_5; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd5, [%rd6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd5; + } + and.b32 %r7, %r2, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r8, %temp}, %fd5; + } + mov.b64 %fd1, {%r8, %r7}; + setp.lt.u32 %p2, %r7, 1072693248; + @%p2 bra BB36_3; + bra.uni BB36_2; + +BB36_3: + mul.f64 %fd51, %fd1, %fd1; + mov.f64 %fd52, 0d3DE611A561D87DEF; + mov.f64 %fd53, 0d3D6B4C75AB274C53; + fma.rn.f64 %fd54, %fd53, %fd51, %fd52; + mov.f64 %fd55, 0d3E5AE64671B18F5C; + fma.rn.f64 %fd56, %fd54, %fd51, %fd55; + mov.f64 %fd57, 0d3EC71DE3A465B1E4; + fma.rn.f64 %fd58, %fd56, %fd51, %fd57; + mov.f64 %fd59, 0d3F2A01A01A02899D; + fma.rn.f64 %fd60, %fd58, %fd51, %fd59; + mov.f64 %fd61, 0d3F811111111110A6; + fma.rn.f64 %fd62, %fd60, %fd51, %fd61; + mov.f64 %fd63, 0d3FC5555555555556; + fma.rn.f64 %fd64, %fd62, %fd51, %fd63; + mul.f64 %fd65, %fd51, %fd64; + fma.rn.f64 %fd67, %fd65, %fd1, %fd1; + bra.uni BB36_4; + +BB36_2: + { + .reg .b32 %temp; + mov.b64 {%temp, %r9}, %fd1; + } + mov.f64 %fd6, 0d4338000000000000; + mov.f64 %fd7, 0d3FF71547652B82FE; + fma.rn.f64 %fd8, %fd1, %fd7, %fd6; + { + .reg .b32 %temp; + mov.b64 {%r10, %temp}, %fd8; + } + add.s32 %r11, %r10, -1; + mov.f64 %fd9, 0dC338000000000000; + add.rn.f64 %fd10, %fd8, %fd9; + mov.f64 %fd11, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd12, %fd10, %fd11, %fd1; + mov.f64 %fd13, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd14, %fd10, %fd13, %fd12; + add.s32 %r12, %r9, %r9; + setp.lt.u32 %p3, %r12, 2142496327; + selp.b32 %r13, 0, %r11, %p3; + selp.f64 %fd15, %fd1, %fd14, %p3; + mov.f64 %fd16, 0d3E5AF86D8EBD13CD; + mov.f64 %fd17, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd18, %fd17, %fd15, %fd16; + mov.f64 %fd19, 0d3E927E5092BA033D; + fma.rn.f64 %fd20, %fd18, %fd15, %fd19; + mov.f64 %fd21, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd22, %fd20, %fd15, %fd21; + mov.f64 %fd23, 0d3EFA01A018D034E6; + fma.rn.f64 %fd24, %fd22, %fd15, %fd23; + mov.f64 %fd25, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd26, %fd24, %fd15, %fd25; + mov.f64 %fd27, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd28, %fd26, %fd15, %fd27; + mov.f64 %fd29, 0d3F8111111110F74D; + fma.rn.f64 %fd30, %fd28, %fd15, %fd29; + mov.f64 %fd31, 0d3FA555555555554D; + fma.rn.f64 %fd32, %fd30, %fd15, %fd31; + mov.f64 %fd33, 0d3FC5555555555557; + fma.rn.f64 %fd34, %fd32, %fd15, %fd33; + mov.f64 %fd35, 0d3FE0000000000000; + fma.rn.f64 %fd36, %fd34, %fd15, %fd35; + mul.f64 %fd37, %fd15, %fd36; + fma.rn.f64 %fd38, %fd37, %fd15, %fd15; + setp.eq.s32 %p4, %r13, 1024; + selp.b32 %r14, -1, 0, %p4; + add.s32 %r15, %r14, %r13; + shl.b32 %r16, %r15, 20; + add.s32 %r17, %r16, 1072693248; + mov.u32 %r18, 0; + mov.b64 %fd39, {%r18, %r17}; + mov.u32 %r19, 1071644672; + mov.b64 %fd40, {%r18, %r19}; + sub.f64 %fd41, %fd39, %fd40; + fma.rn.f64 %fd42, %fd38, %fd39, %fd41; + add.f64 %fd43, %fd42, %fd42; + selp.f64 %fd44, %fd43, %fd42, %p4; + setp.eq.s32 %p5, %r12, 0; + selp.f64 %fd45, %fd15, %fd44, %p5; + mov.f64 %fd46, 0d3FF0000000000000; + mov.f64 %fd47, 0d4000000000000000; + fma.rn.f64 %fd48, %fd47, %fd45, %fd46; + div.rn.f64 %fd49, %fd45, %fd48; + add.f64 %fd50, %fd49, %fd45; + setp.ge.f64 %p6, %fd1, 0d408633CE8FB9F87E; + selp.f64 %fd67, 0d7FF0000000000000, %fd50, %p6; + +BB36_4: + cvta.to.global.u64 %rd7, %rd3; + and.b32 %r20, %r2, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r21}, %fd67; + } + or.b32 %r22, %r21, %r20; + { + .reg .b32 %temp; + mov.b64 {%r23, %temp}, %fd67; + } + mov.b64 %fd66, {%r23, %r22}; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd66; + +BB36_5: ret; } @@ -4568,16 +4715,16 @@ BB35_11: .param .u32 matrix_cos_param_2 ) { - .local .align 4 .b8 __local_depot36[4]; + .local .align 4 .b8 __local_depot37[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<7>; + .reg .pred %p<9>; .reg .b32 %r<19>; .reg .f64 %fd<41>; .reg .b64 %rd<17>; - mov.u64 %rd16, __local_depot36; + mov.u64 %rd16, __local_depot37; cvta.local.u64 %SP, %rd16; ld.param.u64 %rd3, [matrix_cos_param_0]; ld.param.u64 %rd4, [matrix_cos_param_1]; @@ -4589,7 +4736,7 @@ BB35_11: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB36_11; + @%p1 bra BB37_10; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4598,23 +4745,24 @@ BB35_11: ld.global.f64 %fd38, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r10}, %fd38; + mov.b64 {%r10, %temp}, %fd38; } - and.b32 %r11, %r10, 2147483647; - setp.ne.s32 %p2, %r11, 2146435072; - @%p2 bra BB36_4; - { .reg .b32 %temp; - mov.b64 {%r12, %temp}, %fd38; + mov.b64 {%temp, %r11}, %fd38; } - setp.ne.s32 %p3, %r12, 0; - @%p3 bra BB36_4; + and.b32 %r12, %r11, 2147483647; + setp.eq.s32 %p2, %r12, 2146435072; + setp.eq.s32 %p3, %r10, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB37_3; + bra.uni BB37_2; +BB37_2: mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB36_4: +BB37_3: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r18, %fd15; st.local.u32 [%rd1], %r18; @@ -4631,8 +4779,8 @@ BB36_4: mov.b64 {%temp, %r13}, %fd38; } and.b32 %r14, %r13, 2145386496; - setp.lt.u32 %p4, %r14, 1105199104; - @%p4 bra BB36_6; + setp.lt.u32 %p5, %r14, 1105199104; + @%p5 bra BB37_5; // Callseq Start 4 { @@ -4655,12 +4803,12 @@ BB36_4: }// Callseq End 4 ld.local.u32 %r18, [%rd1]; -BB36_6: +BB37_5: add.s32 %r5, %r18, 1; and.b32 %r15, %r5, 1; shl.b32 %r16, %r15, 3; - setp.eq.s32 %p5, %r15, 0; - selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + setp.eq.b32 %p6, %r15, 1; + selp.f64 %fd23, 0dBDA8FF8320FD8164, 0d3DE5DB65F9785EBA, %p6; mul.wide.u32 %rd10, %r16, 8; mov.u64 %rd11, __cudart_sin_cos_coeffs; add.s64 %rd12, %rd10, %rd11; @@ -4678,27 +4826,148 @@ BB36_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB36_8; + setp.eq.s32 %p7, %r15, 0; + @%p7 bra BB37_7; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB36_8: +BB37_7: and.b32 %r17, %r5, 2; - setp.eq.s32 %p6, %r17, 0; - @%p6 bra BB36_10; + setp.eq.s32 %p8, %r17, 0; + @%p8 bra BB37_9; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB36_10: +BB37_9: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB36_11: +BB37_10: + ret; +} + + // .globl matrix_cosh +.visible .entry matrix_cosh( + .param .u64 matrix_cosh_param_0, + .param .u64 matrix_cosh_param_1, + .param .u32 matrix_cosh_param_2 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<16>; + .reg .f64 %fd<46>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_cosh_param_0]; + ld.param.u64 %rd3, [matrix_cosh_param_1]; + ld.param.u32 %r2, [matrix_cosh_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB38_5; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r6}, %fd1; + } + and.b32 %r7, %r6, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r8, %temp}, %fd1; + } + mov.b64 %fd2, {%r8, %r7}; + { + .reg .b32 %temp; + mov.b64 {%temp, %r9}, %fd2; + } + setp.lt.u32 %p2, %r9, 1082536911; + @%p2 bra BB38_3; + bra.uni BB38_2; + +BB38_3: + mov.f64 %fd8, 0d4338000000000000; + mov.f64 %fd9, 0d3FF71547652B82FE; + fma.rn.f64 %fd10, %fd2, %fd9, %fd8; + { + .reg .b32 %temp; + mov.b64 {%r10, %temp}, %fd10; + } + mov.f64 %fd11, 0dC338000000000000; + add.rn.f64 %fd12, %fd10, %fd11; + mov.f64 %fd13, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd14, %fd12, %fd13, %fd2; + mov.f64 %fd15, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd16, %fd12, %fd15, %fd14; + mov.f64 %fd17, 0d3E928AF3FCA213EA; + mov.f64 %fd18, 0d3E5ADE1569CE2BDF; + fma.rn.f64 %fd19, %fd18, %fd16, %fd17; + mov.f64 %fd20, 0d3EC71DEE62401315; + fma.rn.f64 %fd21, %fd19, %fd16, %fd20; + mov.f64 %fd22, 0d3EFA01997C89EB71; + fma.rn.f64 %fd23, %fd21, %fd16, %fd22; + mov.f64 %fd24, 0d3F2A01A014761F65; + fma.rn.f64 %fd25, %fd23, %fd16, %fd24; + mov.f64 %fd26, 0d3F56C16C1852B7AF; + fma.rn.f64 %fd27, %fd25, %fd16, %fd26; + mov.f64 %fd28, 0d3F81111111122322; + fma.rn.f64 %fd29, %fd27, %fd16, %fd28; + mov.f64 %fd30, 0d3FA55555555502A1; + fma.rn.f64 %fd31, %fd29, %fd16, %fd30; + mov.f64 %fd32, 0d3FC5555555555511; + fma.rn.f64 %fd33, %fd31, %fd16, %fd32; + mov.f64 %fd34, 0d3FE000000000000B; + fma.rn.f64 %fd35, %fd33, %fd16, %fd34; + mov.f64 %fd36, 0d3FF0000000000000; + fma.rn.f64 %fd37, %fd35, %fd16, %fd36; + fma.rn.f64 %fd38, %fd37, %fd16, %fd36; + shl.b32 %r11, %r10, 20; + { + .reg .b32 %temp; + mov.b64 {%r12, %temp}, %fd38; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r13}, %fd38; + } + add.s32 %r14, %r11, %r13; + add.s32 %r15, %r14, -2097152; + mov.b64 %fd7, {%r12, %r15}; + // inline asm + rcp.approx.ftz.f64 %fd6,%fd7; + // inline asm + neg.f64 %fd39, %fd7; + fma.rn.f64 %fd40, %fd39, %fd6, %fd36; + fma.rn.f64 %fd41, %fd40, %fd40, %fd40; + fma.rn.f64 %fd42, %fd41, %fd6, %fd6; + mov.f64 %fd43, 0d3FB0000000000000; + fma.rn.f64 %fd45, %fd42, %fd43, %fd7; + bra.uni BB38_4; + +BB38_2: + setp.le.f64 %p3, %fd1, 0d7FF0000000000000; + selp.f64 %fd45, 0d7FF0000000000000, %fd1, %p3; + +BB38_4: + cvta.to.global.u64 %rd7, %rd3; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + add.f64 %fd44, %fd45, %fd45; + st.global.f64 [%rd9], %fd44; + +BB38_5: ret; } @@ -4709,16 +4978,16 @@ BB36_11: .param .u32 matrix_tan_param_2 ) { - .local .align 4 .b8 __local_depot37[4]; + .local .align 4 .b8 __local_depot39[4]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<6>; + .reg .pred %p<7>; .reg .b32 %r<16>; .reg .f64 %fd<66>; .reg .b64 %rd<14>; - mov.u64 %rd13, __local_depot37; + mov.u64 %rd13, __local_depot39; cvta.local.u64 %SP, %rd13; ld.param.u64 %rd3, [matrix_tan_param_0]; ld.param.u64 %rd4, [matrix_tan_param_1]; @@ -4730,7 +4999,7 @@ BB36_11: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB37_9; + @%p1 bra BB39_8; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4739,23 +5008,24 @@ BB36_11: ld.global.f64 %fd63, [%rd8]; { .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd63; + mov.b64 {%r9, %temp}, %fd63; } - and.b32 %r10, %r9, 2147483647; - setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB37_4; - { .reg .b32 %temp; - mov.b64 {%r11, %temp}, %fd63; + mov.b64 {%temp, %r10}, %fd63; } - setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB37_4; + and.b32 %r11, %r10, 2147483647; + setp.eq.s32 %p2, %r11, 2146435072; + setp.eq.s32 %p3, %r9, 0; + and.pred %p4, %p2, %p3; + @!%p4 bra BB39_3; + bra.uni BB39_2; +BB39_2: mov.f64 %fd11, 0d0000000000000000; mul.rn.f64 %fd63, %fd63, %fd11; -BB37_4: +BB39_3: mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r15, %fd12; st.local.u32 [%rd1], %r15; @@ -4772,8 +5042,8 @@ BB37_4: mov.b64 {%temp, %r12}, %fd63; } and.b32 %r13, %r12, 2145386496; - setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB37_6; + setp.lt.u32 %p5, %r13, 1105199104; + @%p5 bra BB39_5; // Callseq Start 5 { @@ -4796,7 +5066,7 @@ BB37_4: }// Callseq End 5 ld.local.u32 %r15, [%rd1]; -BB37_6: +BB39_5: mul.f64 %fd20, %fd64, %fd64; mov.f64 %fd21, 0dBEF9757C5B27EBB1; mov.f64 %fd22, 0d3EE48DAC2799BCB9; @@ -4830,11 +5100,11 @@ BB37_6: mul.f64 %fd7, %fd20, %fd49; fma.rn.f64 %fd65, %fd7, %fd64, %fd64; and.b32 %r14, %r15, 1; - setp.eq.b32 %p5, %r14, 1; - @!%p5 bra BB37_8; - bra.uni BB37_7; + setp.eq.b32 %p6, %r14, 1; + @!%p6 bra BB39_7; + bra.uni BB39_6; -BB37_7: +BB39_6: sub.f64 %fd52, %fd65, %fd64; neg.f64 %fd53, %fd52; fma.rn.f64 %fd54, %fd7, %fd64, %fd53; @@ -4851,13 +5121,159 @@ BB37_7: fma.rn.f64 %fd62, %fd60, %fd54, %fd61; fma.rn.f64 %fd65, %fd62, %fd60, %fd60; -BB37_8: +BB39_7: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd2, 3; add.s64 %rd12, %rd10, %rd11; st.global.f64 [%rd12], %fd65; -BB37_9: +BB39_8: + ret; +} + + // .globl matrix_tanh +.visible .entry matrix_tanh( + .param .u64 matrix_tanh_param_0, + .param .u64 matrix_tanh_param_1, + .param .u32 matrix_tanh_param_2 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<17>; + .reg .f64 %fd<74>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_tanh_param_0]; + ld.param.u64 %rd3, [matrix_tanh_param_1]; + ld.param.u32 %r4, [matrix_tanh_param_2]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r6, %r5, %r7; + setp.ge.u32 %p1, %r1, %r4; + @%p1 bra BB40_5; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd1; + } + and.b32 %r3, %r2, 2147483647; + { + .reg .b32 %temp; + mov.b64 {%r8, %temp}, %fd1; + } + mov.b64 %fd2, {%r8, %r3}; + setp.ltu.f64 %p2, %fd2, 0d3FE1C7A398201CD6; + @%p2 bra BB40_3; + bra.uni BB40_2; + +BB40_3: + mul.f64 %fd51, %fd1, %fd1; + mov.f64 %fd52, 0dBF2B9093D89F0E23; + mov.f64 %fd53, 0d3F0ABFFC9B5786C4; + fma.rn.f64 %fd54, %fd53, %fd51, %fd52; + mov.f64 %fd55, 0d3F42FA2744C30B61; + fma.rn.f64 %fd56, %fd54, %fd51, %fd55; + mov.f64 %fd57, 0dBF57CF3B9C1E491D; + fma.rn.f64 %fd58, %fd56, %fd51, %fd57; + mov.f64 %fd59, 0d3F6D6C61D450119A; + fma.rn.f64 %fd60, %fd58, %fd51, %fd59; + mov.f64 %fd61, 0dBF8226DDD44294F5; + fma.rn.f64 %fd62, %fd60, %fd51, %fd61; + mov.f64 %fd63, 0d3F9664F45C2B04A6; + fma.rn.f64 %fd64, %fd62, %fd51, %fd63; + mov.f64 %fd65, 0dBFABA1BA1AD70754; + fma.rn.f64 %fd66, %fd64, %fd51, %fd65; + mov.f64 %fd67, 0d3FC111111110295E; + fma.rn.f64 %fd68, %fd66, %fd51, %fd67; + mov.f64 %fd69, 0dBFD555555555549F; + fma.rn.f64 %fd70, %fd68, %fd51, %fd69; + mul.f64 %fd71, %fd51, %fd70; + fma.rn.f64 %fd73, %fd71, %fd1, %fd1; + bra.uni BB40_4; + +BB40_2: + add.f64 %fd8, %fd2, %fd2; + mov.f64 %fd9, 0d4338000000000000; + mov.f64 %fd10, 0d3FF71547652B82FE; + fma.rn.f64 %fd11, %fd8, %fd10, %fd9; + { + .reg .b32 %temp; + mov.b64 {%r9, %temp}, %fd11; + } + mov.f64 %fd12, 0dC338000000000000; + add.rn.f64 %fd13, %fd11, %fd12; + mov.f64 %fd14, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd15, %fd13, %fd14, %fd8; + mov.f64 %fd16, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd17, %fd13, %fd16, %fd15; + mov.f64 %fd18, 0d3E5AF86D8EBD13CD; + mov.f64 %fd19, 0d3E21F4076ACD15B6; + fma.rn.f64 %fd20, %fd19, %fd17, %fd18; + mov.f64 %fd21, 0d3E927E5092BA033D; + fma.rn.f64 %fd22, %fd20, %fd17, %fd21; + mov.f64 %fd23, 0d3EC71DDE6C5F9DA1; + fma.rn.f64 %fd24, %fd22, %fd17, %fd23; + mov.f64 %fd25, 0d3EFA01A018D034E6; + fma.rn.f64 %fd26, %fd24, %fd17, %fd25; + mov.f64 %fd27, 0d3F2A01A01B3B6940; + fma.rn.f64 %fd28, %fd26, %fd17, %fd27; + mov.f64 %fd29, 0d3F56C16C16C1B5DD; + fma.rn.f64 %fd30, %fd28, %fd17, %fd29; + mov.f64 %fd31, 0d3F8111111110F74D; + fma.rn.f64 %fd32, %fd30, %fd17, %fd31; + mov.f64 %fd33, 0d3FA555555555554D; + fma.rn.f64 %fd34, %fd32, %fd17, %fd33; + mov.f64 %fd35, 0d3FC5555555555557; + fma.rn.f64 %fd36, %fd34, %fd17, %fd35; + mov.f64 %fd37, 0d3FE0000000000000; + fma.rn.f64 %fd38, %fd36, %fd17, %fd37; + mul.f64 %fd39, %fd17, %fd38; + fma.rn.f64 %fd40, %fd39, %fd17, %fd17; + shl.b32 %r10, %r9, 20; + add.s32 %r11, %r10, 1072693248; + mov.u32 %r12, 0; + mov.b64 %fd41, {%r12, %r11}; + fma.rn.f64 %fd42, %fd40, %fd41, %fd41; + add.f64 %fd7, %fd42, 0d3FF0000000000000; + // inline asm + rcp.approx.ftz.f64 %fd6,%fd7; + // inline asm + neg.f64 %fd43, %fd7; + mov.f64 %fd44, 0d3FF0000000000000; + fma.rn.f64 %fd45, %fd43, %fd6, %fd44; + fma.rn.f64 %fd46, %fd45, %fd45, %fd45; + fma.rn.f64 %fd47, %fd46, %fd6, %fd6; + neg.f64 %fd48, %fd47; + mov.f64 %fd49, 0d4000000000000000; + fma.rn.f64 %fd50, %fd49, %fd48, %fd44; + setp.gt.u32 %p3, %r3, 1077936127; + selp.f64 %fd73, 0d3FF0000000000000, %fd50, %p3; + +BB40_4: + cvta.to.global.u64 %rd7, %rd3; + and.b32 %r13, %r2, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%temp, %r14}, %fd73; + } + or.b32 %r15, %r14, %r13; + { + .reg .b32 %temp; + mov.b64 {%r16, %temp}, %fd73; + } + mov.b64 %fd72, {%r16, %r15}; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd72; + +BB40_5: ret; } @@ -4883,7 +5299,7 @@ BB37_9: mov.u32 %r6, %tid.x; mad.lo.s32 %r1, %r5, %r4, %r6; setp.ge.u32 %p1, %r1, %r3; - @%p1 bra BB38_5; + @%p1 bra BB41_5; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -4897,10 +5313,10 @@ BB37_9: mov.b32 %f1, %r2; abs.f32 %f2, %f1; setp.lt.f32 %p2, %f2, 0f3FE26666; - @%p2 bra BB38_3; - bra.uni BB38_2; + @%p2 bra BB41_3; + bra.uni BB41_2; -BB38_3: +BB41_3: mul.f64 %fd55, %fd1, %fd1; mov.f64 %fd56, 0dBFB3823B180754AF; mov.f64 %fd57, 0d3FB0066BDC1895E9; @@ -4929,9 +5345,9 @@ BB38_3: fma.rn.f64 %fd80, %fd78, %fd55, %fd79; mul.f64 %fd81, %fd55, %fd80; fma.rn.f64 %fd82, %fd81, %fd1, %fd1; - bra.uni BB38_4; + bra.uni BB41_4; -BB38_2: +BB41_2: abs.f64 %fd7, %fd1; mov.f64 %fd8, 0d3FE0000000000000; mov.f64 %fd9, 0dBFE0000000000000; @@ -4965,8 +5381,8 @@ BB38_2: } setp.lt.s32 %p3, %r10, 0; selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3; - setp.equ.f64 %p4, %fd6, 0d0000000000000000; - selp.f64 %fd22, %fd6, %fd21, %p4; + setp.ne.f64 %p4, %fd6, 0d0000000000000000; + selp.f64 %fd22, %fd21, %fd6, %p4; mov.f64 %fd23, 0dBFB3823B180754AF; mov.f64 %fd24, 0d3FB0066BDC1895E9; fma.rn.f64 %fd25, %fd24, %fd6, %fd23; @@ -5011,13 +5427,13 @@ BB38_2: or.b32 %r14, %r12, %r13; mov.b64 %fd82, {%r11, %r14}; -BB38_4: +BB41_4: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd82; -BB38_5: +BB41_5: ret; } @@ -5042,7 +5458,7 @@ BB38_5: mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; setp.ge.u32 %p1, %r1, %r4; - @%p1 bra BB39_14; + @%p1 bra BB42_14; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -5059,10 +5475,10 @@ BB38_5: mov.b64 {%temp, %r8}, %fd1; } setp.lt.s32 %p2, %r8, 1071801958; - @%p2 bra BB39_9; - bra.uni BB39_2; + @%p2 bra BB42_9; + bra.uni BB42_2; -BB39_9: +BB42_9: mul.f64 %fd62, %fd1, %fd1; mov.f64 %fd63, 0dBFB3823B180754AF; mov.f64 %fd64, 0d3FB0066BDC1895E9; @@ -5092,14 +5508,14 @@ BB39_9: mul.f64 %fd88, %fd62, %fd87; fma.rn.f64 %fd10, %fd88, %fd1, %fd1; setp.lt.s32 %p6, %r2, 0; - @%p6 bra BB39_11; + @%p6 bra BB42_11; mov.f64 %fd89, 0dBC91A62633145C07; add.rn.f64 %fd90, %fd10, %fd89; neg.f64 %fd93, %fd90; - bra.uni BB39_12; + bra.uni BB42_12; -BB39_2: +BB42_2: mov.f64 %fd19, 0d3FF0000000000000; sub.f64 %fd2, %fd19, %fd1; { @@ -5135,7 +5551,7 @@ BB39_2: fma.rn.f64 %fd28, %fd24, %fd25, %fd18; fma.rn.f64 %fd3, %fd28, %fd27, %fd24; setp.lt.s32 %p3, %r3, 1; - @%p3 bra BB39_4; + @%p3 bra BB42_4; { .reg .b32 %temp; @@ -5174,31 +5590,31 @@ BB39_2: fma.rn.f64 %fd54, %fd52, %fd2, %fd53; mul.f64 %fd55, %fd2, %fd54; fma.rn.f64 %fd94, %fd55, %fd29, %fd29; - bra.uni BB39_5; + bra.uni BB42_5; -BB39_11: +BB42_11: mov.f64 %fd91, 0d3C91A62633145C07; add.rn.f64 %fd93, %fd10, %fd91; -BB39_12: +BB42_12: mov.f64 %fd92, 0d3FF921FB54442D18; add.rn.f64 %fd94, %fd92, %fd93; - bra.uni BB39_13; + bra.uni BB42_13; -BB39_4: +BB42_4: mov.f64 %fd56, 0d0000000000000000; mul.rn.f64 %fd94, %fd1, %fd56; -BB39_5: +BB42_5: setp.gt.s32 %p4, %r3, -1; - @%p4 bra BB39_7; + @%p4 bra BB42_7; mov.f64 %fd57, 0d7FF0000000000000; mul.rn.f64 %fd94, %fd94, %fd57; -BB39_7: +BB42_7: setp.gt.s32 %p5, %r2, -1; - @%p5 bra BB39_13; + @%p5 bra BB42_13; mov.f64 %fd58, 0dBCA1A62633145C07; add.rn.f64 %fd59, %fd94, %fd58; @@ -5206,13 +5622,13 @@ BB39_7: mov.f64 %fd61, 0d400921FB54442D18; add.rn.f64 %fd94, %fd61, %fd60; -BB39_13: +BB42_13: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd94; -BB39_14: +BB42_14: ret; } @@ -5237,7 +5653,7 @@ BB39_14: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB40_4; + @%p1 bra BB43_4; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -5247,7 +5663,7 @@ BB39_14: abs.f64 %fd2, %fd1; setp.leu.f64 %p2, %fd2, 0d3FF0000000000000; mov.f64 %fd56, %fd2; - @%p2 bra BB40_3; + @%p2 bra BB43_3; // inline asm rcp.approx.ftz.f64 %fd5,%fd2; @@ -5261,7 +5677,7 @@ BB39_14: selp.f64 %fd3, 0d0000000000000000, %fd11, %p3; mov.f64 %fd56, %fd3; -BB40_3: +BB43_3: mov.f64 %fd4, %fd56; cvta.to.global.u64 %rd7, %rd3; mul.f64 %fd12, %fd4, %fd4; @@ -5327,7 +5743,7 @@ BB40_3: add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd55; -BB40_4: +BB43_4: ret; } @@ -5352,7 +5768,7 @@ BB40_4: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB41_4; + @%p1 bra BB44_4; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; @@ -5361,15 +5777,15 @@ BB40_4: setp.eq.f64 %p2, %fd1, 0d0000000000000000; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p2 bra BB41_3; - bra.uni BB41_2; + @%p2 bra BB44_3; + bra.uni BB44_2; -BB41_3: +BB44_3: mov.u64 %rd8, 0; st.global.u64 [%rd1], %rd8; - bra.uni BB41_4; + bra.uni BB44_4; -BB41_2: +BB44_2: { .reg .b32 %temp; mov.b64 {%temp, %r6}, %fd1; @@ -5389,7 +5805,7 @@ BB41_2: mov.b64 %fd3, {%r11, %r10}; st.global.f64 [%rd1], %fd3; -BB41_4: +BB44_4: ret; } @@ -5398,7 +5814,7 @@ BB41_4: .param .b64 __internal_trig_reduction_slowpathd_param_1 ) { - .local .align 8 .b8 __local_depot42[40]; + .local .align 8 .b8 __local_depot45[40]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<9>; @@ -5407,7 +5823,7 @@ BB41_4: .reg .b64 %rd<101>; - mov.u64 %rd100, __local_depot42; + mov.u64 %rd100, __local_depot45; cvta.local.u64 %SP, %rd100; ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0]; ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1]; @@ -5421,7 +5837,7 @@ BB41_4: shr.u32 %r3, %r1, 20; bfe.u32 %r4, %r1, 20, 11; setp.eq.s32 %p1, %r4, 2047; - @%p1 bra BB42_13; + @%p1 bra BB45_13; add.s32 %r16, %r4, -1024; shr.u32 %r17, %r16, 6; @@ -5434,7 +5850,7 @@ BB41_4: setp.gt.s32 %p2, %r5, %r6; mov.u64 %rd94, 0; mov.u64 %rd93, %rd1; - @%p2 bra BB42_4; + @%p2 bra BB45_4; mov.b64 %rd41, %fd4; shl.b64 %rd42, %rd41, 11; @@ -5453,7 +5869,7 @@ BB41_4: mov.u64 %rd91, %rd1; mov.u32 %r39, %r7; -BB42_3: +BB45_3: .pragma "nounroll"; mov.u32 %r8, %r39; mov.u64 %rd7, %rd91; @@ -5488,15 +5904,15 @@ BB42_3: setp.lt.s32 %p3, %r9, %r6; mov.u64 %rd91, %rd13; mov.u32 %r39, %r9; - @%p3 bra BB42_3; + @%p3 bra BB45_3; -BB42_4: +BB45_4: st.local.u64 [%rd93], %rd94; ld.local.u64 %rd95, [%rd1+16]; ld.local.u64 %rd96, [%rd1+24]; and.b32 %r10, %r3, 63; setp.eq.s32 %p4, %r10, 0; - @%p4 bra BB42_6; + @%p4 bra BB45_6; mov.u32 %r27, 64; sub.s32 %r28, %r27, %r10; @@ -5508,7 +5924,7 @@ BB42_4: shr.u64 %rd56, %rd55, %r28; or.b64 %rd95, %rd56, %rd54; -BB42_6: +BB45_6: cvta.to.local.u64 %rd57, %rd37; shr.u64 %rd58, %rd96, 62; cvt.u32.u64 %r29, %rd58; @@ -5521,11 +5937,11 @@ BB42_6: and.b32 %r31, %r30, 1; add.s32 %r32, %r31, %r29; neg.s32 %r33, %r32; - setp.eq.s32 %p5, %r40, 0; - selp.b32 %r34, %r32, %r33, %p5; + setp.ne.s32 %p5, %r40, 0; + selp.b32 %r34, %r33, %r32, %p5; st.local.u32 [%rd57], %r34; setp.eq.s32 %p6, %r31, 0; - @%p6 bra BB42_8; + @%p6 bra BB45_8; mov.u64 %rd65, 0; // inline asm @@ -5545,10 +5961,10 @@ BB42_6: // inline asm xor.b32 %r40, %r40, -2147483648; -BB42_8: +BB45_8: clz.b64 %r41, %rd98; setp.eq.s32 %p7, %r41, 0; - @%p7 bra BB42_10; + @%p7 bra BB45_10; shl.b64 %rd68, %rd98, %r41; mov.u32 %r35, 64; @@ -5556,7 +5972,7 @@ BB42_8: shr.u64 %rd69, %rd97, %r36; or.b64 %rd98, %rd69, %rd68; -BB42_10: +BB45_10: mov.u64 %rd73, -3958705157555305931; // inline asm { @@ -5577,7 +5993,7 @@ BB42_10: } // inline asm setp.lt.s64 %p8, %rd99, 1; - @%p8 bra BB42_12; + @%p8 bra BB45_12; // inline asm { @@ -5596,7 +6012,7 @@ BB42_10: // inline asm add.s32 %r41, %r41, 1; -BB42_12: +BB45_12: cvt.u64.u32 %rd80, %r40; shl.b64 %rd81, %rd80, 32; mov.u32 %r37, 1022; @@ -5611,7 +6027,7 @@ BB42_12: or.b64 %rd89, %rd88, %rd81; mov.b64 %fd4, %rd89; -BB42_13: +BB45_13: st.param.f64 [func_retval0+0], %fd4; ret; } @@ -5621,7 +6037,7 @@ BB42_13: .param .b64 __internal_accurate_pow_param_1 ) { - .reg .pred %p<9>; + .reg .pred %p<10>; .reg .f32 %f<3>; .reg .b32 %r<52>; .reg .f64 %fd<134>; @@ -5639,7 +6055,7 @@ BB42_13: } shr.u32 %r50, %r49, 20; setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB43_2; + @%p1 bra BB46_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -5653,13 +6069,13 @@ BB42_13: shr.u32 %r16, %r49, 20; add.s32 %r50, %r16, -54; -BB43_2: +BB46_2: add.s32 %r51, %r50, -1023; and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; mov.b64 %fd132, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB43_4; + @%p2 bra BB46_4; { .reg .b32 %temp; @@ -5673,7 +6089,7 @@ BB43_2: mov.b64 %fd132, {%r19, %r21}; add.s32 %r51, %r50, -1022; -BB43_4: +BB46_4: add.f64 %fd16, %fd132, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; @@ -5838,13 +6254,13 @@ BB43_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB43_7; + @%p4 bra BB46_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB43_7; + @%p6 bra BB46_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; @@ -5859,26 +6275,26 @@ BB43_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd133, %fd130, %fd131; -BB43_7: +BB46_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd133; } and.b32 %r46, %r45, 2147483647; setp.ne.s32 %p7, %r46, 2146435072; - @%p7 bra BB43_9; - { .reg .b32 %temp; mov.b64 {%r47, %temp}, %fd133; } - setp.eq.s32 %p8, %r47, 0; - @%p8 bra BB43_10; + setp.ne.s32 %p8, %r47, 0; + or.pred %p9, %p8, %p7; + @!%p9 bra BB46_9; + bra.uni BB46_8; -BB43_9: +BB46_8: fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB43_10: +BB46_9: st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/Hop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java index 5ee0b56..f8ee068 100644 --- a/src/main/java/org/apache/sysml/hops/Hop.java +++ b/src/main/java/org/apache/sysml/hops/Hop.java @@ -1051,7 +1051,7 @@ public abstract class Hop implements ParseInfo } public enum OpOp1 { - NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SIGN, SQRT, LOG, EXP, + NOT, ABS, SIN, COS, TAN, ASIN, ACOS, ATAN, SINH, COSH, TANH, SIGN, SQRT, LOG, EXP, CAST_AS_SCALAR, CAST_AS_MATRIX, CAST_AS_FRAME, CAST_AS_DOUBLE, CAST_AS_INT, CAST_AS_BOOLEAN, PRINT, EIGEN, NROW, NCOL, LENGTH, ROUND, IQM, STOP, CEIL, FLOOR, MEDIAN, INVERSE, CHOLESKY, SVD, @@ -1276,6 +1276,9 @@ public abstract class Hop implements ParseInfo HopsOpOp1LopsU.put(OpOp1.ASIN, org.apache.sysml.lops.Unary.OperationTypes.ASIN); HopsOpOp1LopsU.put(OpOp1.ACOS, org.apache.sysml.lops.Unary.OperationTypes.ACOS); HopsOpOp1LopsU.put(OpOp1.ATAN, org.apache.sysml.lops.Unary.OperationTypes.ATAN); + HopsOpOp1LopsU.put(OpOp1.SINH, org.apache.sysml.lops.Unary.OperationTypes.SINH); + HopsOpOp1LopsU.put(OpOp1.COSH, org.apache.sysml.lops.Unary.OperationTypes.COSH); + HopsOpOp1LopsU.put(OpOp1.TANH, org.apache.sysml.lops.Unary.OperationTypes.TANH); HopsOpOp1LopsU.put(OpOp1.SIGN, org.apache.sysml.lops.Unary.OperationTypes.SIGN); HopsOpOp1LopsU.put(OpOp1.SQRT, org.apache.sysml.lops.Unary.OperationTypes.SQRT); HopsOpOp1LopsU.put(OpOp1.EXP, org.apache.sysml.lops.Unary.OperationTypes.EXP); @@ -1310,6 +1313,9 @@ public abstract class Hop implements ParseInfo HopsOpOp1LopsUS.put(OpOp1.ASIN, org.apache.sysml.lops.UnaryCP.OperationTypes.ASIN); HopsOpOp1LopsUS.put(OpOp1.ACOS, org.apache.sysml.lops.UnaryCP.OperationTypes.ACOS); HopsOpOp1LopsUS.put(OpOp1.ATAN, org.apache.sysml.lops.UnaryCP.OperationTypes.ATAN); + HopsOpOp1LopsUS.put(OpOp1.SINH, org.apache.sysml.lops.UnaryCP.OperationTypes.SINH); + HopsOpOp1LopsUS.put(OpOp1.COSH, org.apache.sysml.lops.UnaryCP.OperationTypes.COSH); + HopsOpOp1LopsUS.put(OpOp1.TANH, org.apache.sysml.lops.UnaryCP.OperationTypes.TANH); HopsOpOp1LopsUS.put(OpOp1.SQRT, org.apache.sysml.lops.UnaryCP.OperationTypes.SQRT); HopsOpOp1LopsUS.put(OpOp1.EXP, org.apache.sysml.lops.UnaryCP.OperationTypes.EXP); HopsOpOp1LopsUS.put(OpOp1.LOG, org.apache.sysml.lops.UnaryCP.OperationTypes.LOG); @@ -1365,6 +1371,9 @@ public abstract class Hop implements ParseInfo HopsOpOp12String.put(OpOp1.ASIN, "asin"); HopsOpOp12String.put(OpOp1.ACOS, "acos"); HopsOpOp12String.put(OpOp1.ATAN, "atan"); + HopsOpOp12String.put(OpOp1.SINH, "sinh"); + HopsOpOp12String.put(OpOp1.COSH, "cosh"); + HopsOpOp12String.put(OpOp1.TANH, "tanh"); HopsOpOp12String.put(OpOp1.STOP, "stop"); HopsOpOp12String.put(OpOp1.INVERSE, "inv"); HopsOpOp12String.put(OpOp1.SPROP, "sprop"); http://git-wip-us.apache.org/repos/asf/systemml/blob/50a895f8/src/main/java/org/apache/sysml/hops/UnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index ca1fc84..46cabcc 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -109,7 +109,9 @@ public class UnaryOp extends Hop implements MultiThreadedHop switch(_op) { case SELP:case EXP:case SQRT:case LOG:case ABS: case ROUND:case FLOOR:case CEIL: - case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN: + case SIN:case COS: case TAN: + case ASIN:case ACOS:case ATAN: + case SINH:case COSH: case TANH: case SIGN: return true; default: @@ -601,6 +603,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop if( mc.dimsKnown() ) { if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN || _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN + || _op==OpOp1.COSH || _op==OpOp1.SINH || _op==OpOp1.TANH || _op==OpOp1.SQRT || _op==OpOp1.ROUND || _op==OpOp1.SPROP || _op==OpOp1.SELP ) //sparsity preserving { @@ -723,8 +726,10 @@ public class UnaryOp extends Hop implements MultiThreadedHop Hop input = getInput().get(0); setDim1( input.getDim1() ); setDim2( input.getDim2() ); - if( _op==OpOp1.ABS || _op==OpOp1.COS || _op==OpOp1.SIN || _op==OpOp1.TAN - || _op==OpOp1.ACOS || _op==OpOp1.ASIN || _op==OpOp1.ATAN + // cosh(0)=cos(0)=1, acos(0)=1.5707963267948966 + if( _op==OpOp1.ABS || _op==OpOp1.SIN || _op==OpOp1.TAN + || _op==OpOp1.SINH || _op==OpOp1.TANH + || _op==OpOp1.ASIN || _op==OpOp1.ATAN || _op==OpOp1.SQRT || _op==OpOp1.ROUND || _op==OpOp1.SPROP ) //sparsity preserving { setNnz( input.getNnz() );