[PATCH] D90173: [PowerPC] Exploit splat instruction xxsplti32dx in Power10

Albion Fung via Phabricator via cfe-commits Mon, 26 Oct 2020 10:50:45 -0700

Conanap created this revision.
Conanap added reviewers: PowerPC, nemanjai, saghir.
Conanap added projects: LLVM, clang, PowerPC.
Herald added a subscriber: kbarton.
Conanap requested review of this revision.


Exploits the instruction xxsplti32dx.

It can be used to materialize any 64 bit scalar/vector splat by using two 
instances, one for the upper 32 bits and the other for the lower 32 bits. It 
should not materialize the cases which can be materialized by using the 
instruction xxspltidp.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D90173

Files:
  llvm/lib/Target/PowerPC/PPCISelLowering.cpp
  llvm/lib/Target/PowerPC/PPCInstrPrefix.td
  llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
  llvm/test/CodeGen/PowerPC/p10-splatImm32.ll

Index: llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
+++ llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
@@ -118,3 +118,25 @@
   %vecins1 = shufflevector <4 x i32> <i32 -1414812757, i32 undef, i32 -1414812757, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i32> %vecins1
 }
+
+define dso_local <2 x double> @test_xxsplti32dx_8() {
+; CHECK-LABEL: test_xxsplti32dx_8
+; CHECK-LE: xxlxor vs34, vs34, vs34
+; CHECK-LE: xxsplti32dx vs34, 1, 1082660167
+; CHECK-BE: xxlxor vs34, vs34, vs34
+; CHECK-BE: xxsplti32dx vs34, 0, 1082660167
+; CHECK: blr
+entry:
+  ret <2 x double> <double 0x40881547AE147AE1, double 0x40881547AE147AE1>
+}
+
+define dso_local <8 x i16> @test_xxsplti32dx_9() {
+; CHECK-LABEL: test_xxsplti32dx_9
+; CHECK-LE: xxlxor vs34, vs34, vs34
+; CHECK-LE: xxsplti32dx vs34, 1, 23855277
+; CHECK-BE: xxlxor vs34, vs34, vs34
+; CHECK-BE: xxsplti32dx vs34, 0, 19070977
+; CHECK: blr
+entry:
+  ret <8 x i16> <i16 291, i16 undef, i16 undef, i16 364, i16 undef, i16 1, i16 173, i16 undef>
+}
Index: llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
+++ llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
@@ -1,27 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-LE
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s \
-; RUN:     --check-prefix=CHECK-NOPCREL
+; RUN:     --check-prefixes=CHECK-NOPCREL-BE,CHECK-NOPCREL
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:     -mattr=-pcrelative-memops -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
-; RUN:     FileCheck %s --check-prefix=CHECK-NOPCREL
+; RUN:     FileCheck %s --check-prefixes=CHECK-NOPCREL-LE,CHECK-NOPCREL
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:     -ppc-asm-full-reg-names -target-abi=elfv2 -mcpu=pwr10 < %s | \
-; RUN:     FileCheck %s
+; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
 define dso_local <2 x double> @testDoubleToDoubleFail() local_unnamed_addr {
 ; CHECK-LABEL: testDoubleToDoubleFail:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    plxv vs34, .LCPI0_0@PCREL(0), 1
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, 1081435463
+; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 1081435463
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LABEL: testDoubleToDoubleFail:
 ; CHECK-NOPCREL:       # %bb.0: # %entry
-; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
-; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NOPCREL-LE-NEXT: xxsplti32dx vs34, 1, 1081435463
+; CHECK-NOPCREL-BE-NEXT: xxsplti32dx vs34, 0, 1081435463
 ; CHECK-NOPCREL-NEXT:    blr
 
 entry:
@@ -31,14 +33,16 @@
 define dso_local <2 x double> @testFloatDenormToDouble() local_unnamed_addr {
 ; CHECK-LABEL: testFloatDenormToDouble:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    plxv vs34, .LCPI1_0@PCREL(0), 1
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-LE-NEXT: xxsplti32dx vs34, 1, 940259579
+; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 940259579
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LABEL: testFloatDenormToDouble:
 ; CHECK-NOPCREL:       # %bb.0: # %entry
-; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
-; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI1_0@toc@l
-; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs34, 1, 940259579
+; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs34, 0, 940259579
 ; CHECK-NOPCREL-NEXT:    blr
 
 entry:
@@ -48,14 +52,16 @@
 define dso_local <2 x double> @testDoubleToDoubleNaNFail() local_unnamed_addr {
 ; CHECK-LABEL: testDoubleToDoubleNaNFail:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    plxv vs34, .LCPI2_0@PCREL(0), 1
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, -1
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 0, -1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LABEL: testDoubleToDoubleNaNFail:
 ; CHECK-NOPCREL:       # %bb.0: # %entry
-; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
-; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI2_0@toc@l
-; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs34, 1, -1
+; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs34, 0, -1
 ; CHECK-NOPCREL-NEXT:    blr
 
 entry:
Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2184,6 +2184,9 @@
 
   def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
             (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
+
+  def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)),
+             (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>;
 }
 
 let Predicates = [IsISA3_1, HasVSX] in {
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9186,21 +9186,48 @@
   bool BVNIsConstantSplat =
       BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                            HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+  bool LE = Subtarget.isLittleEndian();
 
   // If it is a splat of a double, check if we can shrink it to a 32 bit
   // non-denormal float which when converted back to double gives us the same
   // double. This is to exploit the XXSPLTIDP instruction.
-  if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
-      (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
-      convertToNonDenormSingle(APSplatBits)) {
-    SDValue SplatNode = DAG.getNode(
-        PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
-        DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
-    return DAG.getBitcast(Op.getValueType(), SplatNode);
+  // If we lose precision, we use XXSPLTI32DX.
+  if (BVNIsConstantSplat  && (SplatBitSize == 64) && Subtarget.hasPrefixInstrs()) {
+    if(convertToNonDenormSingle(APSplatBits) &&
+       (Op->getValueType(0) == MVT::v2f64)) {
+      SDValue SplatNode = DAG.getNode(
+          PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+          DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+      return DAG.getBitcast(Op.getValueType(), SplatNode);
+    } else { // we may lose precision, so we have to use XXSPLTI32DX.
+
+      uint32_t top = (uint32_t) ((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
+      uint32_t bot = (uint32_t) (APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL);
+      SDValue SplatNode;
+
+      if (!top || !bot) {
+        // if either load is 0, then we should generate XXLXOR to set to 0
+        SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);
+      }
+
+      if (bot) {
+        SplatNode = DAG.getNode(
+            PPCISD::XXSPLTI32DX, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64),
+            DAG.getTargetConstant(LE ? 0 : 1, dl, MVT::i32),
+            DAG.getTargetConstant(bot, dl, MVT::i32));
+      }
+      if (top) {
+        SplatNode = DAG.getNode(
+            PPCISD::XXSPLTI32DX, bot ? SplatNode : DAG.getUNDEF(MVT::v2i64), 
+            MVT::v2i64, SplatNode, DAG.getTargetConstant(LE ? 1 : 0, SplatNode, MVT::i32),
+            DAG.getTargetConstant(top, SplatNode, MVT::i32));
+      }
+
+      return DAG.getBitcast(Op.getValueType(), SplatNode);
+    }
   }
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
-
     bool IsPermutedLoad = false;
     const SDValue *InputLoad =
         getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D90173: [PowerPC] Exploit splat instruction xxsplti32dx in Power10

Reply via email to