[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-09 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises closed 
https://github.com/llvm/llvm-project/pull/86783
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises updated 
https://github.com/llvm/llvm-project/pull/86783

>From b886dcf2da25417d9f8cd75ff4aa58686e35139d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan 
Date: Wed, 27 Mar 2024 17:11:04 +0800
Subject: [PATCH 1/4] [PowerPC] Implement 32-bit expansion for rldimi

rldimi is 64-bit instruction, due to backward compatibility, it needs to
be expanded into series of rlwimi in 32-bit environment. In the future,
we may improve bit permutation selector and remove such direct codegen.
---
 clang/lib/Sema/SemaChecking.cpp |   1 -
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 109 --
 llvm/test/CodeGen/PowerPC/rldimi.ll | 366 
 3 files changed, 454 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 11401b6f56c0ea..d2cbe5417d682d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7436b202fba0d9..3281a0dfd08729 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10757,6 +10758,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32);
+} else {
+  if (MB < Left || ME > Right)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, Left % 32, Right % 
32);
+
+  if (MB <= Right && ME < Left)
+return getRotateInsert32(DAG, Loc

[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises updated 
https://github.com/llvm/llvm-project/pull/86783

>From b886dcf2da25417d9f8cd75ff4aa58686e35139d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan 
Date: Wed, 27 Mar 2024 17:11:04 +0800
Subject: [PATCH 1/3] [PowerPC] Implement 32-bit expansion for rldimi

rldimi is 64-bit instruction, due to backward compatibility, it needs to
be expanded into series of rlwimi in 32-bit environment. In the future,
we may improve bit permutation selector and remove such direct codegen.
---
 clang/lib/Sema/SemaChecking.cpp |   1 -
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 109 --
 llvm/test/CodeGen/PowerPC/rldimi.ll | 366 
 3 files changed, 454 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 11401b6f56c0ea..d2cbe5417d682d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7436b202fba0d9..3281a0dfd08729 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10757,6 +10758,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32);
+} else {
+  if (MB < Left || ME > Right)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, Left % 32, Right % 
32);
+
+  if (MB <= Right && ME < Left)
+return getRotateInsert32(DAG, Loc

[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises edited 
https://github.com/llvm/llvm-project/pull/86783
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises updated 
https://github.com/llvm/llvm-project/pull/86783

>From b886dcf2da25417d9f8cd75ff4aa58686e35139d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan 
Date: Wed, 27 Mar 2024 17:11:04 +0800
Subject: [PATCH 1/2] [PowerPC] Implement 32-bit expansion for rldimi

rldimi is 64-bit instruction, due to backward compatibility, it needs to
be expanded into series of rlwimi in 32-bit environment. In the future,
we may improve bit permutation selector and remove such direct codegen.
---
 clang/lib/Sema/SemaChecking.cpp |   1 -
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 109 --
 llvm/test/CodeGen/PowerPC/rldimi.ll | 366 
 3 files changed, 454 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 11401b6f56c0ea..d2cbe5417d682d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7436b202fba0d9..3281a0dfd08729 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10757,6 +10758,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32);
+} else {
+  if (MB < Left || ME > Right)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, Left % 32, Right % 
32);
+
+  if (MB <= Right && ME < Left)
+return getRotateInsert32(DAG, Loc

[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Kai Luo via cfe-commits

bzEq wrote:

> due to backward compatibility, it needs to be expanded into series of rlwimi 
> in 32-bit environment

Why must be 'series of rlwimi'?

Why don't we just expand it following what ISA describes and let legalizer 
generates code sequence under 32-bit mode?
```
n ← sh5 || sh0:4
r ← ROTL64((RS), n)
b ← mb5 || mb0:4
m ← MASK(b, ¬n) RA ← r&m | (RA) & ¬m
```

https://github.com/llvm/llvm-project/pull/86783
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-04-01 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises updated 
https://github.com/llvm/llvm-project/pull/86783

>From b886dcf2da25417d9f8cd75ff4aa58686e35139d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan 
Date: Wed, 27 Mar 2024 17:11:04 +0800
Subject: [PATCH] [PowerPC] Implement 32-bit expansion for rldimi

rldimi is 64-bit instruction, due to backward compatibility, it needs to
be expanded into series of rlwimi in 32-bit environment. In the future,
we may improve bit permutation selector and remove such direct codegen.
---
 clang/lib/Sema/SemaChecking.cpp |   1 -
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 109 --
 llvm/test/CodeGen/PowerPC/rldimi.ll | 366 
 3 files changed, 454 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 11401b6f56c0ea..d2cbe5417d682d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7436b202fba0d9..3281a0dfd08729 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10757,6 +10758,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32);
+} else {
+  if (MB < Left || ME > Right)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, Left % 32, Right % 
32);
+
+  if (MB <= Right && ME < Left)
+return getRotateInsert32(DAG, Loc, Ds

[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-03-27 Thread via cfe-commits

llvmbot wrote:



@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-powerpc

Author: Qiu Chaofan (ecnelises)


Changes

rldimi is 64-bit instruction, due to backward compatibility, it needs to be 
expanded into series of rlwimi in 32-bit environment. In the future, we may 
improve bit permutation selector and remove such direct codegen.

---

Patch is 20.74 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/86783.diff


3 Files Affected:

- (modified) clang/lib/Sema/SemaChecking.cpp (-1) 
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+88-21) 
- (modified) llvm/test/CodeGen/PowerPC/rldimi.ll (+366) 


``diff
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 08449581330934..5e8228ed998978 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cce0efad39c75b..7e42773f3aa1cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10748,6 +10749,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32);
+} else {
+  if (MB < Left || ME > Right)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, Left % 32, Right % 
32);
+
+  if (MB <= Right && ME < Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH, MB % 32, Right % 32);
+
+  if (MB <= Righ

[clang] [llvm] [PowerPC] Implement 32-bit expansion for rldimi (PR #86783)

2024-03-27 Thread Qiu Chaofan via cfe-commits

https://github.com/ecnelises created 
https://github.com/llvm/llvm-project/pull/86783

rldimi is 64-bit instruction, due to backward compatibility, it needs to be 
expanded into series of rlwimi in 32-bit environment. In the future, we may 
improve bit permutation selector and remove such direct codegen.

>From 3362a81ca64e5dec6e64e4ed544c30078025db15 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan 
Date: Wed, 27 Mar 2024 17:11:04 +0800
Subject: [PATCH] [PowerPC] Implement 32-bit expansion for rldimi

rldimi is 64-bit instruction, due to backward compatibility, it needs to
be expanded into series of rlwimi in 32-bit environment. In the future,
we may improve bit permutation selector and remove such direct codegen.
---
 clang/lib/Sema/SemaChecking.cpp |   1 -
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 109 --
 llvm/test/CodeGen/PowerPC/rldimi.ll | 366 
 3 files changed, 454 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 08449581330934..5e8228ed998978 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5236,7 +5236,6 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
-  case PPC::BI__builtin_ppc_rldimi:
 return true;
   }
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cce0efad39c75b..7e42773f3aa1cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -643,6 +643,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine 
&TM,
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
@@ -10748,6 +10749,88 @@ static bool getVectorCompareInfo(SDValue Intrin, int 
&CompareOpc,
   return true;
 }
 
+static SDValue getRotateInsert32(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME) {
+  assert(SH < 32 && MB < 32 && ME < 32 &&
+ "Invalid argument for rotate insert!");
+  return SDValue(
+  DAG.getMachineNode(PPC::RLWIMI, Loc, MVT::i32,
+ {Dst, Src, DAG.getTargetConstant(SH, Loc, MVT::i32),
+  DAG.getTargetConstant(MB, Loc, MVT::i32),
+  DAG.getTargetConstant(ME, Loc, MVT::i32)}),
+  0);
+}
+
+static SDValue getRotateInsert64(SelectionDAG &DAG, SDLoc Loc, SDValue Dst,
+ SDValue Src, unsigned SH, unsigned MB,
+ unsigned ME, bool IsPPC64) {
+  assert(SH < 64 && MB < 64 && ME < 64 &&
+ "Invalid argument for rotate insert!");
+  if (IsPPC64) {
+// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+if (ME < 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH + 1, Loc, MVT::i32));
+} else if (ME > 63 - SH) {
+  Src = DAG.getNode(ISD::ROTL, Loc, MVT::i64, Src,
+DAG.getConstant(ME + SH - 63, Loc, MVT::i32));
+}
+return SDValue(DAG.getMachineNode(
+   PPC::RLDIMI, Loc, MVT::i64,
+   {Dst, Src, DAG.getTargetConstant(63 - ME, Loc, 
MVT::i32),
+DAG.getTargetConstant(MB, Loc, MVT::i32)}),
+   0);
+  }
+
+  // To implement rldimi(Dst, Src) on 32-bit target, four parts are needed. SH
+  // is adjusted to simplify cases. Invalid ranges will be skipped.
+  // - SrcHi inserted into DstHi with [0, 32-SH)
+  // - SrcLo inserted into DstHi with [32-SH, 32)
+  // - SrcHi inserted into DstLo with [32, 64-SH)
+  // - SrcLo inserted into DstLo with [64-SH, 64)
+  auto [SrcLo, SrcHi] = DAG.SplitScalar(Src, Loc, MVT::i32, MVT::i32);
+  auto [DstLo, DstHi] = DAG.SplitScalar(Dst, Loc, MVT::i32, MVT::i32);
+  if (SH >= 32) {
+SH -= 32;
+std::swap(SrcLo, SrcHi);
+  }
+  auto GetSubInsert = [&DAG, &Loc, SH](unsigned Left, unsigned Right,
+   SDValue Src, SDValue Dst, unsigned MB,
+   unsigned ME) {
+if (Left > Right)
+  return Dst;
+
+if (MB <= ME) {
+  if (MB <= Right && ME >= Left)
+return getRotateInsert32(DAG, Loc, Dst, Src, SH,
+ std::max(MB, Left) % 32,
+ std::min(ME, Right) % 32)