================
@@ -5387,6 +5387,192 @@ bool 
AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+                                         MachineInstr &MI,
+                                         Intrinsic::ID IID) const {
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+
+  auto createLaneOp = [&](Register Src0, Register Src1,
+                          Register Src2) -> Register {
+    auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+    switch (IID) {
+    case Intrinsic::amdgcn_readfirstlane:
+      return LaneOp.getReg(0);
+    case Intrinsic::amdgcn_readlane:
+      return LaneOp.addUse(Src1).getReg(0);
+    case Intrinsic::amdgcn_writelane:
+      return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+    default:
+      llvm_unreachable("unhandled lane op");
+    }
+  };
+
+  Register Src1, Src2;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) 
{
+    Src1 = MI.getOperand(3).getReg();
+    if (IID == Intrinsic::amdgcn_writelane) {
+      Src2 = MI.getOperand(4).getReg();
+    }
+  }
+
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
+
+  if (Size == 32) {
+    // Already legal
+    return true;
+  }
+
+  if (Size < 32) {
+    Register Src0Cast = MRI.getType(Src0).isScalar()
+                            ? Src0
+                            : B.buildBitcast(LLT::scalar(Size), 
Src0).getReg(0);
+    Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
+    if (Src2.isValid()) {
+      Register Src2Cast =
+          MRI.getType(Src2).isScalar()
+              ? Src2
+              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+      Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+    }
+
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
+    if (Ty.isScalar())
+      B.buildTrunc(DstReg, LaneOpDst);
+    else {
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
+      B.buildBitcast(DstReg, Trunc);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if ((Size % 32) == 0) {
+    SmallVector<Register, 2> PartialRes;
+    unsigned NumParts = Size / 32;
+    auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+    MachineInstrBuilder Src0Parts;
+
+    if (Ty.isPointer()) {
+      auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else if (Ty.isPointerVector()) {
+      LLT IntVecTy = Ty.changeElementType(
+          LLT::scalar(Ty.getElementType().getSizeInBits()));
+      auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else
+      Src0Parts =
+          IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+
+    switch (IID) {
+    case Intrinsic::amdgcn_readlane: {
+      Register Src1 = MI.getOperand(3).getReg();
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
+                 .addUse(Src0)
+                 .addUse(Src1))
+                .getReg(0));
+      }
+      break;
+    }
+    case Intrinsic::amdgcn_readfirstlane: {
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
+                 .addUse(Src0)
+                 .getReg(0)));
+      }
+
+      break;
+    }
+    case Intrinsic::amdgcn_writelane: {
+      Register Src1 = MI.getOperand(3).getReg();
+      Register Src2 = MI.getOperand(4).getReg();
+      MachineInstrBuilder Src2Parts;
+
+      if (Ty.isPointer()) {
+        auto PtrToInt = B.buildPtrToInt(S64, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else if (Ty.isPointerVector()) {
+        LLT IntVecTy = Ty.changeElementType(
+            LLT::scalar(Ty.getElementType().getSizeInBits()));
+        auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else
+        Src2Parts =
+            IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
----------------
arsenm wrote:

The point of splitting out the pointer typed tests was to avoid fixing the 
handling of the pointer typed selection patterns. You still have the casts 
inserted here. You should not need any ptrtoint, inttoptr, or bitcasts in any 
of these legalizations 

https://github.com/llvm/llvm-project/pull/89217
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to