[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits

https://github.com/alexey-bataev updated 
https://github.com/llvm/llvm-project/pull/80310

>From 92950afd39034c0184a3c807f8062e0053eead5c Mon Sep 17 00:00:00 2001
From: Alexey Bataev 
Date: Thu, 1 Feb 2024 17:22:34 +
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../llvm/Analysis/TargetTransformInfo.h   |  34 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  13 +
 llvm/lib/Analysis/TargetTransformInfo.cpp |  14 +
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  23 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  23 +
 .../Transforms/Vectorize/SLPVectorizer.cpp| 397 --
 .../SLPVectorizer/RISCV/complex-loads.ll  | 132 +++---
 .../RISCV/strided-loads-vectorized.ll | 209 +
 .../strided-loads-with-external-use-ptr.ll|   4 +-
 .../SLPVectorizer/RISCV/strided-loads.ll  |  13 +-
 .../X86/gep-nodes-with-non-gep-inst.ll|   2 +-
 .../X86/remark_gather-load-redux-cost.ll  |   2 +-
 12 files changed, 478 insertions(+), 388 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3b615bc700bbb..b0b6dab03fa38 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -781,6 +781,9 @@ class TargetTransformInfo {
   /// Return true if the target supports masked expand load.
   bool isLegalMaskedExpandLoad(Type *DataType) const;
 
+  /// Return true if the target supports strided load.
+  bool isLegalStridedLoad(Type *DataType, Align Alignment) const;
+
   /// Return true if this is an alternating opcode pattern that can be lowered
   /// to a single instruction on the target. In X86 this is for the addsub
   /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR.
@@ -1412,6 +1415,20 @@ class TargetTransformInfo {
   Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
   const Instruction *I = nullptr) const;
 
+  /// \return The cost of strided memory operations.
+  /// \p Opcode - is a type of memory access Load or Store
+  /// \p DataTy - a vector type of the data to be loaded or stored
+  /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
+  /// \p VariableMask - true when the memory access is predicated with a mask
+  ///   that is not a compile-time constant
+  /// \p Alignment - alignment of single element
+  /// \p I - the optional original context instruction, if one exists, e.g. the
+  ///load/store to transform or the call to the gather/scatter 
intrinsic
+  InstructionCost getStridedMemoryOpCost(
+  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+  Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+  const Instruction *I = nullptr) const;
+
   /// \return The cost of the interleaved memory operation.
   /// \p Opcode is the memory operation code
   /// \p VecTy is the vector type of the interleaved access.
@@ -1848,6 +1865,7 @@ class TargetTransformInfo::Concept {
Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
+  virtual bool isLegalStridedLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1,
const SmallBitVector &OpcodeMask) const = 0;
@@ -2023,6 +2041,11 @@ class TargetTransformInfo::Concept {
  bool VariableMask, Align Alignment,
  TTI::TargetCostKind CostKind,
  const Instruction *I = nullptr) = 0;
+  virtual InstructionCost
+  getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+ bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) = 0;
 
   virtual InstructionCost getInterleavedMemoryOpCost(
   unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef 
Indices,
@@ -2341,6 +2364,9 @@ class TargetTransformInfo::Model final : public 
TargetTransformInfo::Concept {
   bool isLegalMaskedExpandLoad(Type *DataType) override {
 return Impl.isLegalMaskedExpandLoad(DataType);
   }
+  bool isLegalStridedLoad(Type *DataType, Align Alignment) override {
+return Impl.isLegalStridedLoad(DataType, Alignment);
+  }
   bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask) const override {
 return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
@@ -2671,6 +2697,14 @@ class TargetTransformInfo::Model

[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -3930,30 +4065,68 @@ static LoadsState canVectorizeLoads(ArrayRef 
VL, const Value *VL0,
   std::optional Diff =
   getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
   // Check that the sorted loads are consecutive.
-  if (static_cast(*Diff) == VL.size() - 1)
+  if (static_cast(*Diff) == Sz - 1)
 return LoadsState::Vectorize;
   // Simple check if not a strided access - clear order.
-  IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+  bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or 
potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+  (static_cast(std::abs(*Diff)) <=
+   MaxProfitableLoadStride * Sz &&
+   isPowerOf2_32(std::abs(*Diff &&
+ static_cast(std::abs(*Diff)) > Sz) 
||
+*Diff == -(static_cast(Sz) - 1))) {
+int Stride = *Diff / static_cast(Sz - 1);

alexey-bataev wrote:

This is stride in "scalar elements", here the size in bytes is not important, 
getPointersDiff() handles pointers with different types (sizes). Here we just 
looking that the pointers have proportional constant distances between them.

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -3930,30 +4065,68 @@ static LoadsState canVectorizeLoads(ArrayRef 
VL, const Value *VL0,
   std::optional Diff =
   getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
   // Check that the sorted loads are consecutive.
-  if (static_cast(*Diff) == VL.size() - 1)
+  if (static_cast(*Diff) == Sz - 1)
 return LoadsState::Vectorize;
   // Simple check if not a strided access - clear order.
-  IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+  bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or 
potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+  (static_cast(std::abs(*Diff)) <=
+   MaxProfitableLoadStride * Sz &&
+   isPowerOf2_32(std::abs(*Diff &&
+ static_cast(std::abs(*Diff)) > Sz) 
||
+*Diff == -(static_cast(Sz) - 1))) {
+int Stride = *Diff / static_cast(Sz - 1);
+if (*Diff == Stride * static_cast(Sz - 1)) {
+  if (TTI.isTypeLegal(VecTy) &&

alexey-bataev wrote:

Removed

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -3878,6 +3883,130 @@ static Align computeCommonAlignment(ArrayRef 
VL) {
   return CommonAlignment;
 }
 
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef Order) {
+  unsigned Sz = Order.size();
+  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+  });
+}
+
+/// Checks if the provided list of pointers \p Pointers represents the strided
+/// pointers for type ElemTy. If they are not, std::nullopt is returned.
+/// Otherwise, if \p Inst is not specified, just initialized optional value is
+/// returned to show that the pointers represent strided pointers. If \p Inst
+/// specified, the runtime stride is materialized before the given \p Inst.
+/// \returns std::nullopt if the pointers are not pointers with the runtime
+/// stride, nullptr or actual stride value, otherwise.
+static std::optional
+calculateRtStride(ArrayRef PointerOps, Type *ElemTy,
+  const DataLayout &DL, ScalarEvolution &SE,
+  SmallVectorImpl &SortedIndices,
+  Instruction *Inst = nullptr) {
+  SmallVector SCEVs;

alexey-bataev wrote:

Constant strides covered separately, this one checks for non-constant strides 
and it does not care about the order, it sorts them properly

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -397,27 +241,12 @@ define void @test3([48 x float]* %p, float* noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:[[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr 
[[S:%.*]], i64 0
-; CHECK-NEXT:[[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 4
-; CHECK-NEXT:[[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 8
-; CHECK-NEXT:[[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 12
-; CHECK-NEXT:[[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 16
-; CHECK-NEXT:[[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 20
-; CHECK-NEXT:[[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 24
-; CHECK-NEXT:[[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 28
 ; CHECK-NEXT:[[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr 
[[P]], i64 0, i64 23
-; CHECK-NEXT:[[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr 
[[ARRAYIDX]], i32 0
-; CHECK-NEXT:[[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr 
[[ARRAYIDX4]], i32 1
-; CHECK-NEXT:[[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr 
[[ARRAYIDX11]], i32 2
-; CHECK-NEXT:[[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr 
[[ARRAYIDX18]], i32 3
-; CHECK-NEXT:[[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr 
[[ARRAYIDX25]], i32 4
-; CHECK-NEXT:[[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr 
[[ARRAYIDX32]], i32 5
-; CHECK-NEXT:[[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr 
[[ARRAYIDX39]], i32 6
-; CHECK-NEXT:[[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr 
[[ARRAYIDX46]], i32 7
-; CHECK-NEXT:[[TMP8:%.*]] = call <8 x float> 
@llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> , <8 x float> poison)
-; CHECK-NEXT:[[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:[[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x 
float> poison, <8 x i32> 
-; CHECK-NEXT:[[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:[[TMP0:%.*]] = call <8 x float> 
@llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 
16, <8 x i1> , i32 8)
+; CHECK-NEXT:[[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:[[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x 
float> poison, <8 x i32> 

alexey-bataev wrote:

It can, planned for the next patch(es), cannot put all the stuff in a single 
patch

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:[[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr 
[[ADDR:%.*]], i32 0
 ; CHECK-NEXT:[[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> 
poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:[[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x 
i32> 
+; CHECK-NEXT:[[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x 
i32> 

alexey-bataev wrote:

Same, TTI for X86 does not support strided loads, so the order is not important

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:   bb:
 ; CHECK-SLP-THRESHOLD-NEXT:[[TMP0:%.*]] = insertelement <4 x ptr> poison, 
ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:[[TMP1:%.*]] = shufflevector <4 x ptr> 
[[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:[[TMP2:%.*]] = getelementptr i64, <4 x ptr> 
[[TMP1]], <4 x i64> 
+; CHECK-SLP-THRESHOLD-NEXT:[[TMP2:%.*]] = getelementptr i64, <4 x ptr> 
[[TMP1]], <4 x i64> 

alexey-bataev wrote:

For X86 target it is supposed as not supported currently, so it just produces 
masked gather and the order is not important

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Alexey Bataev via cfe-commits


@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:[[TMP4:%.*]] = call <2 x i16> 
@llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> , <2 x i16> poison)
 ; CHECK-NEXT:[[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:[[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:[[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:[[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:br label [[WHILE_BODY_I]]
 ;
 entry:

alexey-bataev wrote:

For SLP vectorizer it is not important.

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [lldb] [clang] [lld] [libc] [llvm] [mlir] [flang] [openmp] [clang-tools-extra] [SLP]Add support for strided loads. (PR #80310)

2024-02-01 Thread Philip Reames via cfe-commits


@@ -3878,6 +3883,130 @@ static Align computeCommonAlignment(ArrayRef 
VL) {
   return CommonAlignment;
 }
 
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef Order) {
+  unsigned Sz = Order.size();
+  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+  });
+}
+
+/// Checks if the provided list of pointers \p Pointers represents the strided
+/// pointers for type ElemTy. If they are not, std::nullopt is returned.
+/// Otherwise, if \p Inst is not specified, just initialized optional value is
+/// returned to show that the pointers represent strided pointers. If \p Inst
+/// specified, the runtime stride is materialized before the given \p Inst.
+/// \returns std::nullopt if the pointers are not pointers with the runtime
+/// stride, nullptr or actual stride value, otherwise.
+static std::optional
+calculateRtStride(ArrayRef PointerOps, Type *ElemTy,
+  const DataLayout &DL, ScalarEvolution &SE,
+  SmallVectorImpl &SortedIndices,
+  Instruction *Inst = nullptr) {
+  SmallVector SCEVs;

preames wrote:

An alternate approach which might be simpler and yet cover many of the 
interesting test cases might be:
* Loop over the pointers, check that getPointerBase matches.
* Loop again doing removePointerBase
* This gives a list of offsets from base, bail if any non-constant
* Sort the list of constant offsets
* Check if strided w/shuffle?

If you don't want a shuffle afterwards, you can check the delta without sorting.

This won't cover non-constant strides, but I'm not sure we really care about 
those in practice.

https://github.com/llvm/llvm-project/pull/80310
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits