Author: Eli Friedman Date: 2026-02-04T09:35:39Z New Revision: a1fd09748aa437cf581e618a1ed582e3c45c6e19
URL: https://github.com/llvm/llvm-project/commit/a1fd09748aa437cf581e618a1ed582e3c45c6e19 DIFF: https://github.com/llvm/llvm-project/commit/a1fd09748aa437cf581e618a1ed582e3c45c6e19.diff LOG: Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#150438)" Cherry-pick of #179339 (a2c7c6032f27c4f8d6f7327a7ca15705d3081c3e). Added: Modified: llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp Removed: llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll ################################################################################ diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index dc47b243625b8..9934caef22a8f 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -295,10 +295,6 @@ class ConstantOffsetExtractor { bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); - /// Analyze XOR instruction to extract disjoint constant bits that behave - /// like addition operations for improved address mode folding. - APInt extractDisjointBitsFromXor(BinaryOperator *XorInst); - /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and @@ -601,9 +597,6 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); - // Handle XOR with disjoint bits that can be treated as addition. - else if (BO->getOpcode() == Instruction::Xor) - ConstantOffset = extractDisjointBitsFromXor(BO); } else if (isa<TruncInst>(V)) { ConstantOffset = find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) @@ -723,20 +716,11 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { Value *NextInChain = removeConstOffset(ChainIndex - 1); Value *TheOther = BO->getOperand(1 - OpNo); + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { - if (CI->isZero()) { - // Custom XOR handling for disjoint bits - preserves original XOR - // with non-disjoint constant bits. - // TODO: The design should be updated to support partial constant - // extraction. - if (BO->getOpcode() == Instruction::Xor) - return BO; - - // If NextInChain is 0 and not the LHS of a sub, we can simplify the - // sub-expression to be just TheOther. - if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0)) - return TheOther; - } + if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; } BinaryOperator::BinaryOps NewOp = BO->getOpcode(); @@ -767,67 +751,6 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } -/// Analyze XOR instruction to extract disjoint constant bits for address -/// folding -/// -/// This function identifies bits in an XOR constant operand that are disjoint -/// from the base operand's known set bits. For these disjoint bits, XOR behaves -/// identically to addition, allowing us to extract them as constant offsets -/// that can be folded into addressing modes. -/// -/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) + -/// DisjointBits` where DisjointBits = Const & KnownZeros(Base) -/// -/// Example with ptr having known-zero low bit: -/// Original: `xor %ptr, 3` ; 3 = 0b11 -/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01 -/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode -/// -/// \param XorInst The XOR binary operator to analyze -/// \return APInt containing the disjoint bits that can be extracted as offset, -/// or zero if no disjoint bits exist -APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( - BinaryOperator *XorInst) { - assert(XorInst && XorInst->getOpcode() == Instruction::Xor && - "Expected XOR instruction"); - - const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits(); - Value *BaseOperand; - ConstantInt *XorConstant; - - // Match pattern: xor BaseOperand, Constant. - if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant)))) - return APInt::getZero(BitWidth); - - // Compute known bits for the base operand. - const SimplifyQuery SQ(DL); - const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ); - const APInt &ConstantValue = XorConstant->getValue(); - - // Identify disjoint bits: constant bits that are known zero in base. - const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero; - - // Early exit if no disjoint bits found. - if (DisjointBits.isZero()) - return APInt::getZero(BitWidth); - - // Compute the remaining non-disjoint bits that stay in the XOR. - const APInt NonDisjointBits = ConstantValue & ~DisjointBits; - - // FIXME: Enhance XOR constant extraction to handle nested binary operations. - // Currently we only extract disjoint bits from the immediate XOR constant, - // but we could recursively process cases like: - // xor (add %base, C1), C2 -> add %base, (C1 ^ disjoint_bits(C2)) - // This requires careful analysis to ensure the transformation preserves - // semantics, particularly around sign extension and overflow behavior. - - // Add the non-disjoint constant to the user chain for later transformation - // This will replace the original constant in the XOR with the new - // constant. - UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits)); - return DisjointBits; -} - /// A helper function to check if reassociating through an entry in the user /// chain would invalidate the GEP's nuw flag. static bool allowsPreservingNUW(const User *U) { diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll deleted file mode 100644 index 056f33e5ee367..0000000000000 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll +++ /dev/null @@ -1,435 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; Test the xor with constant operand is decomposed in to gep. -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ -; RUN: -S < %s | FileCheck %s -; Test the gvn pass eliminates the redundant xor instructions from decomposition. -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \ -; RUN: -S < %s | FileCheck --check-prefix=GVN %s - -; Check that disjoint constants are properly extracted and folded into GEP -; addressing modes and GVN to eliminate redundant computations -define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test1( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 -; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384 -; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576 -; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 -; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] -; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test1( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 -; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 -; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] -; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] -; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] -; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = xor i32 %2, 4128 - %5 = xor i32 %2, 8224 - %6 = xor i32 %2, 12320 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 - %11 = load <8 x half>, ptr addrspace(3) %7, align 16 - %12 = load <8 x half>, ptr addrspace(3) %8, align 16 - %13 = load <8 x half>, ptr addrspace(3) %9, align 16 - %14 = load <8 x half>, ptr addrspace(3) %10, align 16 - %15 = fadd <8 x half> %11, %12 - %16 = fadd <8 x half> %13, %14 - %17 = fadd <8 x half> %15, %16 - store <8 x half> %17, ptr addrspace(3) %1, align 16 - ret void -} - -; Check that disjoint constants are properly extracted and folded into GEP -; addressing modes and GVN to eliminate redundant computations -define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test2( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576 -; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384 -; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 -; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] -; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test2( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 -; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 -; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] -; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] -; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] -; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 12320 - %4 = xor i32 %2, 8224 - %5 = xor i32 %2, 4128 - %6 = xor i32 %2, 32 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 - %11 = load <8 x half>, ptr addrspace(3) %7, align 16 - %12 = load <8 x half>, ptr addrspace(3) %8, align 16 - %13 = load <8 x half>, ptr addrspace(3) %9, align 16 - %14 = load <8 x half>, ptr addrspace(3) %10, align 16 - %15 = fadd <8 x half> %11, %12 - %16 = fadd <8 x half> %13, %14 - %17 = fadd <8 x half> %15, %16 - store <8 x half> %17, ptr addrspace(3) %1, align 16 - ret void -} - -; Verify that xor instructions with diff erent non-disjoint constants are optimized -define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test3( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 -; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192 -; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 -; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]] -; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test3( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 -; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] -; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = xor i32 %2, 2336 - %5 = xor i32 %2, 4128 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %8 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %9 = load <8 x half>, ptr addrspace(3) %6, align 16 - %10 = load <8 x half>, ptr addrspace(3) %7, align 16 - %11 = load <8 x half>, ptr addrspace(3) %8, align 16 - %12 = fadd <8 x half> %9, %10 - %13 = fadd <8 x half> %11, %12 - store <8 x half> %13, ptr addrspace(3) %1, align 16 - ret void -} - -; Verify that no optimization occurs when disjoint constants are absent -define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test4( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] -; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test4( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 -; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] -; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = xor i32 %2, 288 - %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %7 = load <8 x half>, ptr addrspace(3) %5, align 16 - %8 = load <8 x half>, ptr addrspace(3) %6, align 16 - %9 = fadd <8 x half> %7, %8 - store <8 x half> %9, ptr addrspace(3) %1, align 16 - ret void -} - - -; Verify that XOR-BinOp-GEP usage chains are properly optimized -define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test5( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test5( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256 -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] -; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = xor i32 %2, 4128 - %5 = add i32 %4, 256 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %8 = load <8 x half>, ptr addrspace(3) %6, align 16 - %9 = load <8 x half>, ptr addrspace(3) %7, align 16 - %10 = fadd <8 x half> %8, %9 - store <8 x half> %10, ptr addrspace(3) %1, align 16 - ret void -} - -; Verify that BinOp-XOR-GEP usage chains are properly optimized. -; In the below test, make sure we stop processing the chain at xor -; and not fold the constant from add instruction in to gep. The -; constant from add can be folded and the future work will cover -; these cases. -define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test6( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test6( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 -; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = add i32 %2, 256 - %5 = xor i32 %4, 4128 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %8 = load <8 x half>, ptr addrspace(3) %6, align 16 - %9 = load <8 x half>, ptr addrspace(3) %7, align 16 - %10 = fadd <8 x half> %8, %9 - store <8 x half> %10, ptr addrspace(3) %1, align 16 - ret void -} - -; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as -; intended. -define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test6a( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] -; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test6a( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288 -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] -; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = add i32 %2, 256 - %5 = xor i32 %4, 288 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %8 = load <8 x half>, ptr addrspace(3) %6, align 16 - %9 = load <8 x half>, ptr addrspace(3) %7, align 16 - %10 = fadd <8 x half> %8, %9 - store <8 x half> %10, ptr addrspace(3) %1, align 16 - ret void -} - -; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are -; not extracted -define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test7( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] -; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -; GVN-LABEL: define amdgpu_kernel void @test7( -; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; GVN-NEXT: [[ENTRY:.*:]] -; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 -; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] -; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 -; GVN-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 - %4 = xor i32 %2, 32800 - %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %7 = load <8 x half>, ptr addrspace(3) %5, align 16 - %8 = load <8 x half>, ptr addrspace(3) %6, align 16 - %9 = fadd <8 x half> %7, %8 - store <8 x half> %9, ptr addrspace(3) %1, align 16 - ret void -} - _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
