https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/200937
Push insert_subvector into the containing CONCAT_VECTORS operand when the insertion is wholly contained there. AI note: an LLM generated the code and the test, I've read them Co-Authored-By: OpenAI Codex <[email protected]> --- <sub>Stack created with <a href="https://github.com/github/gh-stack">GitHub Stacks CLI</a> • <a href="https://gh.io/stacks-feedback">Give Feedback 💬</a></sub> >From ef3893c5e1ba73c2828533856292ba7a71ef9a63 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak <[email protected]> Date: Sat, 30 May 2026 02:18:03 +0000 Subject: [PATCH] [SelectionDAG] Fold subvector inserts into concat operands Push insert_subvector into the containing CONCAT_VECTORS operand when the insertion is wholly contained there. AI note: an LLM generated the code and the test, I've read them Co-Authored-By: OpenAI Codex <[email protected]> --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++--- .../AMDGPU/dagcombine-insert-concat.ll | 72 +++++++++++++++++++ 2 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 58fc5ece9f3d3..2b93d2236df25 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -29534,16 +29534,41 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { } } - // If the input vector is a concatenation, and the insert replaces - // one of the pieces, we can optimize into a single concat_vectors. - if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && - N0.getOperand(0).getValueType() == N1.getValueType() && - N0.getOperand(0).getValueType().isScalableVector() == - N1.getValueType().isScalableVector()) { - unsigned Factor = N1.getValueType().getVectorMinNumElements(); - SmallVector<SDValue, 8> Ops(N0->ops()); - Ops[InsIdx / Factor] = N1; - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); + // If the input vector is a concatenation and the insert is wholly contained + // in one of its operands, push the insertion into that operand. + if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse()) { + EVT ConcatOpVT = N0.getOperand(0).getValueType(); + EVT InsVT = N1.getValueType(); + unsigned Factor = ConcatOpVT.getVectorMinNumElements(); + unsigned ConcatOpIdx = InsIdx / Factor; + unsigned RelativeIdx = InsIdx - ConcatOpIdx * Factor; + if (ConcatOpIdx < N0.getNumOperands()) { + // If the insert replaces a whole concat operand, optimize into a single + // concat_vectors. + if (ConcatOpVT == InsVT && + ConcatOpVT.isScalableVector() == InsVT.isScalableVector() && + RelativeIdx == 0) { + SmallVector<SDValue, 8> Ops(N0->ops()); + Ops[ConcatOpIdx] = N1; + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); + } + + if (VT.isFixedLengthVector() && ConcatOpVT.isFixedLengthVector() && + InsVT.isFixedLengthVector() && + ConcatOpVT.getVectorElementType() == InsVT.getVectorElementType() && + hasOperation(ISD::INSERT_SUBVECTOR, ConcatOpVT)) { + unsigned NumConcatOpElts = ConcatOpVT.getVectorNumElements(); + unsigned NumInsElts = InsVT.getVectorNumElements(); + if (RelativeIdx % NumInsElts == 0 && + RelativeIdx + NumInsElts <= NumConcatOpElts) { + SmallVector<SDValue, 8> Ops(N0->ops()); + Ops[ConcatOpIdx] = DAG.getNode( + ISD::INSERT_SUBVECTOR, SDLoc(N), ConcatOpVT, Ops[ConcatOpIdx], + N1, DAG.getVectorIdxConstant(RelativeIdx, SDLoc(N))); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); + } + } + } } // Simplify source operands based on insertion. diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll new file mode 100644 index 0000000000000..5d53859b16952 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \ +; RUN: -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s \ +; RUN: --check-prefix=COMBINE \ +; RUN: --implicit-check-not=REG_SEQUENCE --implicit-check-not=INSERT_SUBREG +; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \ +; RUN: -verify-machineinstrs -combiner-disabled -stop-after=amdgpu-isel < %s \ +; RUN: | FileCheck %s --check-prefix=NOCOMBINE + +declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64 immarg) + +define <8 x i32> @insert_into_concat_operand(<4 x i32> %a, <4 x i32> %b, <2 x i32> %sub) nounwind { + ; COMBINE-LABEL: name: insert_into_concat_operand + ; COMBINE: bb.0 (%ir-block.0): + ; COMBINE-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr8, $vgpr9 + ; COMBINE-NEXT: {{ $}} + ; COMBINE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; COMBINE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; COMBINE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; COMBINE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; COMBINE-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; COMBINE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; COMBINE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; COMBINE-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; COMBINE-NEXT: $vgpr0 = COPY [[COPY7]] + ; COMBINE-NEXT: $vgpr1 = COPY [[COPY6]] + ; COMBINE-NEXT: $vgpr2 = COPY [[COPY5]] + ; COMBINE-NEXT: $vgpr3 = COPY [[COPY4]] + ; COMBINE-NEXT: $vgpr4 = COPY [[COPY3]] + ; COMBINE-NEXT: $vgpr5 = COPY [[COPY2]] + ; COMBINE-NEXT: $vgpr6 = COPY [[COPY1]] + ; COMBINE-NEXT: $vgpr7 = COPY [[COPY]] + ; COMBINE-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; NOCOMBINE-LABEL: name: insert_into_concat_operand + ; NOCOMBINE: bb.0 (%ir-block.0): + ; NOCOMBINE-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; NOCOMBINE-NEXT: {{ $}} + ; NOCOMBINE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; NOCOMBINE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; NOCOMBINE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; NOCOMBINE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; NOCOMBINE-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; NOCOMBINE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; NOCOMBINE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; NOCOMBINE-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; NOCOMBINE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; NOCOMBINE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; NOCOMBINE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3, [[COPY5]], %subreg.sub4, [[COPY4]], %subreg.sub5, [[COPY3]], %subreg.sub6, [[COPY2]], %subreg.sub7 + ; NOCOMBINE-NEXT: [[INSERT_SUBREG:%[0-9]+]]:av_256_align2 = INSERT_SUBREG [[REG_SEQUENCE]], [[COPY1]], %subreg.sub6 + ; NOCOMBINE-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:av_256_align2 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY]], %subreg.sub7 + ; NOCOMBINE-NEXT: [[COPY10:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub0 + ; NOCOMBINE-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub1 + ; NOCOMBINE-NEXT: [[COPY12:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub2 + ; NOCOMBINE-NEXT: [[COPY13:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub3 + ; NOCOMBINE-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub4 + ; NOCOMBINE-NEXT: [[COPY15:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub5 + ; NOCOMBINE-NEXT: [[COPY16:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub6 + ; NOCOMBINE-NEXT: [[COPY17:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub7 + ; NOCOMBINE-NEXT: $vgpr0 = COPY [[COPY10]] + ; NOCOMBINE-NEXT: $vgpr1 = COPY [[COPY11]] + ; NOCOMBINE-NEXT: $vgpr2 = COPY [[COPY12]] + ; NOCOMBINE-NEXT: $vgpr3 = COPY [[COPY13]] + ; NOCOMBINE-NEXT: $vgpr4 = COPY [[COPY14]] + ; NOCOMBINE-NEXT: $vgpr5 = COPY [[COPY15]] + ; NOCOMBINE-NEXT: $vgpr6 = COPY [[COPY16]] + ; NOCOMBINE-NEXT: $vgpr7 = COPY [[COPY17]] + ; NOCOMBINE-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %wide = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %ins = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %wide, <2 x i32> %sub, i64 6) + ret <8 x i32> %ins +} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
