kmclaughlin updated this revision to Diff 252575. kmclaughlin marked an inline comment as done. kmclaughlin added a comment.
Use SmallSetVector for the list of functions gathered by runOnModule to preserve the order of iteration CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76078/new/ https://reviews.llvm.org/D76078 Files: llvm/lib/Target/AArch64/AArch64.h llvm/lib/Target/AArch64/AArch64TargetMachine.cpp llvm/lib/Target/AArch64/CMakeLists.txt llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp llvm/test/CodeGen/AArch64/O3-pipeline.ll llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -0,0 +1,203 @@ +; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) { +; OPT-LABEL: @reinterpret_test_h( +; OPT-NOT: convert +; OPT: ret <vscale x 8 x i1> %a + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) + %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1) + ret <vscale x 8 x i1> %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) { +; OPT-LABEL: @reinterpret_test_h_rev( +; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1) +; OPT-NEXT: ret <vscale x 16 x i1> %2 + %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1) + ret <vscale x 16 x i1> %2 +} + +define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) { +; OPT-LABEL: @reinterpret_test_w( +; OPT-NOT: convert +; OPT: ret <vscale x 4 x i1> %a + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) + %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) + ret <vscale x 4 x i1> %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) { +; OPT-LABEL: @reinterpret_test_w_rev( +; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1) +; OPT-NEXT: ret <vscale x 16 x i1> %2 + %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1) + ret <vscale x 16 x i1> %2 +} + +define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) { +; OPT-LABEL: @reinterpret_test_d( +; OPT-NOT: convert +; OPT: ret <vscale x 2 x i1> %a + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1) + ret <vscale x 2 x i1> %2 +} + +; Reinterprets are not redundant because the second reinterpret zeros the +; lanes that don't exist within its input. +define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) { +; OPT-LABEL: @reinterpret_test_d_rev( +; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1) +; OPT-NEXT: ret <vscale x 16 x i1> %2 + %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1) + ret <vscale x 16 x i1> %2 +} + +define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) { +; OPT-LABEL: reinterpret_reductions +; OPT-NOT: convert +; OPT-NOT: phi <vscale x 16 x i1> +; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ] +; OPT-NOT: convert +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + br label %join + +br_phi_b: + %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b) + br label %join + +br_phi_c: + %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c) + br label %join + +join: + %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) + ret <vscale x 2 x i1> %pg1 +} + +; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1) +; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets. +define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) { +; OPT-LABEL: reinterpret_reductions_1 +; OPT: convert +; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi <vscale x 2 x i1> +; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + br label %join + +br_phi_b: + %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %b) + br label %join + +br_phi_c: + %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c) + br label %join + +join: + %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) + ret <vscale x 2 x i1> %pg1 +} + +; No transform. Similar to the the test above, but here only two of the arguments need to +; be converted to svbool. +define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) { +; OPT-LABEL: reinterpret_reductions_2 +; OPT: convert +; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi <vscale x 2 x i1> +; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) +; OPT: ret + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + br label %join + +br_phi_b: + br label %join + +br_phi_c: + %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c) + br label %join + +join: + %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) + ret <vscale x 2 x i1> %pg1 +} + +; Similar to reinterpret_reductions but the reinterprets remain because the +; original phi cannot be removed (i.e. prefer reinterprets over multiple phis). +define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) { +; OPT-LABEL: reinterpret_reductions3 +; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] +; OPT-NOT: phi <vscale x 2 x i1> +; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) +; OPT-NEXT: ret <vscale x 16 x i1> %pg + +entry: + switch i32 %cond, label %br_phi_c [ + i32 43, label %br_phi_a + i32 45, label %br_phi_b + ] + +br_phi_a: + %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + br label %join + +br_phi_b: + %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b) + br label %join + +br_phi_c: + %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c) + br label %join + +join: + %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ] + %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg) + ret <vscale x 16 x i1> %pg +} + +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>) +declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>) +declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>) +declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>) Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s + +define i1 @ptest_any1(<vscale x 2 x i1> %a) { +; OPT-LABEL: ptest_any1 +; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0) + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2) + ret i1 %out +} + +; No transform because the ptest is using differently sized operands. +define i1 @ptest_any2(<vscale x 4 x i1> %a) { +; OPT-LABEL: ptest_any2 +; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; OPT-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask) +; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) +; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2) + %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) + %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2) + ret i1 %out +} + +define i1 @ptest_first(<vscale x 4 x i1> %a) { +; OPT-LABEL: ptest_first +; OPT: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0) + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %mask) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) + %out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2) + ret i1 %out +} + +define i1 @ptest_last(<vscale x 8 x i1> %a) { +; OPT-LABEL: ptest_last +; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) +; OPT-NOT: convert +; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a) +; OPT-NEXT: ret i1 %[[OUT]] + %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0) + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %mask) + %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) + %out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2) + ret i1 %out +} + +declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>) +declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>) +declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>) + +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>) +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>) Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -17,6 +17,10 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: SVE intrinsics optimizations +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information Index: llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -0,0 +1,277 @@ +//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Performs general IR level optimizations on SVE intrinsics. +// +// The main goal of this pass is to remove unnecessary reinterpret +// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g: +// +// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) +// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) +// +// This pass also looks for ptest intrinsics & phi instructions where the +// operands are being needlessly converted to and from svbool_t. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "sve-intrinsic-opts" + +namespace llvm { +void initializeSVEIntrinsicOptsPass(PassRegistry &); +} + +namespace { +struct SVEIntrinsicOpts : public ModulePass { + static char ID; // Pass identification, replacement for typeid + SVEIntrinsicOpts() : ModulePass(ID) { + initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + static IntrinsicInst *isReinterpretFromSVBool(Value *V); + static IntrinsicInst *isReinterpretToSVBool(Value *V); + + static bool optimizeIntrinsic(Instruction *I); + + bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions); + + static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizePTest(IntrinsicInst *I); + + static bool processPhiNode(IntrinsicInst *I); +}; +} // end anonymous namespace + +void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.setPreservesCFG(); +} + +char SVEIntrinsicOpts::ID = 0; +static const char *name = "SVE intrinsics optimizations"; +INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) + +namespace llvm { +ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } +} // namespace llvm + +/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr +/// otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { + IntrinsicInst *I = dyn_cast<IntrinsicInst>(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return nullptr; + + return I; +} + +/// Returns V if it's a cast to <n x 16 x i1> (aka svbool_t), nullptr otherwise. +IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) { + IntrinsicInst *I = dyn_cast<IntrinsicInst>(V); + if (!I) + return nullptr; + + if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool) + return nullptr; + + return I; +} + +/// The function will remove redundant reinterprets casting in the presence +/// of the control flow +bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { + + SmallVector<Instruction *, 32> Worklist; + auto RequiredType = X->getType(); + + auto *PN = dyn_cast<PHINode>(X->getArgOperand(0)); + assert(PN && "Expected Phi Node!"); + + // Don't create a new Phi unless we can remove the old one. + if (!PN->hasOneUse()) + return false; + + for (Value *IncValPhi : PN->incoming_values()) { + auto *Reinterpret = isReinterpretToSVBool(IncValPhi); + if (!Reinterpret || + RequiredType != Reinterpret->getArgOperand(0)->getType()) + return false; + } + + // Create the new Phi + LLVMContext &Ctx = PN->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(PN); + PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); + Worklist.push_back(PN); + + for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { + auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); + NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); + Worklist.push_back(Reinterpret); + } + + // Cleanup Phi Node and reinterprets + X->replaceAllUsesWith(NPN); + X->eraseFromParent(); + + for (auto &I : Worklist) + if (I->use_empty()) + I->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { + IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0)); + IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1)); + + if (Op1 && Op2 && + Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && + Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { + + Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; + Type *Tys[] = {Op1->getArgOperand(0)->getType()}; + Module *M = I->getParent()->getParent()->getParent(); + + auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); + auto CI = CallInst::Create(Fn, Ops, I->getName(), I); + + I->replaceAllUsesWith(CI); + I->eraseFromParent(); + if (Op1->use_empty()) + Op1->eraseFromParent(); + if (Op2->use_empty()) + Op2->eraseFromParent(); + + return true; + } + + return false; +} + +bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { + assert(isReinterpretFromSVBool(I)); + + // If the reinterpret instruction operand is a PHI Node + if (isa<PHINode>(I->getArgOperand(0))) + return processPhiNode(I); + + // If we have a reinterpret intrinsic I of type A which is converting from + // another reinterpret Y of type B, and the source type of Y is A, then we can + // elide away both reinterprets if there are no other users of Y. + auto *Y = isReinterpretToSVBool(I->getArgOperand(0)); + if (!Y) + return false; + + Value *SourceVal = Y->getArgOperand(0); + if (I->getType() != SourceVal->getType()) + return false; + + I->replaceAllUsesWith(SourceVal); + I->eraseFromParent(); + if (Y->use_empty()) + Y->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { + IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I); + if (!IntrI) + return false; + + switch (IntrI->getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + return optimizePTest(IntrI); + default: + return false; + } + + return true; +} + +bool SVEIntrinsicOpts::optimizeFunctions( + SmallSetVector<Function *, 4> &Functions) { + bool Changed = false; + for (auto *F : Functions) { + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree(); + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal<BasicBlock *> RPOT(Root); + for (auto *BB : RPOT) + for (Instruction &I : make_early_inc_range(*BB)) + Changed |= optimizeIntrinsic(&I); + } + return Changed; +} + +bool SVEIntrinsicOpts::runOnModule(Module &M) { + bool Changed = false; + SmallSetVector<Function *, 4> Functions; + + // Check for SVE intrinsic declarations first so that we only iterate over + // relevant functions. Where an appropriate declaration is found, store the + // function(s) where it is used so we can target these only. + for (auto &F : M.getFunctionList()) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::aarch64_sve_convert_from_svbool: + case Intrinsic::aarch64_sve_ptest_any: + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + auto *Inst = dyn_cast<Instruction>(*I++); + Functions.insert(Inst->getFunction()); + } + break; + default: + break; + } + } + + if (!Functions.empty()) + Changed |= optimizeFunctions(Functions); + + return Changed; +} Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -64,6 +64,7 @@ AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp DEPENDS Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -146,6 +146,11 @@ cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); +static cl::opt<bool> EnableSVEIntrinsicOpts( + "aarch64-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); + static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -182,6 +187,7 @@ initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); @@ -434,6 +440,10 @@ // ourselves. addPass(createAtomicExpandPass()); + // Expand any SVE vector library calls that we can't code generate directly. + if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive) + addPass(createSVEIntrinsicOptsPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -52,6 +52,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); @@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); +void initializeSVEIntrinsicOptsPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits