[PATCH] D150670: [WebAssembly] Disable generation of fshl/fshr for rotates

Paulo Matos via Phabricator via cfe-commits Tue, 30 May 2023 02:51:38 -0700

pmatos updated this revision to Diff 526556.
pmatos added a comment.

Implement optimization when demanded bits are known, skip otherwise for rotates.


@nikic Things look much better now. Thanks for your help with the changes in 
InstCombine. What do you think?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150670/new/

https://reviews.llvm.org/D150670

Files:
  clang/test/CodeGen/WebAssembly/wasm-rotate.c
  llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
  llvm/test/CodeGen/WebAssembly/rotate-i3264.ll
  llvm/test/Transforms/InstCombine/fsh.ll

Index: llvm/test/Transforms/InstCombine/fsh.ll
===================================================================
--- llvm/test/Transforms/InstCombine/fsh.ll
+++ llvm/test/Transforms/InstCombine/fsh.ll
@@ -440,12 +440,10 @@
   ret <2 x i32> %r
 }
 
-; TODO: Don't let SimplifyDemandedBits split up a rotate - keep the same operand.
-
 define i32 @rotl_common_demanded(i32 %a0) {
 ; CHECK-LABEL: @rotl_common_demanded(
 ; CHECK-NEXT:    [[X:%.*]] = xor i32 [[A0:%.*]], 2
-; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X]], i32 [[A0]], i32 8)
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X]], i32 [[X]], i32 8)
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %x = xor i32 %a0, 2
@@ -456,7 +454,7 @@
 define i33 @rotr_common_demanded(i33 %a0) {
 ; CHECK-LABEL: @rotr_common_demanded(
 ; CHECK-NEXT:    [[X:%.*]] = xor i33 [[A0:%.*]], 2
-; CHECK-NEXT:    [[R:%.*]] = call i33 @llvm.fshl.i33(i33 [[X]], i33 [[A0]], i33 25)
+; CHECK-NEXT:    [[R:%.*]] = call i33 @llvm.fshl.i33(i33 [[X]], i33 [[X]], i33 25)
 ; CHECK-NEXT:    ret i33 [[R]]
 ;
   %x = xor i33 %a0, 2
Index: llvm/test/CodeGen/WebAssembly/rotate-i3264.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/rotate-i3264.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: sed 's/iX/i32/g' %s | llc --mtriple=wasm32-unknown-unknown | FileCheck --check-prefix=I32 %s
+; RUN: sed 's/iX/i64/g' %s | llc --mtriple=wasm64-unknown-unknown | FileCheck --check-prefix=I64 %s
+
+declare iX @llvm.fshl.iX(iX, iX, iX)
+declare iX @llvm.fshr.iX(iX, iX, iX)
+
+; from https://github.com/llvm/llvm-project/issues/62703
+
+define iX @testLeft(iX noundef %0, iX noundef %1) {
+; I32-LABEL: testLeft:
+; I32:         .functype testLeft (i32, i32) -> (i32)
+; I32-NEXT:  # %bb.0:
+; I32-NEXT:    local.get 0
+; I32-NEXT:    local.get 1
+; I32-NEXT:    i32.rotl
+; I32-NEXT:    # fallthrough-return
+;
+; I64-LABEL: testLeft:
+; I64:         .functype testLeft (i64, i64) -> (i64)
+; I64-NEXT:  # %bb.0:
+; I64-NEXT:    local.get 0
+; I64-NEXT:    local.get 1
+; I64-NEXT:    i64.rotl
+; I64-NEXT:    # fallthrough-return
+  %3 = call iX @llvm.fshl.iX(iX %0, iX %0, iX %1)
+  ret iX %3
+}
+
+define iX @testRight(iX noundef %0, iX noundef %1) {
+; I32-LABEL: testRight:
+; I32:         .functype testRight (i32, i32) -> (i32)
+; I32-NEXT:  # %bb.0:
+; I32-NEXT:    local.get 0
+; I32-NEXT:    local.get 1
+; I32-NEXT:    i32.rotr
+; I32-NEXT:    # fallthrough-return
+;
+; I64-LABEL: testRight:
+; I64:         .functype testRight (i64, i64) -> (i64)
+; I64-NEXT:  # %bb.0:
+; I64-NEXT:    local.get 0
+; I64-NEXT:    local.get 1
+; I64-NEXT:    i64.rotr
+; I64-NEXT:    # fallthrough-return
+  %3 = call iX @llvm.fshr.iX(iX %0, iX %0, iX %1)
+  ret iX %3
+}
Index: llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -912,9 +912,24 @@
 
         APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
         APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
-        if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
-            SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
-          return I;
+        if (I->getOperand(0) != I->getOperand(1)) {
+            if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+              SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))     
+            return I;
+        } else { // fshl is a rotate
+          KnownBits LHSKnown = computeKnownBits(I->getOperand(0), Depth + 1, I);
+          KnownBits RHSKnown = computeKnownBits(I->getOperand(1), Depth + 1, I);
+        
+          // Although this is a rotate there are cases where we want to optimize
+          // it. If the demanded bits are known, then we proceed with the
+          // optimization.
+          if ((DemandedMaskLHS.isSubsetOf(LHSKnown.Zero | LHSKnown.One)
+              || DemandedMaskRHS.isSubsetOf(RHSKnown.Zero | RHSKnown.One)) &&
+              (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+               SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1)))     
+              return I;
+          }
+        }
 
         Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
                      RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
Index: clang/test/CodeGen/WebAssembly/wasm-rotate.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/WebAssembly/wasm-rotate.c
@@ -0,0 +1,53 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -o - -emit-llvm %s | FileCheck --check-prefix=WEBASSEMBLY32 %s
+// RUN: %clang_cc1 -triple wasm64-unknown-unknown -o - -emit-llvm %s | FileCheck --check-prefix=WEBASSEMBLY64 %s
+
+// WEBASSEMBLY32-LABEL: define i32 @test32
+// WEBASSEMBLY32-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// WEBASSEMBLY32-NEXT:  entry:
+// WEBASSEMBLY32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// WEBASSEMBLY32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// WEBASSEMBLY32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// WEBASSEMBLY32-NEXT:    [[AND:%.*]] = and i32 [[TMP0]], -16711936
+// WEBASSEMBLY32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fshl.i32(i32 [[AND]], i32 [[AND]], i32 8)
+// WEBASSEMBLY32-NEXT:    ret i32 [[TMP1]]
+//
+// WEBASSEMBLY64-LABEL: define i32 @test32
+// WEBASSEMBLY64-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// WEBASSEMBLY64-NEXT:  entry:
+// WEBASSEMBLY64-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// WEBASSEMBLY64-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// WEBASSEMBLY64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// WEBASSEMBLY64-NEXT:    [[AND:%.*]] = and i32 [[TMP0]], -16711936
+// WEBASSEMBLY64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fshl.i32(i32 [[AND]], i32 [[AND]], i32 8)
+// WEBASSEMBLY64-NEXT:    ret i32 [[TMP1]]
+//
+unsigned int test32(unsigned int x) {
+  return __builtin_rotateleft32((x & 0xFF00FF00), 8);
+}
+
+// WEBASSEMBLY32-LABEL: define i32 @test64
+// WEBASSEMBLY32-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// WEBASSEMBLY32-NEXT:  entry:
+// WEBASSEMBLY32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// WEBASSEMBLY32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// WEBASSEMBLY32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// WEBASSEMBLY32-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// WEBASSEMBLY32-NEXT:    [[AND:%.*]] = and i64 [[CONV]], -71777214294589696
+// WEBASSEMBLY32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[AND]], i64 [[AND]], i64 8)
+// WEBASSEMBLY32-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// WEBASSEMBLY32-NEXT:    ret i32 [[CONV1]]
+//
+// WEBASSEMBLY64-LABEL: define i64 @test64
+// WEBASSEMBLY64-SAME: (i64 noundef [[X:%.*]]) #[[ATTR0]] {
+// WEBASSEMBLY64-NEXT:  entry:
+// WEBASSEMBLY64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
+// WEBASSEMBLY64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
+// WEBASSEMBLY64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X_ADDR]], align 8
+// WEBASSEMBLY64-NEXT:    [[AND:%.*]] = and i64 [[TMP0]], -71777214294589696
+// WEBASSEMBLY64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[AND]], i64 [[AND]], i64 8)
+// WEBASSEMBLY64-NEXT:    ret i64 [[TMP1]]
+//
+unsigned long test64(unsigned long x) {
+  return __builtin_rotateleft64((x & 0xFF00FF00FF00FF00L), 8);
+}

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D150670: [WebAssembly] Disable generation of fshl/fshr for rotates

Reply via email to