tlively created this revision. tlively added reviewers: aheejin, dschuff. Herald added subscribers: wingo, ecnelises, sunfish, hiraditya, jgravelle-google, sbc100. tlively requested review of this revision. Herald added projects: clang, LLVM. Herald added subscribers: llvm-commits, cfe-commits.
Partially reverts 85157c007903 <https://reviews.llvm.org/rG85157c0079031b51c0446b222894aec4aad71b53>, which had removed these builtins and intrinsics in favor of normal codegen patterns. It turns out that it is possible for the patterns to be split over multiple basic blocks, however, which means that DAG ISel is not able to select them to the pmin/pmax instructions. To make sure the SIMD intrinsics generate the correct instructions in these cases, reintroduce the clang builtins and corresponding LLVM intrinsics, but also keep the normal pattern matching as well. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D108387 Files: clang/include/clang/Basic/BuiltinsWebAssembly.def clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/wasm_simd128.h clang/test/CodeGen/builtins-wasm.c clang/test/Headers/wasm.c llvm/include/llvm/IR/IntrinsicsWebAssembly.td llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
Index: llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll =================================================================== --- llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -540,6 +540,26 @@ ret <4 x float> %a } +; CHECK-LABEL: pmin_v4f32: +; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>) +define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) { + %v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %v +} + +; CHECK-LABEL: pmax_v4f32: +; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>) +define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) { + %v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %v +} + ; CHECK-LABEL: ceil_v4f32: ; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}} ; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}} @@ -595,6 +615,26 @@ ret <2 x double> %a } +; CHECK-LABEL: pmin_v2f64: +; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>) +define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) { + %v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %v +} + +; CHECK-LABEL: pmax_v2f64: +; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>) +define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) { + %v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %v +} + ; CHECK-LABEL: ceil_v2f64: ; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}} ; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}} Index: llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td =================================================================== --- llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1165,6 +1165,16 @@ (pmax $lhs, $rhs)>; } +// And match the pmin/pmax LLVM intrinsics as well +def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (PMIN_F32x4 V128:$lhs, V128:$rhs)>; +def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (PMAX_F32x4 V128:$lhs, V128:$rhs)>; +def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (PMIN_F64x2 V128:$lhs, V128:$rhs)>; +def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (PMAX_F64x2 V128:$lhs, V128:$rhs)>; + //===----------------------------------------------------------------------===// // Conversions //===----------------------------------------------------------------------===// Index: llvm/include/llvm/IR/IntrinsicsWebAssembly.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -164,6 +164,15 @@ [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, IntrSpeculatable]>; +def int_wasm_pmin : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; +def int_wasm_pmax : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; + def int_wasm_extadd_pairwise_signed : Intrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>], Index: clang/test/Headers/wasm.c =================================================================== --- clang/test/Headers/wasm.c +++ clang/test/Headers/wasm.c @@ -2191,11 +2191,11 @@ // CHECK-LABEL: @test_f32x4_pmin( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]] -// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] // v128_t test_f32x4_pmin(v128_t a, v128_t b) { return wasm_f32x4_pmin(a, b); @@ -2205,9 +2205,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]] -// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] // v128_t test_f32x4_pmax(v128_t a, v128_t b) { return wasm_f32x4_pmax(a, b); @@ -2364,10 +2364,9 @@ // CHECK-LABEL: @test_f64x2_pmin( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP0]], <2 x double> [[TMP1]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2379,8 +2378,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP1]], <2 x double> [[TMP0]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // Index: clang/test/CodeGen/builtins-wasm.c =================================================================== --- clang/test/CodeGen/builtins-wasm.c +++ clang/test/CodeGen/builtins-wasm.c @@ -506,6 +506,20 @@ // WEBASSEMBLY-NEXT: ret } +f32x4 pmin_f32x4(f32x4 x, f32x4 y) { + return __builtin_wasm_pmin_f32x4(x, y); + // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmin.v4f32( + // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y) + // WEBASSEMBLY-NEXT: ret +} + +f32x4 pmax_f32x4(f32x4 x, f32x4 y) { + return __builtin_wasm_pmax_f32x4(x, y); + // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmax.v4f32( + // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y) + // WEBASSEMBLY-NEXT: ret +} + f64x2 min_f64x2(f64x2 x, f64x2 y) { return __builtin_wasm_min_f64x2(x, y); // WEBASSEMBLY: call <2 x double> @llvm.minimum.v2f64( @@ -520,6 +534,20 @@ // WEBASSEMBLY-NEXT: ret } +f64x2 pmin_f64x2(f64x2 x, f64x2 y) { + return __builtin_wasm_pmin_f64x2(x, y); + // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmin.v2f64( + // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y) + // WEBASSEMBLY-NEXT: ret +} + +f64x2 pmax_f64x2(f64x2 x, f64x2 y) { + return __builtin_wasm_pmax_f64x2(x, y); + // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmax.v2f64( + // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y) + // WEBASSEMBLY-NEXT: ret +} + f32x4 ceil_f32x4(f32x4 x) { return __builtin_wasm_ceil_f32x4(x); // WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) Index: clang/lib/Headers/wasm_simd128.h =================================================================== --- clang/lib/Headers/wasm_simd128.h +++ clang/lib/Headers/wasm_simd128.h @@ -1150,14 +1150,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a, v128_t __b) { - __i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a); - return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a, v128_t __b) { - __i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b); - return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) { @@ -1220,14 +1218,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a, v128_t __b) { - __i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a); - return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a, v128_t __b) { - __i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b); - return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask)); + return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -17799,6 +17799,22 @@ CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_pmin_f32x4: + case WebAssembly::BI__builtin_wasm_pmin_f64x2: { + Value *LHS = EmitScalarExpr(E->getArg(0)); + Value *RHS = EmitScalarExpr(E->getArg(1)); + Function *Callee = + CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType())); + return Builder.CreateCall(Callee, {LHS, RHS}); + } + case WebAssembly::BI__builtin_wasm_pmax_f32x4: + case WebAssembly::BI__builtin_wasm_pmax_f64x2: { + Value *LHS = EmitScalarExpr(E->getArg(0)); + Value *RHS = EmitScalarExpr(E->getArg(1)); + Function *Callee = + CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType())); + return Builder.CreateCall(Callee, {LHS, RHS}); + } case WebAssembly::BI__builtin_wasm_ceil_f32x4: case WebAssembly::BI__builtin_wasm_floor_f32x4: case WebAssembly::BI__builtin_wasm_trunc_f32x4: Index: clang/include/clang/Basic/BuiltinsWebAssembly.def =================================================================== --- clang/include/clang/Basic/BuiltinsWebAssembly.def +++ clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -129,8 +129,12 @@ TARGET_BUILTIN(__builtin_wasm_min_f32x4, "V4fV4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f32x4, "V4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmin_f32x4, "V4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmax_f32x4, "V4fV4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits