Issue |
102911
|
Summary |
Clang 19 generates incorrect code for vec_splat of the lower 16 bits of the vec_sum_u128 result on Z14
|
Labels |
|
Assignees |
|
Reporter |
johnplatts
|
Here is a snippet of C++ code that generates incorrect code when compiled with Clang 19 on Z14 with the `--target=s390x-linux-gnu -O2 -march=z14 -mzvector -std=c++17` options:
```
#include <stdint.h>
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")
#undef vector
#undef pixel
#undef bool
#include <vecintrin.h>
#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")
typedef int16_t AlignedI16LoadStoreVec
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
extern "C" {
void ZVectorSumOf8LanesVec(int16_t* dst);
}
void ZVectorSumOf8LanesVec(int16_t* dst) {
constexpr __vector unsigned short kVecToSum =
{ 1, 2, 4, 8, 16, 32, 64, 128 };
const __vector signed int zero_vec = vec_splats(0);
const __vector unsigned long long sums_of_4 =
vec_sum2(kVecToSum, reinterpret_cast<__vector unsigned short>(zero_vec));
const __vector signed short sums_of_8 =
vec_splat(reinterpret_cast<__vector signed short>(vec_sum_u128(
sums_of_4,
reinterpret_cast<__vector unsigned long long>(zero_vec))),
7);
*reinterpret_cast<AlignedI16LoadStoreVec*>(__builtin_assume_aligned(dst, 16)) =
reinterpret_cast<AlignedI16LoadStoreVec>(sums_of_8);
}
```
The above code compiles correctly with GCC 12 with the `--target=s390x-linux-gnu -O2 -march=z14 -mzvector -std=c++17` options.
Here is the LLVM IR that is generated when the above C++ snippet is compiled with clang++-19:
```
; ModuleID = 's390x_sum_of_lanes_test_081224_3_sum_routine.cpp'
source_filename = "s390x_sum_of_lanes_test_081224_3_sum_routine.cpp"
target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"
; Function Attrs: mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write)
define dso_local void @ZVectorSumOf8LanesVec(ptr noundef %0) local_unnamed_addr #0 {
%2 = tail call noundef <2 x i64> @llvm.s390.vsumgh(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>, <8 x i16> zeroinitializer)
%3 = tail call i128 @llvm.s390.vsumqg(<2 x i64> %2, <2 x i64> zeroinitializer)
%4 = trunc i128 %3 to i8
%5 = insertelement <16 x i8> poison, i8 %4, i64 0
%6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> zeroinitializer
call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 16) ]
store <16 x i8> %6, ptr %0, align 16, !tbaa !4
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1
; Function Attrs: nofree nosync nounwind memory(none)
declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) #2
; Function Attrs: nofree nosync nounwind memory(none)
declare i128 @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) #2
attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z14" "target-features"="+transactional-execution,+vector,+vector-enhancements-1" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nofree nosync nounwind memory(none) }
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{!"Ubuntu clang version 19.1.0 (++20240810103829+866686180a31-1~exp1~20240810103957.18)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C++ TBAA"}
```
Here is the expected LLVM IR code that should be generated for the above function:
```
; ModuleID = 's390x_sum_of_lanes_test_081224_3_sum_routine.cpp'
source_filename = "s390x_sum_of_lanes_test_081224_3_sum_routine.cpp"
target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"
; Function Attrs: mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write)
define dso_local void @ZVectorSumOf8LanesVec(ptr noundef %0) local_unnamed_addr #0 {
%2 = tail call noundef <2 x i64> @llvm.s390.vsumgh(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>, <8 x i16> zeroinitializer)
%3 = tail call i128 @llvm.s390.vsumqg(<2 x i64> %2, <2 x i64> zeroinitializer)
%4 = bitcast i128 %3 to <8 x i16>
%5 = shufflevector <8 x i16> %4, <8 x i16> %4, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 16) ]
store <8 x i16> %5, ptr %0, align 16, !tbaa !4
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1
; Function Attrs: nofree nosync nounwind memory(none)
declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) #2
; Function Attrs: nofree nosync nounwind memory(none)
declare i128 @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) #2
attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z14" "target-features"="+transactional-execution,+vector,+vector-enhancements-1" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nofree nosync nounwind memory(none) }
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{!"Ubuntu clang version 19.1.0 (++20240810103829+866686180a31-1~exp1~20240810103957.18)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C++ TBAA"}
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs