Issue 102911
Summary Clang 19 generates incorrect code for vec_splat of the lower 16 bits of the vec_sum_u128 result on Z14
Labels
Assignees
Reporter johnplatts
    Here is a snippet of C++ code that generates incorrect code when compiled with Clang 19 on Z14 with the `--target=s390x-linux-gnu -O2 -march=z14 -mzvector -std=c++17` options:
```
#include <stdint.h>

#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <vecintrin.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

typedef int16_t AlignedI16LoadStoreVec
 __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));

extern "C" {
  void ZVectorSumOf8LanesVec(int16_t* dst);
}

void ZVectorSumOf8LanesVec(int16_t* dst) {
  constexpr __vector unsigned short kVecToSum =
    { 1, 2, 4, 8, 16, 32, 64, 128 };
  const __vector signed int zero_vec = vec_splats(0);
  const __vector unsigned long long sums_of_4 =
    vec_sum2(kVecToSum, reinterpret_cast<__vector unsigned short>(zero_vec));
  const __vector signed short sums_of_8 =
 vec_splat(reinterpret_cast<__vector signed short>(vec_sum_u128(
 sums_of_4,
    reinterpret_cast<__vector unsigned long long>(zero_vec))),
    7);

 *reinterpret_cast<AlignedI16LoadStoreVec*>(__builtin_assume_aligned(dst, 16)) =
 reinterpret_cast<AlignedI16LoadStoreVec>(sums_of_8);
}
```

The above code compiles correctly with GCC 12 with the `--target=s390x-linux-gnu -O2 -march=z14 -mzvector -std=c++17` options.

Here is the LLVM IR that is generated when the above C++ snippet is compiled with clang++-19:
```
; ModuleID = 's390x_sum_of_lanes_test_081224_3_sum_routine.cpp'
source_filename = "s390x_sum_of_lanes_test_081224_3_sum_routine.cpp"
target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"

; Function Attrs: mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write)
define dso_local void @ZVectorSumOf8LanesVec(ptr noundef %0) local_unnamed_addr #0 {
  %2 = tail call noundef <2 x i64> @llvm.s390.vsumgh(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>, <8 x i16> zeroinitializer)
  %3 = tail call i128 @llvm.s390.vsumqg(<2 x i64> %2, <2 x i64> zeroinitializer)
  %4 = trunc i128 %3 to i8
  %5 = insertelement <16 x i8> poison, i8 %4, i64 0
  %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> zeroinitializer
  call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 16) ]
  store <16 x i8> %6, ptr %0, align 16, !tbaa !4
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1

; Function Attrs: nofree nosync nounwind memory(none)
declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) #2

; Function Attrs: nofree nosync nounwind memory(none)
declare i128 @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) #2

attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z14" "target-features"="+transactional-execution,+vector,+vector-enhancements-1" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nofree nosync nounwind memory(none) }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{!"Ubuntu clang version 19.1.0 (++20240810103829+866686180a31-1~exp1~20240810103957.18)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C++ TBAA"}
```

Here is the expected LLVM IR code that should be generated for the above function:
```
; ModuleID = 's390x_sum_of_lanes_test_081224_3_sum_routine.cpp'
source_filename = "s390x_sum_of_lanes_test_081224_3_sum_routine.cpp"
target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"

; Function Attrs: mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write)
define dso_local void @ZVectorSumOf8LanesVec(ptr noundef %0) local_unnamed_addr #0 {
  %2 = tail call noundef <2 x i64> @llvm.s390.vsumgh(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>, <8 x i16> zeroinitializer)
  %3 = tail call i128 @llvm.s390.vsumqg(<2 x i64> %2, <2 x i64> zeroinitializer)
  %4 = bitcast i128 %3 to <8 x i16>
  %5 = shufflevector <8 x i16> %4, <8 x i16> %4, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 16) ]
  store <8 x i16> %5, ptr %0, align 16, !tbaa !4
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1

; Function Attrs: nofree nosync nounwind memory(none)
declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) #2

; Function Attrs: nofree nosync nounwind memory(none)
declare i128 @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) #2

attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: write, inaccessiblemem: write) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z14" "target-features"="+transactional-execution,+vector,+vector-enhancements-1" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #2 = { nofree nosync nounwind memory(none) }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{!"Ubuntu clang version 19.1.0 (++20240810103829+866686180a31-1~exp1~20240810103957.18)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C++ TBAA"}
```

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to