| Issue |
174066
|
| Summary |
x86-lower-amx-type pass can lead to "Instruction does not dominate all uses!"
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
thurstond
|
## Reproducer
```
int typedef perkins __attribute__((__vector_size__(1024)));
short b, c, d;
perkins e, f;
void __attribute__((__target__("amx-int8"))) g(perkins h) {
__builtin_ia32_tdpbssd_internal(b, c, d, e, f, h);
}
```
## Output
https://godbolt.org/z/fGPn6qf8j
`clang -O1` fails at trunk with:
```
Instruction does not dominate all uses!
%4 = load i16, ptr @c, align 2, !dbg !42, !tbaa !40
%2 = sext i16 %4 to i64
in function _Z1gDv256_i
fatal error: error in backend: Broken function found, compilation aborted!
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
```
and with clang 15.0.0:
```
Instruction does not dominate all uses!
%5 = udiv i16 %4, 4, !dbg !39
%1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %3, ptr %0, i64 64)
Instruction does not dominate all uses!
%3 = load i16, ptr @c, align 2, !dbg !38, !tbaa !34
%1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %3, ptr %0, i64 64)
...
```
## Isolating the buggy pass
I compiled to IR with clang at trunk and then ran `llc -verify-each`:
```
*** IR Dump After Lower AMX intrinsics (x86-lower-amx-intrinsics) ***
; Function Attrs: mustprogress nounwind uwtable
define dso_local void @_Z1gDv256_i(ptr noundef readonly byval(<256 x i32>) align 1024 captures(none) %0) local_unnamed_addr #0 {
entry:
%h = load <256 x i32>, ptr %0, align 1024, !tbaa !9
%1 = load i16, ptr @b, align 2, !tbaa !10
%2 = load i16, ptr @c, align 2, !tbaa !10
%3 = load i16, ptr @d, align 2, !tbaa !10
%4 = load <256 x i32>, ptr @e, align 1024, !tbaa !9
%5 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4)
%6 = load <256 x i32>, ptr @f, align 1024, !tbaa !9
%7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6)
%8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %h)
%9 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %3, x86_amx %5, x86_amx %7, x86_amx %8)
ret void
}
*** IR Dump After Lower AMX type for load/store (x86-lower-amx-type) ***
; Function Attrs: mustprogress nounwind uwtable
define dso_local void @_Z1gDv256_i(ptr noundef readonly byval(<256 x i32>) align 1024 captures(none) %0) local_unnamed_addr #0 {
entry:
%1 = alloca <256 x i32>, align 64
%2 = sext i16 %4 to i64
%h = load <256 x i32>, ptr %0, align 1024, !tbaa !9
store <256 x i32> %h, ptr %1, align 1024
%3 = load i16, ptr @b, align 2, !tbaa !10
%4 = load i16, ptr @c, align 2, !tbaa !10
%5 = load i16, ptr @d, align 2, !tbaa !10
%6 = udiv i16 %5, 4
%7 = sext i16 %4 to i64
%8 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %3, i16 %4, ptr @e, i64 %7)
%9 = sext i16 %5 to i64
%10 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %3, i16 %5, ptr @f, i64 %9)
%11 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %4, ptr %1, i64 %2)
%12 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %3, i16 %4, i16 %5, x86_amx %8, x86_amx %10, x86_amx %11)
ret void
}
Instruction does not dominate all uses!
%4 = load i16, ptr @c, align 2, !tbaa !10
%2 = sext i16 %4 to i64
in function _Z1gDv256_i
LLVM ERROR: Broken function found, compilation aborted!
```
----
P.S. shameless plug: it was computationally expensive to get the reduced reproducer above, because bugpoint and llvm-reduce are incompatible with AMX (https://github.com/llvm/llvm-project/issues/166653); I ended up using creduce on a 13MB preprocessed C++ file.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs