Issue 174066
Summary x86-lower-amx-type pass can lead to "Instruction does not dominate all uses!"
Labels new issue
Assignees
Reporter thurstond
    ## Reproducer

```
int typedef perkins __attribute__((__vector_size__(1024)));
short b, c, d;
perkins e, f;
void __attribute__((__target__("amx-int8"))) g(perkins h) {
 __builtin_ia32_tdpbssd_internal(b, c, d, e, f, h);
}
```

## Output

https://godbolt.org/z/fGPn6qf8j

`clang -O1` fails at trunk with:

```
Instruction does not dominate all uses!
  %4 = load i16, ptr @c, align 2, !dbg !42, !tbaa !40
  %2 = sext i16 %4 to i64
in function _Z1gDv256_i
fatal error: error in backend: Broken function found, compilation aborted!
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
```
and with clang 15.0.0:
```
Instruction does not dominate all uses!
  %5 = udiv i16 %4, 4, !dbg !39
  %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %3, ptr %0, i64 64)
Instruction does not dominate all uses!
  %3 = load i16, ptr @c, align 2, !dbg !38, !tbaa !34
  %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %3, ptr %0, i64 64)
...
```

## Isolating the buggy pass

I compiled to IR with clang at trunk and then ran `llc -verify-each`:

```
*** IR Dump After Lower AMX intrinsics (x86-lower-amx-intrinsics) ***
; Function Attrs: mustprogress nounwind uwtable
define dso_local void @_Z1gDv256_i(ptr noundef readonly byval(<256 x i32>) align 1024 captures(none) %0) local_unnamed_addr #0 {
entry:
  %h = load <256 x i32>, ptr %0, align 1024, !tbaa !9
  %1 = load i16, ptr @b, align 2, !tbaa !10
  %2 = load i16, ptr @c, align 2, !tbaa !10
  %3 = load i16, ptr @d, align 2, !tbaa !10
  %4 = load <256 x i32>, ptr @e, align 1024, !tbaa !9
  %5 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4)
  %6 = load <256 x i32>, ptr @f, align 1024, !tbaa !9
  %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6)
  %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %h)
  %9 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %3, x86_amx %5, x86_amx %7, x86_amx %8)
  ret void
}
*** IR Dump After Lower AMX type for load/store (x86-lower-amx-type) ***
; Function Attrs: mustprogress nounwind uwtable
define dso_local void @_Z1gDv256_i(ptr noundef readonly byval(<256 x i32>) align 1024 captures(none) %0) local_unnamed_addr #0 {
entry:
  %1 = alloca <256 x i32>, align 64
  %2 = sext i16 %4 to i64
  %h = load <256 x i32>, ptr %0, align 1024, !tbaa !9
  store <256 x i32> %h, ptr %1, align 1024
  %3 = load i16, ptr @b, align 2, !tbaa !10
  %4 = load i16, ptr @c, align 2, !tbaa !10
  %5 = load i16, ptr @d, align 2, !tbaa !10
  %6 = udiv i16 %5, 4
  %7 = sext i16 %4 to i64
  %8 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %3, i16 %4, ptr @e, i64 %7)
  %9 = sext i16 %5 to i64
  %10 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %3, i16 %5, ptr @f, i64 %9)
  %11 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %4, ptr %1, i64 %2)
  %12 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %3, i16 %4, i16 %5, x86_amx %8, x86_amx %10, x86_amx %11)
  ret void
}
Instruction does not dominate all uses!
  %4 = load i16, ptr @c, align 2, !tbaa !10
  %2 = sext i16 %4 to i64
in function _Z1gDv256_i
LLVM ERROR: Broken function found, compilation aborted!
```

----

P.S. shameless plug: it was computationally expensive to get the reduced reproducer above, because bugpoint and llvm-reduce are incompatible with AMX (https://github.com/llvm/llvm-project/issues/166653); I ended up using creduce on a 13MB preprocessed C++ file.

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to