https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95790

--- Comment #8 from Yichao Yu <yyc1992 at gmail dot com> ---
And the reason I reported this as a mis-optimization rather than something
completely unsupported is that the following code.

```
#include <stdio.h>

// #define disable_opt __attribute__((flatten))
#define disable_opt

disable_opt __attribute__ ((target ("default")))
static unsigned foo(const char *buf, unsigned size) {
  return 1;
}

disable_opt __attribute__ ((target ("avx")))
static unsigned foo(const char *buf, unsigned size) {
  return 2;
}

disable_opt __attribute__ ((target ("avx2")))
static unsigned foo(const char *buf, unsigned size) {
  return 3;
}

__attribute__ ((target ("default")))
unsigned bar() {
  char buf[4096];
  unsigned acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
    acc += foo(&buf[i], 1);
  }
  return acc;
}

__attribute__ ((target ("avx")))
unsigned bar() {
  char buf[4096];
  unsigned acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
    acc += foo(&buf[i], 1);
  }
  return acc;
}

int main()
{
    printf("%u\n", bar());
    return 0;
}
```

when compiled with `#define disable_opt`, prints the wrong answer `8192` on my
avx2 laptop. OTOH, with `#define disable_opt __attribute__((flatten))` to
disable the inlining using the bug, it prints the correct result of 12288.
Other ways force an independent dispatch like the following using a volatile
slot also works.

```
#include <stdio.h>

__attribute__ ((target ("default")))
static unsigned _foo(const char *buf, unsigned size) {
  return 1;
}

__attribute__ ((target ("avx")))
static unsigned _foo(const char *buf, unsigned size) {
  return 2;
}

__attribute__ ((target ("avx2")))
static unsigned _foo(const char *buf, unsigned size) {
  return 3;
}

static unsigned (* volatile foo)(const char *buf, unsigned size) = _foo;

__attribute__ ((target ("default")))
unsigned bar() {
  char buf[4096];
  unsigned acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
    acc += foo(&buf[i], 1);
  }
  return acc;
}

__attribute__ ((target ("avx")))
unsigned bar() {
  char buf[4096];
  unsigned acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
    acc += foo(&buf[i], 1);
  }
  return acc;
}

int main()
{
    printf("%u\n", bar());
    return 0;
}
```

I think this suggests that the most basic codegen without optimization is
clearly working and this usage (being it nested multiversioning or not) isn't
something that's just not supported. Rather it's only the optimization that's
wrong.

Reply via email to