[Bug middle-end/113322] New: [14 Regression] internal compiler error: tree check: expected none of vector_type, have vector_type in expand_single_bit_test, at expr.cc:13375

2024-01-10 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113322

Bug ID: 113322
   Summary: [14 Regression] internal compiler error: tree check:
expected none of vector_type, have vector_type in
expand_single_bit_test, at expr.cc:13375
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: middle-end
  Assignee: unassigned at gcc dot gnu.org
  Reporter: zhangjungcc at gmail dot com
  Target Milestone: ---

https://godbolt.org/z/b98WcM9h5

c code:
float a[16];
void 
foo ()
{
int i;
for (i = 0; i < 16/2; i++)
 {
 if (a[2*i+((0 +1)%2)] != (3 * (2*i+((0 +1)%2)) + 2))
  __builtin_abort ();
 }
}


during RTL pass: expand
: In function 'void foo()':
:9:9: internal compiler error: tree check: expected none of
vector_type, have vector_type in expand_single_bit_test, at expr.cc:13375
9 |  if (a[2*i+((0 +1)%2)] != (3 * (2*i+((0 +1)%2)) + 2))
  |~^~
0x263e8ac internal_error(char const*, ...)
???:0
0x9640f2 tree_not_check_failed(tree_node const*, char const*, int, char const*,
...)
???:0
0xf59501 expand_expr_real_2(separate_ops*, rtx_def*, machine_mode,
expand_modifier)
???:0
0xf6137f expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
???:0
0xf5982a expand_expr_real_2(separate_ops*, rtx_def*, machine_mode,
expand_modifier)
???:0
0xf6137f expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
???:0
0xf592f5 expand_expr_real_2(separate_ops*, rtx_def*, machine_mode,
expand_modifier)
???:0
0xf6137f expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
???:0
0xf63cf4 expand_operands(tree_node*, tree_node*, rtx_def*, rtx_def**,
rtx_def**, expand_modifier)
???:0
0xf5a01d expand_expr_real_2(separate_ops*, rtx_def*, machine_mode,
expand_modifier)
???:0
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.
See  for instructions.
Compiler returned: 1

[Bug middle-end/110015] openjpeg is slower when built with gcc13 compared to clang16

2023-10-31 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110015

jun zhang  changed:

   What|Removed |Added

 CC||zhangjungcc at gmail dot com

--- Comment #2 from jun zhang  ---
  The following loop couldn't vectorize in gcc, but could in llvm. it has 3%
improvement.
more info, please refer: https://godbolt.org/z/zMbjq41h5

#include
typedef signed int  OPJ_INT32;
typedef unsigned int OPJ_UINT32;
typedef int OPJ_BOOL;
#define OPJ_TRUE 1
#define OPJ_FALSE 0
typedef char  OPJ_CHAR;
typedef float OPJ_FLOAT32;
typedef doubleOPJ_FLOAT64;
typedef unsigned char OPJ_BYTE;
#define T1_NMSEDEC_FRACBITS 6
#define OPJ_RESTRICT restrict
#define OPJ_TLS_KEY_T1  0
#include 
typedef size_t   OPJ_SIZE_T;

typedef struct opj_tcd_cblk_enc {
OPJ_BYTE* data;   /* Data */
//opj_tcd_layer_t* layers;  /* layer information */
//opj_tcd_pass_t* passes;   /* information about the passes */
OPJ_INT32 x0, y0, x1,
  y1; /* dimension of the code-blocks : left upper corner (x0,
y0) right low corner (x1,y1) */
OPJ_UINT32 numbps;
OPJ_UINT32 numlenbits;
OPJ_UINT32 data_size; /* Size of allocated data buffer */
OPJ_UINT32
numpasses; /* number of pass already done for the code-blocks */
OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */
OPJ_UINT32 totalpasses;   /* total number of passes */
} opj_tcd_cblk_enc_t;
typedef struct opj_t1 {

/** MQC component */
//opj_mqc_t mqc;

OPJ_INT32  *data;
/** Flags used by decoder and encoder.
 * Such that flags[1+0] is for state of col=0,row=0..3,
   flags[1+1] for col=1, row=0..3, flags[1+flags_stride] for
col=0,row=4..7, ...
   This array avoids too much cache trashing when processing by 4 vertical
samples
   as done in the various decoding steps. */
//opj_flag_t *flags;

OPJ_UINT32 w;
OPJ_UINT32 h;
OPJ_UINT32 datasize;
OPJ_UINT32 flagssize;
OPJ_BOOL   encoder;

/* Thre 3 variables below are only used by the decoder */
/* set to TRUE in multithreaded context */
OPJ_BOOL mustuse_cblkdatabuffer;
/* Temporary buffer to concatenate all chunks of a codebock */
OPJ_BYTE*cblkdatabuffer;
/* Maximum size available in cblkdatabuffer */
OPJ_UINT32   cblkdatabuffersize;
} opj_t1_t;

#define INLINE __inline__
static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b)
{
return (a > b) ? a : b;
}
#define opj_to_smr(x)   ((x) >= 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) |
0x8000U))
OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
  opj_tcd_cblk_enc_t* cblk,
  OPJ_UINT32 orient,
  OPJ_UINT32 compno,
  OPJ_UINT32 level,
  OPJ_UINT32 qmfbid,
  OPJ_FLOAT64 stepsize,
  OPJ_UINT32 cblksty,
  OPJ_UINT32 numcomps,
  const OPJ_FLOAT64 * mct_norms,
  OPJ_UINT32 mct_numcomps)
{
OPJ_INT32 max;
OPJ_UINT32 i, j;
OPJ_INT32* datap;

max = 0;
datap = t1->data;
for (j = 0; j < t1->h; ++j) {
const OPJ_UINT32 w = t1->w;
for (i = 0; i < w; ++i, ++datap) {
OPJ_INT32 tmp = *datap;
if (tmp < 0) {
OPJ_UINT32 tmp_unsigned;
max = opj_int_max(max, -tmp);
tmp_unsigned = opj_to_smr(tmp);
memcpy(datap, _unsigned, sizeof(OPJ_INT32));
} else {
max = opj_int_max(max, tmp);
}
}
}
cblk->numbps = max ? 6 : 0;
}

[Bug target/109812] GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 on Intel Raptor Lake

2023-05-29 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812

--- Comment #11 from jun zhang  ---
Hello, Hubicka and Artem
I try to reproduce this issue in Raptor Lake,
I use -fopenmp -O3 -flto, meet the following error,
but if use -fopenmp -O3, no -flto, build ok.
Could you help me?

libtool: link: /home/sdp/jun/gcc0/install/bin/gcc -fopenmp -O3 -flto
-march=native -Wall -o utilities/gm utilities/gm.o
-L/home/sdp/jun/omp/Ofast/pts_g_gomp/install/.phoronix-test-suite/installed-tests/pts/graphics-magick-2.1.0/gm_/lib
magick/.libs/libGraphicsMagick.a -lfreetype -ljbig -ltiff -ljpeg
-lXext -lSM -lICE -lX11 -llzma -lbz2 -lz -lzstd -lm -lpthread -fopenmp
/home/sdp/jun/btl0/install/bin/ld: /tmp/ccnX75zI.ltrans0.ltrans.o: in
function `main':
:(.text.startup+0x1): undefined reference to `GMCommand'
collect2: error: ld returned 1 exit status
make[1]: *** [Makefile:6411: utilities/gm] Error 1
make[1]: Leaving directory


hubicka at gcc dot gnu.org  于2023年5月29日周一 02:50写道:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812
>
> --- Comment #10 from Jan Hubicka  ---
> This is benchmarkeable version of the simplified testcase:
>
> jan@localhost:/tmp> cat t.c
> #define N 1000
> struct rgb {unsigned char r,g,b;} rgbs[N];
> int *addr;
> struct drgb {double r,g,b;
> #ifdef OPACITY
>  double o;
> #endif
> };
>
> struct drgb sum(double w)
> {
> struct drgb r;
> for (int i = 0; i < N; i++)
> {
>   r.r += rgbs[i].r * w;
>   r.g += rgbs[i].g * w;
>   r.b += rgbs[i].b * w;
> }
> return r;
> }
> jan@localhost:/tmp> cat q.c
> struct drgb {double r,g,b;
> #ifdef OPACITY
>  double o;
> #endif
> };
> struct drgb sum(double w);
> int
> main()
> {
> for (int i = 0; i < 1000; i++)
> sum(i);
> }
>
>
> jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g ; objdump -d a.out | grep
> vfmadd231pd  ; perf stat ./a.out
>   40119d:   c4 e2 d9 b8 d1  vfmadd231pd %xmm1,%xmm4,%xmm2
>
>  Performance counter stats for './a.out':
>
>  12,148.04 msec task-clock:u #1.000 CPUs
> utilized
>  0  context-switches:u   #0.000 /sec
>  0  cpu-migrations:u #0.000 /sec
>736  page-faults:u#   60.586 /sec
> 50,018,421,148  cycles:u #4.117 GHz
>220,502  stalled-cycles-frontend:u#0.00% frontend
> cycles idle
> 39,950,154,369  stalled-cycles-backend:u #   79.87% backend
> cycles idle
>120,000,191,713  instructions:u   #2.40  insn per
> cycle
>   #0.33  stalled cycles 
> per
> insn
> 10,000,048,918  branches:u   #  823.182 M/sec
>  7,959  branch-misses:u  #0.00% of all
> branches
>
>   12.149466078 seconds time elapsed
>
>   12.149084000 seconds user
>0.0 seconds sys
>
>
> jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g -DOPACITY ; objdump -d
> a.out | grep vfmadd231pd  ; perf stat ./a.out
>
>  Performance counter stats for './a.out':
>
>  12,141.11 msec task-clock:u #1.000 CPUs
> utilized
>  0  context-switches:u   #0.000 /sec
>  0  cpu-migrations:u #0.000 /sec
>735  page-faults:u#   60.538 /sec
> 50,018,839,129  cycles:u #4.120 GHz
>185,034  stalled-cycles-frontend:u#0.00% frontend
> cycles idle
> 29,963,999,798  stalled-cycles-backend:u #   59.91% backend
> cycles idle
>120,000,191,729  instructions:u   #2.40  insn per
> cycle
>   #0.25  stalled cycles 
> per
> insn
> 10,000,048,913  branches:u   #  823.652 M/sec
>  7,311  branch-misses:u  #0.00% of all
> branches
>
>   12.142252354 seconds time elapsed
>
>   12.138237000 seconds user
>0.00400 seconds sys
>
>
> So on zen2 hardware I get same performance on both.  It may be interesting to
> test it on Raptor Lake.
>
> --
> You are receiving this mail because:
> You are on the CC list for the bug.

[Bug libstdc++/109445] r13-6372-g822a11a1e642e0 regression due to noline with -Ofast -march=sapphirerapids -funroll-loops -flto, 541.leela_r performance decrease by 2-3%

2023-04-20 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109445

--- Comment #5 from jun zhang  ---
Created attachment 54890
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=54890=edit
set param_inline_unit_growth to 41

Hello, Andrew
  this patch could work!

[Bug libstdc++/109445] r13-6372-g822a11a1e642e0 regression due to noline with -Ofast -march=sapphirerapids -funroll-loops -flto, 541.leela_r performance decrease by 2-3%

2023-04-20 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109445

--- Comment #4 from jun zhang  ---
Created attachment 54889
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=54889=edit
leela_r.wpa.085i.inline log

[Bug libstdc++/109445] r13-6372-g822a11a1e642e0 regression due to noline with -Ofast -march=sapphirerapids -funroll-loops -flto, 541.leela_r performance decrease by 2-3%

2023-04-20 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109445

--- Comment #2 from jun zhang  ---
Created attachment 54888
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=54888=edit
random unlined

[Bug libstdc++/109445] New: r13-6372-g822a11a1e642e0 regression due to noline with -Ofast -march=sapphirerapids -funroll-loops -flto, 541.leela_r performance decrease by 2-3%

2023-04-06 Thread zhangjungcc at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109445

Bug ID: 109445
   Summary: r13-6372-g822a11a1e642e0 regression due to noline with
-Ofast -march=sapphirerapids -funroll-loops -flto,
541.leela_r performance decrease by 2-3%
   Product: gcc
   Version: 13.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: libstdc++
  Assignee: unassigned at gcc dot gnu.org
  Reporter: zhangjungcc at gmail dot com
  Target Milestone: ---

r13-6372-g822a11a1e642e0 regression due to noline with -Ofast
-march=sapphirerapids -funroll-loops -flto, 541.leela_r performance decrease by
2-3%

Follow is the inline dump, left dump is before the commit, right dump is after
the commit.

   [local count: 210861628]:[local count:
210861628]:
  # DEBUG BEGIN_STMT   # DEBUG BEGIN_STMT
  _466 = s_rng;_466 = s_rng;
--
  _607 = _466;<>   _561 = _466;
  _118 = _607; _118 = _561;
--
  _35 = this_72(D)->board.D.5191.m_empty_cnt; =_35 =
this_72(D)->board.D.5191.m_empty_cnt;
  _5 = _35 & 65535;_5 = _35 & 65535;
  # DEBUG this => _118 # DEBUG this => _118
  max_458 = (const uint16) _5; max_458 = (const uint16) _5;
  # DEBUG max => max_458   # DEBUG max => max_458
  # DEBUG BEGIN_STMT   # DEBUG BEGIN_STMT
--
  # DEBUG this => _118<>
  # DEBUG BEGIN_STMT
  # DEBUG mask => 4294967295
  # DEBUG BEGIN_STMT
  # DEBUG BEGIN_STMT
  _467 = _118->s1;
  _468 = _467 << 13;
  _469 = _467 ^ _468;
  b_470 = _469 >> 19;
  # DEBUG b => b_470
  # DEBUG BEGIN_STMT
  _471 = _467 << 12;
  _472 = _471 & 4294959104;
  _473 = b_470 ^ _472;
  _118->s1 = _473;
  # DEBUG BEGIN_STMT
  _474 = _118->s2;
  _475 = _474 << 2;
  _476 = _474 ^ _475;
  b_477 = _476 >> 25;
  # DEBUG b => b_477
  # DEBUG BEGIN_STMT
  _478 = _474 << 4;
  _479 = _478 & 4294967168;
  _480 = b_477 ^ _479;
  _118->s2 = _480;
  # DEBUG BEGIN_STMT
  _481 = _118->s3;
  _482 = _481 << 3;
  _483 = _481 ^ _482;
  b_484 = _483 >> 11;
  # DEBUG b => b_484
  # DEBUG BEGIN_STMT
  _485 = _481 << 17;
  _486 = _485 & 4292870144;
  _487 = b_484 ^ _486;
  _118->s3 = _487;
  # DEBUG BEGIN_STMT
  _488 = _473 ^ _480;
  _489 = _487 ^ _488;
  _611 = _489; _459 = random (_118);
  # DEBUG this => NULL
  # DEBUG b => NULL
  _459 = _611;
--
  _460 = _459 >> 16;  =_460 = _459 >> 16;
  _461 = (unsigned int) max_458;   _461 = (unsigned int)
max_458;
  _462 = _460 * _461;  _462 = _460 * _461;
  _463 = _462 >> 16;   _463 = _462 >> 16;
--
  _612 = _463;<>   _563 = _463;
--
  # DEBUG this => NULL=# DEBUG this => NULL
  # DEBUG max => NULL  # DEBUG max => NULL
--
  _120 = _612;<>   _120 = _563;
--
  vidx_121 = (int) _120;  =vidx_121 = (int) _120;
  # DEBUG vidx => vidx_121 # DEBUG vidx => vidx_121
  # DEBUG BEGIN_STMT   # DEBUG BEGIN_STMT
  _37 = this_72(D)->board.D.5191.m_tomove; _37 =
this_72(D)->board.D.5191.m_tomove;
--
  # DEBUG D#1845 => 1 <>   # DEBUG D#1824 => 1
--
  # DEBUG this => this_72(D)  =# DEBUG this => this_72(D)
  # DEBUG color => _37 # DEBUG color => _37
  # DEBUG vidx => vidx_121 # DEBUG vidx => vidx_121
  # DEBUG allow_sa => 1# DEBUG allow_sa => 1
  # DEBUG BEGIN_STMT   # DEBUG BEGIN_STMT
--