https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102054
--- Comment #2 from Kewen Lin <linkw at gcc dot gnu.org> --- Yet another reduced test case from 526.blender_r. #include <math.h> typedef struct QMCSampler { struct QMCSampler *next, *prev; int type; int tot; int used; double *samp2d; double offs[1][2]; } QMCSampler; float BLI_thread_frand(int thread); static void halton_sample(double *ht_invprimes, double *ht_nums, double *v) { unsigned int i; for (i = 0; i < 2; i++) { double r = fabs((1.0 - ht_nums[i]) - 1e-10); if (ht_invprimes[i] >= r) { double lasth; double h = ht_invprimes[i]; do { lasth = h; h *= ht_invprimes[i]; } while (h >= r); ht_nums[i] += ((lasth + h) - 1.0); } else ht_nums[i] += ht_invprimes[i]; v[i] = (float)ht_nums[i]; } } void QMC_initPixel(QMCSampler *qsa, int thread) { if (qsa->type == 2) { qsa->offs[thread][0] = 0.5f * BLI_thread_frand(thread); qsa->offs[thread][1] = 0.5f * BLI_thread_frand(thread); } else { double ht_invprimes[2], ht_nums[2]; double r[2]; int i; ht_nums[0] = BLI_thread_frand(thread); ht_nums[1] = BLI_thread_frand(thread); ht_invprimes[0] = 0.5; ht_invprimes[1] = 1.0 / 3.0; for (i = 0; i < qsa->tot; i++) { halton_sample(ht_invprimes, ht_nums, r); qsa->samp2d[2 * i + 0] = r[0]; qsa->samp2d[2 * i + 1] = r[1]; } } } Without loop vectorization, unrestricted pre makes the loop happy for cunroll and the loop was completely unrolled. The affected pct. is also small, about 0.7%.