https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110979
Bug ID: 110979 Summary: Miss-optimization for O2 fully masked loop on floating point reduction. Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: crazylht at gmail dot com Target Milestone: --- https://godbolt.org/z/YsaesW8zT float foo3 (float* __restrict a, int n) { float sum = 0.0f; for (int i = 0; i != 100; i++) sum += a[i]; return sum; } -O2 -march=znver4 --param vect-partial-vector-usage=2, we get <bb 3> [local count: 66437776]: # sum_13 = PHI <sum_10(3), 0.0(2)> # loop_mask_16 = PHI <_54(3), { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }(2)> # ivtmp.13_12 = PHI <ivtmp.13_15(3), ivtmp.13_1(2)> # ivtmp.16_2 = PHI <ivtmp.16_3(3), 84(2)> # DEBUG i => NULL # DEBUG sum => NULL # DEBUG BEGIN_STMT _4 = (void *) ivtmp.13_12; _11 = &MEM <vector(16) float> [(float *)_4]; vect__4.6_17 = .MASK_LOAD (_11, 32B, loop_mask_16); cond_18 = .VCOND_MASK (loop_mask_16, vect__4.6_17, { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }); stmp_sum_10.7_19 = BIT_FIELD_REF <cond_18, 32, 0>; stmp_sum_10.7_20 = sum_13 + stmp_sum_10.7_19; stmp_sum_10.7_21 = BIT_FIELD_REF <cond_18, 32, 32>; stmp_sum_10.7_22 = stmp_sum_10.7_20 + stmp_sum_10.7_21; stmp_sum_10.7_23 = BIT_FIELD_REF <cond_18, 32, 64>; stmp_sum_10.7_24 = stmp_sum_10.7_22 + stmp_sum_10.7_23; stmp_sum_10.7_25 = BIT_FIELD_REF <cond_18, 32, 96>; stmp_sum_10.7_26 = stmp_sum_10.7_24 + stmp_sum_10.7_25; stmp_sum_10.7_27 = BIT_FIELD_REF <cond_18, 32, 128>; stmp_sum_10.7_28 = stmp_sum_10.7_26 + stmp_sum_10.7_27; stmp_sum_10.7_29 = BIT_FIELD_REF <cond_18, 32, 160>; stmp_sum_10.7_30 = stmp_sum_10.7_28 + stmp_sum_10.7_29; stmp_sum_10.7_31 = BIT_FIELD_REF <cond_18, 32, 192>; stmp_sum_10.7_32 = stmp_sum_10.7_30 + stmp_sum_10.7_31; stmp_sum_10.7_33 = BIT_FIELD_REF <cond_18, 32, 224>; stmp_sum_10.7_34 = stmp_sum_10.7_32 + stmp_sum_10.7_33; stmp_sum_10.7_35 = BIT_FIELD_REF <cond_18, 32, 256>; stmp_sum_10.7_36 = stmp_sum_10.7_34 + stmp_sum_10.7_35; stmp_sum_10.7_37 = BIT_FIELD_REF <cond_18, 32, 288>; stmp_sum_10.7_38 = stmp_sum_10.7_36 + stmp_sum_10.7_37; stmp_sum_10.7_39 = BIT_FIELD_REF <cond_18, 32, 320>; stmp_sum_10.7_40 = stmp_sum_10.7_38 + stmp_sum_10.7_39; stmp_sum_10.7_41 = BIT_FIELD_REF <cond_18, 32, 352>; stmp_sum_10.7_42 = stmp_sum_10.7_40 + stmp_sum_10.7_41; stmp_sum_10.7_43 = BIT_FIELD_REF <cond_18, 32, 384>; stmp_sum_10.7_44 = stmp_sum_10.7_42 + stmp_sum_10.7_43; stmp_sum_10.7_45 = BIT_FIELD_REF <cond_18, 32, 416>; stmp_sum_10.7_46 = stmp_sum_10.7_44 + stmp_sum_10.7_45; stmp_sum_10.7_47 = BIT_FIELD_REF <cond_18, 32, 448>; stmp_sum_10.7_48 = stmp_sum_10.7_46 + stmp_sum_10.7_47; stmp_sum_10.7_49 = BIT_FIELD_REF <cond_18, 32, 480>; sum_10 = stmp_sum_10.7_48 + stmp_sum_10.7_49; # DEBUG sum => sum_10 # DEBUG BEGIN_STMT # DEBUG i => NULL # DEBUG sum => sum_10 # DEBUG BEGIN_STMT _53 = {ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2}; _54 = _53 > { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; ivtmp.13_15 = ivtmp.13_12 + 64; ivtmp.16_3 = ivtmp.16_2 + 240; if (ivtmp.16_3 != 228) Looks like an cost model issue? For aarch64, it looks fine since they have FADDA(Floating-point add strictly-ordered reduction, accumulating in scalar).