https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102495
Bug ID: 102495 Summary: optimize some consecutive byte load pattern to word load Product: gcc Version: 11.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other Assignee: unassigned at gcc dot gnu.org Reporter: mytbk920423 at gmail dot com Target Milestone: --- I use the following code get a 32-bit word from a byte array by loading each byte and shifting them, but GCC doesn't optimize the code to a single word load when I put the byte load in a loop. Clang trunk can optimize all of the follows: https://gcc.godbolt.org/z/KfWE67K5c ``` #define SHL(a,b) ((uint32_t)(a) << (b)) // both GCC and Clang optimize to *(uint32_t*)(vv) uint32_t getword_b(const uint8_t *vv) { return SHL(vv[3], 24) | SHL(vv[2], 16) | SHL(vv[1], 8) | SHL(vv[0], 0); } // GCC cannot optimize this, Clang can uint32_t getword_forloop(const uint8_t *vv) { uint32_t res = 0; for (size_t i = 0; i < 4; i++) { res |= SHL(vv[i], (i * 8)); } return res; } // both GCC and Clang optimize to ((uint32_t*)(vec))[word_idx] uint32_t getword_from_vec(const uint8_t *vec, size_t word_idx) { size_t byte_idx = word_idx * 4; const uint8_t *vv = vec + byte_idx; return SHL(vv[3], 24) | SHL(vv[2], 16) | SHL(vv[1], 8) | SHL(vv[0], 0); } // neither GCC nor Clang 12.0.1 can optimize this, Clang trunk can uint32_t getword_from_vec_forloop(const uint8_t *vec, size_t word_idx) { size_t byte_idx = word_idx * 4; uint32_t res = 0; for (size_t i = 0; i < 4; i++) { res |= SHL(vec[byte_idx + i], (i * 8)); } return res; } ```