https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113392
Bug ID: 113392 Summary: Missed fold of loading 8 consecutive bytes leading to a missed byteswap optimization Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: llvm at rifkin dot dev Target Milestone: --- The simple load function uint64_t load64bits(const uint8_t* data) { uint8_t d0 = data[0]; uint8_t d1 = data[1]; uint8_t d2 = data[2]; uint8_t d3 = data[3]; uint8_t d4 = data[4]; uint8_t d5 = data[5]; uint8_t d6 = data[6]; uint8_t d7 = data[7]; return (uint64_t) d0 | (uint64_t) d1 << 8 | (uint64_t) d2 << 16 | (uint64_t) d3 << 24 | (uint64_t) d4 << 32 | (uint64_t) d5 << 40 | (uint64_t) d6 << 48 | (uint64_t) d7 << 56; } is correctly optimized to load64bits(unsigned char const*): mov rax, QWORD PTR [rdi] ret however, uint64_t load64bits2(const uint8_t* data, size_t index) { uint8_t d0 = data[index++]; uint8_t d1 = data[index++]; uint8_t d2 = data[index++]; uint8_t d3 = data[index++]; uint8_t d4 = data[index++]; uint8_t d5 = data[index++]; uint8_t d6 = data[index++]; uint8_t d7 = data[index++]; return (uint64_t) d0 | (uint64_t) d1 << 8 | (uint64_t) d2 << 16 | (uint64_t) d3 << 24 | (uint64_t) d4 << 32 | (uint64_t) d5 << 40 | (uint64_t) d6 << 48 | (uint64_t) d7 << 56; } compiles to load64bits2(unsigned char const*, unsigned long): mov rdx, rsi movzx eax, BYTE PTR [rdi+1+rsi] movzx esi, BYTE PTR [rdi+2+rsi] sal rax, 8 sal rsi, 16 or rax, rsi movzx esi, BYTE PTR [rdi+rdx] or rax, rsi movzx esi, BYTE PTR [rdi+3+rdx] sal rsi, 24 or rax, rsi movzx esi, BYTE PTR [rdi+4+rdx] sal rsi, 32 or rax, rsi movzx esi, BYTE PTR [rdi+5+rdx] sal rsi, 40 or rax, rsi movzx esi, BYTE PTR [rdi+6+rdx] movzx edx, BYTE PTR [rdi+7+rdx] sal rsi, 48 sal rdx, 56 or rax, rsi or rax, rdx ret Clang compiles both to a single mov. This impacts other operations, such as a simple byteswap uint64_t bswap64(const uint8_t* data, size_t index) { uint8_t d0 = data[index++]; uint8_t d1 = data[index++]; uint8_t d2 = data[index++]; uint8_t d3 = data[index++]; uint8_t d4 = data[index++]; uint8_t d5 = data[index++]; uint8_t d6 = data[index++]; uint8_t d7 = data[index++]; return (uint64_t) d7 | (uint64_t) d6 << 8 | (uint64_t) d5 << 16 | (uint64_t) d4 << 24 | (uint64_t) d3 << 32 | (uint64_t) d2 << 40 | (uint64_t) d1 << 48 | (uint64_t) d0 << 56; } compiling to bswap64(unsigned char const*, unsigned long): mov rdx, rsi movzx eax, BYTE PTR [rdi+6+rsi] movzx esi, BYTE PTR [rdi+5+rsi] sal rax, 8 sal rsi, 16 or rax, rsi movzx esi, BYTE PTR [rdi+7+rdx] or rax, rsi movzx esi, BYTE PTR [rdi+4+rdx] sal rsi, 24 or rax, rsi movzx esi, BYTE PTR [rdi+3+rdx] sal rsi, 32 or rax, rsi movzx esi, BYTE PTR [rdi+2+rdx] sal rsi, 40 or rax, rsi movzx esi, BYTE PTR [rdi+1+rdx] movzx edx, BYTE PTR [rdi+rdx] sal rsi, 48 sal rdx, 56 or rax, rsi or rax, rdx ret instead of bswap64(unsigned char const*, unsigned long): movbe rax, qword ptr [rdi + rsi] ret https://godbolt.org/z/bjxq1rEYY