Paul Eggert wrote:
> > your simplified code does *NOT* generate the same code. If you came
> > to this impression, you probably must have tested x86_64, x86, arm64 only.
>
> Yes, I did only x86-64 to be honest. Ouch: I am surprised RISC-V is so
> poorly optimized. alpha and sparc64 are of course lower priority, but if
> we need to do RISC-V anyway we might as well do them too.
It's not that RISC-V is "poorly optimized". It's that when the alignment is
unknown, the best load/store instruction sequence is different (longer)
than when the alignment is known. It's like this in RISC processors.
x86_64 and similar have instructions that are fast in the aligned case
and slow in the unaligned case, but still (apparently) fast enough that
the GCC authors decided to use these instructions for the unaligned case
too. CISC.
> I attempted to do this by installing the attached patch. This restores
> the optimization by using 'if' rather than '#if' as typically 'if' is
> better than '#ifdef', when either will do.
Good. I confirm that this patch restores the optimization in the
*_aligned_* functions. See the attached loadstore8-paul2.s output.
> > The casts to uint_fast16_t were there for speed. On some architectures,
> > such as sparc, it is more efficient to work with 32-bit integers than with
> > 16-bit integers, and the definition of uint_fast16_t as uint32_t embodies
> > this knowledge. Removing these casts is also a de-optimization that no one
> > has asked for.
>
> I don't understand the deoptimization there, as on all targets of GNU
> the values are promoted to at least 32 bits even without the casts. But
> it's not a big deal so I put that back in too, in the attached patch. It
> makes the code a bit more symmetric even if it is a bit more casty.
Thanks. Yes, if 'int' and 'unsigned int' are 32-bit, as they usally are
under our assumptions, the "usual promotions" do extra things. But I find
- that it takes extra brain cycles to consider their effects,
- as you say, that it breaks the symmetry to omit the uint_fast16_t
casts when we use the uint_fast32_t and uint_fast64_t casts.
> >> These patches also remove casts that
> >> aren't needed (some of which confused me a bit).
> >
> > The casts to signed intN_t types were there for clarity.
>
> For me they made the code less clear. For example, there's no point to
> casting int_least64_t to uint64_t just before converting to
> uint_least64_t, as uint64_t cannot possibly have fewer bits than
> uint_least64_t. So I was puzzled as to why that cast was there.
It isn't really clear to me for what abstract machine we are writing code.
A few years ago, you taught us to care about machines with CHAR_BIT == 36,
for machines with hidden bits in the integers, for machines where -0 and
+0 were represented differently, and so on. Whereas here, clearly, everyone
is assuming that
uint8_t == 'unsigned char' has 8 bits,
uint16_t == 'unsigned short' has 16 bits,
uint32_t == 'unsigned int' has 32 bits,
uint64_t == 'unsigned long long' has 64 bits,
and two's complement.
I wanted to protect the code against pitfalls with the non-standard machine
types, but apparently did not know which pitfalls are real and relevant
and which aren't.
> Also, this part of stdbit.h is specified without exact-width types. So
> it's disconcerting to see unnecessary use of exact-width types in the
> Gnulib implementation; it's not clear why exact-width types are helpful
> in code where the API stays away from them. (C2y allows the exact-width
> types to not exist at all; of course that's mostly theoretical for Gnulib.)
Yes, this is my confusion: If, say, uint16_t does not exist at all, how
is uint_least16_t represented? And do we have to write
x & 0xFFFFU
instead of
(uint16_t) x
then? And similarly, will a conversion (implicit or cast) from
uint_least16_t to int_least16_t extend the sign bit?
Bruno
#include <stdint.h>
#include <string.h>
#include <byteswap.h>
#if (defined __clang__ ? __clang_major__ >= 4 : \
(defined __GNUC__ \
&& (defined __cplusplus \
? __GNUC__ + (__GNUC_MINOR__ >= 9) > 4 \
: __GNUC__ + (__GNUC_MINOR__ >= 7) > 4)))
# define _GL_STDBIT_ASSUME_ALIGNED(ptr, align) \
__builtin_assume_aligned (ptr, align)
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 1
#elif defined _MSC_VER
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 1
#endif
#ifndef _GL_STDBIT_ASSUME_ALIGNED
# define _GL_STDBIT_ASSUME_ALIGNED(ptr, align) (ptr)
#endif
#ifndef _GL_STDBIT_OPTIMIZE_VIA_MEMCPY
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 0
#endif
#define _GL_STDBIT_BIGENDIAN 0
static inline uint_least8_t
stdc_load8_beu8 (const unsigned char ptr[1])
{
return ptr[0];
}
static inline uint_least16_t
stdc_load8_beu16 (const unsigned char ptr[2])
{
return ((uint_fast16_t) ptr[0] << 8) | (uint_fast16_t) ptr[1];
}
static inline uint_least32_t
stdc_load8_beu32 (const unsigned char ptr[4])
{
return ((uint_fast32_t) ptr[0] << 24) | ((uint_fast32_t) ptr[1] << 16)
| ((uint_fast32_t) ptr[2] << 8) | (uint_fast32_t) ptr[3];
}
static inline uint_least64_t
stdc_load8_beu64 (const unsigned char ptr[8])
{
return ((uint_fast64_t) ptr[0] << 56) | ((uint_fast64_t) ptr[1] << 48)
| ((uint_fast64_t) ptr[2] << 40) | ((uint_fast64_t) ptr[3] << 32)
| ((uint_fast64_t) ptr[4] << 24) | ((uint_fast64_t) ptr[5] << 16)
| ((uint_fast64_t) ptr[6] << 8) | (uint_fast64_t) ptr[7];
}
static inline uint_least8_t
stdc_load8_leu8 (const unsigned char ptr[1])
{
return ptr[0];
}
static inline uint_least16_t
stdc_load8_leu16 (const unsigned char ptr[2])
{
return (uint_fast16_t) ptr[0] | ((uint_fast16_t) ptr[1] << 8);
}
static inline uint_least32_t
stdc_load8_leu32 (const unsigned char ptr[4])
{
return (uint_fast32_t) ptr[0] | ((uint_fast32_t) ptr[1] << 8)
| ((uint_fast32_t) ptr[2] << 16) | ((uint_fast32_t) ptr[3] << 24);
}
static inline uint_least64_t
stdc_load8_leu64 (const unsigned char ptr[8])
{
return (uint_fast64_t) ptr[0] | ((uint_fast64_t) ptr[1] << 8)
| ((uint_fast64_t) ptr[2] << 16) | ((uint_fast64_t) ptr[3] << 24)
| ((uint_fast64_t) ptr[4] << 32) | ((uint_fast64_t) ptr[5] << 40)
| ((uint_fast64_t) ptr[6] << 48) | ((uint_fast64_t) ptr[7] << 56);
}
uint_least8_t
stdc_load8_aligned_beu8 (const unsigned char ptr[1])
{
return stdc_load8_beu8 (ptr);
}
uint_least16_t
stdc_load8_aligned_beu16 (const unsigned char ptr[2])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least16_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 2), 2);
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_16 (value);
return value;
}
else
return stdc_load8_beu16 (ptr);
}
uint_least32_t
stdc_load8_aligned_beu32 (const unsigned char ptr[4])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least32_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 4), 4);
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_32 (value);
return value;
}
else
return stdc_load8_beu32 (ptr);
}
uint_least64_t
stdc_load8_aligned_beu64 (const unsigned char ptr[8])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least64_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 8), 8);
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_64 (value);
return value;
}
else
return stdc_load8_beu64 (ptr);
}
uint_least8_t
stdc_load8_aligned_leu8 (const unsigned char ptr[1])
{
return stdc_load8_leu8 (ptr);
}
uint_least16_t
stdc_load8_aligned_leu16 (const unsigned char ptr[2])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least16_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 2), 2);
if (_GL_STDBIT_BIGENDIAN)
value = bswap_16 (value);
return value;
}
else
return stdc_load8_leu16 (ptr);
}
uint_least32_t
stdc_load8_aligned_leu32 (const unsigned char ptr[4])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least32_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 4), 4);
if (_GL_STDBIT_BIGENDIAN)
value = bswap_32 (value);
return value;
}
else
return stdc_load8_leu32 (ptr);
}
uint_least64_t
stdc_load8_aligned_leu64 (const unsigned char ptr[8])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
uint_least64_t value;
memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 8), 8);
if (_GL_STDBIT_BIGENDIAN)
value = bswap_64 (value);
return value;
}
else
return stdc_load8_leu64 (ptr);
}
static inline void
stdc_store8_beu8 (uint_least8_t value, unsigned char ptr[1])
{
ptr[0] = value;
}
static inline void
stdc_store8_beu16 (uint_least16_t value, unsigned char ptr[2])
{
ptr[0] = (value >> 8) & 0xFFU;
ptr[1] = value & 0xFFU;
}
static inline void
stdc_store8_beu32 (uint_least32_t value, unsigned char ptr[4])
{
ptr[0] = (value >> 24) & 0xFFU;
ptr[1] = (value >> 16) & 0xFFU;
ptr[2] = (value >> 8) & 0xFFU;
ptr[3] = value & 0xFFU;
}
static inline void
stdc_store8_beu64 (uint_least64_t value, unsigned char ptr[8])
{
ptr[0] = (value >> 56) & 0xFFU;
ptr[1] = (value >> 48) & 0xFFU;
ptr[2] = (value >> 40) & 0xFFU;
ptr[3] = (value >> 32) & 0xFFU;
ptr[4] = (value >> 24) & 0xFFU;
ptr[5] = (value >> 16) & 0xFFU;
ptr[6] = (value >> 8) & 0xFFU;
ptr[7] = value & 0xFFU;
}
static inline void
stdc_store8_leu8 (uint_least8_t value, unsigned char ptr[1])
{
ptr[0] = value;
}
static inline void
stdc_store8_leu16 (uint_least16_t value, unsigned char ptr[2])
{
ptr[0] = value & 0xFFU;
ptr[1] = (value >> 8) & 0xFFU;
}
static inline void
stdc_store8_leu32 (uint_least32_t value, unsigned char ptr[4])
{
ptr[0] = value & 0xFFU;
ptr[1] = (value >> 8) & 0xFFU;
ptr[2] = (value >> 16) & 0xFFU;
ptr[3] = (value >> 24) & 0xFFU;
}
static inline void
stdc_store8_leu64 (uint_least64_t value, unsigned char ptr[8])
{
ptr[0] = value & 0xFFU;
ptr[1] = (value >> 8) & 0xFFU;
ptr[2] = (value >> 16) & 0xFFU;
ptr[3] = (value >> 24) & 0xFFU;
ptr[4] = (value >> 32) & 0xFFU;
ptr[5] = (value >> 40) & 0xFFU;
ptr[6] = (value >> 48) & 0xFFU;
ptr[7] = (value >> 56) & 0xFFU;
}
void
stdc_store8_aligned_beu8 (uint_least8_t value, unsigned char ptr[1])
{
stdc_store8_beu8 (value, ptr);
}
void
stdc_store8_aligned_beu16 (uint_least16_t value, unsigned char ptr[2])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_16 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 2), &value, 2);
}
else
stdc_store8_beu16 (value, ptr);
}
void
stdc_store8_aligned_beu32 (uint_least32_t value, unsigned char ptr[4])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_32 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 4), &value, 4);
}
else
stdc_store8_beu32 (value, ptr);
}
void
stdc_store8_aligned_beu64 (uint_least64_t value, unsigned char ptr[8])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (!_GL_STDBIT_BIGENDIAN)
value = bswap_64 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 8), &value, 8);
}
else
stdc_store8_beu64 (value, ptr);
}
void
stdc_store8_aligned_leu8 (uint_least8_t value, unsigned char ptr[1])
{
stdc_store8_leu8 (value, ptr);
}
void
stdc_store8_aligned_leu16 (uint_least16_t value, unsigned char ptr[2])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (_GL_STDBIT_BIGENDIAN)
value = bswap_16 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 2), &value, 2);
}
else
stdc_store8_leu16 (value, ptr);
}
void
stdc_store8_aligned_leu32 (uint_least32_t value, unsigned char ptr[4])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (_GL_STDBIT_BIGENDIAN)
value = bswap_32 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 4), &value, 4);
}
else
stdc_store8_leu32 (value, ptr);
}
void
stdc_store8_aligned_leu64 (uint_least64_t value, unsigned char ptr[8])
{
if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
{
if (_GL_STDBIT_BIGENDIAN)
value = bswap_64 (value);
memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 8), &value, 8);
}
else
stdc_store8_leu64 (value, ptr);
}
.file "loadstore8-paul2.c"
.option pic
.attribute arch,
"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0"
.attribute unaligned_access, 0
.attribute stack_align, 16
.text
.align 1
.globl stdc_load8_aligned_beu8
.type stdc_load8_aligned_beu8, @function
stdc_load8_aligned_beu8:
.LFB25:
.cfi_startproc
lbu a0,0(a0)
ret
.cfi_endproc
.LFE25:
.size stdc_load8_aligned_beu8, .-stdc_load8_aligned_beu8
.align 1
.globl stdc_load8_aligned_beu16
.type stdc_load8_aligned_beu16, @function
stdc_load8_aligned_beu16:
.LFB26:
.cfi_startproc
lhu a5,0(a0)
lhu a4,0(a0)
slliw a0,a5,8
srliw a5,a4,8
or a0,a0,a5
slli a0,a0,48
srli a0,a0,48
ret
.cfi_endproc
.LFE26:
.size stdc_load8_aligned_beu16, .-stdc_load8_aligned_beu16
.globl __bswapsi2
.align 1
.globl stdc_load8_aligned_beu32
.type stdc_load8_aligned_beu32, @function
stdc_load8_aligned_beu32:
.LFB27:
.cfi_startproc
addi sp,sp,-16
.cfi_def_cfa_offset 16
sd ra,8(sp)
.cfi_offset 1, -8
lw a0,0(a0)
call __bswapsi2@plt
ld ra,8(sp)
.cfi_restore 1
sext.w a0,a0
addi sp,sp,16
.cfi_def_cfa_offset 0
jr ra
.cfi_endproc
.LFE27:
.size stdc_load8_aligned_beu32, .-stdc_load8_aligned_beu32
.globl __bswapdi2
.align 1
.globl stdc_load8_aligned_beu64
.type stdc_load8_aligned_beu64, @function
stdc_load8_aligned_beu64:
.LFB28:
.cfi_startproc
addi sp,sp,-16
.cfi_def_cfa_offset 16
sd ra,8(sp)
.cfi_offset 1, -8
ld a0,0(a0)
call __bswapdi2@plt
ld ra,8(sp)
.cfi_restore 1
addi sp,sp,16
.cfi_def_cfa_offset 0
jr ra
.cfi_endproc
.LFE28:
.size stdc_load8_aligned_beu64, .-stdc_load8_aligned_beu64
.align 1
.globl stdc_load8_aligned_leu8
.type stdc_load8_aligned_leu8, @function
stdc_load8_aligned_leu8:
.LFB50:
.cfi_startproc
lbu a0,0(a0)
ret
.cfi_endproc
.LFE50:
.size stdc_load8_aligned_leu8, .-stdc_load8_aligned_leu8
.align 1
.globl stdc_load8_aligned_leu16
.type stdc_load8_aligned_leu16, @function
stdc_load8_aligned_leu16:
.LFB30:
.cfi_startproc
lhu a0,0(a0)
ret
.cfi_endproc
.LFE30:
.size stdc_load8_aligned_leu16, .-stdc_load8_aligned_leu16
.align 1
.globl stdc_load8_aligned_leu32
.type stdc_load8_aligned_leu32, @function
stdc_load8_aligned_leu32:
.LFB31:
.cfi_startproc
lw a0,0(a0)
ret
.cfi_endproc
.LFE31:
.size stdc_load8_aligned_leu32, .-stdc_load8_aligned_leu32
.align 1
.globl stdc_load8_aligned_leu64
.type stdc_load8_aligned_leu64, @function
stdc_load8_aligned_leu64:
.LFB32:
.cfi_startproc
ld a0,0(a0)
ret
.cfi_endproc
.LFE32:
.size stdc_load8_aligned_leu64, .-stdc_load8_aligned_leu64
.align 1
.globl stdc_store8_aligned_beu8
.type stdc_store8_aligned_beu8, @function
stdc_store8_aligned_beu8:
.LFB41:
.cfi_startproc
sb a0,0(a1)
ret
.cfi_endproc
.LFE41:
.size stdc_store8_aligned_beu8, .-stdc_store8_aligned_beu8
.align 1
.globl stdc_store8_aligned_beu16
.type stdc_store8_aligned_beu16, @function
stdc_store8_aligned_beu16:
.LFB42:
.cfi_startproc
slliw a5,a0,8
srliw a0,a0,8
or a5,a5,a0
sh a5,0(a1)
ret
.cfi_endproc
.LFE42:
.size stdc_store8_aligned_beu16, .-stdc_store8_aligned_beu16
.align 1
.globl stdc_store8_aligned_beu32
.type stdc_store8_aligned_beu32, @function
stdc_store8_aligned_beu32:
.LFB43:
.cfi_startproc
addi sp,sp,-16
.cfi_def_cfa_offset 16
sd s0,0(sp)
sd ra,8(sp)
.cfi_offset 8, -16
.cfi_offset 1, -8
mv s0,a1
call __bswapsi2@plt
sw a0,0(s0)
ld ra,8(sp)
.cfi_restore 1
ld s0,0(sp)
.cfi_restore 8
addi sp,sp,16
.cfi_def_cfa_offset 0
jr ra
.cfi_endproc
.LFE43:
.size stdc_store8_aligned_beu32, .-stdc_store8_aligned_beu32
.align 1
.globl stdc_store8_aligned_beu64
.type stdc_store8_aligned_beu64, @function
stdc_store8_aligned_beu64:
.LFB44:
.cfi_startproc
addi sp,sp,-16
.cfi_def_cfa_offset 16
sd s0,0(sp)
sd ra,8(sp)
.cfi_offset 8, -16
.cfi_offset 1, -8
mv s0,a1
call __bswapdi2@plt
sd a0,0(s0)
ld ra,8(sp)
.cfi_restore 1
ld s0,0(sp)
.cfi_restore 8
addi sp,sp,16
.cfi_def_cfa_offset 0
jr ra
.cfi_endproc
.LFE44:
.size stdc_store8_aligned_beu64, .-stdc_store8_aligned_beu64
.align 1
.globl stdc_store8_aligned_leu8
.type stdc_store8_aligned_leu8, @function
stdc_store8_aligned_leu8:
.LFB52:
.cfi_startproc
sb a0,0(a1)
ret
.cfi_endproc
.LFE52:
.size stdc_store8_aligned_leu8, .-stdc_store8_aligned_leu8
.align 1
.globl stdc_store8_aligned_leu16
.type stdc_store8_aligned_leu16, @function
stdc_store8_aligned_leu16:
.LFB46:
.cfi_startproc
sh a0,0(a1)
ret
.cfi_endproc
.LFE46:
.size stdc_store8_aligned_leu16, .-stdc_store8_aligned_leu16
.align 1
.globl stdc_store8_aligned_leu32
.type stdc_store8_aligned_leu32, @function
stdc_store8_aligned_leu32:
.LFB47:
.cfi_startproc
sw a0,0(a1)
ret
.cfi_endproc
.LFE47:
.size stdc_store8_aligned_leu32, .-stdc_store8_aligned_leu32
.align 1
.globl stdc_store8_aligned_leu64
.type stdc_store8_aligned_leu64, @function
stdc_store8_aligned_leu64:
.LFB48:
.cfi_startproc
sd a0,0(a1)
ret
.cfi_endproc
.LFE48:
.size stdc_store8_aligned_leu64, .-stdc_store8_aligned_leu64
.ident "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
.section .note.GNU-stack,"",@progbits