Paul Eggert wrote:
> > your simplified code does *NOT* generate the same code. If you came
> > to this impression, you probably must have tested x86_64, x86, arm64 only.
> 
> Yes, I did only x86-64 to be honest. Ouch: I am surprised RISC-V is so 
> poorly optimized. alpha and sparc64 are of course lower priority, but if 
> we need to do RISC-V anyway we might as well do them too.

It's not that RISC-V is "poorly optimized". It's that when the alignment is
unknown, the best load/store instruction sequence is different (longer)
than when the alignment is known. It's like this in RISC processors.

x86_64 and similar have instructions that are fast in the aligned case
and slow in the unaligned case, but still (apparently) fast enough that
the GCC authors decided to use these instructions for the unaligned case
too. CISC.

> I attempted to do this by installing the attached patch. This restores 
> the optimization by using 'if' rather than '#if' as typically 'if' is 
> better than '#ifdef', when either will do.

Good. I confirm that this patch restores the optimization in the
*_aligned_* functions. See the attached loadstore8-paul2.s output.

> > The casts to uint_fast16_t were there for speed. On some architectures,
> > such as sparc, it is more efficient to work with 32-bit integers than with
> > 16-bit integers, and the definition of uint_fast16_t as uint32_t embodies
> > this knowledge. Removing these casts is also a de-optimization that no one
> > has asked for.
> 
> I don't understand the deoptimization there, as on all targets of GNU 
> the values are promoted to at least 32 bits even without the casts. But 
> it's not a big deal so I put that back in too, in the attached patch. It 
> makes the code a bit more symmetric even if it is a bit more casty.

Thanks. Yes, if 'int' and 'unsigned int' are 32-bit, as they usally are
under our assumptions, the "usual promotions" do extra things. But I find
  - that it takes extra brain cycles to consider their effects,
  - as you say, that it breaks the symmetry to omit the uint_fast16_t
    casts when we use the uint_fast32_t and uint_fast64_t casts.

> >> These patches also remove casts that
> >> aren't needed (some of which confused me a bit).
> > 
> > The casts to signed intN_t types were there for clarity.
> 
> For me they made the code less clear. For example, there's no point to 
> casting int_least64_t to uint64_t just before converting to 
> uint_least64_t, as uint64_t cannot possibly have fewer bits than 
> uint_least64_t. So I was puzzled as to why that cast was there.

It isn't really clear to me for what abstract machine we are writing code.
A few years ago, you taught us to care about machines with CHAR_BIT == 36,
for machines with hidden bits in the integers, for machines where -0 and
+0 were represented differently, and so on. Whereas here, clearly, everyone
is assuming that
   uint8_t == 'unsigned char' has 8 bits,
   uint16_t == 'unsigned short' has 16 bits,
   uint32_t == 'unsigned int' has 32 bits,
   uint64_t == 'unsigned long long' has 64 bits,
   and two's complement.
I wanted to protect the code against pitfalls with the non-standard machine
types, but apparently did not know which pitfalls are real and relevant
and which aren't.

> Also, this part of stdbit.h is specified without exact-width types. So 
> it's disconcerting to see unnecessary use of exact-width types in the 
> Gnulib implementation; it's not clear why exact-width types are helpful 
> in code where the API stays away from them. (C2y allows the exact-width 
> types to not exist at all; of course that's mostly theoretical for Gnulib.)

Yes, this is my confusion: If, say, uint16_t does not exist at all, how
is uint_least16_t represented? And do we have to write
  x & 0xFFFFU
instead of
  (uint16_t) x
then? And similarly, will a conversion (implicit or cast) from
uint_least16_t to int_least16_t extend the sign bit?

Bruno
#include <stdint.h>
#include <string.h>
#include <byteswap.h>

#if (defined __clang__ ? __clang_major__ >= 4 : \
     (defined __GNUC__ \
      && (defined __cplusplus \
          ? __GNUC__ + (__GNUC_MINOR__ >= 9) > 4 \
          : __GNUC__ + (__GNUC_MINOR__ >= 7) > 4)))
# define _GL_STDBIT_ASSUME_ALIGNED(ptr, align) \
    __builtin_assume_aligned (ptr, align)
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 1
#elif defined _MSC_VER
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 1
#endif

#ifndef _GL_STDBIT_ASSUME_ALIGNED
# define _GL_STDBIT_ASSUME_ALIGNED(ptr, align) (ptr)
#endif

#ifndef _GL_STDBIT_OPTIMIZE_VIA_MEMCPY
# define _GL_STDBIT_OPTIMIZE_VIA_MEMCPY 0
#endif

#define _GL_STDBIT_BIGENDIAN 0

static inline uint_least8_t
stdc_load8_beu8 (const unsigned char ptr[1])
{
  return ptr[0];
}

static inline uint_least16_t
stdc_load8_beu16 (const unsigned char ptr[2])
{
  return ((uint_fast16_t) ptr[0] << 8) | (uint_fast16_t) ptr[1];
}

static inline uint_least32_t
stdc_load8_beu32 (const unsigned char ptr[4])
{
  return ((uint_fast32_t) ptr[0] << 24) | ((uint_fast32_t) ptr[1] << 16)
         | ((uint_fast32_t) ptr[2] << 8) | (uint_fast32_t) ptr[3];
}

static inline uint_least64_t
stdc_load8_beu64 (const unsigned char ptr[8])
{
  return ((uint_fast64_t) ptr[0] << 56) | ((uint_fast64_t) ptr[1] << 48)
         | ((uint_fast64_t) ptr[2] << 40) | ((uint_fast64_t) ptr[3] << 32)
         | ((uint_fast64_t) ptr[4] << 24) | ((uint_fast64_t) ptr[5] << 16)
         | ((uint_fast64_t) ptr[6] << 8) | (uint_fast64_t) ptr[7];
}

static inline uint_least8_t
stdc_load8_leu8 (const unsigned char ptr[1])
{
  return ptr[0];
}

static inline uint_least16_t
stdc_load8_leu16 (const unsigned char ptr[2])
{
  return (uint_fast16_t) ptr[0] | ((uint_fast16_t) ptr[1] << 8);
}

static inline uint_least32_t
stdc_load8_leu32 (const unsigned char ptr[4])
{
  return (uint_fast32_t) ptr[0] | ((uint_fast32_t) ptr[1] << 8)
         | ((uint_fast32_t) ptr[2] << 16) | ((uint_fast32_t) ptr[3] << 24);
}

static inline uint_least64_t
stdc_load8_leu64 (const unsigned char ptr[8])
{
  return (uint_fast64_t) ptr[0] | ((uint_fast64_t) ptr[1] << 8)
         | ((uint_fast64_t) ptr[2] << 16) | ((uint_fast64_t) ptr[3] << 24)
         | ((uint_fast64_t) ptr[4] << 32) | ((uint_fast64_t) ptr[5] << 40)
         | ((uint_fast64_t) ptr[6] << 48) | ((uint_fast64_t) ptr[7] << 56);
}

uint_least8_t
stdc_load8_aligned_beu8 (const unsigned char ptr[1])
{
  return stdc_load8_beu8 (ptr);
}

uint_least16_t
stdc_load8_aligned_beu16 (const unsigned char ptr[2])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least16_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 2), 2);
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_16 (value);
      return value;
    }
  else
    return stdc_load8_beu16 (ptr);
}

uint_least32_t
stdc_load8_aligned_beu32 (const unsigned char ptr[4])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least32_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 4), 4);
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_32 (value);
      return value;
    }
  else
    return stdc_load8_beu32 (ptr);
}

uint_least64_t
stdc_load8_aligned_beu64 (const unsigned char ptr[8])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least64_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 8), 8);
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_64 (value);
      return value;
    }
  else
    return stdc_load8_beu64 (ptr);
}

uint_least8_t
stdc_load8_aligned_leu8 (const unsigned char ptr[1])
{
  return stdc_load8_leu8 (ptr);
}

uint_least16_t
stdc_load8_aligned_leu16 (const unsigned char ptr[2])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least16_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 2), 2);
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_16 (value);
      return value;
    }
  else
    return stdc_load8_leu16 (ptr);
}

uint_least32_t
stdc_load8_aligned_leu32 (const unsigned char ptr[4])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least32_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 4), 4);
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_32 (value);
      return value;
    }
  else
    return stdc_load8_leu32 (ptr);
}

uint_least64_t
stdc_load8_aligned_leu64 (const unsigned char ptr[8])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      uint_least64_t value;
      memcpy (&value, _GL_STDBIT_ASSUME_ALIGNED (ptr, 8), 8);
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_64 (value);
      return value;
    }
  else
    return stdc_load8_leu64 (ptr);
}

static inline void
stdc_store8_beu8 (uint_least8_t value, unsigned char ptr[1])
{
  ptr[0] = value;
}

static inline void
stdc_store8_beu16 (uint_least16_t value, unsigned char ptr[2])
{
  ptr[0] = (value >> 8) & 0xFFU;
  ptr[1] = value & 0xFFU;
}

static inline void
stdc_store8_beu32 (uint_least32_t value, unsigned char ptr[4])
{
  ptr[0] = (value >> 24) & 0xFFU;
  ptr[1] = (value >> 16) & 0xFFU;
  ptr[2] = (value >> 8) & 0xFFU;
  ptr[3] = value & 0xFFU;
}

static inline void
stdc_store8_beu64 (uint_least64_t value, unsigned char ptr[8])
{
  ptr[0] = (value >> 56) & 0xFFU;
  ptr[1] = (value >> 48) & 0xFFU;
  ptr[2] = (value >> 40) & 0xFFU;
  ptr[3] = (value >> 32) & 0xFFU;
  ptr[4] = (value >> 24) & 0xFFU;
  ptr[5] = (value >> 16) & 0xFFU;
  ptr[6] = (value >> 8) & 0xFFU;
  ptr[7] = value & 0xFFU;
}

static inline void
stdc_store8_leu8 (uint_least8_t value, unsigned char ptr[1])
{
  ptr[0] = value;
}

static inline void
stdc_store8_leu16 (uint_least16_t value, unsigned char ptr[2])
{
  ptr[0] = value & 0xFFU;
  ptr[1] = (value >> 8) & 0xFFU;
}

static inline void
stdc_store8_leu32 (uint_least32_t value, unsigned char ptr[4])
{
  ptr[0] = value & 0xFFU;
  ptr[1] = (value >> 8) & 0xFFU;
  ptr[2] = (value >> 16) & 0xFFU;
  ptr[3] = (value >> 24) & 0xFFU;
}

static inline void
stdc_store8_leu64 (uint_least64_t value, unsigned char ptr[8])
{
  ptr[0] = value & 0xFFU;
  ptr[1] = (value >> 8) & 0xFFU;
  ptr[2] = (value >> 16) & 0xFFU;
  ptr[3] = (value >> 24) & 0xFFU;
  ptr[4] = (value >> 32) & 0xFFU;
  ptr[5] = (value >> 40) & 0xFFU;
  ptr[6] = (value >> 48) & 0xFFU;
  ptr[7] = (value >> 56) & 0xFFU;
}

void
stdc_store8_aligned_beu8 (uint_least8_t value, unsigned char ptr[1])
{
  stdc_store8_beu8 (value, ptr);
}

void
stdc_store8_aligned_beu16 (uint_least16_t value, unsigned char ptr[2])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_16 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 2), &value, 2);
    }
  else
    stdc_store8_beu16 (value, ptr);
}

void
stdc_store8_aligned_beu32 (uint_least32_t value, unsigned char ptr[4])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_32 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 4), &value, 4);
    }
  else
    stdc_store8_beu32 (value, ptr);
}

void
stdc_store8_aligned_beu64 (uint_least64_t value, unsigned char ptr[8])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (!_GL_STDBIT_BIGENDIAN)
        value = bswap_64 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 8), &value, 8);
    }
  else
    stdc_store8_beu64 (value, ptr);
}

void
stdc_store8_aligned_leu8 (uint_least8_t value, unsigned char ptr[1])
{
  stdc_store8_leu8 (value, ptr);
}

void
stdc_store8_aligned_leu16 (uint_least16_t value, unsigned char ptr[2])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_16 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 2), &value, 2);
    }
  else
    stdc_store8_leu16 (value, ptr);
}

void
stdc_store8_aligned_leu32 (uint_least32_t value, unsigned char ptr[4])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_32 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 4), &value, 4);
    }
  else
    stdc_store8_leu32 (value, ptr);
}

void
stdc_store8_aligned_leu64 (uint_least64_t value, unsigned char ptr[8])
{
  if (_GL_STDBIT_OPTIMIZE_VIA_MEMCPY)
    {
      if (_GL_STDBIT_BIGENDIAN)
        value = bswap_64 (value);
      memcpy (_GL_STDBIT_ASSUME_ALIGNED (ptr, 8), &value, 8);
    }
  else
    stdc_store8_leu64 (value, ptr);
}
        .file   "loadstore8-paul2.c"
        .option pic
        .attribute arch, 
"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0"
        .attribute unaligned_access, 0
        .attribute stack_align, 16
        .text
        .align  1
        .globl  stdc_load8_aligned_beu8
        .type   stdc_load8_aligned_beu8, @function
stdc_load8_aligned_beu8:
.LFB25:
        .cfi_startproc
        lbu     a0,0(a0)
        ret
        .cfi_endproc
.LFE25:
        .size   stdc_load8_aligned_beu8, .-stdc_load8_aligned_beu8
        .align  1
        .globl  stdc_load8_aligned_beu16
        .type   stdc_load8_aligned_beu16, @function
stdc_load8_aligned_beu16:
.LFB26:
        .cfi_startproc
        lhu     a5,0(a0)
        lhu     a4,0(a0)
        slliw   a0,a5,8
        srliw   a5,a4,8
        or      a0,a0,a5
        slli    a0,a0,48
        srli    a0,a0,48
        ret
        .cfi_endproc
.LFE26:
        .size   stdc_load8_aligned_beu16, .-stdc_load8_aligned_beu16
        .globl  __bswapsi2
        .align  1
        .globl  stdc_load8_aligned_beu32
        .type   stdc_load8_aligned_beu32, @function
stdc_load8_aligned_beu32:
.LFB27:
        .cfi_startproc
        addi    sp,sp,-16
        .cfi_def_cfa_offset 16
        sd      ra,8(sp)
        .cfi_offset 1, -8
        lw      a0,0(a0)
        call    __bswapsi2@plt
        ld      ra,8(sp)
        .cfi_restore 1
        sext.w  a0,a0
        addi    sp,sp,16
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc
.LFE27:
        .size   stdc_load8_aligned_beu32, .-stdc_load8_aligned_beu32
        .globl  __bswapdi2
        .align  1
        .globl  stdc_load8_aligned_beu64
        .type   stdc_load8_aligned_beu64, @function
stdc_load8_aligned_beu64:
.LFB28:
        .cfi_startproc
        addi    sp,sp,-16
        .cfi_def_cfa_offset 16
        sd      ra,8(sp)
        .cfi_offset 1, -8
        ld      a0,0(a0)
        call    __bswapdi2@plt
        ld      ra,8(sp)
        .cfi_restore 1
        addi    sp,sp,16
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc
.LFE28:
        .size   stdc_load8_aligned_beu64, .-stdc_load8_aligned_beu64
        .align  1
        .globl  stdc_load8_aligned_leu8
        .type   stdc_load8_aligned_leu8, @function
stdc_load8_aligned_leu8:
.LFB50:
        .cfi_startproc
        lbu     a0,0(a0)
        ret
        .cfi_endproc
.LFE50:
        .size   stdc_load8_aligned_leu8, .-stdc_load8_aligned_leu8
        .align  1
        .globl  stdc_load8_aligned_leu16
        .type   stdc_load8_aligned_leu16, @function
stdc_load8_aligned_leu16:
.LFB30:
        .cfi_startproc
        lhu     a0,0(a0)
        ret
        .cfi_endproc
.LFE30:
        .size   stdc_load8_aligned_leu16, .-stdc_load8_aligned_leu16
        .align  1
        .globl  stdc_load8_aligned_leu32
        .type   stdc_load8_aligned_leu32, @function
stdc_load8_aligned_leu32:
.LFB31:
        .cfi_startproc
        lw      a0,0(a0)
        ret
        .cfi_endproc
.LFE31:
        .size   stdc_load8_aligned_leu32, .-stdc_load8_aligned_leu32
        .align  1
        .globl  stdc_load8_aligned_leu64
        .type   stdc_load8_aligned_leu64, @function
stdc_load8_aligned_leu64:
.LFB32:
        .cfi_startproc
        ld      a0,0(a0)
        ret
        .cfi_endproc
.LFE32:
        .size   stdc_load8_aligned_leu64, .-stdc_load8_aligned_leu64
        .align  1
        .globl  stdc_store8_aligned_beu8
        .type   stdc_store8_aligned_beu8, @function
stdc_store8_aligned_beu8:
.LFB41:
        .cfi_startproc
        sb      a0,0(a1)
        ret
        .cfi_endproc
.LFE41:
        .size   stdc_store8_aligned_beu8, .-stdc_store8_aligned_beu8
        .align  1
        .globl  stdc_store8_aligned_beu16
        .type   stdc_store8_aligned_beu16, @function
stdc_store8_aligned_beu16:
.LFB42:
        .cfi_startproc
        slliw   a5,a0,8
        srliw   a0,a0,8
        or      a5,a5,a0
        sh      a5,0(a1)
        ret
        .cfi_endproc
.LFE42:
        .size   stdc_store8_aligned_beu16, .-stdc_store8_aligned_beu16
        .align  1
        .globl  stdc_store8_aligned_beu32
        .type   stdc_store8_aligned_beu32, @function
stdc_store8_aligned_beu32:
.LFB43:
        .cfi_startproc
        addi    sp,sp,-16
        .cfi_def_cfa_offset 16
        sd      s0,0(sp)
        sd      ra,8(sp)
        .cfi_offset 8, -16
        .cfi_offset 1, -8
        mv      s0,a1
        call    __bswapsi2@plt
        sw      a0,0(s0)
        ld      ra,8(sp)
        .cfi_restore 1
        ld      s0,0(sp)
        .cfi_restore 8
        addi    sp,sp,16
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc
.LFE43:
        .size   stdc_store8_aligned_beu32, .-stdc_store8_aligned_beu32
        .align  1
        .globl  stdc_store8_aligned_beu64
        .type   stdc_store8_aligned_beu64, @function
stdc_store8_aligned_beu64:
.LFB44:
        .cfi_startproc
        addi    sp,sp,-16
        .cfi_def_cfa_offset 16
        sd      s0,0(sp)
        sd      ra,8(sp)
        .cfi_offset 8, -16
        .cfi_offset 1, -8
        mv      s0,a1
        call    __bswapdi2@plt
        sd      a0,0(s0)
        ld      ra,8(sp)
        .cfi_restore 1
        ld      s0,0(sp)
        .cfi_restore 8
        addi    sp,sp,16
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc
.LFE44:
        .size   stdc_store8_aligned_beu64, .-stdc_store8_aligned_beu64
        .align  1
        .globl  stdc_store8_aligned_leu8
        .type   stdc_store8_aligned_leu8, @function
stdc_store8_aligned_leu8:
.LFB52:
        .cfi_startproc
        sb      a0,0(a1)
        ret
        .cfi_endproc
.LFE52:
        .size   stdc_store8_aligned_leu8, .-stdc_store8_aligned_leu8
        .align  1
        .globl  stdc_store8_aligned_leu16
        .type   stdc_store8_aligned_leu16, @function
stdc_store8_aligned_leu16:
.LFB46:
        .cfi_startproc
        sh      a0,0(a1)
        ret
        .cfi_endproc
.LFE46:
        .size   stdc_store8_aligned_leu16, .-stdc_store8_aligned_leu16
        .align  1
        .globl  stdc_store8_aligned_leu32
        .type   stdc_store8_aligned_leu32, @function
stdc_store8_aligned_leu32:
.LFB47:
        .cfi_startproc
        sw      a0,0(a1)
        ret
        .cfi_endproc
.LFE47:
        .size   stdc_store8_aligned_leu32, .-stdc_store8_aligned_leu32
        .align  1
        .globl  stdc_store8_aligned_leu64
        .type   stdc_store8_aligned_leu64, @function
stdc_store8_aligned_leu64:
.LFB48:
        .cfi_startproc
        sd      a0,0(a1)
        ret
        .cfi_endproc
.LFE48:
        .size   stdc_store8_aligned_leu64, .-stdc_store8_aligned_leu64
        .ident  "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
        .section        .note.GNU-stack,"",@progbits

Reply via email to