On 1/10/26 00:23, Richard Henderson wrote:
I'm not fond of the pointer arithmetic or the code structure.

Perhaps better as

     switch (mop & (MO_BSWAP | MO_SIZE)) {
     case MO_LEUW:
         return lduw_le_p(ptr);
     case MO_BEUW:
         return lduw_be_p(ptr);
     ...
     default:
         g_assert_not_reached();
     }

which would hopefully compile to host endian-swapping load insns like

.L1:
        mov     (ptr), %eax
        ret
.L2:
        movbe   (ptr), %eax
        ret

It only might do so for 32-bits, because movbe also bundles a free 32->64-bit zero extension, but not for the smaller ones. Thinking about which, I think ldm_p also needs to handle MO_SIGN? It can be done all in one with

static inline uint64_t ldm_p(const void *ptr, MemOp mop)
{
    const unsigned size = memop_size(mop);
    uint64_t val;
    uint8_t *pval = (uint8_t *)&val;

    if (HOST_BIG_ENDIAN) {
        pval += sizeof(val) - size;
    }

    assert(size < 8);
    __builtin_memcpy(pval, ptr, size);

    if (mop & MO_BSWAP) {
        val = __builtin_bswap64(val);
    } else if (mop & MO_SIGN) {
        val <<= (64 - 8 * size);
    } else {
        return val;
    }

    if (mop & MO_SIGN) {
        return ((int64_t) val) >> (64 - 8 * size);
    } else {
        return val >> (64 - 8 * size);
    }
}

static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
{
    const unsigned size = memop_size(mop);
    uint8_t *pval = (uint8_t *)&val;

    assert(size < 8);
    if ((mop & MO_BSWAP)) {
        val = __builtin_bswap64(val) >> (64 - size * 8);
    }

    if (HOST_BIG_ENDIAN) {
        pval += sizeof(val) - size;
    }

    __builtin_memcpy(ptr, pval, size);
}

When inlining ldm_p, GCC is able to generate movzx/movsx instruction but doesn't recognize bswap64 + right shift as a smaller-width bswap + zero extension; clang does. Neither is able to generate movbe instructions, though.

I attach a standalone file I played with.

Paolo
#include <stdint.h>
#include <assert.h>

typedef enum MemOp {
    MO_16 = 1,
    MO_32 = 2,
    MO_SIZE = 7,
    MO_SIGN = 8,
    MO_BSWAP = 16,
} MemOp;

extern void __attribute__((noreturn)) g_assert_not_reached(void);

static inline int memop_size(MemOp x)
{
    return 1 << (x & MO_SIZE);
}

static inline uint64_t ldm_p(const void *ptr, MemOp mop)
{
    const unsigned size = memop_size(mop);
    uint64_t val;
    uint8_t *pval = (uint8_t *)&val;

    assert(size < 8);
    __builtin_memcpy(pval, ptr, size);

    if (mop & MO_BSWAP) {
        val = __builtin_bswap64(val);
    } else if (mop & MO_SIGN) {
        val <<= (64 - 8 * size);
    } else {
        return val;
    }

    if (mop & MO_SIGN) {
        return ((int64_t) val) >> (64 - 8 * size);
    } else {
        return val >> (64 - 8 * size);
    }
}

static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
{
    const unsigned size = memop_size(mop);
    uint8_t *pval = (uint8_t *)&val;

    assert(size < 8);
    if ((mop & MO_BSWAP)) {
        val = __builtin_bswap64(val) >> (64 - size * 8);
    }

    __builtin_memcpy(ptr, pval, size);
}

uint32_t lduw_be_p(const void *ptr)
{
    return ldm_p(ptr, MO_16 | MO_BSWAP);
}

uint32_t ldsw_le_p(const void *ptr)
{
    return ldm_p(ptr, MO_16 | MO_SIGN);
}

uint32_t ldsl_be_p(const void *ptr)
{
    return ldm_p(ptr, MO_32 | MO_BSWAP | MO_SIGN);
}

void stl_be_p(void *ptr, uint32_t val)
{
    stm_p(ptr, val, MO_32 | MO_BSWAP | MO_SIGN);
}

Reply via email to