On Mon, 15 Feb 2021, grischka wrote:
Why not just provide functions as a library and wrap them into macros from stdatomic.h? No or almost no changes to tcc's generator would be needed.

I've posted already a WIP version of atomics-as-support library for amd64. Attached it again.

(It needs some scaffolding from the header--__builtin_constant_p/_Static_assert/sizeof shenanigans--but nothing too bad.)

Implementing them as functions does harm performance, but maybe that's not a very big problem since the atomic itself incurs a decent performance hit?

 -E
/* ---------------------------------------------- */
/* atomic86_64.S */

#ifdef __leading_underscore
# define _(s) _##s
#else
# define _(s) s
#endif

#ifdef _WIN32
# define P1 %rcx

# define P2 %rdx
# define P2_8  %dl
# define P2_16 %dx
# define P2_32 %edx
# define P2_64 %rdx

# define P3 %r8
# define P3_8 %r8b
# define P3_16 %r8w
# define P3_32 %r8l
# define P3_64 %r8

# define P4 %r9
#else
# define P1 %rdi

# define P2 %rsi
# define P2_8  %sil
# define P2_16 %si
# define P2_32 %esi
# define P2_64 %rsi

# define P3 %rdx
# define P3_8 %dl
# define P3_16 %dx
# define P3_32 %edx
# define P3_64 %rdx

# define P4 %rcx
#endif

#define RAX8 %al
#define RAX16 %ax
#define RAX32 %eax
#define RAX64 %rax

#define mkloader(size) \
_(__atomic_load_n ## size): \
    mov (P1), RAX ## size; \
    ret; \
_(__atomic_load ## size): \
    mov (P1), RAX ## size; \
    mov RAX ## size, (P2); \
    ret

mkloader(8)
mkloader(16)
mkloader(32)
mkloader(64)

#undef mkloader


#define mkstorer(size) \
_(__atomic_store_relaxed ## size): \
_(__atomic_store_release ## size): \
    mov (P2), P2_ ## size; \
_(__atomic_store_relaxed_n ## size): \
_(__atomic_store_release_n ## size): \
    mov P2_ ## size, (P1); \
    ret; \
\
_(__atomic_store_seq_cst ## size): \
    mov (P2), P2_ ## size; \
_(__atomic_store_seq_cst_n ## size): \
    xchg P2_ ## size, (P1); \
    ret;

mkstorer(8)
mkstorer(16)
mkstorer(32)
mkstorer(64)

#undef mkstorer

#define mkexchanger(size) \
_(__atomic_exchange_n ## size): \
    mov P2_ ## size, RAX ## size; \
    xchg (P1), RAX ## size; \
    ret; \
_(__atomic_exchange ## size): \
    mov (P2), RAX ## size; \
    xchg (P1), RAX ## size; \
    mov RAX ## size, (P3); \
    ret;

mkexchanger(8)
mkexchanger(16)
mkexchanger(32)
mkexchanger(64)

#undef mkexchanger

// nskip: how many bytes to skip (two or three)?
// tcc's assembler doesn't know to produce a more compact encoding for small, 
local jumps
#define mkcmpxchg(size, nskip) \
_(__atomic_compare_exchange ## size): \
    mov (P3), P3_ ## size; \
_(__atomic_compare_exchange_n ## size): \
    mov (P2), RAX ## size; \
    lock cmpxchg P3_ ## size, (P1); \
    .byte 0x74, nskip; /*jz skip*/ \
    mov RAX ## size, (P3); \
/*skip:*/ \
    setz %al; \
    ret; \

mkcmpxchg(8, 2)
mkcmpxchg(16, 3)
mkcmpxchg(32, 2)
mkcmpxchg(64, 3)

#undef mkcmpxchg

#define arithmetic_op(name, op, size) \
name: \
    lock op P2_ ## size, (P1); \
    ret;

#define arithmetic_op_fetch(name, op, size, off) \
name: \
    mov (P1), RAX ## size; \
    mov RAX ## size, P3_ ## size; \
    op P2_ ## size, P3_ ## size; \
    lock cmpxchg P3_ ## size, (P1); \
    .byte 0x75, off; /*jnz retry*/ \
    mov P3_ ## size, RAX ## size; \
    ret;

#define arithmetic_op_all(op, size, fetch_off) \
arithmetic_op(_(__atomic_ ## op ## size), op, size) \
arithmetic_op_fetch(_(__atomic_ ## op ## _fetch ## size), op, size, fetch_off)

#define arithmetic_ops(op) \
arithmetic_op_all(op,  8, -11) \
arithmetic_op_all(op, 16, -13) \
arithmetic_op_all(op, 32, -10) \
arithmetic_op_all(op, 64, -13) \

arithmetic_ops(and)
arithmetic_ops(xor)
arithmetic_ops(or)

arithmetic_op(_(__atomic_sub8), sub, 8)
arithmetic_op(_(__atomic_sub16), sub, 16)
arithmetic_op(_(__atomic_sub32), sub, 32)
arithmetic_op(_(__atomic_sub64), sub, 64)
arithmetic_op(_(__atomic_add8), add, 8)
arithmetic_op(_(__atomic_add16), add, 16)
arithmetic_op(_(__atomic_add32), add, 32)
arithmetic_op(_(__atomic_add64), add, 64)
#undef arithmetic_ops
#undef arithmetic_op_all
#undef arithmetic_op_fetch
#undef arithmetic_op

#define arithmetic_add(size) \
_(__atomic_sub_fetch ## size): \
    neg P2_ ## size; \
_(__atomic_add_fetch ## size): \
    mov P2_ ## size, RAX ## size; \
    lock xadd RAX ## size, (P1); \
    add P2_ ## size, RAX ## size; \
    ret; \
_(__atomic_fetch_add ## size): \
    mov P2_ ## size, RAX ## size; \
    lock xadd RAX ## size, (P1); \
    ret
arithmetic_add(8)
arithmetic_add(16)
arithmetic_add(32)
arithmetic_add(64)
#undef arithmetic_add

#define arithmetic_sub(size) \
_(__atomic_fetch_sub ## size): \
    mov P2_ ## size, RAX ## size; \
    neg RAX ## size; \
    lock xadd RAX ## size, (P1); \
    ret
arithmetic_sub(8)
arithmetic_sub(16)
arithmetic_sub(32)
arithmetic_sub(64)



//arithmetic_ops(_(__atomic_add), add, -11, -13, -10, -13)
//arithmetic_ops(_(__atomic_sub), sub, -11, -13, -10, -13)
    

/* ---------------------------------------------- */
_______________________________________________
Tinycc-devel mailing list
Tinycc-devel@nongnu.org
https://lists.nongnu.org/mailman/listinfo/tinycc-devel

Reply via email to