Hi.

Em seg., 9 de mar. de 2026 às 14:02, Bryan Green <[email protected]>
escreveu:

> I performed a micro-benchmark on my dual epyc (zen 2) server and version 1
> wins for small values of n.
>
> 20 runs:
>
> n       version       min  median    mean     max  stddev  noise%
> -----------------------------------------------------------------------
> n=1     version1     2.440   2.440   2.450   2.550   0.024    4.5%
> n=1     version2     4.260   4.280   4.277   4.290   0.007    0.7%
>
> n=2     version1     2.740   2.750   2.757   2.880   0.029    5.1%
> n=2     version2     3.970   3.980   3.980   4.020   0.010    1.3%
>
> n=4     version1     4.580   4.595   4.649   4.910   0.094    7.2%
> n=4     version2     5.780   5.815   5.809   5.820   0.013    0.7%
>
> But, micro-benchmarks always make me nervous, so I looked at the actual
> instruction cost for my
> platform given the version 1 and version 2 code.
>
> If we count cpu cycles using the AMD Zen 2 instruction latency/throughput
> tables:  version 1 (loop body)
> has a critical path of ~5-6 cycles per iteration.  version 2 (loop body)
> has ~3-4 cycles per iteration.
>
> The problem for version 2 is that the call to memcpy is ~24-30 cycles due
> to the stub + function call + return
> and branch predictor pressure on first call.  This probably results in
> ~2.5 ns per iteration cost for version 2.
>
> So, no I wouldn't call it an optimization.  But, it will be interesting to
> hear other opinions on this.
>
I made dirty and quick tests with two versions:
gcc 15.2.0
gcc -O2 memcpy1.c -o memcpy1

The first test was with keys 10000000 and 10000000 loops:
version1: on memcpy call
done in 1873 nanoseconds

version2: inlined memcpy
not finish

The second test was with keys 4 and 10000000 loops:
version1: one memcpy call
version2: inlined memcpy call

version1: done in 1519 nanoseconds
version2: done in 104981851 nanoseconds
(1.44692e-05 times faster)

version1: done in 1979 nanoseconds
version2: done in 110568901 nanoseconds
(1.78983e-05 times faster)

version1: done in 1814 nanoseconds
version2: done in 108555484 nanoseconds
(1.67103e-05 times faster)

version1: done in 1631 nanoseconds
version2: done in 109867919 nanoseconds
(1.48451e-05 times faster)

version1: done in 1269 nanoseconds
version2: done in 111639106 nanoseconds
(1.1367e-05 times faster)

Unless I'm doing something wrong, one call memcpy wins!
memcpy1.c attached.

best regards,
Ranier Vilela
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <time.h>
#include <immintrin.h>


/* Closer approximation of ScanKeyData - has function pointer and Datum */
typedef void (*RegProcedure)(void);
typedef uintptr_t Datum;

typedef struct ScanKeyData
{
    int         sk_flags;
    int         sk_attno;
    RegProcedure sk_func;
    Datum       sk_argument;
} ScanKeyData;

/*  */
const ScanKeyData * version1(int n, const ScanKeyData  * key)
{
    ScanKeyData *idxkey = (ScanKeyData *) malloc(n * sizeof(ScanKeyData));

    memcpy(&idxkey, &key, n * sizeof(ScanKeyData));
    for (int i = 0; i < n; i++)
    {
        idxkey[i].sk_attno = i + 1;
    }

    return idxkey;
}

/* */
const ScanKeyData * version2(int n, const ScanKeyData *key)
{
    ScanKeyData *idxkey = (ScanKeyData *) malloc(n * sizeof(ScanKeyData));

    for (int i = 0; i < n; i++)
    {
        memcpy(&idxkey[i], &key[i], sizeof(ScanKeyData));
        idxkey[i].sk_attno = i + 1;
    }

    return idxkey;
}




#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}




//#define NKEYS 10000000 version2 does not finish
#define NKEYS 4
#define LOOPS 10000000

void test1(int n)
{
	ScanKeyData *keys;
	ScanKeyData *idx;

        keys = (ScanKeyData *) malloc(NKEYS * sizeof(ScanKeyData));
	memset(keys, 0, NKEYS * sizeof(ScanKeyData));

	for(int i = 0; i < n; i++)
	{
		idx = version1(NKEYS, keys);
		free(idx);
	}
	free(keys);
}

void test2(int n)
{
	ScanKeyData *keys;
	ScanKeyData *idx;

        keys = (ScanKeyData *) malloc(NKEYS * sizeof(ScanKeyData));
	memset(keys, 0, NKEYS * sizeof(ScanKeyData));

	for(int i = 0; i < n; i++)
	{
		idx = version2(NKEYS, keys);
		free(idx);
	}
	free(keys);
}


int main(void)
{
	struct timespec start,end;
	int64_t version1_time, version2_time;

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	test1(LOOPS);
	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	version1_time = get_clock_diff(&end, &start);
	printf("version1: done in %lld nanoseconds\n", version1_time);	

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	test2(LOOPS);
	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	version2_time = get_clock_diff(&end, &start);
	printf("version2: done in %lld nanoseconds\n", version2_time);	

	printf("(%g times faster)\n", (double) version1_time / version2_time);

	return 0;
}

Reply via email to