Hi,
On Tue, Nov 12, 2024 at 03:56:13PM +0900, Michael Paquier wrote:
> On Tue, Nov 12, 2024 at 06:09:04AM +0000, Bertrand Drouvot wrote:
> > I think that the 64b len check done in v11 is mandatory for safety reasons.
> >
> > The loop above reads 64 bytes at once, so would read beyond the memory area
> > bounds
> > if len < 64: That could cause crash or read invalid data.
>
> Sorry, I was not following your argument. You're right that we need
> something else here. However..
>
> + /*
> + * For len < 64, compare byte per byte to ensure we'll not read beyond
> the
> + * memory area.
> + */
> + if (len < sizeof(size_t) * 8)
> + {
> + while (p < end)
> + {
> + if (*p++ != 0)
> + return false;
> + }
> + return true;
> + }
> +
> + /* Compare bytes until the pointer "p" is aligned */
> + while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
> + {
> + if (p == end)
> + return true;
> +
> + if (*p++ != 0)
> + return false;
> + }
> +
>
> Still, this is not optimal, based on what's been discussed upthread.
> The byte-per-byte check is more expensive than the size_t check,
I think that depends of the memory area size. If the size is small enough then
the
byte per byte can be good enough.
For example, with the allzeros_small.c attached:
== with BLCKSZ 32
$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o
allzeros_small ; ./allzeros_small
byte per byte: done in 22528 nanoseconds
size_t: done in 6949 nanoseconds (3.24191 times faster than byte per byte)
SIMD v10: done in 7562 nanoseconds (2.97911 times faster than byte per byte)
SIMD v11: done in 22096 nanoseconds (1.01955 times faster than byte per byte)
== with BLCKSZ 63
$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o
allzeros_small ; ./allzeros_small
byte per byte: done in 29246 nanoseconds
size_t: done in 10555 nanoseconds (2.77082 times faster than byte per byte)
SIMD v10: done in 11220 nanoseconds (2.6066 times faster than byte per byte)
SIMD v11: done in 29126 nanoseconds (1.00412 times faster than byte per byte)
Obviously v11 is about the same time as "byte per byte" but we can see that the
size_t or v10 improvment is not that much for small size.
While for larger size:
== with BLCKSZ 256
$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o
allzeros_small ; ./allzeros_small
byte per byte: done in 102703 nanoseconds
size_t: done in 15381 nanoseconds (6.67726 times faster than byte per byte)
SIMD v10: done in 7241 nanoseconds (14.1835 times faster than byte per byte)
SIMD v11: done in 7899 nanoseconds (13.002 times faster than byte per byte)
== with BLCKSZ 8192
$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c -o
allzeros_small ; ./allzeros_small
byte per byte: done in 2993458 nanoseconds
size_t: done in 436650 nanoseconds (6.85551 times faster than byte per byte)
SIMD v10: done in 136413 nanoseconds (21.9441 times faster than byte per byte)
SIMD v11: done in 155474 nanoseconds (19.2538 times faster than byte per byte)
It's sensitive improvment.
> shouldn't you make sure that you stack some size_t checks if dealing
> with something smaller than 64 bytes?
Based on the above I've the feeling that doing byte per byte comparison for
small size only (< 64b) is good enough. I'm not sure that adding extra
complexity
for small sizes is worth it.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>
#define BLCKSZ 32
#define LOOPS 1000
static inline bool
allzeros_byte_per_byte(const void *ptr, size_t len)
{
const unsigned char *p = (const unsigned char *) ptr;
const unsigned char *end = &p[len];
while (p < end)
{
if (*p++ != 0)
return false;
}
return true;
}
static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
const unsigned char *p = (const unsigned char *) ptr;
const unsigned char *end = &p[len];
const unsigned char *aligned_end = (const unsigned char *)
((uintptr_t) end & (~(sizeof(size_t) - 1)));
/* Compare bytes until the pointer "p" is aligned */
while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
{
if (p == end)
return true;
if (*p++ != 0)
return false;
}
/*
* Compare remaining size_t-aligned chunks.
*
* aligned_end cant' be > end as we ensured to take care of len < 8 (in
* the len < 64 check below). So, no risk to read beyond the memory area.
*/
for (; p < aligned_end; p += sizeof(size_t))
{
if (*(size_t *) p != 0)
return false;
}
/* Compare remaining bytes until the end */
while (p < end)
{
if (*p++ != 0)
return false;
}
return true;
}
bool
pg_memory_is_all_zeros_v10(const void *ptr, size_t len)
{
const unsigned char *p = (const unsigned char *) ptr;
const unsigned char *end = &p[len];
const unsigned char *aligned_end = (const unsigned char *)
((uintptr_t) end & (~(sizeof(size_t) - 1)));
/* Compare bytes until the pointer "p" is aligned */
while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
{
if (p == end)
return true;
if (*p++ != 0)
return false;
}
/*
* Compare 8 * sizeof(size_t) chunks at once.
*
* For performance reasons, we manually unroll this loop and purposefully
* use bitwise-ORs to combine each comparison. This prevents boolean
* short-circuiting and lets the compiler know that it's safe to access
* all 8 elements regardless of the result of the other comparisons. This
* seems to be enough to coax a few compilers into using SIMD
* instructions.
*
* There is no risk to read beyond the memory area thanks to the len < 64
* check done below.
*/
for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
{
if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
return false;
}
/*
* Compare remaining size_t-aligned chunks.
*
* aligned_end cant' be > end as we ensured to take care of len < 8 (in
* the len < 64 check below). So, no risk to read beyond the memory area.
*/
for (; p < aligned_end; p += sizeof(size_t))
{
if (*(size_t *) p != 0)
return false;
}
/* Compare remaining bytes until the end */
while (p < end)
{
if (*p++ != 0)
return false;
}
return true;
}
bool
pg_memory_is_all_zeros_v11(const void *ptr, size_t len)
{
const unsigned char *p = (const unsigned char *) ptr;
const unsigned char *end = &p[len];
const unsigned char *aligned_end = (const unsigned char *)
((uintptr_t) end & (~(sizeof(size_t) - 1)));
/*
* For len < 64, compare byte per byte to ensure we'll not read beyond the
* memory area.
*/
if (len < sizeof(size_t) * 8)
{
while (p < end)
{
if (*p++ != 0)
return false;
}
return true;
}
/* Compare bytes until the pointer "p" is aligned */
while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
{
if (p == end)
return true;
if (*p++ != 0)
return false;
}
/*
* Compare 8 * sizeof(size_t) chunks at once.
*
* For performance reasons, we manually unroll this loop and purposefully
* use bitwise-ORs to combine each comparison. This prevents boolean
* short-circuiting and lets the compiler know that it's safe to access
* all 8 elements regardless of the result of the other comparisons. This
* seems to be enough to coax a few compilers into using SIMD
* instructions.
*
* There is no risk to read beyond the memory area thanks to the len < 64
* check done below.
*/
for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
{
if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
return false;
}
/*
* Compare remaining size_t-aligned chunks.
*
* aligned_end cant' be > end as we ensured to take care of len < 8 (in
* the len < 64 check below). So, no risk to read beyond the memory area.
*/
for (; p < aligned_end; p += sizeof(size_t))
{
if (*(size_t *) p != 0)
return false;
}
/* Compare remaining bytes until the end */
while (p < end)
{
if (*p++ != 0)
return false;
}
return true;
}
#define NANOSEC_PER_SEC 1000000000
// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
nanosec += (t1->tv_nsec - t2->tv_nsec);
return nanosec;
}
int main()
{
size_t pagebytes[BLCKSZ] = {0};
volatile bool result;
struct timespec start,end;
int64_t byte_time, size_t_time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = allzeros_byte_per_byte(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
byte_time = get_clock_diff(&end, &start);
printf("byte per byte: done in %ld nanoseconds\n", byte_time);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = allzeros_size_t(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
size_t_time = get_clock_diff(&end, &start);
printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
size_t_time = get_clock_diff(&end, &start);
printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
size_t_time = get_clock_diff(&end, &start);
printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);
return 0;
}