Thanks for your suggestion, this is the modified patch and two test files.

-----Original Message-----
From: Michael Paquier <mich...@paquier.xyz>
Sent: Friday, October 20, 2023 4:19 PM
To: Xiang Gao <xiang....@arm.com>
Cc: pgsql-hackers@lists.postgresql.org
Subject: Re: CRC32C Parallel Computation Optimization on ARM

On Fri, Oct 20, 2023 at 07:08:58AM +0000, Xiang Gao wrote:
> This patch uses a parallel computing optimization algorithm to improve
> crc32c computing performance on ARM. The algorithm comes from Intel
> whitepaper:
> crc-iscsi-polynomial-crc32-instruction-paper. Input data is divided
> into three equal-sized blocks.Three parallel blocks (crc0, crc1,
> crc2) for 1024 Bytes.One Block: 42(BLK_LENGTH) * 8(step length:
> crc32c_u64) bytes
>
> Crc32c unitest:
> https://gist.github.com/gaoxyt/138fd53ca1eead8102eeb9204067f7e4
> Crc32c benchmark:
> https://gist.github.com/gaoxyt/4506c10fc06b3501445e32c4257113e9
> It gets ~2x speedup compared to linear Arm crc32c instructions.

Interesting.  Could you attached to this thread the test files you used and the 
results obtained please?  If this data gets deleted from github, then it would 
not be possible to refer back to what you did at the related benchmark results.

Note that your patch is forgetting about meson; it just patches ./configure.
--
Michael
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.

Attachment: 0002-crc32c-parallel-computation-optimization-on-arm.patch
Description: 0002-crc32c-parallel-computation-optimization-on-arm.patch

/*********************************************************************
* compile postgres first with different crc32c implementation(use arm vmull_p64 
or not)
* we should comment out some codes about elog in pg_crc32c_armv8_choose.c to 
compile correctly and simply.
* $ gcc   -I ../postgres/_install/include -I 
../postgres/_install/include/server main.c \
* -L ../postgres/build/src/port -l pgport_srv -O2  -o main

* this test was run on Neoverse-N1
* $ ./main.no_vmull
* data size is 512 bytes, and compute crc cost 139 us totally, 0.135742 us per 
loop
* data size is 4096 bytes, and compute crc cost 1061 us totally, 1.036133 us 
per loop

* $ ./main.use_vmull
* data size is 512 bytes, and compute crc cost 101 us totally, 0.098633 us per 
loop
* data size is 4096 bytes, and compute crc cost 540 us totally, 0.527344 us per 
loop

* We can see that the cost of computing crc32c without vmull_p64 is about two 
times than
* the cost that using vmull_p64 when data size is large. and the cost is almost 
same when 
* data size is small.
*********************************************************************/

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/time.h>
#include <memory.h>

#include "c.h"
#include "port/pg_crc32c.h"

uint64_t
GetTickCount()
{
        struct timeval tv;

        gettimeofday(&tv, NULL);
        return tv.tv_sec * 1000000 + tv.tv_usec;
}

int
main()
{
#define CASE_CNT 2
        uint32_t        test_size[CASE_CNT] = {512, 1024 * 4};

        for (int case_cnt = 0; case_cnt < CASE_CNT; case_cnt++)
        {
                uint8_t    *buf = (uint8_t *) malloc(test_size[case_cnt] * 
sizeof(uint8_t));

                srand(0);
                for (int i = 0; i < test_size[case_cnt]; i++)
                {
                        *(buf + i) = (uint8_t) (rand() % 256u);
                }

                static const uint32_t kLoop = 1024;
                uint32_t        crc = 0;
                uint64_t        start = GetTickCount();

                INIT_CRC32C(crc);
                for (int i = 0; i < kLoop; i++)
                {
                        COMP_CRC32C(crc, buf, test_size[case_cnt]);
                }
                FIN_CRC32C(crc);
                uint64_t        stop = GetTickCount();

                printf("data size is %d bytes, and compute crc cost %ld us 
totally, %f us per loop\n", test_size[case_cnt], stop - start, (double) (stop - 
start) / kLoop);
                
                free(buf);
        }
#undef CASE_CNT
        return 0;
}

/*******************************************************************************
* We use libcheck(https://github.com/libcheck/check) as unit testing framework.

* compile postgres first with different crc32c implementation(use arm crc32c
* and vmull intrisics or not). we should comment out some codes about elog in
* pg_crc32c_armv8_choose.c to compile correctly and simply.
* $ gcc -I ../postgres/_install/include -I ../postgres/_install/include/server \
  crc32c_unittest.c  -L ../postgres/build/src/port -l pgport_srv  -L 
/usr/local/lib \
  -lcheck  -o crc32c_unittest

* this test was run on Neoverse-N1
* $ ./crc32c_unittest 
* Running suite(s): CRC32C
* 100%: Checks: 3, Failures: 0, Errors: 0
*******************************************************************************/
#include <stdlib.h>
#include <check.h>

#include "c.h"
#include "port/pg_crc32c.h"

START_TEST (test_crc32c_0)
{
    int crc = 0;

    int data = 0;

    INIT_CRC32C(crc);
    COMP_CRC32C(crc, &data, sizeof(int));
    FIN_CRC32C(crc);
    ck_assert_int_eq(crc, 0x48674bc7);
}
END_TEST

START_TEST (test_crc32c_small_size)
{
    int crc = 0;

    int size = 512;
    uint8_t *buf = (uint8_t*)malloc(size * sizeof(uint8_t));
    memset(buf, 0, size * sizeof(uint8_t));

    INIT_CRC32C(crc);
    COMP_CRC32C(crc, buf, size * sizeof(uint8_t));
    FIN_CRC32C(crc);
    ck_assert_int_eq(crc, 0x30fcedc0);
    
    free(buf);
}
END_TEST

START_TEST (test_crc32c_large_size)
{
    int crc = 0;

    int size = 4096;
    uint8_t *buf = (uint8_t*)malloc(size * sizeof(uint8_t));
    for (int i = 0; i < size; i++) {
        *(buf + i) |= 0xFF;
    }

    INIT_CRC32C(crc);
    COMP_CRC32C(crc, buf, size * sizeof(uint8_t));
    FIN_CRC32C(crc);
    ck_assert_int_eq(crc, 0x25c1fe13);
    
    free(buf);
}
END_TEST


Suite * crc32c_suite(void)
{
    Suite *s;
    TCase *tc_core;

    s = suite_create("CRC32C");

    /* Core test case */
    tc_core = tcase_create("Core");

    tcase_add_test(tc_core, test_crc32c_0);
    tcase_add_test(tc_core, test_crc32c_small_size);
    tcase_add_test(tc_core, test_crc32c_large_size);
    suite_add_tcase(s, tc_core);

    return s;
}

int main()
{
    int number_failed;
    Suite *s;
    SRunner *sr;

    s = crc32c_suite();
    sr = srunner_create(s);

    srunner_run_all(sr, CK_NORMAL);
    number_failed = srunner_ntests_failed(sr);
    srunner_free(sr);
    return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
}

Reply via email to