From: Mattias Rönnblom <[email protected]> Add functional, performance, and profiling test suites for the fastmem library.
Signed-off-by: Mattias Rönnblom <[email protected]> --- app/test/meson.build | 3 + app/test/test_fastmem.c | 1682 +++++++++++++++++++++++++++++++ app/test/test_fastmem_perf.c | 997 ++++++++++++++++++ app/test/test_fastmem_profile.c | 157 +++ 4 files changed, 2839 insertions(+) create mode 100644 app/test/test_fastmem.c create mode 100644 app/test/test_fastmem_perf.c create mode 100644 app/test/test_fastmem_profile.c diff --git a/app/test/meson.build b/app/test/meson.build index 7d458f9c07..d11c63be6f 100644 --- a/app/test/meson.build +++ b/app/test/meson.build @@ -82,6 +82,9 @@ source_file_deps = { 'test_event_vector_adapter.c': ['eventdev', 'bus_vdev'], 'test_eventdev.c': ['eventdev', 'bus_vdev'], 'test_external_mem.c': [], + 'test_fastmem.c': ['fastmem'], + 'test_fastmem_perf.c': ['fastmem', 'mempool'], + 'test_fastmem_profile.c': ['fastmem'], 'test_fbarray.c': [], 'test_fib.c': ['net', 'fib'], 'test_fib6.c': ['rib', 'fib'], diff --git a/app/test/test_fastmem.c b/app/test/test_fastmem.c new file mode 100644 index 0000000000..c79ea95481 --- /dev/null +++ b/app/test/test_fastmem.c @@ -0,0 +1,1682 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2026 Ericsson AB + */ + +#include <errno.h> +#include <inttypes.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_thread.h> + +#include <rte_fastmem.h> + +#include "test.h" + +#define FASTMEM_MEMZONE_SIZE (128U << 20) + +/* + * Count memzones whose names begin with the fastmem prefix. + * Used to verify that rte_fastmem_reserve() really did reserve + * backing memzones. + */ +static int fastmem_memzone_count; + +static void +count_fastmem_memzones_walk(const struct rte_memzone *mz, void *arg) +{ + RTE_SET_USED(arg); + + if (strncmp(mz->name, "fastmem_", strlen("fastmem_")) == 0) + fastmem_memzone_count++; +} + +static unsigned int +count_fastmem_memzones(void) +{ + fastmem_memzone_count = 0; + rte_memzone_walk(count_fastmem_memzones_walk, NULL); + return fastmem_memzone_count; +} + +static int +test_init_deinit(void) +{ + int rc; + + rc = rte_fastmem_init(); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc); + + rte_fastmem_deinit(); + + /* A subsequent init/deinit cycle must succeed. */ + rc = rte_fastmem_init(); + TEST_ASSERT_EQUAL(rc, 0, "second rte_fastmem_init() failed: %d", rc); + + rte_fastmem_deinit(); + + return TEST_SUCCESS; +} + +static int +test_init_is_not_idempotent(void) +{ + int rc; + + rc = rte_fastmem_init(); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc); + + rc = rte_fastmem_init(); + TEST_ASSERT_EQUAL(rc, -EBUSY, + "expected -EBUSY on re-init, got %d", rc); + + rte_fastmem_deinit(); + + return TEST_SUCCESS; +} + +static int +test_deinit_without_init(void) +{ + /* Must be a no-op, not a crash. */ + rte_fastmem_deinit(); + + return TEST_SUCCESS; +} + +static int +test_max_size(void) +{ + size_t max; + + max = rte_fastmem_max_size(); + TEST_ASSERT(max >= (1U << 20), + "max_size=%zu below required 1 MiB minimum", max); + + return TEST_SUCCESS; +} + +static int +test_reserve_small(void) +{ + int socket_id; + unsigned int before, after; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no available sockets"); + + before = count_fastmem_memzones(); + + /* + * A small reserve request (1 byte) must result in exactly + * one memzone reservation: the internal rounding is to + * memzone granularity. + */ + rc = rte_fastmem_reserve(1, socket_id); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve() failed: %d", rc); + + after = count_fastmem_memzones(); + TEST_ASSERT_EQUAL(after - before, 1, + "expected 1 new memzone, got %u", after - before); + + rte_fastmem_deinit(); + + /* After deinit the memzones must be released. */ + TEST_ASSERT_EQUAL(count_fastmem_memzones(), 0, + "%u fastmem memzones leaked after deinit", + count_fastmem_memzones()); + + return TEST_SUCCESS; +} + +static int +test_reserve_multiple_memzones(void) +{ + int socket_id; + unsigned int before, after; + size_t reserve_size; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no available sockets"); + + before = count_fastmem_memzones(); + + /* + * Request just over one memzone's worth; this must force + * a second memzone to be reserved. + */ + reserve_size = FASTMEM_MEMZONE_SIZE + 1; + rc = rte_fastmem_reserve(reserve_size, socket_id); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve(%zu) failed: %d", + reserve_size, rc); + + after = count_fastmem_memzones(); + TEST_ASSERT_EQUAL(after - before, 2, + "expected 2 new memzones for %zu-byte reserve, got %u", + reserve_size, after - before); + + + return TEST_SUCCESS; +} + +static int +test_reserve_cumulative(void) +{ + int socket_id; + unsigned int after_first, after_second; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no available sockets"); + + rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id); + TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc); + + after_first = count_fastmem_memzones(); + + /* + * A second call requesting the same amount that's already + * reserved must not trigger any new memzone reservation. + */ + rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id); + TEST_ASSERT_EQUAL(rc, 0, "second reserve failed: %d", rc); + + after_second = count_fastmem_memzones(); + TEST_ASSERT_EQUAL(after_first, after_second, + "reserve of already-reserved amount added memzones (%u -> %u)", + after_first, after_second); + + + return TEST_SUCCESS; +} + +static int +test_reserve_invalid_socket(void) +{ + int rc; + + rc = rte_fastmem_reserve(1, RTE_MAX_NUMA_NODES); + TEST_ASSERT_EQUAL(rc, -EINVAL, + "expected -EINVAL for out-of-range socket, got %d", rc); + + rc = rte_fastmem_reserve(1, -2); + TEST_ASSERT_EQUAL(rc, -EINVAL, + "expected -EINVAL for negative socket, got %d", rc); + + + return TEST_SUCCESS; +} + +static int +test_reserve_without_init(void) +{ + int rc; + + rc = rte_fastmem_reserve(1, SOCKET_ID_ANY); + TEST_ASSERT(rc < 0, + "expected failure without init, got %d", rc); + + return TEST_SUCCESS; +} + +static int +test_reserve_any_socket(void) +{ + unsigned int before, after; + int rc; + + before = count_fastmem_memzones(); + + /* + * SOCKET_ID_ANY should succeed on any system with at least + * one configured socket. The allocator picks the caller's + * socket first and falls back to other sockets if needed. + */ + rc = rte_fastmem_reserve(1, SOCKET_ID_ANY); + TEST_ASSERT_EQUAL(rc, 0, + "rte_fastmem_reserve(SOCKET_ID_ANY) failed: %d", rc); + + after = count_fastmem_memzones(); + TEST_ASSERT_EQUAL(after - before, 1, + "expected 1 new memzone, got %u", after - before); + + + return TEST_SUCCESS; +} + +/* + * Stage 2 tests: allocation and free. + */ + +static int +test_alloc_too_big(void) +{ + void *p; + int rc; + + rte_errno = 0; + p = rte_fastmem_alloc(rte_fastmem_max_size() + 1, 0, 0); + TEST_ASSERT_NULL(p, "alloc above max_size returned non-NULL"); + TEST_ASSERT_EQUAL(rte_errno, E2BIG, + "expected rte_errno=E2BIG, got %d", rte_errno); + + + return TEST_SUCCESS; +} + +static int +test_alloc_invalid_align(void) +{ + void *p; + int rc; + + rte_errno = 0; + p = rte_fastmem_alloc(16, 3, 0); /* 3 is not a power of 2 */ + TEST_ASSERT_NULL(p, "alloc with align=3 returned non-NULL"); + TEST_ASSERT_EQUAL(rte_errno, EINVAL, + "expected rte_errno=EINVAL, got %d", rte_errno); + + + return TEST_SUCCESS; +} + +static int +test_alloc_free_small(void) +{ + void *p; + int rc; + + p = rte_fastmem_alloc(8, 0, 0); + TEST_ASSERT_NOT_NULL(p, "alloc(8) failed: rte_errno=%d", rte_errno); + + /* Writing into the object must not crash. */ + memset(p, 0xa5, 8); + + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +static int +test_alloc_free_various_sizes(void) +{ + static const size_t sizes[] = { + 1, 8, 16, 17, 63, 64, 128, 1024, 4096, + 64 * 1024, 256 * 1024, 1024 * 1024, + }; + void *ptrs[RTE_DIM(sizes)]; + unsigned int i; + int rc; + + for (i = 0; i < RTE_DIM(sizes); i++) { + ptrs[i] = rte_fastmem_alloc(sizes[i], 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], + "alloc(%zu) failed: rte_errno=%d", + sizes[i], rte_errno); + memset(ptrs[i], 0x5a, sizes[i]); + } + + for (i = 0; i < RTE_DIM(sizes); i++) + rte_fastmem_free(ptrs[i]); + + + return TEST_SUCCESS; +} + +static int +test_alloc_alignment(void) +{ + static const size_t aligns[] = { + 8, 16, 64, 256, 4096, 65536, + }; + unsigned int i; + int rc; + + for (i = 0; i < RTE_DIM(aligns); i++) { + void *p = rte_fastmem_alloc(1, aligns[i], 0); + + TEST_ASSERT_NOT_NULL(p, + "alloc(1, align=%zu) failed: rte_errno=%d", + aligns[i], rte_errno); + TEST_ASSERT((uintptr_t)p % aligns[i] == 0, + "pointer %p not aligned on %zu", + p, aligns[i]); + rte_fastmem_free(p); + } + + /* Default (align=0) gives at least RTE_CACHE_LINE_SIZE. */ + { + void *p = rte_fastmem_alloc(1, 0, 0); + + TEST_ASSERT_NOT_NULL(p, + "alloc(1, align=0) failed: rte_errno=%d", rte_errno); + TEST_ASSERT((uintptr_t)p % RTE_CACHE_LINE_SIZE == 0, + "default-align pointer %p not cache-line aligned", + p); + rte_fastmem_free(p); + } + + + return TEST_SUCCESS; +} + +static int +test_alloc_zero_flag(void) +{ + uint8_t *p; + unsigned int i; + int rc; + bool all_zero = true; + + /* + * Dirty a slab first by allocating without F_ZERO, writing + * a non-zero pattern, and freeing. A subsequent F_ZERO + * allocation on the same slab must return zeroed memory. + */ + p = rte_fastmem_alloc(128, 0, 0); + TEST_ASSERT_NOT_NULL(p, "priming alloc failed"); + memset(p, 0xff, 128); + rte_fastmem_free(p); + + p = rte_fastmem_alloc(128, 0, RTE_FASTMEM_F_ZERO); + TEST_ASSERT_NOT_NULL(p, "F_ZERO alloc failed"); + for (i = 0; i < 128; i++) { + if (p[i] != 0) { + all_zero = false; + break; + } + } + TEST_ASSERT(all_zero, "F_ZERO returned non-zero byte at offset %u", i); + + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +static int +test_alloc_reuse(void) +{ + void *first, *second; + int rc; + + first = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(first, "first alloc failed"); + rte_fastmem_free(first); + + second = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(second, "second alloc failed"); + + /* + * The slab's free list is LIFO, so the most recently freed + * object is at the head of the list. A subsequent alloc in + * the same class returns it. + */ + TEST_ASSERT_EQUAL(first, second, + "free + alloc did not reuse: first=%p second=%p", + first, second); + + rte_fastmem_free(second); + + + return TEST_SUCCESS; +} + +static int +test_alloc_many_in_class(void) +{ + /* + * Allocate more objects in one class than fit in a single + * slab, forcing the bin to pull a second block. This + * exercises the partial->full transition and the cross-slab + * allocation path. + */ + enum { CLASS_SIZE = 8, COUNT = 300000 }; + void **ptrs; + unsigned int i; + int rc; + + ptrs = calloc(COUNT, sizeof(*ptrs)); + TEST_ASSERT_NOT_NULL(ptrs, "calloc for test ptrs failed"); + + for (i = 0; i < COUNT; i++) { + ptrs[i] = rte_fastmem_alloc(CLASS_SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], + "alloc[%u] failed: rte_errno=%d", + i, rte_errno); + } + + for (i = 0; i < COUNT; i++) + rte_fastmem_free(ptrs[i]); + + free(ptrs); + + return TEST_SUCCESS; +} + +static int +test_alloc_socket(void) +{ + void *p; + int socket_id; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no available sockets"); + + p = rte_fastmem_alloc_socket(64, 0, 0, socket_id); + TEST_ASSERT_NOT_NULL(p, + "alloc_socket(%d) failed: rte_errno=%d", + socket_id, rte_errno); + + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +static int +test_alloc_block_repurposing(void) +{ + void *small, *large; + int rc; + + /* + * Allocate and free a small object, forcing a block to be + * assigned to the small class and then returned to the + * free-block pool. A subsequent allocation in a different + * class must be able to reuse that block. + */ + small = rte_fastmem_alloc(8, 0, 0); + TEST_ASSERT_NOT_NULL(small, "small alloc failed"); + rte_fastmem_free(small); + + large = rte_fastmem_alloc(256 * 1024, 0, 0); + TEST_ASSERT_NOT_NULL(large, "large alloc failed"); + rte_fastmem_free(large); + + + return TEST_SUCCESS; +} + +static int +test_alloc_block_repurposing_no_growth(void) +{ + struct rte_fastmem_stats stats; + void *small, *large; + uint64_t after_small; + int rc; + + /* + * Stronger version of test_alloc_block_repurposing: assert + * that the cross-class allocation does not grow the + * backing memory (bytes_backing stays flat). Because the + * free-block pool is shared across size classes — not + * partitioned per class — the block freed from the small + * class must serve the large allocation without triggering + * a new memzone reservation. + */ + rc = rte_fastmem_stats(&stats); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc); + TEST_ASSERT_EQUAL(stats.bytes_backing, (uint64_t)0, + "unexpected pre-alloc bytes_backing: %" PRIu64, + stats.bytes_backing); + + small = rte_fastmem_alloc(8, 0, 0); + TEST_ASSERT_NOT_NULL(small, "small alloc failed"); + + rc = rte_fastmem_stats(&stats); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc); + TEST_ASSERT(stats.bytes_backing > 0, + "bytes_backing did not grow on first alloc"); + after_small = stats.bytes_backing; + + rte_fastmem_free(small); + rte_fastmem_cache_flush(); + + large = rte_fastmem_alloc(256 * 1024, 0, 0); + TEST_ASSERT_NOT_NULL(large, + "large alloc failed: rte_errno=%d", rte_errno); + + rc = rte_fastmem_stats(&stats); + TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc); + TEST_ASSERT_EQUAL(stats.bytes_backing, after_small, + "cross-class alloc grew backing memory from %" PRIu64 + " to %" PRIu64, + after_small, stats.bytes_backing); + + rte_fastmem_free(large); + + return TEST_SUCCESS; +} + +static int +test_free_null(void) +{ + /* Must be a no-op, not a crash. */ + rte_fastmem_free(NULL); + + + return TEST_SUCCESS; +} + +static int +test_alloc_content_integrity(void) +{ + /* + * Allocate a batch of objects, fill each with a distinct + * byte pattern, then verify none of the patterns overlap. + * This catches header overwrites (slab header corrupted by + * object access) and slot-overlap bugs (two pointers pointing + * at overlapping slots). + */ + enum { N = 256, SIZE = 128 }; + uint8_t *ptrs[N]; + unsigned int i, j; + int rc; + + for (i = 0; i < N; i++) { + ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i); + memset(ptrs[i], (int)i, SIZE); + } + + for (i = 0; i < N; i++) + for (j = 0; j < SIZE; j++) + TEST_ASSERT_EQUAL(ptrs[i][j], (uint8_t)i, + "corruption at ptrs[%u][%u]: got 0x%x, want 0x%x", + i, j, ptrs[i][j], (uint8_t)i); + + for (i = 0; i < N; i++) + rte_fastmem_free(ptrs[i]); + + + return TEST_SUCCESS; +} + +static int +test_alloc_align_too_big(void) +{ + void *p; + int rc; + + /* + * A small size with an alignment larger than the maximum + * size class cannot be served. The class selected must be + * large enough for the alignment, but no such class exists. + */ + rte_errno = 0; + p = rte_fastmem_alloc(1, rte_fastmem_max_size() * 2, 0); + TEST_ASSERT_NULL(p, + "alloc with align>max_size returned non-NULL"); + TEST_ASSERT_EQUAL(rte_errno, E2BIG, + "expected rte_errno=E2BIG, got %d", rte_errno); + + + return TEST_SUCCESS; +} + +static int +test_alloc_align_one(void) +{ + void *p; + int rc; + + /* align=1 is a valid power of 2 and must be accepted. */ + p = rte_fastmem_alloc(8, 1, 0); + TEST_ASSERT_NOT_NULL(p, "alloc(8, 1) failed: rte_errno=%d", + rte_errno); + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +static int +test_alloc_socket_numa_placement(void) +{ + void *p; + int socket_id; + struct rte_memseg *ms; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no available sockets"); + + p = rte_fastmem_alloc_socket(64, 0, 0, socket_id); + TEST_ASSERT_NOT_NULL(p, + "alloc_socket(%d) failed: rte_errno=%d", + socket_id, rte_errno); + + /* + * Walk the memory to find the memseg for this pointer and + * verify its socket. Skip the check if lookup fails (e.g., + * --no-huge mode may not populate memsegs for fastmem's + * allocations in a way that rte_mem_virt2memseg can find). + */ + ms = rte_mem_virt2memseg(p, NULL); + if (ms != NULL) { + TEST_ASSERT_EQUAL(ms->socket_id, socket_id, + "alloc on socket %d landed on socket %d", + socket_id, ms->socket_id); + } + + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +/* + * Stage 3 tests: per-lcore caches. + */ + +static int +test_cache_flush(void) +{ + void *p; + int rc; + + /* + * Alloc and free one object, leaving it in the cache. Then + * flush and verify that a subsequent alloc may or may not + * return the same pointer (not asserting same/different — + * just checking that flush does not crash and a follow-up + * alloc still works). + */ + p = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(p, "first alloc failed"); + rte_fastmem_free(p); + + rte_fastmem_cache_flush(); + + /* Flush again — must be idempotent. */ + rte_fastmem_cache_flush(); + + p = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(p, "post-flush alloc failed"); + rte_fastmem_free(p); + + + return TEST_SUCCESS; +} + +static int +test_cache_flush_without_init(void) +{ + /* Must be a no-op, not a crash. */ + rte_fastmem_cache_flush(); + + return TEST_SUCCESS; +} + +static int +test_cache_exceeds_capacity(void) +{ + /* + * Free more objects at a single size class than the cache + * capacity (64 for classes <= 4 KiB). This forces the + * cache-drain slow path and verifies no corruption. + */ + enum { COUNT = 200, SIZE = 64 }; + void *ptrs[COUNT]; + unsigned int i; + int rc; + + for (i = 0; i < COUNT; i++) { + ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], + "alloc[%u] failed: rte_errno=%d", i, rte_errno); + } + + for (i = 0; i < COUNT; i++) + rte_fastmem_free(ptrs[i]); + + /* Re-alloc the same count should still work. */ + for (i = 0; i < COUNT; i++) { + ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], + "re-alloc[%u] failed: rte_errno=%d", i, rte_errno); + } + + for (i = 0; i < COUNT; i++) + rte_fastmem_free(ptrs[i]); + + + return TEST_SUCCESS; +} + +struct non_eal_args { + int ok; + char pad[64]; +}; + +static uint32_t +non_eal_thread_main(void *arg) +{ + struct non_eal_args *args = arg; + uint8_t *p; + + p = rte_fastmem_alloc(128, 0, 0); + if (p == NULL) + return 1; + + memset(p, 0x7e, 128); + + rte_fastmem_free(p); + + args->ok = 1; + return 0; +} + +static int +test_non_eal_thread(void) +{ + rte_thread_t thread_id; + struct non_eal_args args = { 0 }; + int rc; + + rc = rte_thread_create(&thread_id, NULL, non_eal_thread_main, &args); + TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc); + + rc = rte_thread_join(thread_id, NULL); + TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc); + + TEST_ASSERT_EQUAL(args.ok, 1, + "non-EAL thread did not complete alloc/free successfully"); + + + return TEST_SUCCESS; +} + +static int +test_cache_flush_returns_memory(void) +{ + /* + * When an entire slab's worth of objects is freed, the + * slab's block is returned to the free-block pool and can + * be reassigned to another size class. Verify the cache + * does not permanently hold objects that prevent this. + * + * Allocate enough objects in one class to force multiple + * slabs, free them all, then flush the cache. After the + * flush, all cached objects are drained to their bins and + * empty slabs are returned to the block pool. + */ + enum { N = 200, SIZE = 64 }; + void *ptrs[N]; + unsigned int i; + int rc; + + for (i = 0; i < N; i++) { + ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i); + } + for (i = 0; i < N; i++) + rte_fastmem_free(ptrs[i]); + + rte_fastmem_cache_flush(); + + /* + * An allocation in a completely different class should + * succeed now, having access to any blocks freed by the + * flush. + */ + { + void *other = rte_fastmem_alloc(65536, 0, 0); + + TEST_ASSERT_NOT_NULL(other, + "post-flush cross-class alloc failed"); + rte_fastmem_free(other); + } + + + return TEST_SUCCESS; +} + +static int +test_alloc_bulk_basic(void) +{ + enum { N = 32 }; + void *ptrs[N]; + int rc; + + rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0); + TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc); + + /* Verify all pointers are non-NULL and distinct. */ + for (unsigned int i = 0; i < N; i++) { + TEST_ASSERT_NOT_NULL(ptrs[i], "ptrs[%u] is NULL", i); + for (unsigned int j = 0; j < i; j++) + TEST_ASSERT(ptrs[i] != ptrs[j], + "ptrs[%u] == ptrs[%u]", i, j); + } + + rte_fastmem_free_bulk(ptrs, N); + + return TEST_SUCCESS; +} + +static int +test_alloc_bulk_zero_flag(void) +{ + enum { N = 8, SIZE = 128 }; + void *ptrs[N]; + int rc; + + rc = rte_fastmem_alloc_bulk(ptrs, N, SIZE, 0, RTE_FASTMEM_F_ZERO); + TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc); + + for (unsigned int i = 0; i < N; i++) { + uint8_t *p = ptrs[i]; + + for (unsigned int b = 0; b < SIZE; b++) + TEST_ASSERT_EQUAL(p[b], 0, + "ptrs[%u][%u] != 0", i, b); + } + + rte_fastmem_free_bulk(ptrs, N); + + return TEST_SUCCESS; +} + +static int +test_alloc_bulk_exceeds_cache(void) +{ + /* Allocate more than cache capacity (64) in one bulk call. */ + enum { N = 128 }; + void *ptrs[N]; + int rc; + + rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0); + TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk(%u) failed: %d", N, rc); + + rte_fastmem_free_bulk(ptrs, N); + + return TEST_SUCCESS; +} + +static int +test_alloc_bulk_socket(void) +{ + enum { N = 16 }; + void *ptrs[N]; + int socket_id; + int rc; + + socket_id = rte_socket_id_by_idx(0); + TEST_ASSERT(socket_id >= 0, "no sockets"); + + rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, socket_id); + TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket failed: %d", rc); + + rte_fastmem_free_bulk(ptrs, N); + + /* SOCKET_ID_ANY */ + rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, SOCKET_ID_ANY); + TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket(ANY) failed: %d", rc); + + rte_fastmem_free_bulk(ptrs, N); + + return TEST_SUCCESS; +} + +static int +test_free_bulk(void) +{ + enum { N = 64 }; + void *ptrs[N]; + int rc; + + /* Allocate individually, free in bulk. */ + for (unsigned int i = 0; i < N; i++) { + ptrs[i] = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i); + } + + rte_fastmem_free_bulk(ptrs, N); + + /* Verify memory is reusable. */ + for (unsigned int i = 0; i < N; i++) { + ptrs[i] = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "re-alloc[%u] failed", i); + } + + rte_fastmem_free_bulk(ptrs, N); + + return TEST_SUCCESS; +} + +static int +test_classes(void) +{ + size_t sizes[32]; + unsigned int n; + + n = rte_fastmem_classes(NULL); + TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n); + + n = rte_fastmem_classes(sizes); + TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n); + TEST_ASSERT_EQUAL(sizes[0], (size_t)8, "class 0 != 8"); + TEST_ASSERT_EQUAL(sizes[n - 1], (size_t)(1 << 20), + "last class != 1 MiB"); + + for (unsigned int i = 0; i < n; i++) { + TEST_ASSERT(sizes[i] != 0 && (sizes[i] & (sizes[i] - 1)) == 0, + "class %u size %zu not power of 2", i, sizes[i]); + if (i > 0) + TEST_ASSERT(sizes[i] > sizes[i - 1], + "classes not ascending at %u", i); + } + + return TEST_SUCCESS; +} + +static int +test_stats_class(void) +{ + enum { N = 10 }; + struct rte_fastmem_class_stats cs; + void *ptrs[N]; + int rc; + + for (unsigned int i = 0; i < N; i++) { + ptrs[i] = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i); + } + + rc = rte_fastmem_stats_class(64, &cs); + TEST_ASSERT_EQUAL(rc, 0, "stats_class failed: %d", rc); + TEST_ASSERT_EQUAL(cs.class_size, (size_t)64, "wrong class_size"); + TEST_ASSERT(cs.alloc_cache_hits + cs.alloc_cache_misses == N, + "alloc count != N: hits=%" PRIu64 " misses=%" PRIu64, + cs.alloc_cache_hits, cs.alloc_cache_misses); + TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)N, "in_use != N"); + + for (unsigned int i = 0; i < N; i++) + rte_fastmem_free(ptrs[i]); + + rc = rte_fastmem_stats_class(64, &cs); + TEST_ASSERT_EQUAL(rc, 0, "stats_class after free failed: %d", rc); + TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)0, "in_use != 0 after free"); + + /* Invalid class size. */ + rc = rte_fastmem_stats_class(13, &cs); + TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad size"); + + return TEST_SUCCESS; +} + +static int +test_stats_lcore(void) +{ + struct rte_fastmem_lcore_stats ls; + void *ptr; + int rc; + + ptr = rte_fastmem_alloc(128, 0, 0); + TEST_ASSERT_NOT_NULL(ptr, "alloc failed"); + + rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls); + TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc); + TEST_ASSERT(ls.alloc_cache_hits + ls.alloc_cache_misses > 0, + "no alloc activity on this lcore"); + + rte_fastmem_free(ptr); + + rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls); + TEST_ASSERT_EQUAL(rc, 0, "stats_lcore after free failed: %d", rc); + TEST_ASSERT(ls.free_cache_hits + ls.free_cache_misses > 0, + "no free activity on this lcore"); + + /* Invalid lcore. */ + rc = rte_fastmem_stats_lcore(RTE_MAX_LCORE, &ls); + TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad lcore"); + + return TEST_SUCCESS; +} + +static int +test_stats_lcore_class(void) +{ + struct rte_fastmem_lcore_class_stats lcs; + void *ptr; + int rc; + + ptr = rte_fastmem_alloc(256, 0, 0); + TEST_ASSERT_NOT_NULL(ptr, "alloc failed"); + + rc = rte_fastmem_stats_lcore_class(rte_lcore_id(), 256, &lcs); + TEST_ASSERT_EQUAL(rc, 0, "stats_lcore_class failed: %d", rc); + TEST_ASSERT_EQUAL(lcs.class_size, (size_t)256, "wrong class_size"); + TEST_ASSERT(lcs.alloc_cache_hits + lcs.alloc_cache_misses > 0, + "no alloc activity"); + + rte_fastmem_free(ptr); + return TEST_SUCCESS; +} + +static int +test_stats_reset(void) +{ + struct rte_fastmem_stats gs; + void *ptr; + int rc; + + ptr = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(ptr, "alloc failed"); + rte_fastmem_free(ptr); + + rte_fastmem_stats_reset(); + + rc = rte_fastmem_stats(&gs); + TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc); + TEST_ASSERT_EQUAL(gs.alloc_total, (uint64_t)0, + "alloc_total not zero after reset"); + TEST_ASSERT_EQUAL(gs.free_total, (uint64_t)0, + "free_total not zero after reset"); + + return TEST_SUCCESS; +} + + +#define MIXED_LONG_LIVED_COUNT 25 +#define MIXED_SHORT_LIVED_ITERS 1000 +#define MIXED_MIN_LCORES 3 + +static const size_t mixed_long_sizes[] = { 64, 256, 4096 }; +static const size_t mixed_short_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024 }; + +struct mixed_worker_args { + uint32_t seed; + int result; +}; + +static uint32_t +xorshift32(uint32_t *state) +{ + uint32_t x = *state; + + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + *state = x; + return x; +} + +static int +mixed_worker(void *arg) +{ + struct mixed_worker_args *args = arg; + uint32_t seed = args->seed; + void *long_lived[MIXED_LONG_LIVED_COUNT]; + size_t long_sizes[MIXED_LONG_LIVED_COUNT]; + unsigned int i; + + /* Allocate long-lived objects of mixed sizes. */ + for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) { + long_sizes[i] = mixed_long_sizes[i % RTE_DIM(mixed_long_sizes)]; + long_lived[i] = rte_fastmem_alloc(long_sizes[i], 0, 0); + if (long_lived[i] == NULL) { + args->result = TEST_FAILED; + return -1; + } + memset(long_lived[i], (int)(i + 1), long_sizes[i]); + } + + /* Rapidly cycle short-lived objects. */ + for (i = 0; i < MIXED_SHORT_LIVED_ITERS; i++) { + size_t sz = mixed_short_sizes[xorshift32(&seed) % + RTE_DIM(mixed_short_sizes)]; + uint8_t pattern = (uint8_t)(i & 0xff); + uint8_t *p; + + p = rte_fastmem_alloc(sz, 0, 0); + if (p == NULL) { + args->result = TEST_FAILED; + return -1; + } + memset(p, pattern, sz); + + /* Verify before freeing. */ + for (size_t j = 0; j < sz; j++) { + if (p[j] != pattern) { + args->result = TEST_FAILED; + return -1; + } + } + rte_fastmem_free(p); + } + + /* Verify long-lived objects are still intact. */ + for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) { + uint8_t *bytes = long_lived[i]; + uint8_t expected = (uint8_t)(i + 1); + + for (size_t j = 0; j < long_sizes[i]; j++) { + if (bytes[j] != expected) { + args->result = TEST_FAILED; + return -1; + } + } + rte_fastmem_free(long_lived[i]); + } + + args->result = TEST_SUCCESS; + return 0; +} + +static int +test_mixed_lifetimes_multi_lcore(void) +{ + struct mixed_worker_args args[RTE_MAX_LCORE]; + unsigned int lcore_id; + unsigned int count = 0; + struct rte_fastmem_stats stats; + int rc; + + RTE_LCORE_FOREACH_WORKER(lcore_id) + count++; + + if (count < MIXED_MIN_LCORES) { + printf("Not enough worker lcores (%u < %u), skipping\n", + count, MIXED_MIN_LCORES); + return TEST_SKIPPED; + } + + /* Launch workers with distinct seeds. */ + uint32_t seed = 0xdeadbeef; + + RTE_LCORE_FOREACH_WORKER(lcore_id) { + args[lcore_id].seed = seed; + args[lcore_id].result = TEST_FAILED; + seed += 0x12345678; + rte_eal_remote_launch(mixed_worker, &args[lcore_id], lcore_id); + } + + rte_eal_mp_wait_lcore(); + + /* Check all workers succeeded. */ + RTE_LCORE_FOREACH_WORKER(lcore_id) { + TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS, + "worker on lcore %u failed", lcore_id); + } + + /* Verify no memory leak. */ + rc = rte_fastmem_stats(&stats); + TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc); + TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0, + "bytes_in_use not zero after test: %" PRIu64, + stats.bytes_in_use); + + + return TEST_SUCCESS; +} + + +/* + * Memory limit tests. + * + * FASTMEM_MEMZONE_SIZE is 128 MiB. We use a limit of 128 MiB + * (one memzone) for most tests, and large objects (256 KiB) to + * exhaust slabs quickly. + */ + +#define LIMIT_ONE_MZ ((size_t)128 << 20) +#define LIMIT_OBJ_SIZE ((size_t)256 * 1024) + +static int +test_memory_limit_basic(void) +{ + int rc; + + rc = rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + TEST_ASSERT_EQUAL(rc, 0, "set_memory_limit failed: %d", rc); + + const size_t got = rte_fastmem_get_limit(0); + TEST_ASSERT_EQUAL(got, LIMIT_ONE_MZ, + "get_memory_limit mismatch: %zu", got); + + rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY); + TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc); + + rc = rte_fastmem_reserve(LIMIT_ONE_MZ + 1, SOCKET_ID_ANY); + TEST_ASSERT(rc < 0, "second reserve should have failed"); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_alloc_exhaustion(void) +{ + const unsigned int max_ptrs = 1024; + void *ptrs[max_ptrs]; + unsigned int count = 0; + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + + for (count = 0; count < max_ptrs; count++) { + ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + if (ptrs[count] == NULL) + break; + } + + TEST_ASSERT(count > 0, "should have allocated at least one"); + TEST_ASSERT(count < max_ptrs, "should have hit the limit"); + TEST_ASSERT_EQUAL(rte_errno, ENOMEM, "expected ENOMEM, got %d", rte_errno); + + rte_fastmem_free(ptrs[count - 1]); + void *p = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(p, "alloc after free should succeed"); + rte_fastmem_free(p); + + for (unsigned int i = 0; i < count - 1; i++) + rte_fastmem_free(ptrs[i]); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_zero_blocks_growth(void) +{ + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, 0); + + rc = rte_fastmem_reserve(1, SOCKET_ID_ANY); + TEST_ASSERT(rc < 0, "reserve with limit=0 should fail"); + + void *p = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NULL(p, "alloc with limit=0 should fail"); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_below_current(void) +{ + int rc; + + rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY); + TEST_ASSERT_EQUAL(rc, 0, "reserve failed: %d", rc); + + rte_fastmem_set_limit(SOCKET_ID_ANY, 1); + + void *p = rte_fastmem_alloc(64, 0, 0); + TEST_ASSERT_NOT_NULL(p, "alloc from existing backing should work"); + rte_fastmem_free(p); + + rc = rte_fastmem_reserve(LIMIT_ONE_MZ * 2, SOCKET_ID_ANY); + TEST_ASSERT(rc < 0, "growth beyond limit should fail"); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_socket_id_any(void) +{ + rte_fastmem_set_limit(SOCKET_ID_ANY, 42); + + for (unsigned int i = 0; i < rte_socket_count(); i++) { + const int sid = rte_socket_id_by_idx(i); + const size_t lim = rte_fastmem_get_limit(sid); + + TEST_ASSERT_EQUAL(lim, (size_t)42, + "socket %d limit mismatch: %zu", sid, lim); + } + + return TEST_SUCCESS; +} + +static int +test_memory_limit_unlimited(void) +{ + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, 0); + rte_fastmem_set_limit(SOCKET_ID_ANY, SIZE_MAX); + + rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY); + TEST_ASSERT_EQUAL(rc, 0, "reserve after reset failed: %d", rc); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_alloc_integrity_under_oom(void) +{ + const unsigned int n = 128; + const size_t obj_size = 1024; + uint8_t *ptrs[n]; + const unsigned int extra_max = 1024; + void *extra[extra_max]; + unsigned int n_extra = 0; + unsigned int i; + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + + for (i = 0; i < n; i++) { + ptrs[i] = rte_fastmem_alloc(obj_size, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i); + memset(ptrs[i], (int)(i & 0xff), obj_size); + } + + /* Exhaust remaining backing with large objects. */ + for (n_extra = 0; n_extra < extra_max; n_extra++) { + extra[n_extra] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + if (extra[n_extra] == NULL) + break; + } + + /* Verify original objects are intact. */ + for (i = 0; i < n; i++) { + const uint8_t expected = (uint8_t)(i & 0xff); + for (unsigned int j = 0; j < obj_size; j++) + TEST_ASSERT_EQUAL(ptrs[i][j], expected, + "corruption at [%u][%u]", i, j); + } + + for (i = 0; i < n; i++) + rte_fastmem_free(ptrs[i]); + for (i = 0; i < n_extra; i++) + rte_fastmem_free(extra[i]); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_bulk_alloc_oom(void) +{ + const unsigned int bulk_n = 64; + const unsigned int drain_max = 512; + void *ptrs[bulk_n]; + void *drain[drain_max]; + unsigned int drained = 0; + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + + for (drained = 0; drained < drain_max; drained++) { + drain[drained] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + if (drain[drained] == NULL) + break; + } + + /* Free a few — enough for some but not bulk_n objects. */ + const unsigned int freed = RTE_MIN(drained, 4u); + for (unsigned int i = 0; i < freed; i++) + rte_fastmem_free(drain[--drained]); + + rc = rte_fastmem_alloc_bulk(ptrs, bulk_n, LIMIT_OBJ_SIZE, 0, 0); + TEST_ASSERT(rc < 0, "bulk alloc should fail"); + + for (unsigned int i = 0; i < drained; i++) + rte_fastmem_free(drain[i]); + + return TEST_SUCCESS; +} + +static int +test_memory_limit_recovery_after_free(void) +{ + const unsigned int max_ptrs = 512; + void *ptrs[max_ptrs]; + unsigned int count = 0; + int rc; + + rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + + for (count = 0; count < max_ptrs; count++) { + ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + if (ptrs[count] == NULL) + break; + } + TEST_ASSERT(count > 0 && count < max_ptrs, + "expected partial fill, got %u", count); + + const unsigned int half = count / 2; + for (unsigned int i = 0; i < half; i++) + rte_fastmem_free(ptrs[i]); + + for (unsigned int i = 0; i < half; i++) { + ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + TEST_ASSERT_NOT_NULL(ptrs[i], "recovery alloc[%u] failed", i); + } + + for (unsigned int i = 0; i < count; i++) + rte_fastmem_free(ptrs[i]); + + return TEST_SUCCESS; +} + +struct limit_worker_args { + unsigned int alloc_count; + int result; +}; + +static int +limit_worker(void *arg) +{ + struct limit_worker_args *args = arg; + const unsigned int max_ptrs = 128; + void *ptrs[max_ptrs]; + unsigned int i; + + args->alloc_count = 0; + + for (i = 0; i < max_ptrs; i++) { + ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0); + if (ptrs[i] == NULL) + break; + memset(ptrs[i], 0xab, LIMIT_OBJ_SIZE); + args->alloc_count++; + } + + for (unsigned int j = 0; j < args->alloc_count; j++) { + uint8_t *bytes = ptrs[j]; + for (size_t k = 0; k < LIMIT_OBJ_SIZE; k++) { + if (bytes[k] != 0xab) { + args->result = TEST_FAILED; + return -1; + } + } + rte_fastmem_free(ptrs[j]); + } + + args->result = TEST_SUCCESS; + return 0; +} + +static int +test_memory_limit_multi_lcore_oom(void) +{ + struct limit_worker_args args[RTE_MAX_LCORE]; + unsigned int lcore_id; + unsigned int worker_count = 0; + int rc; + + RTE_LCORE_FOREACH_WORKER(lcore_id) + worker_count++; + + if (worker_count < 2) { + printf("Not enough workers (%u < 2), skipping\n", worker_count); + return TEST_SKIPPED; + } + + rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ); + + RTE_LCORE_FOREACH_WORKER(lcore_id) { + args[lcore_id].result = TEST_FAILED; + rte_eal_remote_launch(limit_worker, &args[lcore_id], lcore_id); + } + + rte_eal_mp_wait_lcore(); + + RTE_LCORE_FOREACH_WORKER(lcore_id) { + TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS, + "worker on lcore %u failed", lcore_id); + } + + struct rte_fastmem_stats stats; + rte_fastmem_stats(&stats); + TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0, + "bytes_in_use not zero: %" PRIu64, stats.bytes_in_use); + + return TEST_SUCCESS; +} + +static int +fastmem_setup(void) +{ + return rte_fastmem_init(); +} + +static void +fastmem_teardown(void) +{ + rte_fastmem_deinit(); +} + +static struct unit_test_suite fastmem_lifecycle_testsuite = { + .suite_name = "fastmem lifecycle tests", + .setup = NULL, + .teardown = NULL, + .unit_test_cases = { + TEST_CASE(test_init_deinit), + TEST_CASE(test_init_is_not_idempotent), + TEST_CASE(test_deinit_without_init), + TEST_CASE(test_max_size), + TEST_CASE(test_reserve_without_init), + TEST_CASE(test_cache_flush_without_init), + TEST_CASE(test_classes), + TEST_CASES_END() + } +}; + +static struct unit_test_suite fastmem_functional_testsuite = { + .suite_name = "fastmem functional tests", + .setup = NULL, + .teardown = NULL, + .unit_test_cases = { + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_reserve_small), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_reserve_multiple_memzones), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_reserve_cumulative), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_reserve_invalid_socket), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_reserve_any_socket), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_too_big), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_invalid_align), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_free_small), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_free_various_sizes), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_alignment), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_zero_flag), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_reuse), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_many_in_class), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_socket), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_block_repurposing), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_block_repurposing_no_growth), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_free_null), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_content_integrity), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_align_too_big), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_align_one), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_socket_numa_placement), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_cache_flush), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_cache_exceeds_capacity), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_non_eal_thread), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_cache_flush_returns_memory), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_bulk_basic), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_bulk_zero_flag), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_bulk_exceeds_cache), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_alloc_bulk_socket), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_free_bulk), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_stats_class), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_stats_lcore), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_stats_lcore_class), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_stats_reset), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_mixed_lifetimes_multi_lcore), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_basic), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_alloc_exhaustion), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_zero_blocks_growth), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_below_current), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_socket_id_any), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_unlimited), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_alloc_integrity_under_oom), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_bulk_alloc_oom), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_recovery_after_free), + TEST_CASE_ST(fastmem_setup, fastmem_teardown, + test_memory_limit_multi_lcore_oom), + TEST_CASES_END() + } +}; + +static int +test_fastmem(void) +{ + int rc; + + rc = unit_test_suite_runner(&fastmem_lifecycle_testsuite); + if (rc != 0) + return rc; + + return unit_test_suite_runner(&fastmem_functional_testsuite); +} + +REGISTER_FAST_TEST(fastmem_autotest, NOHUGE_OK, ASAN_OK, test_fastmem); diff --git a/app/test/test_fastmem_perf.c b/app/test/test_fastmem_perf.c new file mode 100644 index 0000000000..9200847847 --- /dev/null +++ b/app/test/test_fastmem_perf.c @@ -0,0 +1,997 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2026 Ericsson AB + */ + +#include <inttypes.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#include <rte_malloc.h> +#include <rte_mempool.h> +#include <rte_stdatomic.h> + +#include <rte_fastmem.h> + +#include "test.h" + +#define TEST_LOG(...) printf(__VA_ARGS__) + +static const size_t SIZES[] = { 8, 64, 256, 1024, 4096 }; +#define N_SIZES RTE_DIM(SIZES) + +/* Number of ops for warmup and measurement. */ +#define WARMUP_OPS 20000u +#define MEASURE_OPS 2000000u + +/* Buffer for scenarios that allocate N then free N. */ +#define BATCH_N 256 + +/* + * Allocator vtable: a thin adapter exposing alloc / free / + * per-allocator setup/teardown. Each scenario calls these + * indirectly so the same timing loop serves all allocators. + */ +struct allocator { + const char *name; + int (*setup)(size_t size, unsigned int n_max); + void (*teardown)(void); + void *(*alloc)(void); + void (*free_obj)(void *ptr); + int (*alloc_bulk)(void **ptrs, unsigned int n); + void (*free_bulk)(void **ptrs, unsigned int n); +}; + +/* Fastmem adapter -------------------------------------------------- */ + +static size_t fastmem_size; + +static int +fastmem_setup(size_t size, unsigned int n_max __rte_unused) +{ + fastmem_size = size; + return 0; +} + +static void +fastmem_teardown(void) +{ + rte_fastmem_cache_flush(); +} + +static void * __rte_noinline +fastmem_alloc(void) +{ + return rte_fastmem_alloc(fastmem_size, 0, 0); +} + +static void __rte_noinline +fastmem_free(void *ptr) +{ + rte_fastmem_free(ptr); +} + +/* Mempool adapter -------------------------------------------------- */ + +static struct rte_mempool *mempool_pool; + +static int +mempool_setup(size_t size, unsigned int n_max) +{ + char name[RTE_MEMPOOL_NAMESIZE]; + unsigned int cache_size; + + /* + * Pool size must accommodate the full batch burst plus + * per-lcore cache capacity. Use mempool's default cache + * size so we're measuring its standard hot path. + */ + cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; + + snprintf(name, sizeof(name), "fmperf_mp_%zu", size); + mempool_pool = rte_mempool_create(name, n_max + cache_size * 2, + size, cache_size, 0, NULL, NULL, NULL, NULL, + SOCKET_ID_ANY, 0); + if (mempool_pool == NULL) { + TEST_LOG("mempool_create(%zu) failed\n", size); + return -1; + } + + return 0; +} + +static void +mempool_teardown(void) +{ + rte_mempool_free(mempool_pool); + mempool_pool = NULL; +} + +static void * __rte_noinline +mempool_alloc_one(void) +{ + void *obj = NULL; + + if (rte_mempool_get(mempool_pool, &obj) < 0) + return NULL; + return obj; +} + +static void __rte_noinline +mempool_free_one(void *ptr) +{ + rte_mempool_put(mempool_pool, ptr); +} + +/* rte_malloc adapter ----------------------------------------------- */ + +static size_t malloc_size; + +static int +malloc_setup(size_t size, unsigned int n_max __rte_unused) +{ + malloc_size = size; + return 0; +} + +static void +malloc_teardown(void) +{ +} + +static void * __rte_noinline +malloc_alloc(void) +{ + return rte_malloc(NULL, malloc_size, 0); +} + +static void __rte_noinline +malloc_free(void *ptr) +{ + rte_free(ptr); +} + +/* libc (glibc) malloc adapter -------------------------------------- */ + +static size_t libc_size; + +static int +libc_setup(size_t size, unsigned int n_max __rte_unused) +{ + /* + * Round up to cache-line alignment to match the other + * allocators' default alignment guarantees and keep the + * comparison honest. aligned_alloc() requires size to be + * a multiple of the alignment. + */ + libc_size = RTE_ALIGN_CEIL(size, RTE_CACHE_LINE_SIZE); + return 0; +} + +static void +libc_teardown(void) +{ +} + +static void * __rte_noinline +libc_alloc(void) +{ + return aligned_alloc(RTE_CACHE_LINE_SIZE, libc_size); +} + +static void __rte_noinline +libc_free(void *ptr) +{ + free(ptr); +} + +/* Bulk adapters ---------------------------------------------------- */ + +static int __rte_noinline +fastmem_alloc_bulk(void **ptrs, unsigned int n) +{ + return rte_fastmem_alloc_bulk(ptrs, n, fastmem_size, 0, 0); +} + +static void __rte_noinline +fastmem_free_bulk(void **ptrs, unsigned int n) +{ + rte_fastmem_free_bulk(ptrs, n); +} + +static int __rte_noinline +mempool_alloc_bulk(void **ptrs, unsigned int n) +{ + return rte_mempool_get_bulk(mempool_pool, ptrs, n); +} + +static void __rte_noinline +mempool_free_bulk(void **ptrs, unsigned int n) +{ + rte_mempool_put_bulk(mempool_pool, ptrs, n); +} + +static int __rte_noinline +generic_alloc_bulk(void **ptrs, unsigned int n, void *(*alloc_fn)(void)) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + ptrs[i] = alloc_fn(); + if (ptrs[i] == NULL) + return -1; + } + return 0; +} + +static int __rte_noinline +malloc_alloc_bulk(void **ptrs, unsigned int n) +{ + return generic_alloc_bulk(ptrs, n, malloc_alloc); +} + +static void __rte_noinline +malloc_free_bulk(void **ptrs, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; i++) + malloc_free(ptrs[i]); +} + +static int __rte_noinline +libc_alloc_bulk(void **ptrs, unsigned int n) +{ + return generic_alloc_bulk(ptrs, n, libc_alloc); +} + +static void __rte_noinline +libc_free_bulk(void **ptrs, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; i++) + libc_free(ptrs[i]); +} + +/* Adapter table ---------------------------------------------------- */ + +static const struct allocator allocators[] = { + { "fastmem", fastmem_setup, fastmem_teardown, fastmem_alloc, fastmem_free, fastmem_alloc_bulk, fastmem_free_bulk }, + { "mempool", mempool_setup, mempool_teardown, mempool_alloc_one, mempool_free_one, mempool_alloc_bulk, mempool_free_bulk }, + { "rte_malloc", malloc_setup, malloc_teardown, malloc_alloc, malloc_free, malloc_alloc_bulk, malloc_free_bulk }, + { "libc", libc_setup, libc_teardown, libc_alloc, libc_free, libc_alloc_bulk, libc_free_bulk }, +}; +#define N_ALLOCATORS RTE_DIM(allocators) + +/* + * Scenario 1: tight alloc+free loop. A single object is cycled + * repeatedly. The LIFO path keeps the same pointer hot, giving + * a best-case measurement. + */ +static double +run_tight(const struct allocator *alloc, size_t size) +{ + void *p; + uint64_t tsc; + unsigned int i; + + if (alloc->setup(size, 1) < 0) + return -1.0; + + /* Warmup. */ + for (i = 0; i < WARMUP_OPS; i++) { + p = alloc->alloc(); + if (p == NULL) + goto err; + alloc->free_obj(p); + } + + tsc = rte_rdtsc_precise(); + for (i = 0; i < MEASURE_OPS; i++) { + p = alloc->alloc(); + if (p == NULL) + goto err; + alloc->free_obj(p); + } + tsc = rte_rdtsc_precise() - tsc; + + alloc->teardown(); + + return (double)tsc / MEASURE_OPS; +err: + alloc->teardown(); + return -1.0; +} + +/* + * Scenario 2: allocate N, free N (FIFO free order). Exercises + * cache refill and drain paths when N exceeds cache capacity. + */ +static void +run_batch(const struct allocator *alloc, size_t size, + double *cycles_alloc, double *cycles_free) +{ + void *ptrs[BATCH_N]; + uint64_t tsc_alloc = 0, tsc_free = 0; + unsigned int iter, i; + unsigned int iters; + + *cycles_alloc = -1.0; + *cycles_free = -1.0; + + if (alloc->setup(size, BATCH_N) < 0) + return; + + /* Pick iteration count so total ops ~= MEASURE_OPS. */ + iters = MEASURE_OPS / BATCH_N; + + /* Warmup. */ + for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) { + for (i = 0; i < BATCH_N; i++) { + ptrs[i] = alloc->alloc(); + if (ptrs[i] == NULL) + goto err; + } + for (i = 0; i < BATCH_N; i++) + alloc->free_obj(ptrs[i]); + } + + for (iter = 0; iter < iters; iter++) { + uint64_t t0; + + t0 = rte_rdtsc_precise(); + for (i = 0; i < BATCH_N; i++) { + ptrs[i] = alloc->alloc(); + if (ptrs[i] == NULL) + goto err; + } + tsc_alloc += rte_rdtsc_precise() - t0; + + t0 = rte_rdtsc_precise(); + for (i = 0; i < BATCH_N; i++) + alloc->free_obj(ptrs[i]); + tsc_free += rte_rdtsc_precise() - t0; + } + + alloc->teardown(); + + *cycles_alloc = (double)tsc_alloc / (iters * BATCH_N); + *cycles_free = (double)tsc_free / (iters * BATCH_N); + return; +err: + alloc->teardown(); +} + +/* + * Scenario 3: allocate N, free N in reverse order. + */ +static void +run_batch_reverse(const struct allocator *alloc, size_t size, + double *cycles_alloc, double *cycles_free) +{ + void *ptrs[BATCH_N]; + uint64_t tsc_alloc = 0, tsc_free = 0; + unsigned int iter, i; + unsigned int iters; + + *cycles_alloc = -1.0; + *cycles_free = -1.0; + + if (alloc->setup(size, BATCH_N) < 0) + return; + + iters = MEASURE_OPS / BATCH_N; + + for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) { + for (i = 0; i < BATCH_N; i++) { + ptrs[i] = alloc->alloc(); + if (ptrs[i] == NULL) + goto err; + } + for (i = BATCH_N; i > 0; i--) + alloc->free_obj(ptrs[i - 1]); + } + + for (iter = 0; iter < iters; iter++) { + uint64_t t0; + + t0 = rte_rdtsc_precise(); + for (i = 0; i < BATCH_N; i++) { + ptrs[i] = alloc->alloc(); + if (ptrs[i] == NULL) + goto err; + } + tsc_alloc += rte_rdtsc_precise() - t0; + + t0 = rte_rdtsc_precise(); + for (i = BATCH_N; i > 0; i--) + alloc->free_obj(ptrs[i - 1]); + tsc_free += rte_rdtsc_precise() - t0; + } + + alloc->teardown(); + + *cycles_alloc = (double)tsc_alloc / (iters * BATCH_N); + *cycles_free = (double)tsc_free / (iters * BATCH_N); + return; +err: + alloc->teardown(); +} + +/* + * Scenario 4: multi-lcore alloc/work/free with a dummy-work + * baseline. Each worker runs a tight alloc → touch → free loop + * on its own lcore. A second run with the same dummy work but + * no allocator traffic establishes a baseline; the per-op + * allocator cost is reported as (alloc_run - baseline_run). + * + * Fixed size class and a fixed amount of dummy work per op — + * this scenario sweeps lcore count rather than size. + */ +#define MULTI_SIZE 256u +#define MULTI_WORK_BYTES 64u +#define MULTI_WORK_PASSES 8u /* RMW passes over the work region. */ +#define MULTI_OPS 200000u +#define MULTI_WARMUP 2000u +#define MAX_MULTI_LCORES 32u + +/* + * Per-worker volatile sink. Each worker writes to its own + * slot, preventing dead-code elimination of touch_buffer() and + * avoiding cross-lcore cache-line sharing on the hot path. + * Padded to cache-line stride to prevent false sharing between + * neighboring workers' slots. + */ +struct worker_sink { + volatile uint64_t value; +} __rte_cache_aligned; + +static struct worker_sink worker_sinks[RTE_MAX_LCORE]; + +/* + * Out-of-line dummy workload: run MULTI_WORK_PASSES + * read-modify-write passes over the first 'bytes' of the + * buffer. Each pass reads what the previous pass wrote, so the + * compiler cannot unroll or parallelize across passes — the + * work scales linearly with MULTI_WORK_PASSES. Returns an + * accumulator so the caller can feed it into a volatile sink; + * without that, the compiler could elide the whole function. + * + * __rte_noinline so it looks identical to the compiler in both + * the baseline (pre-allocated scratch buffer) and alloc-path + * runs, making the cycle-delta subtraction valid. + * + * The purpose of this being tunably expensive is to keep + * worker-per-iteration cost high relative to the allocator's + * critical section, so that even serialized allocators like + * rte_malloc spend most of their time outside the lock and the + * measured per-op allocator cost reflects its own work rather + * than its contention queue. + */ +static uint64_t __rte_noinline +touch_buffer(void *buf, size_t bytes) +{ + uint64_t *p = buf; + size_t n = bytes / sizeof(uint64_t); + uint64_t acc = 0; + unsigned int pass; + size_t i; + + /* Prime the buffer with a known pattern. */ + for (i = 0; i < n; i++) + p[i] = i * 0x9E3779B97F4A7C15ULL; + + /* + * Dependent RMW passes: each pass reads p[i] written by + * the previous pass, mixes the pass index in, and writes + * back. The XOR into acc keeps the chain live. + */ + for (pass = 0; pass < MULTI_WORK_PASSES; pass++) { + for (i = 0; i < n; i++) { + uint64_t v = p[i]; + + v = v * 0xC2B2AE3D27D4EB4FULL + pass; + v ^= v >> 33; + p[i] = v; + acc ^= v; + } + } + + return acc; +} + +struct worker_args { + const struct allocator *alloc; + void *scratch; /* baseline only; NULL => alloc path */ + unsigned int iters; + unsigned int warmup; + unsigned int bulk_n; /* 0 = single-object, >0 = bulk */ + RTE_ATOMIC(bool) start_flag; /* barrier at worker entry */ + uint64_t cycles; /* out */ + unsigned int ops; /* out */ + int err; /* out */ +}; + +static int +worker_run(void *arg) +{ + struct worker_args *wa = arg; + unsigned int lcore = rte_lcore_id(); + uint64_t acc = 0; + uint64_t t0; + unsigned int i; + + wa->err = 0; + wa->ops = 0; + wa->cycles = 0; + + /* Wait for start flag (spin-barrier set by main). */ + while (!rte_atomic_load_explicit(&wa->start_flag, + rte_memory_order_acquire)) + rte_pause(); + + /* Warmup. */ + for (i = 0; i < wa->warmup; i++) { + void *p; + + if (wa->scratch != NULL) + p = wa->scratch; + else { + p = wa->alloc->alloc(); + if (p == NULL) { + wa->err = -1; + return -1; + } + } + acc ^= touch_buffer(p, MULTI_WORK_BYTES); + if (wa->scratch == NULL) + wa->alloc->free_obj(p); + } + + /* Measured loop. */ + t0 = rte_rdtsc_precise(); + for (i = 0; i < wa->iters; i++) { + void *p; + + if (wa->scratch != NULL) + p = wa->scratch; + else { + p = wa->alloc->alloc(); + if (p == NULL) { + wa->err = -1; + break; + } + } + acc ^= touch_buffer(p, MULTI_WORK_BYTES); + if (wa->scratch == NULL) + wa->alloc->free_obj(p); + } + wa->cycles = rte_rdtsc_precise() - t0; + wa->ops = i; + + /* Publish accumulator to defeat dead-code elimination. */ + worker_sinks[lcore].value ^= acc; + + return 0; +} + +static int +worker_run_bulk(void *arg) +{ + struct worker_args *wa = arg; + unsigned int lcore = rte_lcore_id(); + void *ptrs[BATCH_N]; + uint64_t acc = 0; + uint64_t t0; + unsigned int i, j; + unsigned int bulk_n = wa->bulk_n; + + wa->err = 0; + wa->ops = 0; + wa->cycles = 0; + + while (!rte_atomic_load_explicit(&wa->start_flag, + rte_memory_order_acquire)) + rte_pause(); + + /* Warmup. */ + for (i = 0; i < wa->warmup; i++) { + if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) { + wa->err = -1; + return -1; + } + for (j = 0; j < bulk_n; j++) + acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES); + wa->alloc->free_bulk(ptrs, bulk_n); + } + + t0 = rte_rdtsc_precise(); + for (i = 0; i < wa->iters; i++) { + if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) { + wa->err = -1; + break; + } + for (j = 0; j < bulk_n; j++) + acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES); + wa->alloc->free_bulk(ptrs, bulk_n); + } + wa->cycles = rte_rdtsc_precise() - t0; + wa->ops = i * bulk_n; + + worker_sinks[lcore].value ^= acc; + + return 0; +} + +/* + * Launch workers on the first 'n_workers' worker lcores, run + * either the baseline (scratch != NULL) or the alloc path + * (scratch == NULL), and return the mean per-op cycle cost + * averaged across participating workers. + * + * On any worker error, returns -1.0. + */ +static double +run_multi_workers(const struct allocator *alloc, unsigned int n_workers, + void *const *scratches, unsigned int bulk_n) +{ + struct worker_args wargs[RTE_MAX_LCORE]; + unsigned int worker_lcores[MAX_MULTI_LCORES]; + unsigned int n = 0; + unsigned int lcore_id; + unsigned int i; + lcore_function_t *fn = bulk_n > 0 ? worker_run_bulk : worker_run; + + /* Collect the first n_workers worker lcores. */ + RTE_LCORE_FOREACH_WORKER(lcore_id) { + if (n >= n_workers) + break; + worker_lcores[n++] = lcore_id; + } + if (n < n_workers) + return -1.0; + + /* Prepare per-worker args. */ + for (i = 0; i < n_workers; i++) { + struct worker_args *wa = &wargs[worker_lcores[i]]; + + wa->alloc = alloc; + wa->scratch = scratches != NULL ? scratches[i] : NULL; + wa->iters = MULTI_OPS; + wa->warmup = MULTI_WARMUP; + wa->bulk_n = bulk_n; + rte_atomic_store_explicit(&wa->start_flag, false, + rte_memory_order_relaxed); + } + + /* Launch workers. They spin on start_flag until released. */ + for (i = 0; i < n_workers; i++) + rte_eal_remote_launch(fn, &wargs[worker_lcores[i]], + worker_lcores[i]); + + /* Release all workers roughly simultaneously. */ + for (i = 0; i < n_workers; i++) + rte_atomic_store_explicit( + &wargs[worker_lcores[i]].start_flag, true, + rte_memory_order_release); + + /* Wait for completion. */ + for (i = 0; i < n_workers; i++) + rte_eal_wait_lcore(worker_lcores[i]); + + /* Aggregate: mean cycles per op across workers. */ + { + double sum_cycles_per_op = 0.0; + unsigned int n_ok = 0; + + for (i = 0; i < n_workers; i++) { + struct worker_args *wa = &wargs[worker_lcores[i]]; + + if (wa->err != 0 || wa->ops == 0) + return -1.0; + sum_cycles_per_op += + (double)wa->cycles / (double)wa->ops; + n_ok++; + } + return sum_cycles_per_op / n_ok; + } +} + +/* + * One sub-run of Scenario 4: given an allocator and a worker + * count, return (baseline, alloc_path) mean cycles per op. + */ +static void +run_multi_lcore(const struct allocator *alloc, unsigned int n_workers, + unsigned int bulk_n, double *baseline, double *alloc_path) +{ + void *scratches[MAX_MULTI_LCORES] = {0}; + unsigned int n_alloced = 0; + unsigned int i; + + *baseline = -1.0; + *alloc_path = -1.0; + + if (alloc->setup(MULTI_SIZE, n_workers * 64) < 0) + return; + + /* Baseline: pre-allocate one scratch per worker. */ + for (i = 0; i < n_workers; i++) { + scratches[i] = alloc->alloc(); + if (scratches[i] == NULL) + goto err; + n_alloced++; + } + + *baseline = run_multi_workers(alloc, n_workers, scratches, 0); + + for (i = 0; i < n_alloced; i++) + alloc->free_obj(scratches[i]); + n_alloced = 0; + + /* Alloc path: workers alloc+free each iter. */ + *alloc_path = run_multi_workers(alloc, n_workers, NULL, bulk_n); + + alloc->teardown(); + return; +err: + for (i = 0; i < n_alloced; i++) + alloc->free_obj(scratches[i]); + alloc->teardown(); +} + +/* Reporting -------------------------------------------------------- */ + +static void +print_header(const char *title) +{ + size_t i; + + TEST_LOG("\n=== %s ===\n", title); + TEST_LOG("%-12s", "allocator"); + for (i = 0; i < N_SIZES; i++) + TEST_LOG(" %10zu B", SIZES[i]); + TEST_LOG("\n"); +} + +static void +print_row(const char *name, const double *values) +{ + size_t i; + + TEST_LOG("%-12s", name); + for (i = 0; i < N_SIZES; i++) { + if (values[i] < 0) + TEST_LOG(" %12s", "--"); + else + TEST_LOG(" %12.1f", values[i]); + } + TEST_LOG("\n"); +} + +static void +print_multi_header(const char *title, const unsigned int *lcore_counts, + unsigned int n_counts) +{ + unsigned int i; + + TEST_LOG("\n=== %s ===\n", title); + TEST_LOG("%-12s", "allocator"); + for (i = 0; i < n_counts; i++) + TEST_LOG(" %8u lcore%c", lcore_counts[i], + lcore_counts[i] == 1 ? ' ' : 's'); + TEST_LOG("\n"); +} + +static void +print_multi_row(const char *name, const double *values, unsigned int n_counts) +{ + unsigned int i; + + TEST_LOG("%-12s", name); + for (i = 0; i < n_counts; i++) { + if (values[i] < 0) + TEST_LOG(" %14s", "--"); + else + TEST_LOG(" %14.1f", values[i]); + } + TEST_LOG("\n"); +} + +/* Driver ----------------------------------------------------------- */ + +static int +test_fastmem_perf(void) +{ + size_t i; + size_t a; + int rc; + + rc = rte_fastmem_init(); + if (rc < 0) { + TEST_LOG("rte_fastmem_init() failed: %d\n", rc); + return -1; + } + + rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY); + if (rc < 0) { + TEST_LOG("rte_fastmem_reserve() failed: %d\n", rc); + rte_fastmem_deinit(); + return -1; + } + + TEST_LOG("\nfastmem performance — single-lcore, fixed-size\n"); + TEST_LOG("All numbers are TSC cycles.\n"); + + /* Scenario 1: tight alloc+free. */ + print_header("Scenario 1: Single-object hot path — cycles per (alloc + free)"); + for (a = 0; a < N_ALLOCATORS; a++) { + double vals[N_SIZES]; + + for (i = 0; i < N_SIZES; i++) + vals[i] = run_tight(&allocators[a], SIZES[i]); + print_row(allocators[a].name, vals); + } + + /* Scenario 2: batched, FIFO free. */ + print_header("Scenario 2: Batch alloc, FIFO free — cycles per alloc"); + for (a = 0; a < N_ALLOCATORS; a++) { + double vals_alloc[N_SIZES], vals_free[N_SIZES]; + + for (i = 0; i < N_SIZES; i++) + run_batch(&allocators[a], SIZES[i], + &vals_alloc[i], &vals_free[i]); + print_row(allocators[a].name, vals_alloc); + } + print_header("Scenario 2: Batch alloc, FIFO free — cycles per free"); + for (a = 0; a < N_ALLOCATORS; a++) { + double vals_alloc[N_SIZES], vals_free[N_SIZES]; + + for (i = 0; i < N_SIZES; i++) + run_batch(&allocators[a], SIZES[i], + &vals_alloc[i], &vals_free[i]); + print_row(allocators[a].name, vals_free); + } + + /* Scenario 3: batched, reverse free. */ + print_header("Scenario 3: Batch alloc, LIFO free — cycles per alloc"); + for (a = 0; a < N_ALLOCATORS; a++) { + double vals_alloc[N_SIZES], vals_free[N_SIZES]; + + for (i = 0; i < N_SIZES; i++) + run_batch_reverse(&allocators[a], SIZES[i], + &vals_alloc[i], &vals_free[i]); + print_row(allocators[a].name, vals_alloc); + } + print_header("Scenario 3: Batch alloc, LIFO free — cycles per free"); + for (a = 0; a < N_ALLOCATORS; a++) { + double vals_alloc[N_SIZES], vals_free[N_SIZES]; + + for (i = 0; i < N_SIZES; i++) + run_batch_reverse(&allocators[a], SIZES[i], + &vals_alloc[i], &vals_free[i]); + print_row(allocators[a].name, vals_free); + } + + /* Scenario 4: multi-lcore alloc/work/free with baseline. */ + { + unsigned int max_workers = rte_lcore_count() - 1; + unsigned int lcore_counts[8]; + unsigned int n_counts = 0; + unsigned int w; + double base_vals[N_ALLOCATORS][8]; + double alloc_vals[N_ALLOCATORS][8]; + double delta_vals[N_ALLOCATORS][8]; + + if (max_workers > MAX_MULTI_LCORES) + max_workers = MAX_MULTI_LCORES; + + /* Sweep lcore counts: 1, 2, 4, 8, ... up to max_workers. */ + for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2) + lcore_counts[n_counts++] = w; + /* Ensure max_workers is the final column if not power of two. */ + if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers && + n_counts < RTE_DIM(lcore_counts) && max_workers >= 1) + lcore_counts[n_counts++] = max_workers; + + if (n_counts == 0) { + TEST_LOG("\nScenario 4 (Multi-lcore contention) skipped: no worker lcores available.\n"); + } else { + TEST_LOG("\nScenario 4 parameters: size=%u B\n", + MULTI_SIZE); + + for (a = 0; a < N_ALLOCATORS; a++) { + unsigned int c; + + for (c = 0; c < n_counts; c++) + run_multi_lcore(&allocators[a], lcore_counts[c], + 0, &base_vals[a][c], + &alloc_vals[a][c]); + for (c = 0; c < n_counts; c++) { + if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0) + delta_vals[a][c] = -1.0; + else + delta_vals[a][c] = alloc_vals[a][c] - + base_vals[a][c]; + } + } + + TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n", + base_vals[0][0]); + + print_multi_header("Scenario 4: Multi-lcore contention — allocator overhead (cycles/op)", + lcore_counts, n_counts); + for (a = 0; a < N_ALLOCATORS; a++) + print_multi_row(allocators[a].name, + delta_vals[a], n_counts); + } + } + + /* Scenario 5: multi-lcore bulk alloc/work/free. */ + { + unsigned int max_workers = rte_lcore_count() - 1; + unsigned int lcore_counts[8]; + unsigned int n_counts = 0; + unsigned int w; + double base_vals[N_ALLOCATORS][8]; + double alloc_vals[N_ALLOCATORS][8]; + double delta_vals[N_ALLOCATORS][8]; + unsigned int bulk_n = 8; + + if (max_workers > MAX_MULTI_LCORES) + max_workers = MAX_MULTI_LCORES; + + for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2) + lcore_counts[n_counts++] = w; + if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers && + n_counts < RTE_DIM(lcore_counts) && max_workers >= 1) + lcore_counts[n_counts++] = max_workers; + + if (n_counts == 0) { + TEST_LOG("\nScenario 5 (Multi-lcore bulk contention) skipped: no worker lcores available.\n"); + } else { + TEST_LOG("\nScenario 5 parameters: size=%u B, " + "bulk=%u\n", + MULTI_SIZE, bulk_n); + + for (size_t a = 0; a < N_ALLOCATORS; a++) { + unsigned int c; + + for (c = 0; c < n_counts; c++) + run_multi_lcore(&allocators[a], + lcore_counts[c], bulk_n, + &base_vals[a][c], + &alloc_vals[a][c]); + for (c = 0; c < n_counts; c++) { + if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0) + delta_vals[a][c] = -1.0; + else + delta_vals[a][c] = alloc_vals[a][c] - + base_vals[a][c]; + } + } + + TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n", + base_vals[0][0]); + + print_multi_header("Scenario 5: Multi-lcore bulk contention — allocator overhead (cycles/op)", + lcore_counts, n_counts); + for (size_t a = 0; a < N_ALLOCATORS; a++) + print_multi_row(allocators[a].name, + delta_vals[a], n_counts); + } + } + + TEST_LOG("\n"); + rte_fastmem_deinit(); + return 0; +} + +REGISTER_PERF_TEST(fastmem_perf_autotest, test_fastmem_perf); diff --git a/app/test/test_fastmem_profile.c b/app/test/test_fastmem_profile.c new file mode 100644 index 0000000000..9a5dc94018 --- /dev/null +++ b/app/test/test_fastmem_profile.c @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2026 Ericsson AB + */ + +/* + * A minimal fastmem workload intended for use with perf record / + * perf report. Runs a tight alloc/free loop for a fixed duration + * so that sampling profilers can attribute cycles to individual + * functions and instructions within the fastmem hot path. + * + * Usage: + * perf record -g -- dpdk-test --no-huge --no-pci -m 8192 \ + * -l 0 <<< fastmem_profile_autotest + * perf report + */ + +#include <inttypes.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_memory.h> + +#include <rte_fastmem.h> + +#include "test.h" + +/* Duration of each sub-test in TSC cycles (~3 seconds at 3 GHz). */ +#define PROFILE_DURATION_CYCLES (3ULL * rte_get_tsc_hz()) + +/* Allocation size for the profiling workload. */ +#define PROFILE_SIZE 256u + +/* + * Sub-test 1: tight alloc+free, exercises only the per-lcore + * cache (no bin interaction after warmup). + */ +static int +profile_cache_hit(void) +{ + uint64_t deadline; + uint64_t ops = 0; + + deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES; + + while (rte_rdtsc() < deadline) { + void *p = rte_fastmem_alloc(PROFILE_SIZE, 0, 0); + + if (p == NULL) + return -1; + rte_fastmem_free(p); + ops++; + } + + printf(" cache_hit: %" PRIu64 " ops\n", ops); + return 0; +} + +/* + * Sub-test 2: alloc N then free N, where N exceeds the cache + * capacity. This forces repeated cache refills and drains, + * exercising the bin lock and slab free-list traversal. + */ +#define PROFILE_BATCH 256u + +static int +profile_cache_miss(void) +{ + void *ptrs[PROFILE_BATCH]; + uint64_t deadline; + uint64_t ops = 0; + unsigned int i; + + deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES; + + while (rte_rdtsc() < deadline) { + for (i = 0; i < PROFILE_BATCH; i++) { + ptrs[i] = rte_fastmem_alloc(PROFILE_SIZE, 0, 0); + if (ptrs[i] == NULL) + return -1; + } + for (i = 0; i < PROFILE_BATCH; i++) + rte_fastmem_free(ptrs[i]); + ops += PROFILE_BATCH; + } + + printf(" cache_miss: %" PRIu64 " ops\n", ops); + return 0; +} + +static int +test_fastmem_profile_cache_hit(void) +{ + int rc; + + rc = rte_fastmem_init(); + if (rc < 0) { + printf("rte_fastmem_init() failed: %d\n", rc); + return -1; + } + + rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY); + if (rc < 0) { + printf("rte_fastmem_reserve() failed: %d\n", rc); + rte_fastmem_deinit(); + return -1; + } + + printf("fastmem profile: cache-hit workload (size=%u, ~%u s)\n", + PROFILE_SIZE, 3); + + if (profile_cache_hit() < 0) { + rte_fastmem_deinit(); + return -1; + } + + rte_fastmem_deinit(); + return 0; +} + +static int +test_fastmem_profile_cache_miss(void) +{ + int rc; + + rc = rte_fastmem_init(); + if (rc < 0) { + printf("rte_fastmem_init() failed: %d\n", rc); + return -1; + } + + rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY); + if (rc < 0) { + printf("rte_fastmem_reserve() failed: %d\n", rc); + rte_fastmem_deinit(); + return -1; + } + + printf("fastmem profile: cache-miss workload (size=%u, ~%u s)\n", + PROFILE_SIZE, 3); + + if (profile_cache_miss() < 0) { + rte_fastmem_deinit(); + return -1; + } + + rte_fastmem_deinit(); + return 0; +} + +REGISTER_PERF_TEST(fastmem_profile_cache_hit_autotest, + test_fastmem_profile_cache_hit); +REGISTER_PERF_TEST(fastmem_profile_cache_miss_autotest, + test_fastmem_profile_cache_miss); -- 2.43.0

