Hello Samuel, While writing stress tests for the JBD2 journal patch to ensure the VFS can handle heavy background sync pressure, I noticed that diskfs_node_iterate allocates an array for the entire cache while holding the global read lock. This patch replaces that potentially giant allocation with an O(1) intrusive doubly-linked list, completely eliminating the malloc overhead and improving sync latency under heavy inode loads. This makes the filesystem performance much more predictable.
Kind regards, Milos
From 10882fb74eb8021b11a54885a2e4a8998b37d40f Mon Sep 17 00:00:00 2001 From: Milos Nikic <[email protected]> Date: Fri, 27 Mar 2026 23:12:22 -0700 Subject: [PATCH] libdiskfs: Eliminate malloc from node-cache.c iteration. This patch changes how diskfs_node_iterate() iterates the cache content. At the cost of adding two pointers to the node structure and one lock to the cache, we are able to avoid malloc-ing and copying the whole content of the node-cache in order to do said iteration. Nodes contained in the cache now form a doubly linked list, and iterating them needs only constant memory (vs linear memory previously). Performance improvement becomes more visible the larger the cache is, and it can even become measurable on very large node-cache size. --- libdiskfs/diskfs.h | 4 ++ libdiskfs/node-cache.c | 115 ++++++++++++++++++++++++++--------------- 2 files changed, 76 insertions(+), 43 deletions(-) diff --git a/libdiskfs/diskfs.h b/libdiskfs/diskfs.h index 9a5d5d9d..d95e1b80 100644 --- a/libdiskfs/diskfs.h +++ b/libdiskfs/diskfs.h @@ -125,6 +125,10 @@ struct node loff_t allocsize; ino64_t cache_id; + + /* The Intrusive List Pointers */ + struct node *cache_next; + struct node *cache_prev; }; struct diskfs_control diff --git a/libdiskfs/node-cache.c b/libdiskfs/node-cache.c index 5f6c3e36..27b38123 100644 --- a/libdiskfs/node-cache.c +++ b/libdiskfs/node-cache.c @@ -19,7 +19,6 @@ #include <hurd/ihash.h> -#include "priv.h" #include "diskfs.h" /* The node cache is implemented using a hash table. Access to the @@ -60,6 +59,28 @@ static struct hurd_ihash nodecache = HURD_IHASH_INITIALIZER_GKI (offsetof (struct node, slot), NULL, NULL, hash, compare); static pthread_rwlock_t nodecache_lock = PTHREAD_RWLOCK_INITIALIZER; +static pthread_mutex_t active_nodes_lock = PTHREAD_MUTEX_INITIALIZER; +static struct node *active_nodes_head = NULL; +static struct node *active_nodes_tail = NULL; + +/* Unlinks a node from the active nodes doubly-linked list. + The caller MUST hold active_nodes_lock before calling this. */ +static void +unlink_active_node (struct node *np) +{ + if (np->cache_prev) + np->cache_prev->cache_next = np->cache_next; + if (np->cache_next) + np->cache_next->cache_prev = np->cache_prev; + + if (active_nodes_head == np) + active_nodes_head = np->cache_next; + if (active_nodes_tail == np) + active_nodes_tail = np->cache_prev; + + np->cache_prev = NULL; + np->cache_next = NULL; +} /* Fetch inode INUM, set *NPP to the node structure; gain one user reference and lock the node. */ @@ -109,6 +130,15 @@ diskfs_cached_lookup_context (ino_t inum, struct node **npp, assert_perror_backtrace (err); diskfs_nref_light (np); pthread_rwlock_unlock (&nodecache_lock); + pthread_mutex_lock (&active_nodes_lock); + np->cache_next = active_nodes_head; + np->cache_prev = NULL; + if (active_nodes_head) + active_nodes_head->cache_prev = np; + else + active_nodes_tail = np; + active_nodes_head = np; + pthread_mutex_unlock (&active_nodes_lock); /* Get the contents of NP off disk. */ err = diskfs_user_read_node (np, ctx); @@ -118,6 +148,10 @@ diskfs_cached_lookup_context (ino_t inum, struct node **npp, hurd_ihash_remove (&nodecache, (hurd_ihash_key_t) &np->cache_id); pthread_rwlock_unlock (&nodecache_lock); + pthread_mutex_lock (&active_nodes_lock); + unlink_active_node (np); + pthread_mutex_unlock (&active_nodes_lock); + /* Don't delete from disk. */ np->dn_stat.st_nlink = 1; np->allocsize = 0; @@ -180,6 +214,9 @@ diskfs_try_dropping_softrefs (struct node *np) hurd_ihash_locp_remove (&nodecache, np->slot); np->slot = NULL; + pthread_mutex_lock (&active_nodes_lock); + unlink_active_node (np); + pthread_mutex_unlock (&active_nodes_lock); /* Flush node if needed, before forgetting it */ diskfs_node_update (np, diskfs_synchronous); @@ -197,54 +234,46 @@ error_t __attribute__ ((weak)) diskfs_node_iterate (error_t (*fun)(struct node *)) { error_t err = 0; - size_t num_nodes; - struct node *node, **node_list, **p; - - pthread_rwlock_rdlock (&nodecache_lock); - - /* We must copy everything from the hash table into another data structure - to avoid running into any problems with the hash-table being modified - during processing (normally we delegate access to hash-table with - nodecache_lock, but we can't hold this while locking the - individual node locks). */ - /* XXX: Can we? */ - num_nodes = nodecache.nr_items; - - /* TODO This method doesn't scale beyond a few dozen nodes and should be - replaced. */ - node_list = malloc (num_nodes * sizeof (struct node *)); - if (node_list == NULL) - { - pthread_rwlock_unlock (&nodecache_lock); - return ENOMEM; - } + struct node *current, *next_node; - p = node_list; - HURD_IHASH_ITERATE (&nodecache, i) - { - *p++ = node = i; + pthread_mutex_lock (&active_nodes_lock); + current = active_nodes_tail; - /* We acquire a hard reference for node, but without using - diskfs_nref. We do this so that diskfs_new_hardrefs will not - get called. */ - refcounts_ref (&node->refcounts, NULL); - } - pthread_rwlock_unlock (&nodecache_lock); + /* Bootstrap the loop by grabbing a ref to the very first node */ + if (current) + refcounts_ref (¤t->refcounts, NULL); - p = node_list; - while (num_nodes-- > 0) + while (current != NULL) { - node = *p++; - if (!err) - { - pthread_mutex_lock (&node->lock); - err = (*fun)(node); - pthread_mutex_unlock (&node->lock); - } - diskfs_nrele (node); + /* Figure out who is next, and grab a ref so it doesn't + * disappear while we are processing 'current' */ + next_node = current->cache_prev; + if (next_node) + refcounts_ref (&next_node->refcounts, NULL); + + pthread_mutex_unlock (&active_nodes_lock); + + pthread_mutex_lock (¤t->lock); + err = (*fun)(current); + pthread_mutex_unlock (¤t->lock); + + /* We are done with 'current', drop the ref we grabbed */ + diskfs_nrele (current); + if (err) + { + /* We don't need to traverse the rest of the list! + Just drop the next_node ref if we grabbed it, and return. */ + if (next_node) + diskfs_nrele (next_node); + return err; + } + + /* Re-acquire the global lock to loop around */ + pthread_mutex_lock (&active_nodes_lock); + current = next_node; } - free (node_list); + pthread_mutex_unlock (&active_nodes_lock); return err; } -- 2.53.0
