On 1/7/26 7:14 AM, Leon Hwang wrote:
When updating an existing element in lru_[percpu_,]hash maps, the current
implementation always calls prealloc_lru_pop() to get a new node before
checking if the key already exists. If the map is full, this triggers
LRU eviction and removes an existing element, even though the update
operation only needs to modify the value of an existing key in-place.
This is problematic because:
1. Users may unexpectedly lose entries when doing simple value updates
2. The eviction overhead is unnecessary for existing key updates
This is not the common LRU map use case. The bpf prog usually does a
lookup first, finds the entry, and then directly updates the value
in-place in the bpf prog itself. If the lookup fails, it will insert a
_new_ element.
When the map is full, eviction should actually be triggered regardless.
For an LRU map that is too small to fit the working set, it is asking
for trouble.
From the syscall update, if the use case is always updating an existing
element, the regular hashmap should be used instead.
Fix this by first checking if the key exists before allocating a new
node. If the key is found, update the value using the extra lru node
without triggering any eviction.
This will instead add overhead for the common use case described above.
The patch is mostly for getting a selftest case to work in a small LRU
map. I don't think it is worth the added complexity either.
Patch 2 and 3 look ok, but they also only make marginal improvements on
the existing code.
pw-bot: cr
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key,
void *value,
+ u64 map_flags, struct bucket *b,
+ struct hlist_nulls_head *head, u32
hash,
+ bool percpu, bool onallcpus)
+{
+ struct htab_elem *l_new, *l_old, *l_free;
+ struct bpf_map *map = &htab->map;
+ u32 key_size = map->key_size;
+ struct bpf_lru_node *node;
+ unsigned long flags;
+ void *l_val;
+ int ret;
+
+ node = bpf_lru_pop_extra(&htab->lru);
+ if (!node)
+ return -ENOENT;
+
+ l_new = container_of(node, struct htab_elem, lru_node);
+ l_new->hash = hash;
+ memcpy(l_new->key, key, key_size);
+ if (!percpu) {
+ l_val = htab_elem_value(l_new, key_size);
+ copy_map_value(map, l_val, value);
+ bpf_obj_free_fields(map->record, l_val);
+ }
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ goto err_lock_bucket;
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_new->lru_node);
+ if (percpu) {
+ /* per-cpu hash map can update value in-place.
+ * Keep the same logic in
__htab_lru_percpu_map_update_elem().
+ */
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old,
key_size),
+ value, onallcpus, map_flags);
+ l_free = l_new;
+ } else {
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_del_rcu(&l_old->hash_node);
+ l_free = l_old;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+err:
+ htab_unlock_bucket(b, flags);
+
+err_lock_bucket:
+ if (ret) {
+ bpf_lru_push_free(&htab->lru, node);
+ } else {
+ if (l_old && !percpu)
+ bpf_obj_free_fields(map->record, htab_elem_value(l_old,
key_size));
Does htab_lru_map_update_elem() have an existing bug that is missing the
bpf_obj_free_fields() on l_old?
+ bpf_lru_push_free(&htab->lru, &l_free->lru_node);
+ }
+
+ return ret;
+}
+
static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void
*value,
u64 map_flags)
{
@@ -1215,6 +1286,11 @@ static long htab_lru_map_update_elem(struct bpf_map
*map, void *key, void *value
b = __select_bucket(htab, hash);
head = &b->head;
+ ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, false,
+ false);
+ if (!ret)
+ return 0;
+
/* For LRU, we need to alloc before taking bucket's
* spinlock because getting free nodes from LRU may need
* to remove older elements from htab and this removal
@@ -1354,6 +1430,11 @@ static long __htab_lru_percpu_map_update_elem(struct
bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
+ ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, true,
+ onallcpus);
+ if (!ret)
+ return 0;
+
/* For LRU, we need to alloc before taking bucket's
* spinlock because LRU's elem alloc may need
* to remove older elem from htab and this removal