On Wed, Feb 18, 2026 at 5:22 AM T.J. Mercier <[email protected]> wrote:
>
> Currently some kernfs files (e.g. cgroup.events, memory.events) support
> inotify watches for IN_MODIFY, but unlike with regular filesystems, they
> do not receive IN_DELETE_SELF or IN_IGNORED events when they are
> removed.
>
> This creates a problem for processes monitoring cgroups. For example, a
> service monitoring memory.events for memory.high breaches needs to know
> when a cgroup is removed to clean up its state. Where it's known that a
> cgroup is removed when all processes die, without IN_DELETE_SELF the
> service must resort to inefficient workarounds such as:
> 1.  Periodically scanning procfs to detect process death (wastes CPU and
>     is susceptible to PID reuse).
> 2.  Placing an additional IN_DELETE watch on the parent directory
>     (wastes resources managing double the watches).

This sentence is a red flag for me.
"wastes resources"? What resources are you talking about?
A single inotify watch? That's nothing.
This is not a valid argument IMO.
I fail to see how managing N watches is different than managing 2N watches.
I have no objection to your patch, but we need to keep our arguments honest.

> 3.  Holding a pidfd for every monitored cgroup (can exhaust file
>     descriptors).
>
> This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
> This allows applications to rely on a single existing watch on the file
> of interest (e.g. memory.events) to receive notifications for both
> modifications and the eventual removal of the file, as well as automatic
> watch descriptor cleanup, simplifying userspace logic and improving
> resource efficiency.
>
> Implementation details:
> The kernfs notification worker is updated to handle file deletion.
> The optimized single call for MODIFY events to both the parent and the
> file is retained, however because CREATE (parent) events remain
> unsupported for kernfs files, support for DELETE (parent) events is not

Either drop this story about DELETE or expand it.
inotify does not generate a DELETE event when watching a file,
because DELETE is an event notifying a change of a directory.
If you would have kept your DELETE implementation that would have
broken this rule.

> added here to retain symmetry. Only support for DELETE_SELF events is
> added.
>
> Signed-off-by: T.J. Mercier <[email protected]>
> Acked-by: Tejun Heo <[email protected]>
> ---
>  fs/kernfs/dir.c             | 21 +++++++++++++++++
>  fs/kernfs/file.c            | 45 ++++++++++++++++++++-----------------
>  fs/kernfs/kernfs-internal.h |  3 +++
>  3 files changed, 48 insertions(+), 21 deletions(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 29baeeb97871..e5bda829fcb8 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -9,6 +9,7 @@
>
>  #include <linux/sched.h>
>  #include <linux/fs.h>
> +#include <linux/fsnotify_backend.h>
>  #include <linux/namei.h>
>  #include <linux/idr.h>
>  #include <linux/slab.h>
> @@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
>         up_write(&root->kernfs_rwsem);
>  }
>
> +static void kernfs_notify_file_deleted(struct kernfs_node *kn)
> +{
> +       static DECLARE_WORK(kernfs_notify_deleted_work,
> +                           kernfs_notify_workfn);
> +
> +       guard(spinlock_irqsave)(&kernfs_notify_lock);
> +       /* may overwite already pending FS_MODIFY events */

Typo: overwite

> +       kn->attr.notify_event = FS_DELETE;

FS_DELETE_SELF

> +
> +       if (!kn->attr.notify_next) {
> +               kernfs_get(kn);
> +               kn->attr.notify_next = kernfs_notify_list;
> +               kernfs_notify_list = kn;
> +               schedule_work(&kernfs_notify_deleted_work);
> +       }
> +}
> +
>  static void __kernfs_remove(struct kernfs_node *kn)
>  {
>         struct kernfs_node *pos, *parent;
> @@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
>                         struct kernfs_iattrs *ps_iattr =
>                                 parent ? parent->iattr : NULL;
>
> +                       if (kernfs_type(pos) == KERNFS_FILE)
> +                               kernfs_notify_file_deleted(pos);
> +

Why are we not notifying a deleted directory?
If users expect DELETE_SELF on a watched cgroup file
they would definitely expect DELETE_SELF on a watched cgroup dir
when the cgroup is destroyed.

I claim that *this* should be the standard way to monitor
destroyed cgroups.

>                         /* update timestamps on the parent */
>                         down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
>
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index e978284ff983..4be9bbe29378 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -37,8 +37,8 @@ struct kernfs_open_node {
>   */
>  #define KERNFS_NOTIFY_EOL                      ((void *)&kernfs_notify_list)
>
> -static DEFINE_SPINLOCK(kernfs_notify_lock);
> -static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> +DEFINE_SPINLOCK(kernfs_notify_lock);
> +struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
>
>  static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node 
> *kn)
>  {
> @@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t 
> offset, int whence)
>         return ret;
>  }
>
> -static void kernfs_notify_workfn(struct work_struct *work)
> +void kernfs_notify_workfn(struct work_struct *work)
>  {
>         struct kernfs_node *kn;
>         struct kernfs_super_info *info;
> @@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct 
> *work)
>         down_read(&root->kernfs_supers_rwsem);
>         down_read(&root->kernfs_rwsem);
>         list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> -               struct kernfs_node *parent;
> -               struct inode *p_inode = NULL;
> -               const char *kn_name;
>                 struct inode *inode;
> -               struct qstr name;
>
>                 /*
>                  * We want fsnotify_modify() on @kn but as the
> @@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct 
> *work)
>                 if (!inode)
>                         continue;
>
> -               kn_name = kernfs_rcu_name(kn);
> -               name = QSTR(kn_name);
> -               parent = kernfs_get_parent(kn);
> -               if (parent) {
> -                       p_inode = ilookup(info->sb, kernfs_ino(parent));
> -                       if (p_inode) {
> -                               fsnotify(notify_event | FS_EVENT_ON_CHILD,
> -                                        inode, FSNOTIFY_EVENT_INODE,
> -                                        p_inode, &name, inode, 0);
> -                               iput(p_inode);
> +               if (notify_event == FS_DELETE) {
FS_DELETE_SELF

> +                       fsnotify_inoderemove(inode);
                            iput(inode);
                            continue;
                    }

Avoids all the churn and unneeded extra indentation that follows.

Thanks,
Amir.

Reply via email to