On Wed, 7 Jan 2026 09:33:27 -0600 John Groves <[email protected]> wrote:
> Memory errors are at least somewhat more likely on disaggregated memory > than on-board memory. This commit registers to be notified by fsdev_dax > in the event that a memory failure is detected. > > When a file access resolves to a daxdev with memory errors, it will fail > with an appropriate error. > > If a daxdev failed fs_dax_get(), we set dd->dax_err. If a daxdev called > our notify_failure(), set dd->error. When any of the above happens, set > (file)->error and stop allowing access. > > In general, the recovery from memory errors is to unmount the file > system and re-initialize the memory, but there may be usable degraded > modes of operation - particularly in the future when famfs supports > file systems backed by more than one daxdev. In those cases, > accessing data that is on a working daxdev can still work. > > For now, return errors for any file that has encountered a memory or dax > error. > > Signed-off-by: John Groves <[email protected]> > --- > fs/fuse/famfs.c | 115 +++++++++++++++++++++++++++++++++++++++--- > fs/fuse/famfs_kfmap.h | 3 +- > 2 files changed, 109 insertions(+), 9 deletions(-) > > diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c > index c02b14789c6e..4eb87c5c628e 100644 > --- a/fs/fuse/famfs.c > +++ b/fs/fuse/famfs.c > @@ -254,6 +288,38 @@ famfs_update_daxdev_table( > return 0; > } > > +static void > +famfs_set_daxdev_err( > + struct fuse_conn *fc, > + struct dax_device *dax_devp) > +{ > + int i; > + > + /* Gotta search the list by dax_devp; > + * read lock because we're not adding or removing daxdev entries > + */ > + down_read(&fc->famfs_devlist_sem); Use a guard() > + for (i = 0; i < fc->dax_devlist->nslots; i++) { > + if (fc->dax_devlist->devlist[i].valid) { > + struct famfs_daxdev *dd = &fc->dax_devlist->devlist[i]; > + > + if (dd->devp != dax_devp) > + continue; > + > + dd->error = true; > + up_read(&fc->famfs_devlist_sem); > + > + pr_err("%s: memory error on daxdev %s (%d)\n", > + __func__, dd->name, i); > + goto done; > + } > + } > + up_read(&fc->famfs_devlist_sem); > + pr_err("%s: memory err on unrecognized daxdev\n", __func__); > + > +done: If this isn't getting more interesting, just return above. > +} > + > /***************************************************************************/ > > void > @@ -611,6 +677,26 @@ famfs_file_init_dax( > > static ssize_t famfs_file_bad(struct inode *inode); > > +static int famfs_dax_err(struct famfs_daxdev *dd) I'd introduce this earlier in the series to reduce need to refactor below. > +{ > + if (!dd->valid) { > + pr_err("%s: daxdev=%s invalid\n", > + __func__, dd->name); > + return -EIO; > + } > + if (dd->dax_err) { > + pr_err("%s: daxdev=%s dax_err\n", > + __func__, dd->name); > + return -EIO; > + } > + if (dd->error) { > + pr_err("%s: daxdev=%s memory error\n", > + __func__, dd->name); > + return -EHWPOISON; > + } > + return 0; > +} ... > @@ -966,7 +1064,8 @@ famfs_file_bad(struct inode *inode) > return -EIO; > } > if (meta->error) { > - pr_debug("%s: previously detected metadata errors\n", __func__); > + pr_debug("%s: previously detected metadata errors\n", > + __func__); Spurious change. > return -EIO; > }
