Re: [PATCH v12 2/2] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind

2023-07-31 Thread Darrick J. Wong
On Mon, Jul 31, 2023 at 05:36:36PM +0800, Shiyang Ruan wrote:
> 
> 
> 在 2023/7/29 23:15, Darrick J. Wong 写道:
> > On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:
> > > This patch is inspired by Dan's "mm, dax, pmem: Introduce
> > > dev_pagemap_failure()"[1].  With the help of dax_holder and
> > > ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> > > on it to unmap all files in use, and notify processes who are using
> > > those files.
> > > 
> > > Call trace:
> > > trigger unbind
> > >   -> unbind_store()
> > >-> ... (skip)
> > > -> devres_release_all()
> > >  -> kill_dax()
> > >   -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
> > >-> xfs_dax_notify_failure()
> > >`-> freeze_super() // freeze (kernel call)
> > >`-> do xfs rmap
> > >` -> mf_dax_kill_procs()
> > >`  -> collect_procs_fsdax()// all associated processes
> > >`  -> unmap_and_kill()
> > >` -> invalidate_inode_pages2_range() // drop file's cache
> > >`-> thaw_super()   // thaw (both kernel & user call)
> > > 
> > > Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> > > event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> > > new dax mapping from being created.  Do not shutdown filesystem directly
> > > if configuration is not supported, or if failure range includes metadata
> > > area.  Make sure all files and processes(not only the current progress)
> > > are handled correctly.  Also drop the cache of associated files before
> > > pmem is removed.
> > > 
> > > [1]: 
> > > https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.st...@dwillia2-desk3.amr.corp.intel.com/
> > > [2]: 
> > > https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> > > 
> > > Signed-off-by: Shiyang Ruan 
> > > ---
> > >   drivers/dax/super.c |  3 +-
> > >   fs/xfs/xfs_notify_failure.c | 86 ++---
> > >   include/linux/mm.h  |  1 +
> > >   mm/memory-failure.c | 17 ++--
> > >   4 files changed, 96 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> > > index c4c4728a36e4..2e1a35e82fce 100644
> > > --- a/drivers/dax/super.c
> > > +++ b/drivers/dax/super.c
> > > @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
> > >   return;
> > >   if (dax_dev->holder_data != NULL)
> > > - dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> > > + dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> > > + MF_MEM_PRE_REMOVE);
> > >   clear_bit(DAXDEV_ALIVE, _dev->flags);
> > >   synchronize_srcu(_srcu);
> > > diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> > > index 4a9bbd3fe120..f6ec56b76db6 100644
> > > --- a/fs/xfs/xfs_notify_failure.c
> > > +++ b/fs/xfs/xfs_notify_failure.c
> > > @@ -22,6 +22,7 @@
> > >   #include 
> > >   #include 
> > > +#include 
> > >   struct xfs_failure_info {
> > >   xfs_agblock_t   startblock;
> > > @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
> > >   struct xfs_mount*mp = cur->bc_mp;
> > >   struct xfs_inode*ip;
> > >   struct xfs_failure_info *notify = data;
> > > + struct address_space*mapping;
> > > + pgoff_t pgoff;
> > > + unsigned long   pgcnt;
> > >   int error = 0;
> > >   if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> > >   (rec->rm_flags & (XFS_RMAP_ATTR_FORK | 
> > > XFS_RMAP_BMBT_BLOCK))) {
> > > + /* Continue the query because this isn't a failure. */
> > > + if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > + return 0;
> > >   notify->want_shutdown = true;
> > >   return 0;
> > >   }
> > > @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
> > >   return 0;
> > >   }
> > > - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> > > -   xfs_failure_pgoff(mp, rec, notify),
> > > -   xfs_failure_pgcnt(mp, rec, notify),
> > > -   notify->mf_flags);
> > > + mapping = VFS_I(ip)->i_mapping;
> > > + pgoff = xfs_failure_pgoff(mp, rec, notify);
> > > + pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> > > +
> > > + /* Continue the rmap query if the inode isn't a dax file. */
> > > + if (dax_mapping(mapping))
> > > + error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> > > +   notify->mf_flags);
> > > +
> > > + /* Invalidate the cache in dax pages. */
> > > + if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > + invalidate_inode_pages2_range(mapping, pgoff,
> > > +   

Re: [PATCH ndctl v2] ndctl/cxl/test: Add CXL event test

2023-07-31 Thread Verma, Vishal L
On Mon, 2023-07-31 at 16:53 -0700, Ira Weiny wrote:
> Previously CXL event testing was run by hand.  This reduces testing

Reduces or increases / improves? Or did you mean running by hand
reduced coverage.

Maybe this can read "Improve testing coverage and address a lack of
automated regression testing by adding a unit test for this"

(no need to respin, I can fixup when applying, just making sure I'm not
misinterpreting what you meant to say).

> coverage including a lack of regression testing.
> 
> Add a CXL event test as part of the meson test infrastructure.  Passing
> is predicated on receiving the appropriate number of errors in each log.
> Individual event values are not checked.
> 
> Signed-off-by: Ira Weiny 
> ---
> Changes in v2:
> [djiang] run shellcheck and fix as needed 
>     
> [vishal] quote variables  
>     
> [vishal] skip test if event_trigger is not available  
>     
> [vishal] remove dead code 
>     
> [vishal] explicitly use the first memdev returned from cxl-cli
>     
> [vishal] store trace output in a variable 
>     
> [vishal] simplify grep statement looking for results  
>     
> [vishal] use variables for expected values
>     
> - Link to v1: 
> https://lore.kernel.org/r/20230726-cxl-event-v1-1-1cf8cb02b...@intel.com
> ---
>  test/cxl-events.sh | 76 
> ++
>  test/meson.build   |  2 ++
>  2 files changed, 78 insertions(+)
> 
Thanks for the rev, everything else looks good.


[PATCH ndctl v2] ndctl/cxl/test: Add CXL event test

2023-07-31 Thread Ira Weiny
Previously CXL event testing was run by hand.  This reduces testing
coverage including a lack of regression testing.

Add a CXL event test as part of the meson test infrastructure.  Passing
is predicated on receiving the appropriate number of errors in each log.
Individual event values are not checked.

Signed-off-by: Ira Weiny 
---
Changes in v2:
[djiang] run shellcheck and fix as needed   
  
[vishal] quote variables
  
[vishal] skip test if event_trigger is not available
  
[vishal] remove dead code   
  
[vishal] explicitly use the first memdev returned from cxl-cli  
  
[vishal] store trace output in a variable   
  
[vishal] simplify grep statement looking for results
  
[vishal] use variables for expected values  
  
- Link to v1: 
https://lore.kernel.org/r/20230726-cxl-event-v1-1-1cf8cb02b...@intel.com
---
 test/cxl-events.sh | 76 ++
 test/meson.build   |  2 ++
 2 files changed, 78 insertions(+)

diff --git a/test/cxl-events.sh b/test/cxl-events.sh
new file mode 100644
index ..33b68daa6ade
--- /dev/null
+++ b/test/cxl-events.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2023 Intel Corporation. All rights reserved.
+
+. "$(dirname "$0")/common"
+
+# Results expected
+num_overflow_expected=1
+num_fatal_expected=2
+num_failure_expected=16
+num_info_expected=3
+
+set -ex
+
+trap 'err $LINENO' ERR
+
+check_prereq "jq"
+
+modprobe -r cxl_test
+modprobe cxl_test
+
+dev_path="/sys/bus/platform/devices"
+
+test_cxl_events()
+{
+   memdev="$1"
+
+   if [ ! -f "${dev_path}/${memdev}/event_trigger" ]; then
+   echo "TEST: Kernel does not support test event trigger"
+   exit 77
+   fi
+
+   echo "TEST: triggering $memdev"
+   echo 1 > "${dev_path}/${memdev}/event_trigger"
+}
+
+readarray -t memdevs < <("$CXL" list -b cxl_test -Mi | jq -r '.[].host')
+
+echo "TEST: Prep event trace"
+echo "" > /sys/kernel/tracing/trace
+echo 1 > /sys/kernel/tracing/events/cxl/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+
+test_cxl_events "${memdevs[0]}"
+
+echo 0 > /sys/kernel/tracing/tracing_on
+
+echo "TEST: Events seen"
+trace_out=$(cat /sys/kernel/tracing/trace)
+
+num_overflow=$(grep -c "cxl_overflow" <<< "${trace_out}")
+num_fatal=$(grep -c "log=Fatal" <<< "${trace_out}")
+num_failure=$(grep -c "log=Failure" <<< "${trace_out}")
+num_info=$(grep -c "log=Informational" <<< "${trace_out}")
+echo " LOG (Expected) : (Found)"
+echo " overflow  ($num_overflow_expected) : $num_overflow"
+echo " Fatal ($num_fatal_expected) : $num_fatal"
+echo " Failure   ($num_failure_expected) : $num_failure"
+echo " Informational ($num_info_expected) : $num_info"
+
+if [ "$num_overflow" -ne $num_overflow_expected ]; then
+   err "$LINENO"
+fi
+if [ "$num_fatal" -ne $num_fatal_expected ]; then
+   err "$LINENO"
+fi
+if [ "$num_failure" -ne $num_failure_expected ]; then
+   err "$LINENO"
+fi
+if [ "$num_info" -ne $num_info_expected ]; then
+   err "$LINENO"
+fi
+
+check_dmesg "$LINENO"
+
+modprobe -r cxl_test
diff --git a/test/meson.build b/test/meson.build
index a956885f6df6..a33255bde1a8 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -155,6 +155,7 @@ cxl_sysfs = find_program('cxl-region-sysfs.sh')
 cxl_labels = find_program('cxl-labels.sh')
 cxl_create_region = find_program('cxl-create-region.sh')
 cxl_xor_region = find_program('cxl-xor-region.sh')
+cxl_events = find_program('cxl-events.sh')
 
 tests = [
   [ 'libndctl',   libndctl,  'ndctl' ],
@@ -183,6 +184,7 @@ tests = [
   [ 'cxl-labels.sh',  cxl_labels,'cxl'   ],
   [ 'cxl-create-region.sh',   cxl_create_region,  'cxl'   ],
   [ 'cxl-xor-region.sh',  cxl_xor_region, 'cxl'   ],
+  [ 'cxl-events.sh',  cxl_events, 'cxl'   ],
 ]
 
 if get_option('destructive').enabled()

---
base-commit: 2fd570a0ed788b1bd0971dfdb1466a5dbcb79775
change-id: 20230726-cxl-event-dc00a2f94b60

Best regards,
-- 
Ira Weiny 




[PATCH V2 1/1] pmem: set QUEUE_FLAG_NOWAIT

2023-07-31 Thread Chaitanya Kulkarni
Set the QUEUE_FLAG_NOWAIT. Following are the performance numbers with
io_uring fio engine for random read, note that device has been populated
fully with randwrite workload before taking these numbers :-

linux-block (pmem-nowait-on) # grep IOPS  pmem*fio | column -t
pmem-nowait-off-1.fio:  read:  IOPS=3683k,  BW=14.0GiB/s
pmem-nowait-off-2.fio:  read:  IOPS=3819k,  BW=14.6GiB/s
pmem-nowait-off-3.fio:  read:  IOPS=3999k,  BW=15.3GiB/s

pmem-nowait-on-1.fio:   read:  IOPS=5837k,  BW=22.3GiB/s
pmem-nowait-on-2.fio:   read:  IOPS=5936k,  BW=22.6GiB/s
pmem-nowait-on-3.fio:   read:  IOPS=5945k,  BW=22.7GiB/s

linux-block (pmem-nowait-on) # grep cpu  pmem*fio | column -t
pmem-nowait-off-1.fio:  cpu  :  usr=7.09%,   sys=29.65%,  ctx=198742065
pmem-nowait-off-2.fio:  cpu  :  usr=6.89%,   sys=30.56%,  ctx=205817652
pmem-nowait-off-3.fio:  cpu  :  usr=6.86%,   sys=30.94%,  ctx=222627094

pmem-nowait-on-1.fio:   cpu  :  usr=10.58%,  sys=88.44%,  ctx=27181   
pmem-nowait-on-2.fio:   cpu  :  usr=10.50%,  sys=87.75%,  ctx=25746   
pmem-nowait-on-3.fio:   cpu  :  usr=10.67%,  sys=88.60%,  ctx=28261   

linux-block (pmem-nowait-on) # grep slat  pmem*fio | column -t
pmem-nowait-off-1.fio:  slat  (nsec):  min=432,   max=50847k,  avg=9324.69
pmem-nowait-off-2.fio:  slat  (nsec):  min=441,   max=52557k,  avg=9132.45
pmem-nowait-off-3.fio:  slat  (nsec):  min=430,   max=36113k,  avg=9132.63

pmem-nowait-on-1.fio:   slat  (nsec):  min=1393,  max=68090k,  avg=7615.31
pmem-nowait-on-2.fio:   slat  (nsec):  min=1222,  max=44137k,  avg=7493.77
pmem-nowait-on-3.fio:   slat  (nsec):  min=1493,  max=40100k,  avg=7486.36

Signed-off-by: Chaitanya Kulkarni 
---
 drivers/nvdimm/pmem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 46e094e56159..ddd485c377eb 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -543,6 +543,7 @@ static int pmem_attach_disk(struct device *dev,
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
+   blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
if (pmem->pfn_flags & PFN_MAP)
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 
-- 
2.40.0




[PATCH V2 0/1] pmem: set QUEUE_FLAG_NOWAIT

2023-07-31 Thread Chaitanya Kulkarni
Hi,

Set the QUEUE_FLAG_NOWAIT. Following are the performance numbers with
io_uring fio engine for random read, note that device has been populated
fully with randwrite workload before taking these numbers :-

linux-block (pmem-nowait-on) # grep IOPS  pmem*fio | column -t
pmem-nowait-off-1.fio:  read:  IOPS=3683k,  BW=14.0GiB/s
pmem-nowait-off-2.fio:  read:  IOPS=3819k,  BW=14.6GiB/s
pmem-nowait-off-3.fio:  read:  IOPS=3999k,  BW=15.3GiB/s

pmem-nowait-on-1.fio:   read:  IOPS=5837k,  BW=22.3GiB/s
pmem-nowait-on-2.fio:   read:  IOPS=5936k,  BW=22.6GiB/s
pmem-nowait-on-3.fio:   read:  IOPS=5945k,  BW=22.7GiB/s

linux-block (pmem-nowait-on) # grep cpu  pmem*fio | column -t
pmem-nowait-off-1.fio:  cpu  :  usr=7.09%,   sys=29.65%,  ctx=198742065
pmem-nowait-off-2.fio:  cpu  :  usr=6.89%,   sys=30.56%,  ctx=205817652
pmem-nowait-off-3.fio:  cpu  :  usr=6.86%,   sys=30.94%,  ctx=222627094

pmem-nowait-on-1.fio:   cpu  :  usr=10.58%,  sys=88.44%,  ctx=27181   
pmem-nowait-on-2.fio:   cpu  :  usr=10.50%,  sys=87.75%,  ctx=25746   
pmem-nowait-on-3.fio:   cpu  :  usr=10.67%,  sys=88.60%,  ctx=28261   

linux-block (pmem-nowait-on) # grep slat  pmem*fio | column -t
pmem-nowait-off-1.fio:  slat  (nsec):  min=432,   max=50847k,  avg=9324.69
pmem-nowait-off-2.fio:  slat  (nsec):  min=441,   max=52557k,  avg=9132.45
pmem-nowait-off-3.fio:  slat  (nsec):  min=430,   max=36113k,  avg=9132.63

pmem-nowait-on-1.fio:   slat  (nsec):  min=1393,  max=68090k,  avg=7615.31
pmem-nowait-on-2.fio:   slat  (nsec):  min=1222,  max=44137k,  avg=7493.77
pmem-nowait-on-3.fio:   slat  (nsec):  min=1493,  max=40100k,  avg=7486.36

Please let me know if further testing is needed I've ran fio verification
job in order to make verify these changes.

-ck

V2:-

Unconditionally set the QUEUE_FLAG_NOWAIT in pmem_attach_disk() along
with the other queue flags.

Chaitanya Kulkarni (1):
  pmem: set QUEUE_FLAG_NOWAIT

 drivers/nvdimm/pmem.c | 1 +
 1 file changed, 1 insertion(+)

linux-block (pmem-nowait-on) # ./test-pmem.sh 
++ unload_mod
++ rmmod nd_pmem
++ rmmod nd_btt
++ git checkout for-next
Switched to branch 'for-next'
Your branch is ahead of 'origin/for-next' by 155 commits.
  (use "git push" to publish your local commits)
++ git log -1
commit e50c5e801b5a9e1797eb5a157eac1b5e50084486 (HEAD -> for-next)
Merge: e6dfe861227b e98acd815ebf
Author: Chaitanya Kulkarni 
Date:   Mon Jul 31 14:48:39 2023 -0700

Merge branch 'for-next' of git://git.kernel.dk/linux-block into for-next
++ makej M=drivers/nvdimm
  CC [M]  drivers/nvdimm/pmem.o
  LD [M]  drivers/nvdimm/nd_pmem.o
  MODPOST drivers/nvdimm/Module.symvers
  LD [M]  drivers/nvdimm/nd_pmem.ko
++ load_mod
++ insmod drivers/nvdimm/nd_btt.ko
++ insmod drivers/nvdimm/nd_pmem.ko
++ sleep 1
++ test_pmem nowait-off
++ sleep 1
++ fio fio/verify.fio --ioengine=io_uring --size=896M --filename=/dev/pmem0
write-and-verify: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=io_uring, iodepth=16
fio-3.34
Starting 1 process
Jobs: 1 (f=1)
write-and-verify: (groupid=0, jobs=1): err= 0: pid=4662: Mon Jul 31 15:24:07 
2023
  read: IOPS=265k, BW=1036MiB/s (1087MB/s)(566MiB/546msec)
slat (nsec): min=501, max=31820, avg=2733.20, stdev=1179.52
clat (nsec): min=17022, max=96063, avg=56544.09, stdev=5848.22
 lat (usec): min=20, max=101, avg=59.28, stdev= 6.06
clat percentiles (nsec):
 |  1.00th=[44288],  5.00th=[46848], 10.00th=[48896], 20.00th=[51456],
 | 30.00th=[53504], 40.00th=[2], 50.00th=[56576], 60.00th=[58112],
 | 70.00th=[59648], 80.00th=[61184], 90.00th=[63744], 95.00th=[66048],
 | 99.00th=[72192], 99.50th=[74240], 99.90th=[80384], 99.95th=[82432],
 | 99.99th=[88576]
  write: IOPS=209k, BW=818MiB/s (857MB/s)(896MiB/1096msec); 0 zone resets
slat (nsec): min=1352, max=113484, avg=4293.77, stdev=1425.01
clat (usec): min=25, max=285, avg=71.81, stdev= 9.33
 lat (usec): min=31, max=288, avg=76.10, stdev= 9.56
clat percentiles (usec):
 |  1.00th=[   45],  5.00th=[   61], 10.00th=[   65], 20.00th=[   68],
 | 30.00th=[   70], 40.00th=[   71], 50.00th=[   72], 60.00th=[   73],
 | 70.00th=[   75], 80.00th=[   77], 90.00th=[   80], 95.00th=[   84],
 | 99.00th=[  102], 99.50th=[  113], 99.90th=[  169], 99.95th=[  180],
 | 99.99th=[  219]
   bw (  KiB/s): min=152408, max=857568, per=73.07%, avg=611669.33, 
stdev=398064.54, samples=3
   iops: min=38102, max=214392, avg=152917.33, stdev=99516.13, samples=3
  lat (usec)   : 20=0.01%, 50=6.40%, 100=92.93%, 250=0.67%, 500=0.01%
  cpu  : usr=35.49%, sys=63.60%, ctx=2561, majf=0, minf=3973
  IO depths: 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0%
 submit: 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0%
 issued rwts: total=144875,229376,0,0 short=0,0,0,0 dropped=0,0,0,0
 latency   : target=0, window=0, percentile=100.00%, 

Re: [PATCH ndctl] ndctl/cxl/test: Add CXL event test

2023-07-31 Thread Ira Weiny
Verma, Vishal L wrote:
> On Thu, 2023-07-27 at 14:21 -0700, Ira Weiny wrote:
> > Previously CXL event testing was run by hand.  This reduces testing
> > coverage including a lack of regression testing.
> > 
> > Add a CXL test as part of the meson test infrastructure.  Passing is
> > predicated on receiving the appropriate number of errors in each log.
> > Individual event values are not checked.
> > 
> > Signed-off-by: Ira Weiny 
> > ---
> >  test/cxl-events.sh | 68 
> > ++
> >  test/meson.build   |  2 ++
> >  2 files changed, 70 insertions(+)
> 
> Hi Ira,
> 
> Thanks for adding this test. Just a few minor comments below, otherwise
> looks good.

Thanks!

> 
> > 
> > diff --git a/test/cxl-events.sh b/test/cxl-events.sh
> > new file mode 100644
> > index ..f51046ec39ad
> > --- /dev/null
> > +++ b/test/cxl-events.sh
> > @@ -0,0 +1,68 @@
> > +#!/bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (C) 2023 Intel Corporation. All rights reserved.
> > +
> > +. $(dirname $0)/common
> > +
> > +set -ex
> > +
> > +trap 'err $LINENO' ERR
> > +
> > +check_prereq "jq"
> > +
> > +modprobe -r cxl_test
> > +modprobe cxl_test
> > +
> > +dev_path="/sys/bus/platform/devices"
> > +
> > +test_cxl_events()
> > +{
> > +   memdev="$1"
> > +
> > +   echo "TEST: triggering $memdev"
> > +   echo 1 > $dev_path/$memdev/event_trigger
> 
> Quote the "$variables" here. We don't expect spaces in the path in this
> case, so it will still work, but it is good practice to always quote
> everything.

Done.

> 
> We might also need a test to see if this file exists first. For kernels
> that don't have this support, we probably want to print a message and
> skip the test (return '77').

Good idea.

> 
> > +}
> > +
> > +readarray -t memdevs < <("$CXL" list -b cxl_test -Mi | jq -r '.[].host')
> > +
> > +echo "TEST: Prep event trace"
> > +echo "" > /sys/kernel/tracing/trace
> > +echo 1 > /sys/kernel/tracing/events/cxl/enable
> > +echo 1 > /sys/kernel/tracing/tracing_on
> > +
> > +# Only need to test 1 device
> > +#for memdev in ${memdevs[@]}; do
> > +#done
> 
> Probably just remove the commented out loop, if we need to test more
> than one memdev in the future, it is easy enough to add back then.

Done.

> 
> > +
> > +test_cxl_events "$memdevs"
> 
> Shouldn't use "$memdevs" here since it is an array. If you want to pass
> in just the first memdev, use "${memdevs[0]}"

Ah yea caught my hack ;-)  done.

> 
> > +
> > +echo 0 > /sys/kernel/tracing/tracing_on
> > +
> > +echo "TEST: Events seen"
> > +cat /sys/kernel/tracing/trace
> > +num_overflow=$(grep "cxl_overflow" /sys/kernel/tracing/trace | wc -l)
> > +num_fatal=$(grep "log=Fatal" /sys/kernel/tracing/trace | wc -l)
> > +num_failure=$(grep "log=Failure" /sys/kernel/tracing/trace | wc -l)
> > +num_info=$(grep "log=Informational" /sys/kernel/tracing/trace | wc -l)
> 
> Minor nit, but you can 'grep -c' instead of 'grep | wc -l'

Ok Done.

> 
> Also could put /sys/kernel/tracing/trace into a variable just for
> readability since it is used many times.

Done.

> 
> > +echo " LOG (Expected) : (Found)"
> > +echo " overflow  ( 1) : $num_overflow"
> > +echo " Fatal ( 2) : $num_fatal"
> > +echo " Failure   (16) : $num_failure"
> > +echo " Informational ( 3) : $num_info"
> > +
> > +if [ "$num_overflow" -ne 1 ]; then
> > +   err "$LINENO"
> > +fi
> > +if [ "$num_fatal" -ne 2 ]; then
> > +   err "$LINENO"
> > +fi
> > +if [ "$num_failure" -ne 16 ]; then
> > +   err "$LINENO"
> > +fi
> > +if [ "$num_info" -ne 3 ]; then
> > +   err "$LINENO"
> > +fi
> 
> Perhaps define variables for each of the expected nums, that way there
> is only one spot to change in case the numbers change in the future.

Good idea, done.

V2 on it's way soon, thanks for looking,
Ira

> 
> > +
> > +check_dmesg "$LINENO"
> > +
> > +modprobe -r cxl_test
> > diff --git a/test/meson.build b/test/meson.build
> > index a956885f6df6..a33255bde1a8 100644
> > --- a/test/meson.build
> > +++ b/test/meson.build
> > @@ -155,6 +155,7 @@ cxl_sysfs = find_program('cxl-region-sysfs.sh')
> >  cxl_labels = find_program('cxl-labels.sh')
> >  cxl_create_region = find_program('cxl-create-region.sh')
> >  cxl_xor_region = find_program('cxl-xor-region.sh')
> > +cxl_events = find_program('cxl-events.sh')
> >  
> >  tests = [
> >    [ 'libndctl',   libndctl,  'ndctl' ],
> > @@ -183,6 +184,7 @@ tests = [
> >    [ 'cxl-labels.sh',  cxl_labels,    'cxl'   ],
> >    [ 'cxl-create-region.sh',   cxl_create_region,  'cxl'   ],
> >    [ 'cxl-xor-region.sh',  cxl_xor_region, 'cxl'   ],
> > +  [ 'cxl-events.sh',  cxl_events, 'cxl'   ],
> >  ]
> >  
> >  if get_option('destructive').enabled()
> > 
> > ---
> > base-commit: 2fd570a0ed788b1bd0971dfdb1466a5dbcb79775
> > change-id: 20230726-cxl-event-dc00a2f94b60
> > 
> > Best regards,
> 





Re: [PATCH ndctl] cxl/memdev: initialize 'rc' in action_update_fw()

2023-07-31 Thread Verma, Vishal L
On Mon, 2023-07-31 at 14:18 -0600, Vishal Verma wrote:
> Static analysis complains that in some cases, an uninitialized 'rc' can
> get returned from action_update_fw(). Since this can only happen in a
> 'no-op' case, initialize rc to 0.
> 
> Signed-off-by: Vishal Verma 

This should've included a

  Fixes: 64ad46b4a147 ("cxl: add an update-firmware command")

tag. I'll add it when applying.

> ---
>  cxl/memdev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/cxl/memdev.c b/cxl/memdev.c
> index 1ad871a..f6a2d3f 100644
> --- a/cxl/memdev.c
> +++ b/cxl/memdev.c
> @@ -679,7 +679,7 @@ static int action_update_fw(struct cxl_memdev *memdev,
> const char *devname = cxl_memdev_get_devname(memdev);
> struct json_object *jmemdev;
> unsigned long flags;
> -   int rc;
> +   int rc = 0;
>  
> if (param.cancel)
> return cxl_memdev_cancel_fw_update(memdev);
> 
> ---
> base-commit: 32cec0c5cfe669940107ce030beeb1e02e5a767b
> change-id: 20230731-coverity-fix-edc28fd6e0fe
> 
> Best regards,



Re: [PATCH ndctl] cxl/memdev: initialize 'rc' in action_update_fw()

2023-07-31 Thread Dave Jiang




On 7/31/23 13:18, Vishal Verma wrote:

Static analysis complains that in some cases, an uninitialized 'rc' can
get returned from action_update_fw(). Since this can only happen in a
'no-op' case, initialize rc to 0.

Signed-off-by: Vishal Verma 


Reviewed-by: Dave Jiang 

---
  cxl/memdev.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cxl/memdev.c b/cxl/memdev.c
index 1ad871a..f6a2d3f 100644
--- a/cxl/memdev.c
+++ b/cxl/memdev.c
@@ -679,7 +679,7 @@ static int action_update_fw(struct cxl_memdev *memdev,
const char *devname = cxl_memdev_get_devname(memdev);
struct json_object *jmemdev;
unsigned long flags;
-   int rc;
+   int rc = 0;
  
  	if (param.cancel)

return cxl_memdev_cancel_fw_update(memdev);

---
base-commit: 32cec0c5cfe669940107ce030beeb1e02e5a767b
change-id: 20230731-coverity-fix-edc28fd6e0fe

Best regards,




[PATCH ndctl] cxl/memdev: initialize 'rc' in action_update_fw()

2023-07-31 Thread Vishal Verma
Static analysis complains that in some cases, an uninitialized 'rc' can
get returned from action_update_fw(). Since this can only happen in a
'no-op' case, initialize rc to 0.

Signed-off-by: Vishal Verma 
---
 cxl/memdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cxl/memdev.c b/cxl/memdev.c
index 1ad871a..f6a2d3f 100644
--- a/cxl/memdev.c
+++ b/cxl/memdev.c
@@ -679,7 +679,7 @@ static int action_update_fw(struct cxl_memdev *memdev,
const char *devname = cxl_memdev_get_devname(memdev);
struct json_object *jmemdev;
unsigned long flags;
-   int rc;
+   int rc = 0;
 
if (param.cancel)
return cxl_memdev_cancel_fw_update(memdev);

---
base-commit: 32cec0c5cfe669940107ce030beeb1e02e5a767b
change-id: 20230731-coverity-fix-edc28fd6e0fe

Best regards,
-- 
Vishal Verma 




Re: [PATCH ndctl] ndctl/cxl/test: Add CXL event test

2023-07-31 Thread Verma, Vishal L
On Thu, 2023-07-27 at 14:21 -0700, Ira Weiny wrote:
> Previously CXL event testing was run by hand.  This reduces testing
> coverage including a lack of regression testing.
> 
> Add a CXL test as part of the meson test infrastructure.  Passing is
> predicated on receiving the appropriate number of errors in each log.
> Individual event values are not checked.
> 
> Signed-off-by: Ira Weiny 
> ---
>  test/cxl-events.sh | 68 
> ++
>  test/meson.build   |  2 ++
>  2 files changed, 70 insertions(+)

Hi Ira,

Thanks for adding this test. Just a few minor comments below, otherwise
looks good.

> 
> diff --git a/test/cxl-events.sh b/test/cxl-events.sh
> new file mode 100644
> index ..f51046ec39ad
> --- /dev/null
> +++ b/test/cxl-events.sh
> @@ -0,0 +1,68 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (C) 2023 Intel Corporation. All rights reserved.
> +
> +. $(dirname $0)/common
> +
> +set -ex
> +
> +trap 'err $LINENO' ERR
> +
> +check_prereq "jq"
> +
> +modprobe -r cxl_test
> +modprobe cxl_test
> +
> +dev_path="/sys/bus/platform/devices"
> +
> +test_cxl_events()
> +{
> +   memdev="$1"
> +
> +   echo "TEST: triggering $memdev"
> +   echo 1 > $dev_path/$memdev/event_trigger

Quote the "$variables" here. We don't expect spaces in the path in this
case, so it will still work, but it is good practice to always quote
everything.

We might also need a test to see if this file exists first. For kernels
that don't have this support, we probably want to print a message and
skip the test (return '77').

> +}
> +
> +readarray -t memdevs < <("$CXL" list -b cxl_test -Mi | jq -r '.[].host')
> +
> +echo "TEST: Prep event trace"
> +echo "" > /sys/kernel/tracing/trace
> +echo 1 > /sys/kernel/tracing/events/cxl/enable
> +echo 1 > /sys/kernel/tracing/tracing_on
> +
> +# Only need to test 1 device
> +#for memdev in ${memdevs[@]}; do
> +#done

Probably just remove the commented out loop, if we need to test more
than one memdev in the future, it is easy enough to add back then.

> +
> +test_cxl_events "$memdevs"

Shouldn't use "$memdevs" here since it is an array. If you want to pass
in just the first memdev, use "${memdevs[0]}"

> +
> +echo 0 > /sys/kernel/tracing/tracing_on
> +
> +echo "TEST: Events seen"
> +cat /sys/kernel/tracing/trace
> +num_overflow=$(grep "cxl_overflow" /sys/kernel/tracing/trace | wc -l)
> +num_fatal=$(grep "log=Fatal" /sys/kernel/tracing/trace | wc -l)
> +num_failure=$(grep "log=Failure" /sys/kernel/tracing/trace | wc -l)
> +num_info=$(grep "log=Informational" /sys/kernel/tracing/trace | wc -l)

Minor nit, but you can 'grep -c' instead of 'grep | wc -l'

Also could put /sys/kernel/tracing/trace into a variable just for
readability since it is used many times.

> +echo " LOG (Expected) : (Found)"
> +echo " overflow  ( 1) : $num_overflow"
> +echo " Fatal ( 2) : $num_fatal"
> +echo " Failure   (16) : $num_failure"
> +echo " Informational ( 3) : $num_info"
> +
> +if [ "$num_overflow" -ne 1 ]; then
> +   err "$LINENO"
> +fi
> +if [ "$num_fatal" -ne 2 ]; then
> +   err "$LINENO"
> +fi
> +if [ "$num_failure" -ne 16 ]; then
> +   err "$LINENO"
> +fi
> +if [ "$num_info" -ne 3 ]; then
> +   err "$LINENO"
> +fi

Perhaps define variables for each of the expected nums, that way there
is only one spot to change in case the numbers change in the future.

> +
> +check_dmesg "$LINENO"
> +
> +modprobe -r cxl_test
> diff --git a/test/meson.build b/test/meson.build
> index a956885f6df6..a33255bde1a8 100644
> --- a/test/meson.build
> +++ b/test/meson.build
> @@ -155,6 +155,7 @@ cxl_sysfs = find_program('cxl-region-sysfs.sh')
>  cxl_labels = find_program('cxl-labels.sh')
>  cxl_create_region = find_program('cxl-create-region.sh')
>  cxl_xor_region = find_program('cxl-xor-region.sh')
> +cxl_events = find_program('cxl-events.sh')
>  
>  tests = [
>    [ 'libndctl',   libndctl,  'ndctl' ],
> @@ -183,6 +184,7 @@ tests = [
>    [ 'cxl-labels.sh',  cxl_labels,    'cxl'   ],
>    [ 'cxl-create-region.sh',   cxl_create_region,  'cxl'   ],
>    [ 'cxl-xor-region.sh',  cxl_xor_region, 'cxl'   ],
> +  [ 'cxl-events.sh',  cxl_events, 'cxl'   ],
>  ]
>  
>  if get_option('destructive').enabled()
> 
> ---
> base-commit: 2fd570a0ed788b1bd0971dfdb1466a5dbcb79775
> change-id: 20230726-cxl-event-dc00a2f94b60
> 
> Best regards,



Re: [ndctl PATCH RESEND 2/2] libcxl: Fix accessors for temperature field to support negative value

2023-07-31 Thread Verma, Vishal L
On Mon, 2023-07-31 at 12:17 +0900, Jehoon Park wrote:
> On Mon, Jul 24, 2023 at 09:08:21PM +, Verma, Vishal L wrote:
> > On Mon, 2023-07-17 at 15:29 +0900, Jehoon Park wrote:
> > > Add a new macro function to retrieve a signed value such as a temperature.
> > > Replace indistinguishable error numbers with debug message.
> > > 
> > > Signed-off-by: Jehoon Park 
> > > ---
> > >  cxl/lib/libcxl.c | 36 ++--
> > >  1 file changed, 26 insertions(+), 10 deletions(-)
> > > 

<..>

> > > 
> > >  CXL_EXPORT int cxl_cmd_health_info_get_temperature(struct cxl_cmd *cmd)
> > >  {
> > > int rc = health_info_get_temperature_raw(cmd);
> > > +   struct cxl_ctx *ctx = cxl_memdev_get_ctx(cmd->memdev);
> > >  
> > > -   if (rc < 0)
> > > -   return rc;
> > > +   if (rc == 0x)
> > > +   dbg(ctx, "%s: Invalid command status\n",
> > > +   cxl_memdev_get_devname(cmd->memdev));
> > > if (rc == CXL_CMD_HEALTH_INFO_TEMPERATURE_NOT_IMPL)
> > > -   return -EOPNOTSUPP;
> > > +   dbg(ctx, "%s: Device Temperature not implemented\n",
> > > +   cxl_memdev_get_devname(cmd->memdev));
> > 
> > Hi Jehoon,
> > 
> > libcxl tends to just return errno codes for simple accessors liek this,
> > and leave it up to the caller to print additional information about why
> > the call might have failed. Even though these are dbg() messages, I'd
> > prefer leaving them out of this patch, and if there is a call site
> > where this fails and there isn't an adequate error message printed as
> > to why, then add these prints there.
> > 
> > Rest of the conversion to s16 looks good.
> > 
> 
> Hi, Vishal.
> 
> Thank you for comment. I agree with the behavior of libcxl accessors as you
> explained. FYI, the reason I replaced errno codes with dbg messages is that
> those accessors are retreiving signed values. I thought returning errno codes
> is not distinguishable from retrieved values when they are negative.
> However, it looks like an overkill because a memory device works below-zero
> temperature would not make sense in real world.
> 
> I'll send revised patch soon after reverting to errno codes and fixing
> related codes in cxl/json.c.
> 
Good point on the negative temperatures - this means we can't use the
negative = error convention, but in this case what you can do is return
something like INT_MAX to indicate an error, and set errno in the
library to whatever error we want to indicate. And adjust all the
callers to check for errors in this way.


[PATCH -next] libnvdimm: remove kernel-doc warnings:

2023-07-31 Thread Zhu Wang
Remove kernel-doc warnings:

drivers/nvdimm/badrange.c:271: warning: Function parameter or member
'nd_region' not described in 'nvdimm_badblocks_populate'
drivers/nvdimm/badrange.c:271: warning: Function parameter or member
'range' not described in 'nvdimm_badblocks_populate'
drivers/nvdimm/badrange.c:271: warning: Excess function parameter 'region'
description in 'nvdimm_badblocks_populate'
drivers/nvdimm/badrange.c:271: warning: Excess function parameter 'res'
description in 'nvdimm_badblocks_populate'

Signed-off-by: Zhu Wang 
---
 drivers/nvdimm/badrange.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
index aaf6e215a8c6..a002ea6fdd84 100644
--- a/drivers/nvdimm/badrange.c
+++ b/drivers/nvdimm/badrange.c
@@ -257,9 +257,9 @@ static void badblocks_populate(struct badrange *badrange,
 
 /**
  * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
- * @region: parent region of the range to interrogate
+ * @nd_region: parent region of the range to interrogate
  * @bb: badblocks instance to populate
- * @res: resource range to consider
+ * @range: resource range to consider
  *
  * The badrange list generated during bus initialization may contain
  * multiple, possibly overlapping physical address ranges.  Compare each
-- 
2.17.1




Re: [PATCH v12 2/2] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind

2023-07-31 Thread Shiyang Ruan




在 2023/7/29 23:15, Darrick J. Wong 写道:

On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:

This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1].  With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
on it to unmap all files in use, and notify processes who are using
those files.

Call trace:
trigger unbind
  -> unbind_store()
   -> ... (skip)
-> devres_release_all()
 -> kill_dax()
  -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
   -> xfs_dax_notify_failure()
   `-> freeze_super() // freeze (kernel call)
   `-> do xfs rmap
   ` -> mf_dax_kill_procs()
   `  -> collect_procs_fsdax()// all associated processes
   `  -> unmap_and_kill()
   ` -> invalidate_inode_pages2_range() // drop file's cache
   `-> thaw_super()   // thaw (both kernel & user call)

Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
new dax mapping from being created.  Do not shutdown filesystem directly
if configuration is not supported, or if failure range includes metadata
area.  Make sure all files and processes(not only the current progress)
are handled correctly.  Also drop the cache of associated files before
pmem is removed.

[1]: 
https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.st...@dwillia2-desk3.amr.corp.intel.com/
[2]: 
https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/

Signed-off-by: Shiyang Ruan 
---
  drivers/dax/super.c |  3 +-
  fs/xfs/xfs_notify_failure.c | 86 ++---
  include/linux/mm.h  |  1 +
  mm/memory-failure.c | 17 ++--
  4 files changed, 96 insertions(+), 11 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c4c4728a36e4..2e1a35e82fce 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
return;
  
  	if (dax_dev->holder_data != NULL)

-   dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+   dax_holder_notify_failure(dax_dev, 0, U64_MAX,
+   MF_MEM_PRE_REMOVE);
  
  	clear_bit(DAXDEV_ALIVE, _dev->flags);

synchronize_srcu(_srcu);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 4a9bbd3fe120..f6ec56b76db6 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
  
  #include 

  #include 
+#include 
  
  struct xfs_failure_info {

xfs_agblock_t   startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
struct xfs_mount*mp = cur->bc_mp;
struct xfs_inode*ip;
struct xfs_failure_info *notify = data;
+   struct address_space*mapping;
+   pgoff_t pgoff;
+   unsigned long   pgcnt;
int error = 0;
  
  	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||

(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+   /* Continue the query because this isn't a failure. */
+   if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+   return 0;
notify->want_shutdown = true;
return 0;
}
@@ -92,14 +99,55 @@ xfs_dax_failure_fn(
return 0;
}
  
-	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,

- xfs_failure_pgoff(mp, rec, notify),
- xfs_failure_pgcnt(mp, rec, notify),
- notify->mf_flags);
+   mapping = VFS_I(ip)->i_mapping;
+   pgoff = xfs_failure_pgoff(mp, rec, notify);
+   pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+   /* Continue the rmap query if the inode isn't a dax file. */
+   if (dax_mapping(mapping))
+   error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+ notify->mf_flags);
+
+   /* Invalidate the cache in dax pages. */
+   if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+   invalidate_inode_pages2_range(mapping, pgoff,
+ pgoff + pgcnt - 1);
+
xfs_irele(ip);
return error;
  }
  
+static void

+xfs_dax_notify_failure_freeze(
+   struct xfs_mount*mp)
+{
+   struct super_block  *sb = mp->m_super;


Nit: extra space right^ here.


+
+   /* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
+   while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
+   // Shall we just wait, or print warning then return -EBUSY?


Hm.  PRE_REMOVE gets called before the pmem gets unplugged, right?  So
we'll send a second notification after