[RFC PATCH 6/6] lightnvm: pblk: Integrate RAIL

2018-09-16 Thread Heiner Litz
Integrate Redundant Array of Independent Luns (RAIL) into lightnvm. RAIL
enforces low tail read latency by guaranteeing that reads are never
serialized behind writes and erases to the same LUN. Whenever LUNs serve a
high latency operation, reads are performed by recomputing the original
utilizing redundant parity information.
Rail trades-off read latency for capacity (redundancy) which, however, can
be leveraged for fault tolerance.

On FIO, with the kyber scheduler set to a target read latency of 500us,
RAIL reduces tail latency percentiles (us) as follows:

   Avg90%99% 99.9%  99.95%  99.99%
   pblk   90 10002200   30006000
   RAIL   85 100 250400 500

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/Kconfig  | 10 ++
 drivers/lightnvm/Makefile |  1 +
 drivers/lightnvm/pblk-core.c  | 36 ++-
 drivers/lightnvm/pblk-init.c  | 17 +
 drivers/lightnvm/pblk-rail.c  |  1 +
 drivers/lightnvm/pblk-rb.c|  6 ++
 drivers/lightnvm/pblk-read.c  |  9 +
 drivers/lightnvm/pblk-write.c |  9 +
 drivers/lightnvm/pblk.h   |  5 +
 9 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index a872cd720967..165d5a29acc3 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -35,6 +35,16 @@ config NVM_PBLK_DEBUG
  vocal error messages, and extra tracking fields in the pblk sysfs
  entries.
 
+config NVM_PBLK_RAIL
+   bool "Pblk RAIL Support"
+   default n
+   help
+ Enables RAIL for pblk. RAIL enforces tail read latency guarantees by
+eliminiating reads being serialized behind writes to the same LUN.
+RAIL partitions LUNs into strides and enforces that only one LUN per
+stride is written at a time. Reads can bypass busy LUNs by recompting
+requested data using parity redundancy.
+
 endif # NVM_PBLK_DEBUG
 
 endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 97d9d7c71550..92f4376428cc 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_NVM)  := core.o
 obj-$(CONFIG_NVM_PBLK) += pblk.o
+obj-$(CONFIG_NVM_PBLK_RAIL)+= pblk-rail.o
 pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
   pblk-write.o pblk-cache.o pblk-read.o \
   pblk-gc.o pblk-recovery.o pblk-map.o \
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index a31bf359f905..ca74d7763fa9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -113,6 +113,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 {
struct pblk *pblk = rqd->private;
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+   struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
+
+   pblk_up_chunk(pblk, ppa_list[0]);
+#endif
+
__pblk_end_io_erase(pblk, rqd);
mempool_free(rqd, >e_rq_pool);
 }
@@ -940,7 +946,11 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct 
ppa_addr ppa)
/* The write thread schedules erases so that it minimizes disturbances
 * with writes. Thus, there is no need to take the LUN semaphore.
 */
+#ifdef CONFIG_NVM_PBLK_RAIL
+   ret = pblk_submit_io_sync_sem(pblk, );
+#else
ret = pblk_submit_io_sync(pblk, );
+#endif
rqd.private = pblk;
__pblk_end_io_erase(pblk, );
 
@@ -1754,7 +1764,11 @@ int pblk_blk_erase_async(struct pblk *pblk, struct 
ppa_addr ppa)
/* The write thread schedules erases so that it minimizes disturbances
 * with writes. Thus, there is no need to take the LUN semaphore.
 */
+#ifdef CONFIG_NVM_PBLK_RAIL
+   err = pblk_submit_io_sem(pblk, rqd);
+#else
err = pblk_submit_io(pblk, rqd);
+#endif
if (err) {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
@@ -1909,6 +1923,10 @@ void pblk_line_close_ws(struct work_struct *work)
if (w_err_gc->has_write_err)
pblk_save_lba_list(pblk, line);
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+   pblk_rail_line_close(pblk, line);
+#endif
+
pblk_line_close(pblk, line);
mempool_free(line_ws, >gen_ws_pool);
 }
@@ -1938,8 +1956,12 @@ static void __pblk_down_chunk(struct pblk *pblk, int pos)
 * Only send one inflight I/O per LUN. Since we map at a page
 * granurality, all ppas in the I/O will map to the same LUN
 */
-
+#ifdef CONFIG_NVM_PBLK_RAIL
+   (void)rlun;
+   ret = pblk_rail_down_stride(pblk, pos, msecs_to_jiffies(3));
+#else
ret = down_timeout(>wr_sem, msecs_to_jiffies(3));
+#endif
if (ret == -ETIME || ret == -EINTR)
pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
-ret);
@@ -1978,7 +2000,13 @@ void 

[RFC PATCH 4/6] lightnvm: pblk: Add pblk_submit_io_sem

2018-09-16 Thread Heiner Litz
In preparation of supporting RAIL, add a new API pblk_submit_io_sem which
takes the lun semaphore before submitting the asynchronous request.

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/pblk-core.c | 11 +++
 drivers/lightnvm/pblk.h  |  1 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 2e40666fdf80..a31bf359f905 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -492,6 +492,17 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
return nvm_submit_io(dev, rqd);
 }
 
+int pblk_submit_io_sem(struct pblk *pblk, struct nvm_rq *rqd)
+{
+   struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
+   int ret;
+
+   pblk_down_chunk(pblk, ppa_list[0]);
+   ret = pblk_submit_io(pblk, rqd);
+
+   return ret;
+}
+
 void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
 {
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 64d9c206ec52..bd88784e51d9 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -797,6 +797,7 @@ struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_sem(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
-- 
2.17.1



[RFC PATCH 5/6] lightnvm: pblk: Add RAIL interface

2018-09-16 Thread Heiner Litz
In prepartion of supporting RAIL, add the RAIL API.

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/pblk-rail.c | 808 +++
 drivers/lightnvm/pblk.h  |  63 ++-
 2 files changed, 870 insertions(+), 1 deletion(-)
 create mode 100644 drivers/lightnvm/pblk-rail.c

diff --git a/drivers/lightnvm/pblk-rail.c b/drivers/lightnvm/pblk-rail.c
new file mode 100644
index ..a48ed31a0ba9
--- /dev/null
+++ b/drivers/lightnvm/pblk-rail.c
@@ -0,0 +1,808 @@
+/*
+ * Copyright (C) 2018 Heiner Litz
+ * Initial release: Heiner Litz 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rail.c - pblk's RAIL path
+ */
+
+#include "pblk.h"
+
+#define PBLK_RAIL_EMPTY ~0x0
+#define PBLK_RAIL_PARITY_WRITE 0x8000
+
+/* RAIL auxiliary functions */
+static unsigned int pblk_rail_nr_parity_luns(struct pblk *pblk)
+{
+   struct pblk_line_meta *lm = >lm;
+
+   return lm->blk_per_line / PBLK_RAIL_STRIDE_WIDTH;
+}
+
+static unsigned int pblk_rail_nr_data_luns(struct pblk *pblk)
+{
+   struct pblk_line_meta *lm = >lm;
+
+   return lm->blk_per_line - pblk_rail_nr_parity_luns(pblk);
+}
+
+static unsigned int pblk_rail_sec_per_stripe(struct pblk *pblk)
+{
+   struct pblk_line_meta *lm = >lm;
+
+   return lm->blk_per_line * pblk->min_write_pgs;
+}
+
+static unsigned int pblk_rail_psec_per_stripe(struct pblk *pblk)
+{
+   return pblk_rail_nr_parity_luns(pblk) * pblk->min_write_pgs;
+}
+
+static unsigned int pblk_rail_dsec_per_stripe(struct pblk *pblk)
+{
+   return pblk_rail_sec_per_stripe(pblk) - pblk_rail_psec_per_stripe(pblk);
+}
+
+static unsigned int pblk_rail_wrap_lun(struct pblk *pblk, unsigned int lun)
+{
+   struct pblk_line_meta *lm = >lm;
+
+   return (lun & (lm->blk_per_line - 1));
+}
+
+bool pblk_rail_meta_distance(struct pblk_line *data_line)
+{
+   return (data_line->meta_distance % PBLK_RAIL_STRIDE_WIDTH) == 0;
+}
+
+/* Notify readers that LUN is serving high latency operation */
+static void pblk_rail_notify_reader_down(struct pblk *pblk, int lun)
+{
+   WARN_ON(test_and_set_bit(lun, pblk->rail.busy_bitmap));
+   /* Make sure that busy bit is seen by reader before proceeding */
+   smp_mb__after_atomic();
+}
+
+static void pblk_rail_notify_reader_up(struct pblk *pblk, int lun)
+{
+   /* Make sure that write is completed before releasing busy bit */
+   smp_mb__before_atomic();
+   WARN_ON(!test_and_clear_bit(lun, pblk->rail.busy_bitmap));
+}
+
+int pblk_rail_lun_busy(struct pblk *pblk, struct ppa_addr ppa)
+{
+   struct nvm_tgt_dev *dev = pblk->dev;
+   struct nvm_geo *geo = >geo;
+   int lun_pos = pblk_ppa_to_pos(geo, ppa);
+
+   return test_bit(lun_pos, pblk->rail.busy_bitmap);
+}
+
+/* Enforces one writer per stride */
+int pblk_rail_down_stride(struct pblk *pblk, int lun_pos, int timeout)
+{
+   struct pblk_lun *rlun;
+   int strides = pblk_rail_nr_parity_luns(pblk);
+   int stride = lun_pos % strides;
+   int ret;
+
+   rlun = >luns[stride];
+   ret = down_timeout(>wr_sem, timeout);
+   pblk_rail_notify_reader_down(pblk, lun_pos);
+
+   return ret;
+}
+
+void pblk_rail_up_stride(struct pblk *pblk, int lun_pos)
+{
+   struct pblk_lun *rlun;
+   int strides = pblk_rail_nr_parity_luns(pblk);
+   int stride = lun_pos % strides;
+
+   pblk_rail_notify_reader_up(pblk, lun_pos);
+   rlun = >luns[stride];
+   up(>wr_sem);
+}
+
+/* Determine whether a sector holds data, meta or is bad*/
+bool pblk_rail_valid_sector(struct pblk *pblk, struct pblk_line *line, int pos)
+{
+   struct pblk_line_meta *lm = >lm;
+   struct nvm_tgt_dev *dev = pblk->dev;
+   struct nvm_geo *geo = >geo;
+   struct ppa_addr ppa;
+   int lun;
+
+   if (pos >= line->smeta_ssec && pos < (line->smeta_ssec + lm->smeta_sec))
+   return false;
+
+   if (pos >= line->emeta_ssec &&
+   pos < (line->emeta_ssec + lm->emeta_sec[0]))
+   return false;
+
+   ppa = addr_to_gen_ppa(pblk, pos, line->id);
+   lun = pblk_ppa_to_pos(geo, ppa);
+
+   return !test_bit(lun, line->blk_bitmap);
+}
+
+/* Delay rb overwrite until whole stride has been written */
+int pblk_rail_rb_delay(struct pblk_rb *rb)
+{
+   struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+   return pblk_rail_sec_per_stripe(pblk);
+}
+
+static unsigned int pblk_rail_sec_to_stride(struct pblk *pblk, unsigned int 
sec)
+{
+   unsigned int sec_in_stripe = sec % pblk_rail_sec_per_stripe(pblk);
+   int page = sec_in_stripe / 

[RFC PATCH 1/6] lightnvm: pblk: refactor read and write APIs

2018-09-16 Thread Heiner Litz
In prepartion of supporting RAIL, expose read and write APIs so their
functionality can be leveraged by RAIL.

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/pblk-read.c  | 8 +++-
 drivers/lightnvm/pblk-write.c | 4 ++--
 drivers/lightnvm/pblk.h   | 7 +++
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 6d13763f2f6a..67d44caefff4 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -170,8 +170,7 @@ static void pblk_end_user_read(struct bio *bio)
bio_endio(bio);
 }
 
-static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
-  bool put_line)
+void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, bool put_line)
 {
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
@@ -285,10 +284,9 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
__pblk_end_io_read(pblk, rqd, false);
 }
 
-static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int bio_init_idx,
-   unsigned long *read_bitmap,
-   int nr_holes)
+   unsigned long *read_bitmap, int nr_holes)
 {
struct pblk_sec_meta *meta_list = rqd->meta_list;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 9554febee480..1ce03d7c873b 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -217,7 +217,7 @@ static void pblk_submit_rec(struct work_struct *work)
 }
 
 
-static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
 {
struct pblk_rec_ctx *recovery;
 
@@ -500,7 +500,7 @@ static struct pblk_line *pblk_should_submit_meta_io(struct 
pblk *pblk,
return meta_line;
 }
 
-static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
+int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
 {
struct ppa_addr erase_ppa;
struct pblk_line *meta_line;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 3596043332f2..eab50df70ae6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -861,6 +861,8 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr 
*ppas,
 int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags);
 int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
+void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd);
 
 /*
  * pblk map
@@ -886,6 +888,11 @@ void pblk_write_kick(struct pblk *pblk);
 extern struct bio_set pblk_bio_set;
 int pblk_submit_read(struct pblk *pblk, struct bio *bio);
 int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
+void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, bool put_line);
+int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+   unsigned int bio_init_idx,
+   unsigned long *read_bitmap, int nr_holes);
+
 /*
  * pblk recovery
  */
-- 
2.17.1



[RFC PATCH 0/6] lightnvm: pblk: Introduce RAIL to enforce low tail read latency

2018-09-16 Thread Heiner Litz
Hi All,
this patchset introduces RAIL, a mechanism to enforce low tail read latency for
lightnvm OCSSD devices. RAIL leverages redundancy to guarantee that reads are
always served from LUNs that do not serve a high latency operation such as a
write or erase. This avoids that reads become serialized behind these operations
reducing tail latency by ~10x. In particular, in the absence of ECC read errors,
it provides 99.99 percentile read latencies of below 500us. RAIL introduces
capacity overheads (7%-25%) due to RAID-5 like striping (providing fault
tolerance) and reduces the maximum write bandwidth to 110K IOPS on CNEX SSD.

This patch is based on pblk/core and requires two additional patches from Javier
to be applicable (let me know if you want me to rebase):

The 1st patch exposes some existing APIs so they can be used by RAIL
The 2nd patch introduces a configurable sector mapping function
The 3rd patch refactors the write path so the end_io_fn can be specified when
setting up the request
The 4th patch adds a new submit io function that acquires the write semaphore
The 5th patch introduces the RAIL feature and its API
The 6th patch integrates RAIL into pblk's read and write path




[RFC PATCH 2/6] lightnvm: pblk: Add configurable mapping function

2018-09-16 Thread Heiner Litz
In prepartion of supporting RAIL, introduce a new function pointer so that
different mapping functions can be used to determine sector placement.

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/pblk-init.c |  2 ++
 drivers/lightnvm/pblk-map.c  | 18 +-
 drivers/lightnvm/pblk.h  | 13 +
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index fb66bc84d5ca..2b9c6ebd9fac 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -411,6 +411,8 @@ static int pblk_core_init(struct pblk *pblk)
pblk->pad_rst_wa = 0;
pblk->gc_rst_wa = 0;
 
+   pblk->map_page = pblk_map_page_data;
+
atomic64_set(>nr_flush, 0);
pblk->nr_flush_rst = 0;
 
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index ff677ca6e4e1..9490601de3a5 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -18,11 +18,11 @@
 
 #include "pblk.h"
 
-static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
- struct ppa_addr *ppa_list,
- unsigned long *lun_bitmap,
- struct pblk_sec_meta *meta_list,
- unsigned int valid_secs)
+int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+  struct ppa_addr *ppa_list,
+  unsigned long *lun_bitmap,
+  struct pblk_sec_meta *meta_list,
+  unsigned int valid_secs)
 {
struct pblk_line *line = pblk_line_get_data(pblk);
struct pblk_emeta *emeta;
@@ -95,8 +95,8 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, 
unsigned int sentry,
 
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-   if (pblk_map_page_data(pblk, sentry + i, _list[i],
-   lun_bitmap, _list[i], map_secs)) {
+   if (pblk->map_page(pblk, sentry + i, _list[i], lun_bitmap,
+  _list[i], map_secs)) {
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
pblk_pipeline_stop(pblk);
@@ -121,8 +121,8 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq 
*rqd,
 
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-   if (pblk_map_page_data(pblk, sentry + i, _list[i],
-   lun_bitmap, _list[i], map_secs)) {
+   if (pblk->map_page(pblk, sentry + i, _list[i], lun_bitmap,
+  _list[i], map_secs)) {
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
pblk_pipeline_stop(pblk);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index eab50df70ae6..87dc24772dad 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -604,6 +604,12 @@ struct pblk_addrf {
int sec_ws_stripe;
 };
 
+typedef int (pblk_map_page_fn)(struct pblk *pblk, unsigned int sentry,
+  struct ppa_addr *ppa_list,
+  unsigned long *lun_bitmap,
+  struct pblk_sec_meta *meta_list,
+  unsigned int valid_secs);
+
 struct pblk {
struct nvm_tgt_dev *dev;
struct gendisk *disk;
@@ -709,6 +715,8 @@ struct pblk {
struct timer_list wtimer;
 
struct pblk_gc gc;
+
+   pblk_map_page_fn *map_page;
 };
 
 struct pblk_line_ws {
@@ -873,6 +881,11 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq 
*rqd,
 void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 unsigned long *lun_bitmap, unsigned int valid_secs,
 unsigned int off);
+int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+  struct ppa_addr *ppa_list,
+  unsigned long *lun_bitmap,
+  struct pblk_sec_meta *meta_list,
+  unsigned int valid_secs);
 
 /*
  * pblk write thread
-- 
2.17.1



[RFC PATCH 3/6] lightnvm: pblk: Refactor end_io function in pblk_submit_io_set

2018-09-16 Thread Heiner Litz
In preparation of supporting RAIL, refactor pblk_submit_io_set in the write
path so that the end_io function can be specified when setting up the
request.

Signed-off-by: Heiner Litz 
---
 drivers/lightnvm/pblk-write.c | 11 ++-
 drivers/lightnvm/pblk.h   |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 1ce03d7c873b..6eba38b83acd 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -309,7 +309,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq 
*rqd,
 }
 
 static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-  struct ppa_addr *erase_ppa)
+  struct ppa_addr *erase_ppa, nvm_end_io_fn(*end_io))
 {
struct pblk_line_meta *lm = >lm;
struct pblk_line *e_line = pblk_line_get_erase(pblk);
@@ -325,7 +325,7 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq 
*rqd,
return -ENOMEM;
c_ctx->lun_bitmap = lun_bitmap;
 
-   ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write);
+   ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, end_io);
if (ret) {
kfree(lun_bitmap);
return ret;
@@ -500,7 +500,8 @@ static struct pblk_line *pblk_should_submit_meta_io(struct 
pblk *pblk,
return meta_line;
 }
 
-int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
+int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd,
+  nvm_end_io_fn(*end_io))
 {
struct ppa_addr erase_ppa;
struct pblk_line *meta_line;
@@ -509,7 +510,7 @@ int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq 
*rqd)
pblk_ppa_set_empty(_ppa);
 
/* Assign lbas to ppas and populate request structure */
-   err = pblk_setup_w_rq(pblk, rqd, _ppa);
+   err = pblk_setup_w_rq(pblk, rqd, _ppa, end_io);
if (err) {
pblk_err(pblk, "could not setup write request: %d\n", err);
return NVM_IO_ERR;
@@ -631,7 +632,7 @@ static int pblk_submit_write(struct pblk *pblk)
goto fail_put_bio;
}
 
-   if (pblk_submit_io_set(pblk, rqd))
+   if (pblk_submit_io_set(pblk, rqd, pblk_end_io_write))
goto fail_free_bio;
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 87dc24772dad..64d9c206ec52 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -870,7 +870,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags);
 int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd,
+  nvm_end_io_fn(*end_io));
 
 /*
  * pblk map
-- 
2.17.1



Re: [PATCH V3 00/17] SCSI: introduce per-host admin queue & enable runtime PM

2018-09-16 Thread jianchao.wang
Hi Ming

Thanks for your kindly response.

On 09/16/2018 09:09 PM, Ming Lei wrote:
> On Fri, Sep 14, 2018 at 03:27:44PM +0800, jianchao.wang wrote:
>> Hi Ming
>>
>> On 09/13/2018 08:15 PM, Ming Lei wrote:
>>> This patchset introduces per-host admin request queue for submitting
>>> admin request only, and uses this approach to implement both SCSI
>>> quiesce and runtime PM in one very simply way. Also runtime PM deadlock
>>> can be avoided in case that request pool is used up, such as when too
>>> many IO requests are allocated before resuming device
>>
>> To be honest, after look in to the SCSI part of this patchset, I really
>> suspect whether it is worth to introduce this per scsi-host adminq.
>> Too much complexity is introduced into SCSI. Compared with this, the current
> 
> Can you comment on individual patch about which part is complicated?
> I will explain them to you only by one.

I have read through the scsi part of your patch many times. :)

After this patchset, we could control the IO request_queues separately. This is
more convenient.

But what we have to pay is the relative complexity _spread_ over scsi-layer 
code.
Especially, we have to handle that a request to a scsi device could be from its
own IO request_queue or the per-host adminq at both submit and complete side.

We have handled those cases in the patchset, but that looks really boring.
And also it is risky in current stable scsi mid-layer code.

I really think we should control the complexity in a very small range.

> 
>> preempt-only feature look much simpler.
>>
>> If you just don't like the preempt-only checking in the hot path, we may 
>> introduce
>> a new interface similar with blk_queue_enter for the non hot path.
>>
>> With your patch percpu-refcount: relax limit on percpu_ref_reinit()
> 
> Seems this way may be one improvement on Bart's V6.
> 
>>
>> (totally no test)
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 4dbc93f..dd7f007 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -427,14 +427,24 @@ EXPORT_SYMBOL(blk_sync_queue);
>>   * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was 
>> not
>>   * set and 1 if the flag was already set.
>>   */
>> -int blk_set_preempt_only(struct request_queue *q)
>> +int blk_set_preempt_only(struct request_queue *q, bool drain)
>>  {
>> -return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
>> +if (blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q))
>> +return 1;
>> +percpu_ref_kill(>q_usage_counter);
>> +synchronize_sched();
>> +
>> +if (drain)
>> +wait_event(q->mq_freeze_wq,
>> +percpu_ref_is_zero(>q_usage_counter));
>> +
>> +return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(blk_set_preempt_only);
> 
> It is easy to cause race between the normal freeze interface and the
> above one. That may be one new complexity of the preempt only approach,
> because there can be more than one path to kill/reinit .q_usage_counter.

Yes, indeed.

> 
>>  
>>  void blk_clear_preempt_only(struct request_queue *q)
>>  {
>> +percpu_ref_reinit(>q_usage_counter);
>>  blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
>>  wake_up_all(>mq_freeze_wq);
>>  }
>> @@ -913,31 +923,13 @@ EXPORT_SYMBOL(blk_alloc_queue);
>>  /**
>>   * blk_queue_enter() - try to increase q->q_usage_counter
>>   * @q: request queue pointer
>> - * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
>> + * @flags: BLK_MQ_REQ_NOWAIT
>>   */
>>  int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
>>  {
>> -const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
>> -
>>  while (true) {
>> -bool success = false;
>> -
>> -rcu_read_lock();
>> -if (percpu_ref_tryget_live(>q_usage_counter)) {
>> -/*
>> - * The code that sets the PREEMPT_ONLY flag is
>> - * responsible for ensuring that that flag is globally
>> - * visible before the queue is unfrozen.
>> - */
>> -if (preempt || !blk_queue_preempt_only(q)) {
>> -success = true;
>> -} else {
>> -percpu_ref_put(>q_usage_counter);
>> -}
>> -}
>> -rcu_read_unlock();
>>  
>> -if (success)
>> +if (percpu_ref_tryget_live(>q_usage_counter))
>>  return 0;
>>  
>>  if (flags & BLK_MQ_REQ_NOWAIT)
>> @@ -954,7 +946,44 @@ int blk_queue_enter(struct request_queue *q, 
>> blk_mq_req_flags_t flags)
>>  
>>  wait_event(q->mq_freeze_wq,
>> (atomic_read(>mq_freeze_depth) == 0 &&
>> -(preempt || !blk_queue_preempt_only(q))) ||
>> +   !blk_queue_preempt_only(q)) ||
>> +   blk_queue_dying(q));
>> +if (blk_queue_dying(q))
>> +return -ENODEV;
>> +}
>> +}
> 
> The big question is how you will implement runtime suspend with this
> approach? New IO has to terminate the runtime suspend.

Some checking 

Re: [PATCH V3 00/17] SCSI: introduce per-host admin queue & enable runtime PM

2018-09-16 Thread Ming Lei
On Fri, Sep 14, 2018 at 03:27:44PM +0800, jianchao.wang wrote:
> Hi Ming
> 
> On 09/13/2018 08:15 PM, Ming Lei wrote:
> > This patchset introduces per-host admin request queue for submitting
> > admin request only, and uses this approach to implement both SCSI
> > quiesce and runtime PM in one very simply way. Also runtime PM deadlock
> > can be avoided in case that request pool is used up, such as when too
> > many IO requests are allocated before resuming device
> 
> To be honest, after look in to the SCSI part of this patchset, I really
> suspect whether it is worth to introduce this per scsi-host adminq.
> Too much complexity is introduced into SCSI. Compared with this, the current

Can you comment on individual patch about which part is complicated?
I will explain them to you only by one.

> preempt-only feature look much simpler.
> 
> If you just don't like the preempt-only checking in the hot path, we may 
> introduce
> a new interface similar with blk_queue_enter for the non hot path.
> 
> With your patch percpu-refcount: relax limit on percpu_ref_reinit()

Seems this way may be one improvement on Bart's V6.

> 
> (totally no test)
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4dbc93f..dd7f007 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -427,14 +427,24 @@ EXPORT_SYMBOL(blk_sync_queue);
>   * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was 
> not
>   * set and 1 if the flag was already set.
>   */
> -int blk_set_preempt_only(struct request_queue *q)
> +int blk_set_preempt_only(struct request_queue *q, bool drain)
>  {
> -return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
> +if (blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q))
> +return 1;
> +percpu_ref_kill(>q_usage_counter);
> +synchronize_sched();
> +
> +if (drain)
> +wait_event(q->mq_freeze_wq,
> +percpu_ref_is_zero(>q_usage_counter));
> +
> +return 0;
>  }
>  EXPORT_SYMBOL_GPL(blk_set_preempt_only);

It is easy to cause race between the normal freeze interface and the
above one. That may be one new complexity of the preempt only approach,
because there can be more than one path to kill/reinit .q_usage_counter.

>  
>  void blk_clear_preempt_only(struct request_queue *q)
>  {
> +percpu_ref_reinit(>q_usage_counter);
>  blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
>  wake_up_all(>mq_freeze_wq);
>  }
> @@ -913,31 +923,13 @@ EXPORT_SYMBOL(blk_alloc_queue);
>  /**
>   * blk_queue_enter() - try to increase q->q_usage_counter
>   * @q: request queue pointer
> - * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
> + * @flags: BLK_MQ_REQ_NOWAIT
>   */
>  int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
>  {
> -const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
> -
>  while (true) {
> -bool success = false;
> -
> -rcu_read_lock();
> -if (percpu_ref_tryget_live(>q_usage_counter)) {
> -/*
> - * The code that sets the PREEMPT_ONLY flag is
> - * responsible for ensuring that that flag is globally
> - * visible before the queue is unfrozen.
> - */
> -if (preempt || !blk_queue_preempt_only(q)) {
> -success = true;
> -} else {
> -percpu_ref_put(>q_usage_counter);
> -}
> -}
> -rcu_read_unlock();
>  
> -if (success)
> +if (percpu_ref_tryget_live(>q_usage_counter))
>  return 0;
>  
>  if (flags & BLK_MQ_REQ_NOWAIT)
> @@ -954,7 +946,44 @@ int blk_queue_enter(struct request_queue *q, 
> blk_mq_req_flags_t flags)
>  
>  wait_event(q->mq_freeze_wq,
> (atomic_read(>mq_freeze_depth) == 0 &&
> -(preempt || !blk_queue_preempt_only(q))) ||
> +   !blk_queue_preempt_only(q)) ||
> +   blk_queue_dying(q));
> +if (blk_queue_dying(q))
> +return -ENODEV;
> +}
> +}

The big question is how you will implement runtime suspend with this
approach? New IO has to terminate the runtime suspend.

> +
> +/*
> + * When set PREEMPT_ONLY mode, q->q_usage_counter is killed.
> + * If only PREEMPT_ONLY mode, go on.
> + * If queue frozen, wait.
> + */
> +int blk_queue_preempt_enter(struct request_queue *q,
> +blk_mq_req_flags_t flags)
> +{
> +while (true) {
> +
> +if (percpu_ref_tryget_live(>q_usage_counter))
> +return 0;
> +
> +smp_rmb();
> +
> +/*
> + * If queue is only in PREEMPT_ONLY mode, get the ref
> + * directly. The one who set PREEMPT_ONLY mode is responsible
> + * to wake up the waiters on mq_freeze_wq.
> + */
> +if (!atomic_read(>mq_freeze_depth) &&
> +blk_queue_preempt_only(q)) {
> +percpu_ref_get_many(>q_usage_counter, 1);
> +return 0;
> +}

This way will delay runtime pm or system suspend