For the normal (make_request) I/O path writes are always synchronously
flushed through to media.  However, when DAX is in use it is possible
that userspace leaves dirty data in the cache.  Ideally userspace uses
cache-writeback and persistent-commit instructions directly to flush
writes to media.  If instead userspace uses fsync()/msync() for
consistency guarantees then the driver needs to flush the cpu cache
manually.

Ideally an architecture would provide a single instruction to write-back
all dirty lines in the cache.  In the absence of that the driver resorts
to flushing line by line.

Introduce mmio_wb_range() as the non-invalidating version of
mmio_flush_range() and arrange for a small number of flusher threads to
parallelize the work.

The flush is a nop until a userspace mapping, BLKDAX_F_DIRTY request,
arrives and we reduce the amount of work per-flush by tracking open
active dax extents.  Finer granularity 'dax_active' tracking and
clearing mapped extents will be a subject of future experiments.  For
now this enables moderately cheap fsync/msync without per-fs and mm
enabling.

Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 arch/x86/include/asm/cacheflush.h |    4 +
 block/blk-core.c                  |    1 
 block/blk.h                       |   11 ---
 drivers/nvdimm/pmem.c             |  139 +++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h            |   11 +++
 5 files changed, 154 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
index e63aa38e85fb..3eafa8088489 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -89,6 +89,10 @@ int set_pages_rw(struct page *page, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define mmio_wb_range(addr, size) __arch_wb_cache_pmem(addr, size)
+#endif
+
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
 #ifdef CONFIG_DEBUG_RODATA
diff --git a/block/blk-core.c b/block/blk-core.c
index 5159946a2b41..43e402f9c06e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -661,6 +661,7 @@ void blk_queue_exit(struct request_queue *q)
 {
        percpu_ref_put(&q->q_usage_counter);
 }
+EXPORT_SYMBOL(blk_queue_exit);
 
 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 {
diff --git a/block/blk.h b/block/blk.h
index dc7d9411fa45..a83f14f07921 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -74,17 +74,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
                            unsigned int nr_bytes, unsigned int bidi_bytes);
 void blk_freeze_queue(struct request_queue *q);
 
-static inline void blk_queue_enter_live(struct request_queue *q)
-{
-       /*
-        * Given that running in generic_make_request() context
-        * guarantees that a live reference against q_usage_counter has
-        * been established, further references under that same context
-        * need not check that the queue has been frozen (marked dead).
-        */
-       percpu_ref_get(&q->q_usage_counter);
-}
-
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 #else
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3d83f3079602..6f39d0017399 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -33,6 +33,9 @@
 
 static ASYNC_DOMAIN_EXCLUSIVE(async_pmem);
 
+#define NUM_FLUSH_THREADS 4
+#define DAX_EXTENT_SHIFT 8
+#define NUM_DAX_EXTENTS (1ULL << DAX_EXTENT_SHIFT)
 struct pmem_device {
        struct request_queue    *pmem_queue;
        struct gendisk          *pmem_disk;
@@ -45,6 +48,10 @@ struct pmem_device {
        unsigned long           pfn_flags;
        void __pmem             *virt_addr;
        size_t                  size;
+       unsigned long           size_shift;
+       struct bio              *flush_bio;
+       spinlock_t              lock;
+       DECLARE_BITMAP(dax_active, NUM_DAX_EXTENTS);
 };
 
 static int pmem_major;
@@ -68,6 +75,105 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct 
page *page,
        kunmap_atomic(mem);
 }
 
+struct pmem_flush_ctx {
+       struct pmem_device *pmem;
+       struct block_device *bdev;
+       int id;
+};
+
+static resource_size_t dax_extent_shift(struct pmem_device *pmem)
+{
+       return pmem->size_shift - DAX_EXTENT_SHIFT;
+}
+
+static resource_size_t dax_extent_size(struct pmem_device *pmem)
+{
+       return 1ULL << dax_extent_shift(pmem);
+}
+
+static void pmem_flush(void *data, async_cookie_t cookie)
+{
+       unsigned int i;
+       resource_size_t offset;
+       struct pmem_flush_ctx *ctx = data;
+       struct pmem_device *pmem = ctx->pmem;
+       struct device *dev = part_to_dev(ctx->bdev->bd_part);
+       unsigned long extent = dax_extent_size(pmem) / NUM_FLUSH_THREADS;
+
+       for_each_set_bit(i, pmem->dax_active, NUM_DAX_EXTENTS) {
+               unsigned long flush_len;
+               void *addr;
+
+               offset = dax_extent_size(pmem) * i + extent * ctx->id;
+               if (offset > pmem->size)
+                       break;
+               flush_len = min_t(resource_size_t, extent, pmem->size - offset);
+               addr = (void __force *) pmem->virt_addr + offset;
+               dev_dbg(dev, "%s: %p %#lx\n", __func__, addr, flush_len);
+               while (flush_len) {
+                       unsigned long len = min_t(unsigned long, flush_len, 
SZ_1M);
+
+#if defined(mmio_wb_range)
+                       mmio_wb_range(addr, len);
+#elif defined(mmio_flush_range)
+                       mmio_flush_range(addr, len);
+#else
+                       dev_err_once(dev, "%s: failed, no flush method\n",
+                                       __func__);
+                       return;
+#endif
+                       flush_len -= len;
+                       addr += len;
+                       cond_resched();
+               }
+       }
+}
+
+static void __pmem_flush_request(void *data, async_cookie_t cookie)
+{
+       struct pmem_flush_ctx ctx[NUM_FLUSH_THREADS];
+       struct pmem_device *pmem = data;
+       struct bio *bio;
+       int i;
+
+       spin_lock(&pmem->lock);
+       bio = pmem->flush_bio;
+       pmem->flush_bio = bio->bi_next;
+       bio->bi_next = NULL;
+       spin_unlock(&pmem->lock);
+
+       for (i = 0; i < NUM_FLUSH_THREADS; i++) {
+               ctx[i].bdev = bio->bi_bdev;
+               ctx[i].pmem = pmem;
+               ctx[i].id = i;
+               cookie = async_schedule_domain(pmem_flush, &ctx[i], 
&async_pmem);
+       }
+       async_synchronize_cookie_domain(cookie, &async_pmem);
+       wmb_pmem();
+       bio_endio(bio);
+       blk_queue_exit(pmem->pmem_queue);
+}
+
+static void pmem_flush_request(struct pmem_device *pmem, struct bio *bio)
+{
+       int do_flush = 1;
+
+       spin_lock(&pmem->lock);
+       if (bitmap_weight(pmem->dax_active, NUM_DAX_EXTENTS) == 0) {
+               do_flush = 0;
+       } else {
+               bio->bi_next = pmem->flush_bio;
+               pmem->flush_bio = bio;
+       }
+       spin_unlock(&pmem->lock);
+
+       if (do_flush) {
+               blk_queue_enter_live(pmem->pmem_queue);
+               async_schedule(__pmem_flush_request, pmem);
+       } else
+               bio_endio(bio);
+}
+
 static void pmem_make_request(struct request_queue *q, struct bio *bio)
 {
        bool do_acct;
@@ -87,7 +193,11 @@ static void pmem_make_request(struct request_queue *q, 
struct bio *bio)
        if (bio_data_dir(bio))
                wmb_pmem();
 
-       bio_endio(bio);
+       /* we're always durable unless/until dax is activated */
+       if (bio->bi_rw & REQ_FLUSH)
+               pmem_flush_request(pmem, bio);
+       else
+               bio_endio(bio);
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
@@ -112,6 +222,27 @@ static long pmem_direct_access(struct block_device *bdev,
        dax->addr = pmem->virt_addr + offset;
        dax->pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+       if (dax->flags & BLKDAX_F_DIRTY) {
+               unsigned long start = offset >> dax_extent_shift(pmem);
+               unsigned long len;
+               size_t size;
+
+               size = min_t(size_t, pmem->size - offset, dax->size);
+               size = ALIGN(size, dax_extent_size(pmem));
+               len = max_t(unsigned long, 1, size >> dax_extent_shift(pmem));
+
+               /*
+                * Any flush initiated after the lock is dropped observes new
+                * dirty state
+                */
+               spin_lock(&pmem->lock);
+               bitmap_set(pmem->dax_active, start, len);
+               spin_unlock(&pmem->lock);
+
+               dev_dbg(part_to_dev(bdev->bd_part), "dax active %lx +%lx\n",
+                               start, len);
+       }
+
        return pmem->size - offset;
 }
 
@@ -132,8 +263,12 @@ static struct pmem_device *pmem_alloc(struct device *dev,
        if (!pmem)
                return ERR_PTR(-ENOMEM);
 
+       spin_lock_init(&pmem->lock);
        pmem->phys_addr = res->start;
        pmem->size = resource_size(res);
+       pmem->size_shift = ilog2(pmem->size);
+       if (1ULL << pmem->size_shift < pmem->size)
+               pmem->size_shift++;
        if (!arch_has_wmb_pmem())
                dev_warn(dev, "unable to guarantee persistence of writes\n");
 
@@ -217,6 +352,8 @@ static int pmem_attach_disk(struct device *dev,
        blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
        blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
+       /* every write via pmem_make_request has FUA semantics by default */
+       blk_queue_flush(pmem->pmem_queue, REQ_FLUSH | REQ_FUA);
 
        disk = alloc_disk_node(0, nid);
        if (!disk) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 663e9974820f..de8a3d58f071 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -793,6 +793,17 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct 
gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
 
+static inline void blk_queue_enter_live(struct request_queue *q)
+{
+       /*
+        * Given that running in generic_make_request() context
+        * guarantees that a live reference against q_usage_counter has
+        * been established, further references under that same context
+        * need not check that the queue has been frozen (marked dead).
+        */
+       percpu_ref_get(&q->q_usage_counter);
+}
+
 extern int blk_queue_enter(struct request_queue *q, gfp_t gfp);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to