from:"Omar Sandoval"

Re: [PATCH v2] kyber: introduce kyber_depth_updated()

2021-02-22 Thread Omar Sandoval

On Fri, Feb 05, 2021 at 01:13:10AM -0800, Yang Yang wrote:
> Hang occurs when user changes the scheduler queue depth, by writing to
> the 'nr_requests' sysfs file of that device.
> 
> The details of the environment that we found the problem are as follows:
>   an eMMC block device
>   total driver tags: 16
>   default queue_depth: 32
>   kqd->async_depth initialized in kyber_init_sched() with queue_depth=32
> 
> Then we change queue_depth to 256, by writing to the 'nr_requests' sysfs
> file. But kqd->async_depth don't be updated after queue_depth changes.
> Now the value of async depth is too small for queue_depth=256, this may
> cause hang.
> 
> This patch introduces kyber_depth_updated(), so that kyber can update
> async depth when queue depth changes.
> 
> Signed-off-by: Yang Yang 

I wasn't able to reproduce the hang, but this looks correct, and it
passed my tests.

Reviewed-by: Omar Sandoval 

> ---
> v2:
> - Change the commit message
> - Change from sbitmap::depth to 2^sbitmap::shift
> ---
>  block/kyber-iosched.c | 29 +
>  1 file changed, 13 insertions(+), 16 deletions(-)
> 
> diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> index dc89199bc8c6..17215b6bf482 100644
> --- a/block/kyber-iosched.c
> +++ b/block/kyber-iosched.c
> @@ -353,19 +353,9 @@ static void kyber_timer_fn(struct timer_list *t)
>   }
>  }
>  
> -static unsigned int kyber_sched_tags_shift(struct request_queue *q)
> -{
> - /*
> -  * All of the hardware queues have the same depth, so we can just grab
> -  * the shift of the first one.
> -  */
> - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
> -}
> -
>  static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue 
> *q)
>  {
>   struct kyber_queue_data *kqd;
> - unsigned int shift;
>   int ret = -ENOMEM;
>   int i;
>  
> @@ -400,9 +390,6 @@ static struct kyber_queue_data 
> *kyber_queue_data_alloc(struct request_queue *q)
>   kqd->latency_targets[i] = kyber_latency_targets[i];
>   }
>  
> - shift = kyber_sched_tags_shift(q);
> - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
> -
>   return kqd;
>  
>  err_buckets:
> @@ -458,9 +445,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue 
> *kcq)
>   INIT_LIST_HEAD(>rq_list[i]);
>  }
>  
> -static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
> +static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
>  {
>   struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
> + struct blk_mq_tags *tags = hctx->sched_tags;
> + unsigned int shift = tags->bitmap_tags->sb.shift;
> +
> + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
> +
> + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
> +}
> +
> +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
> +{
>   struct kyber_hctx_data *khd;
>   int i;
>  
> @@ -502,8 +499,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, 
> unsigned int hctx_idx)
>   khd->batching = 0;
>  
>   hctx->sched_data = khd;
> - sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
> - kqd->async_depth);
> + kyber_depth_updated(hctx);
>  
>   return 0;
>  
> @@ -1022,6 +1018,7 @@ static struct elevator_type kyber_sched = {
>   .completed_request = kyber_completed_request,
>   .dispatch_request = kyber_dispatch_request,
>   .has_work = kyber_has_work,
> + .depth_updated = kyber_depth_updated,
>   },
>  #ifdef CONFIG_BLK_DEBUG_FS
>   .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
> -- 
> 2.17.1
>

Re: [PATCH blktests v3 00/11] NVMe Target Passthru Block Tests

2020-10-22 Thread Omar Sandoval

On Thu, Oct 22, 2020 at 12:45:25PM -0600, Logan Gunthorpe wrote:
> 
> On 2020-10-08 10:40 a.m., Logan Gunthorpe wrote:
> > Hi,
> > 
> > This series adds blktests for the nvmet passthru feature that was merged
> > for 5.9. It's been reconciled with Sagi's blktest series that Omar
> > recently merged.
> 
> Bump. This has been around for a while now. Omar, can you please
> consider picking this up?

There were a couple of shellcheck errors:

tests/nvme/rc:77:8: warning: Declare and assign separately to avoid masking 
return values. [SC2155]
tests/nvme/rc:278:7: note: Useless echo? Instead of 'echo $(cmd)', just use 
'cmd'. [SC2005]

I fixed those up and applied, thanks.

Re: [PATCH] kyber: Fix crash in kyber_finish_request()

2020-09-08 Thread Omar Sandoval

On Mon, Sep 07, 2020 at 10:41:16AM -0600, Jens Axboe wrote:
> CC Omar
> 
> On 9/7/20 1:43 AM, Yang Yang wrote:
> > Kernel crash when requeue flush request.
> > It can be reproduced as below:
> > 
> > [2.517297] Unable to handle kernel paging request at virtual address 
> > ffd8071c0b00
> > ...
> > [2.517468] pc : clear_bit+0x18/0x2c
> > [2.517502] lr : sbitmap_queue_clear+0x40/0x228
> > [2.517503] sp : ff800832bc60 pstate : 00c00145
> > ...
> > [2.517599] Process ksoftirqd/5 (pid: 51, stack limit = 
> > 0xff8008328000)
> > [2.517602] Call trace:
> > [2.517606]  clear_bit+0x18/0x2c
> > [2.517619]  kyber_finish_request+0x74/0x80
> > [2.517627]  blk_mq_requeue_request+0x3c/0xc0
> > [2.517637]  __scsi_queue_insert+0x11c/0x148
> > [2.517640]  scsi_softirq_done+0x114/0x130
> > [2.517643]  blk_done_softirq+0x7c/0xb0
> > [2.517651]  __do_softirq+0x208/0x3bc
> > [2.517657]  run_ksoftirqd+0x34/0x60
> > [2.517663]  smpboot_thread_fn+0x1c4/0x2c0
> > [2.517667]  kthread+0x110/0x120
> > [2.517669]  ret_from_fork+0x10/0x18
> > 
> > Signed-off-by: Yang Yang 
> > ---
> >  block/kyber-iosched.c | 3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> > index a38c5ab103d1..af73afe7a05c 100644
> > --- a/block/kyber-iosched.c
> > +++ b/block/kyber-iosched.c
> > @@ -611,6 +611,9 @@ static void kyber_finish_request(struct request *rq)
> >  {
> > struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
> >  
> > +   if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
> > +   return;
> > +
> > rq_clear_domain_token(kqd, rq);
> >  }
> >  
> > 

It looks like BFQ also has this check. Wouldn't it make more sense to
check it in blk-mq, like we do for .finish_request() in
blk_mq_free_request()? Something along these lines:

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c34b090178a9..fa98470df3f0 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5895,18 +5895,6 @@ static void bfq_finish_requeue_request(struct request 
*rq)
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd;
 
-   /*
-* Requeue and finish hooks are invoked in blk-mq without
-* checking whether the involved request is actually still
-* referenced in the scheduler. To handle this fact, the
-* following two checks make this function exit in case of
-* spurious invocations, for which there is nothing to do.
-*
-* First, check whether rq has nothing to do with an elevator.
-*/
-   if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
-   return;
-
/*
 * rq either is not associated with any icq, or is an already
 * requeued request that has not (yet) been re-inserted into
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 126021fc3a11..e81ca1bf6e10 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -66,7 +66,7 @@ static inline void blk_mq_sched_requeue_request(struct 
request *rq)
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
 
-   if (e && e->type->ops.requeue_request)
+   if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
 }

Re: [PATCH] sbitmap: trivial - update comment for sbitmap_deferred_clear_bit

2019-03-21 Thread Omar Sandoval

On Sat, Mar 16, 2019 at 04:24:37PM +0800, Shenghui Wang wrote:
> "sbitmap_batch_clear" should be "sbitmap_deferred_clear"

Acked-by: Omar Sandoval 

> Signed-off-by: Shenghui Wang 
> ---
>  include/linux/sbitmap.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index 14d558146aea..20f3e3f029b9 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, 
> unsigned int bitnr)
>  /*
>   * This one is special, since it doesn't actually clear the bit, rather it
>   * sets the corresponding bit in the ->cleared mask instead. Paired with
> - * the caller doing sbitmap_batch_clear() if a given index is full, which
> + * the caller doing sbitmap_deferred_clear() if a given index is full, which
>   * will clear the previously freed entries in the corresponding ->word.
>   */
>  static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned 
> int bitnr)
> -- 
> 2.20.1
> 
> 
>

Re: [PATCH 01/11] btrfs: add macros for compression type and level

2019-01-29 Thread Omar Sandoval

On Mon, Jan 28, 2019 at 04:24:27PM -0500, Dennis Zhou wrote:
> It is very easy to miss places that rely on a certain bitshifting for
> decyphering the type_level overloading. Make macros handle this instead.
> 
> Signed-off-by: Dennis Zhou 
> ---
>  fs/btrfs/compression.c | 2 +-
>  fs/btrfs/compression.h | 3 +++
>  fs/btrfs/zlib.c| 2 +-
>  3 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 548057630b69..586f95ac0aea 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -1036,9 +1036,9 @@ int btrfs_compress_pages(unsigned int type_level, 
> struct address_space *mapping,
>unsigned long *total_in,
>unsigned long *total_out)
>  {
> + int type = BTRFS_COMPRESS_TYPE(type_level);
>   struct list_head *workspace;
>   int ret;
> - int type = type_level & 0xF;
>  
>   workspace = find_workspace(type);
>  
> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
> index ddda9b80bf20..69a9197dadc3 100644
> --- a/fs/btrfs/compression.h
> +++ b/fs/btrfs/compression.h
> @@ -25,6 +25,9 @@
>  
>  #define  BTRFS_ZLIB_DEFAULT_LEVEL3
>  
> +#define BTRFS_COMPRESS_TYPE(type_level)  (type_level & 0xF)
> +#define BTRFS_COMPRESS_LEVEL(type_level) ((type_level & 0xF0) >> 4)

Nit: these can be inline functions rather than macros.

>  struct compressed_bio {
>   /* number of bios pending for this compressed extent */
>   refcount_t pending_bios;
> diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
> index 970ff3e35bb3..1480b3eee306 100644
> --- a/fs/btrfs/zlib.c
> +++ b/fs/btrfs/zlib.c
> @@ -393,7 +393,7 @@ static int zlib_decompress(struct list_head *ws, unsigned 
> char *data_in,
>  static void zlib_set_level(struct list_head *ws, unsigned int type)
>  {
>   struct workspace *workspace = list_entry(ws, struct workspace, list);
> - unsigned level = (type & 0xF0) >> 4;
> + unsigned int level = BTRFS_COMPRESS_LEVEL(type);
>  
>   if (level > 9)
>   level = 9;
> -- 
> 2.17.1
>

Re: [PATCH] block: aoe: no need to check return value of debugfs_create functions

2019-01-22 Thread Omar Sandoval

On Tue, Jan 22, 2019 at 04:21:04PM +0100, Greg Kroah-Hartman wrote:
> When calling debugfs functions, there is no need to ever check the
> return value.  The function can work or not, but the code logic should
> never do something different based on this.
> 
> Cc: "Ed L. Cashin" 
> Cc: Jens Axboe 
> Cc: linux-bl...@vger.kernel.org
> Signed-off-by: Greg Kroah-Hartman 
> ---
>  drivers/block/aoe/aoeblk.c | 13 +
>  1 file changed, 1 insertion(+), 12 deletions(-)
> 
> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
> index e2c6aae2d636..b602646bfa04 100644
> --- a/drivers/block/aoe/aoeblk.c
> +++ b/drivers/block/aoe/aoeblk.c
> @@ -207,14 +207,7 @@ aoedisk_add_debugfs(struct aoedev *d)
>   else
>   p++;
>   BUG_ON(*p == '\0');
> - entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
> - _debugfs_fops);
> - if (IS_ERR_OR_NULL(entry)) {
> - pr_info("aoe: cannot create debugfs file for %s\n",
> - d->gd->disk_name);
> - return;
> - }
> - BUG_ON(d->debugfs);
> + debugfs_create_file(p, 0444, aoe_debugfs_dir, d, _debugfs_fops);
>   d->debugfs = entry;

Now entry is uninitialized here when we assign it to d->debugfs.

>  }
>  void
> @@ -472,10 +465,6 @@ aoeblk_init(void)
>   if (buf_pool_cache == NULL)
>   return -ENOMEM;
>   aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
> - if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
> - pr_info("aoe: cannot create debugfs directory\n");
> - aoe_debugfs_dir = NULL;
> - }
>   return 0;
>  }
>  
> -- 
> 2.20.1
>

Re: [PATCH] blk_types.h: Use REQ_OP_WRITE in op_is_write

2019-01-11 Thread Omar Sandoval

On Sat, Dec 22, 2018 at 08:03:54AM -0200, Marcos Paulo de Souza wrote:
> Instead of just using plain '1', as it improves readability.
> 
> Signed-off-by: Marcos Paulo de Souza 
> ---
>  include/linux/blk_types.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 1dcf652ba0aa..905c666a0101 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -377,7 +377,7 @@ static inline void bio_set_op_attrs(struct bio *bio, 
> unsigned op,
>  
>  static inline bool op_is_write(unsigned int op)
>  {
> - return (op & 1);
> + return (op & REQ_OP_WRITE);
>  }
>  
>  /*

Nak. Conceptually, an operation is a write if the least significant bit
is set. The 1 here doesn't mean REQ_OP_WRITE, it means the least
significant bit.

Re: [PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-12-06 Thread Omar Sandoval

On Thu, Dec 06, 2018 at 11:07:46AM -0500, Steven Sistare wrote:
> On 11/27/2018 8:19 PM, Omar Sandoval wrote:
> > On Tue, Nov 27, 2018 at 10:16:56AM -0500, Steven Sistare wrote:
> >> On 11/9/2018 7:50 AM, Steve Sistare wrote:
> >>> From: Steve Sistare 
> >>>
> >>> Provide struct sparsemask and functions to manipulate it.  A sparsemask is
> >>> a sparse bitmap.  It reduces cache contention vs the usual bitmap when 
> >>> many
> >>> threads concurrently set, clear, and visit elements, by reducing the 
> >>> number
> >>> of significant bits per cacheline.  For each 64 byte chunk of the mask,
> >>> only the first K bits of the first word are used, and the remaining bits
> >>> are ignored, where K is a creation time parameter.  Thus a sparsemask that
> >>> can represent a set of N elements is approximately (N/K * 64) bytes in
> >>> size.
> >>>
> >>> Signed-off-by: Steve Sistare 
> >>> ---
> >>>  include/linux/sparsemask.h | 260 
> >>> +
> >>>  lib/Makefile   |   2 +-
> >>>  lib/sparsemask.c   | 142 +
> >>>  3 files changed, 403 insertions(+), 1 deletion(-)
> >>>  create mode 100644 include/linux/sparsemask.h
> >>>  create mode 100644 lib/sparsemask.c
> >>
> >> Hi Peter and Ingo,
> >>   I need your opinion: would you prefer that I keep the new sparsemask 
> >> type, 
> >> or fold it into the existing sbitmap type?  There is some overlap between 
> >> the 
> >> two, but mostly in trivial one line functions. The main differences are:
> > 
> > Adding Jens and myself.
> > 
> >>   * sparsemask defines iterators that allow an inline loop body, like 
> >> cpumask,
> >>   whereas the sbitmap iterator forces us to define a callback function for
> >>   the body, which is awkward.
> >>
> >>   * sparsemask is slightly more efficient.  The struct and variable length
> >>   bitmap are allocated contiguously,
> > 
> > That just means you have the pointer indirection elsewhere :) The users
> > of sbitmap embed it in whatever structure they have.
>  
> Yes, the sparsemask can be embedded in one place, but in my use case I also 
> cache
> pointers to the mask from elsewhere, and those sites incur the cost of 2 
> indirections
> to perform bitmap operations.
> 
> >>   and sbitmap uses an extra field "depth"
> >>   per bitmap cacheline.
> > 
> > The depth field is memory which would otherwise be unused, and it's only
> > used for sbitmap_get(), so it doesn't have any cost if you're using it
> > like a cpumask.
> > 
> >>   * The order of arguments is different for the sparsemask accessors and
> >>   sbitmap accessors.  sparsemask mimics cpumask which is used extensively
> >>   in the sched code.
> >>
> >>   * Much of the sbitmap code supports queueing, sleeping, and waking on bit
> >>   allocation, which is N/A for scheduler load load balancing.  However, we
> >>   can call the basic functions which do not use queueing.
> >>
> >> I could add the sparsemask iterators to sbitmap (90 lines), and define
> >> a thin layer to change the argument order to mimic cpumask, but that
> >> essentially recreates sparsemask.
> > 
> > We only use sbitmap_for_each_set() in a few places. Maybe a for_each()
> > style macro would be cleaner for those users, too, in which case I
> > wouldn't be opposed to changing it. The cpumask argument order thing is
> > a annoying, though.
> > 
> >> Also, pushing sparsemask into sbitmap would limit our freedom to evolve the
> >> type to meet the future needs of sched, as sbitmap has its own maintainer,
> >> and is used by drivers, so changes to its API and ABI will be frowned upon.
> > 
> > It's a generic data structure, so of course Jens and I have no problem
> > with changing it to meet more needs :) Personally, I'd prefer to only
> > have one datastructure for this, but I suppose it depends on whether
> > Peter and Ingo think the argument order is important enough.
> 
> The argument order is a minor thing, not a blocker to adoption, but 
> efficiency 
> is important in the core scheduler code.  I actually did the work to write a
> for_each macro with inline body to sbitmap, and converted my patches to use 
> sbitmap.
> But then I noticed your very recent patch adding the cleared word to each 
> c

Re: [PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-12-06 Thread Omar Sandoval

On Thu, Dec 06, 2018 at 11:07:46AM -0500, Steven Sistare wrote:
> On 11/27/2018 8:19 PM, Omar Sandoval wrote:
> > On Tue, Nov 27, 2018 at 10:16:56AM -0500, Steven Sistare wrote:
> >> On 11/9/2018 7:50 AM, Steve Sistare wrote:
> >>> From: Steve Sistare 
> >>>
> >>> Provide struct sparsemask and functions to manipulate it.  A sparsemask is
> >>> a sparse bitmap.  It reduces cache contention vs the usual bitmap when 
> >>> many
> >>> threads concurrently set, clear, and visit elements, by reducing the 
> >>> number
> >>> of significant bits per cacheline.  For each 64 byte chunk of the mask,
> >>> only the first K bits of the first word are used, and the remaining bits
> >>> are ignored, where K is a creation time parameter.  Thus a sparsemask that
> >>> can represent a set of N elements is approximately (N/K * 64) bytes in
> >>> size.
> >>>
> >>> Signed-off-by: Steve Sistare 
> >>> ---
> >>>  include/linux/sparsemask.h | 260 
> >>> +
> >>>  lib/Makefile   |   2 +-
> >>>  lib/sparsemask.c   | 142 +
> >>>  3 files changed, 403 insertions(+), 1 deletion(-)
> >>>  create mode 100644 include/linux/sparsemask.h
> >>>  create mode 100644 lib/sparsemask.c
> >>
> >> Hi Peter and Ingo,
> >>   I need your opinion: would you prefer that I keep the new sparsemask 
> >> type, 
> >> or fold it into the existing sbitmap type?  There is some overlap between 
> >> the 
> >> two, but mostly in trivial one line functions. The main differences are:
> > 
> > Adding Jens and myself.
> > 
> >>   * sparsemask defines iterators that allow an inline loop body, like 
> >> cpumask,
> >>   whereas the sbitmap iterator forces us to define a callback function for
> >>   the body, which is awkward.
> >>
> >>   * sparsemask is slightly more efficient.  The struct and variable length
> >>   bitmap are allocated contiguously,
> > 
> > That just means you have the pointer indirection elsewhere :) The users
> > of sbitmap embed it in whatever structure they have.
>  
> Yes, the sparsemask can be embedded in one place, but in my use case I also 
> cache
> pointers to the mask from elsewhere, and those sites incur the cost of 2 
> indirections
> to perform bitmap operations.
> 
> >>   and sbitmap uses an extra field "depth"
> >>   per bitmap cacheline.
> > 
> > The depth field is memory which would otherwise be unused, and it's only
> > used for sbitmap_get(), so it doesn't have any cost if you're using it
> > like a cpumask.
> > 
> >>   * The order of arguments is different for the sparsemask accessors and
> >>   sbitmap accessors.  sparsemask mimics cpumask which is used extensively
> >>   in the sched code.
> >>
> >>   * Much of the sbitmap code supports queueing, sleeping, and waking on bit
> >>   allocation, which is N/A for scheduler load load balancing.  However, we
> >>   can call the basic functions which do not use queueing.
> >>
> >> I could add the sparsemask iterators to sbitmap (90 lines), and define
> >> a thin layer to change the argument order to mimic cpumask, but that
> >> essentially recreates sparsemask.
> > 
> > We only use sbitmap_for_each_set() in a few places. Maybe a for_each()
> > style macro would be cleaner for those users, too, in which case I
> > wouldn't be opposed to changing it. The cpumask argument order thing is
> > a annoying, though.
> > 
> >> Also, pushing sparsemask into sbitmap would limit our freedom to evolve the
> >> type to meet the future needs of sched, as sbitmap has its own maintainer,
> >> and is used by drivers, so changes to its API and ABI will be frowned upon.
> > 
> > It's a generic data structure, so of course Jens and I have no problem
> > with changing it to meet more needs :) Personally, I'd prefer to only
> > have one datastructure for this, but I suppose it depends on whether
> > Peter and Ingo think the argument order is important enough.
> 
> The argument order is a minor thing, not a blocker to adoption, but 
> efficiency 
> is important in the core scheduler code.  I actually did the work to write a
> for_each macro with inline body to sbitmap, and converted my patches to use 
> sbitmap.
> But then I noticed your very recent patch adding the cleared word to each 
> c

Re: [PATCH,1/2] genhd: avoid overflow of sectors in disk_stats

2018-11-30 Thread Omar Sandoval

On Fri, Nov 30, 2018 at 04:32:40AM -0500, Huijin Park wrote:
> From: "huijin.park" 
> 
> This patch changes the 'sectors' type to an u64.
> In 32 bit system, the 'sectors' can accumulate up to about 2TiB.
> If a 32 bit system makes i/o over 2TiB while running,
> the 'sectors' will overflow.
> As a result, the part_stat_read(sectors), the diskstats in proc and
> the (lifetime|session)_write_kbytes in sysfs return invalid statistic.

What about parsers which expect it to be an unsigned long? E.g., iostat:
https://github.com/sysstat/sysstat/blob/v12.1.1/rd_stats.c#L736

At least with glibc, scanf seems to truncate sanely, but this appears to
be undefined.

> Signed-off-by: huijin.park 
> ---
>  block/genhd.c |6 +++---
>  include/linux/genhd.h |2 +-
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 0145bcb..7518dcd 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1343,10 +1343,10 @@ static int diskstats_show(struct seq_file *seqf, void 
> *v)
>   part_stat_unlock();
>   part_in_flight(gp->queue, hd, inflight);
>   seq_printf(seqf, "%4d %7d %s "
> -"%lu %lu %lu %u "
> -"%lu %lu %lu %u "
> +"%lu %lu %llu %u "
> +"%lu %lu %llu %u "
>  "%u %u %u "
> -"%lu %lu %lu %u\n",
> +"%lu %lu %llu %u\n",
>  MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
>  disk_name(gp, hd->partno, buf),
>  part_stat_read(hd, ios[STAT_READ]),
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 0c5ee17..5bf86f9 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -84,7 +84,7 @@ struct partition {
>  
>  struct disk_stats {
>   u64 nsecs[NR_STAT_GROUPS];
> - unsigned long sectors[NR_STAT_GROUPS];
> + u64 sectors[NR_STAT_GROUPS];
>   unsigned long ios[NR_STAT_GROUPS];
>   unsigned long merges[NR_STAT_GROUPS];
>   unsigned long io_ticks;
> -- 
> 1.7.9.5
>

Re: [PATCH,1/2] genhd: avoid overflow of sectors in disk_stats

2018-11-30 Thread Omar Sandoval

On Fri, Nov 30, 2018 at 04:32:40AM -0500, Huijin Park wrote:
> From: "huijin.park" 
> 
> This patch changes the 'sectors' type to an u64.
> In 32 bit system, the 'sectors' can accumulate up to about 2TiB.
> If a 32 bit system makes i/o over 2TiB while running,
> the 'sectors' will overflow.
> As a result, the part_stat_read(sectors), the diskstats in proc and
> the (lifetime|session)_write_kbytes in sysfs return invalid statistic.

What about parsers which expect it to be an unsigned long? E.g., iostat:
https://github.com/sysstat/sysstat/blob/v12.1.1/rd_stats.c#L736

At least with glibc, scanf seems to truncate sanely, but this appears to
be undefined.

> Signed-off-by: huijin.park 
> ---
>  block/genhd.c |6 +++---
>  include/linux/genhd.h |2 +-
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 0145bcb..7518dcd 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1343,10 +1343,10 @@ static int diskstats_show(struct seq_file *seqf, void 
> *v)
>   part_stat_unlock();
>   part_in_flight(gp->queue, hd, inflight);
>   seq_printf(seqf, "%4d %7d %s "
> -"%lu %lu %lu %u "
> -"%lu %lu %lu %u "
> +"%lu %lu %llu %u "
> +"%lu %lu %llu %u "
>  "%u %u %u "
> -"%lu %lu %lu %u\n",
> +"%lu %lu %llu %u\n",
>  MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
>  disk_name(gp, hd->partno, buf),
>  part_stat_read(hd, ios[STAT_READ]),
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 0c5ee17..5bf86f9 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -84,7 +84,7 @@ struct partition {
>  
>  struct disk_stats {
>   u64 nsecs[NR_STAT_GROUPS];
> - unsigned long sectors[NR_STAT_GROUPS];
> + u64 sectors[NR_STAT_GROUPS];
>   unsigned long ios[NR_STAT_GROUPS];
>   unsigned long merges[NR_STAT_GROUPS];
>   unsigned long io_ticks;
> -- 
> 1.7.9.5
>

Re: [PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-27 Thread Omar Sandoval

On Tue, Nov 27, 2018 at 10:16:56AM -0500, Steven Sistare wrote:
> On 11/9/2018 7:50 AM, Steve Sistare wrote:
> > From: Steve Sistare 
> > 
> > Provide struct sparsemask and functions to manipulate it.  A sparsemask is
> > a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
> > threads concurrently set, clear, and visit elements, by reducing the number
> > of significant bits per cacheline.  For each 64 byte chunk of the mask,
> > only the first K bits of the first word are used, and the remaining bits
> > are ignored, where K is a creation time parameter.  Thus a sparsemask that
> > can represent a set of N elements is approximately (N/K * 64) bytes in
> > size.
> > 
> > Signed-off-by: Steve Sistare 
> > ---
> >  include/linux/sparsemask.h | 260 
> > +
> >  lib/Makefile   |   2 +-
> >  lib/sparsemask.c   | 142 +
> >  3 files changed, 403 insertions(+), 1 deletion(-)
> >  create mode 100644 include/linux/sparsemask.h
> >  create mode 100644 lib/sparsemask.c
> 
> Hi Peter and Ingo,
>   I need your opinion: would you prefer that I keep the new sparsemask type, 
> or fold it into the existing sbitmap type?  There is some overlap between the 
> two, but mostly in trivial one line functions. The main differences are:

Adding Jens and myself.

>   * sparsemask defines iterators that allow an inline loop body, like cpumask,
>   whereas the sbitmap iterator forces us to define a callback function for
>   the body, which is awkward.
>
>   * sparsemask is slightly more efficient.  The struct and variable length
>   bitmap are allocated contiguously,

That just means you have the pointer indirection elsewhere :) The users
of sbitmap embed it in whatever structure they have.

>   and sbitmap uses an extra field "depth"
>   per bitmap cacheline.

The depth field is memory which would otherwise be unused, and it's only
used for sbitmap_get(), so it doesn't have any cost if you're using it
like a cpumask.

>   * The order of arguments is different for the sparsemask accessors and
>   sbitmap accessors.  sparsemask mimics cpumask which is used extensively
>   in the sched code.
> 
>   * Much of the sbitmap code supports queueing, sleeping, and waking on bit
>   allocation, which is N/A for scheduler load load balancing.  However, we
>   can call the basic functions which do not use queueing.
> 
> I could add the sparsemask iterators to sbitmap (90 lines), and define
> a thin layer to change the argument order to mimic cpumask, but that
> essentially recreates sparsemask.

We only use sbitmap_for_each_set() in a few places. Maybe a for_each()
style macro would be cleaner for those users, too, in which case I
wouldn't be opposed to changing it. The cpumask argument order thing is
a annoying, though.

> Also, pushing sparsemask into sbitmap would limit our freedom to evolve the
> type to meet the future needs of sched, as sbitmap has its own maintainer,
> and is used by drivers, so changes to its API and ABI will be frowned upon.

It's a generic data structure, so of course Jens and I have no problem
with changing it to meet more needs :) Personally, I'd prefer to only
have one datastructure for this, but I suppose it depends on whether
Peter and Ingo think the argument order is important enough.

> FWIW, here is the amount of code involved:
> 
> include/linux/sbitmap.h
>   250 lines basic operations
>   284 lines for queueing
>   ---
>   534 lines total
> 
> lib/sbitmap.c
>   201 lines basic operations
>   380 lines for queueing
>   ---
>   581 lines total
> 
> include/linux/sparsemask.h
>   260 lines total
>   https://lkml.org/lkml/2018/11/9/1176
> 
> lib/sparsemask.c
>   142 lines total
>   https://lkml.org/lkml/2018/11/9/1176
> 
> - Steve

Re: [PATCH v3 01/10] sched: Provide sparsemask, a reduced contention bitmap

2018-11-27 Thread Omar Sandoval

On Tue, Nov 27, 2018 at 10:16:56AM -0500, Steven Sistare wrote:
> On 11/9/2018 7:50 AM, Steve Sistare wrote:
> > From: Steve Sistare 
> > 
> > Provide struct sparsemask and functions to manipulate it.  A sparsemask is
> > a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
> > threads concurrently set, clear, and visit elements, by reducing the number
> > of significant bits per cacheline.  For each 64 byte chunk of the mask,
> > only the first K bits of the first word are used, and the remaining bits
> > are ignored, where K is a creation time parameter.  Thus a sparsemask that
> > can represent a set of N elements is approximately (N/K * 64) bytes in
> > size.
> > 
> > Signed-off-by: Steve Sistare 
> > ---
> >  include/linux/sparsemask.h | 260 
> > +
> >  lib/Makefile   |   2 +-
> >  lib/sparsemask.c   | 142 +
> >  3 files changed, 403 insertions(+), 1 deletion(-)
> >  create mode 100644 include/linux/sparsemask.h
> >  create mode 100644 lib/sparsemask.c
> 
> Hi Peter and Ingo,
>   I need your opinion: would you prefer that I keep the new sparsemask type, 
> or fold it into the existing sbitmap type?  There is some overlap between the 
> two, but mostly in trivial one line functions. The main differences are:

Adding Jens and myself.

>   * sparsemask defines iterators that allow an inline loop body, like cpumask,
>   whereas the sbitmap iterator forces us to define a callback function for
>   the body, which is awkward.
>
>   * sparsemask is slightly more efficient.  The struct and variable length
>   bitmap are allocated contiguously,

That just means you have the pointer indirection elsewhere :) The users
of sbitmap embed it in whatever structure they have.

>   and sbitmap uses an extra field "depth"
>   per bitmap cacheline.

The depth field is memory which would otherwise be unused, and it's only
used for sbitmap_get(), so it doesn't have any cost if you're using it
like a cpumask.

>   * The order of arguments is different for the sparsemask accessors and
>   sbitmap accessors.  sparsemask mimics cpumask which is used extensively
>   in the sched code.
> 
>   * Much of the sbitmap code supports queueing, sleeping, and waking on bit
>   allocation, which is N/A for scheduler load load balancing.  However, we
>   can call the basic functions which do not use queueing.
> 
> I could add the sparsemask iterators to sbitmap (90 lines), and define
> a thin layer to change the argument order to mimic cpumask, but that
> essentially recreates sparsemask.

We only use sbitmap_for_each_set() in a few places. Maybe a for_each()
style macro would be cleaner for those users, too, in which case I
wouldn't be opposed to changing it. The cpumask argument order thing is
a annoying, though.

> Also, pushing sparsemask into sbitmap would limit our freedom to evolve the
> type to meet the future needs of sched, as sbitmap has its own maintainer,
> and is used by drivers, so changes to its API and ABI will be frowned upon.

It's a generic data structure, so of course Jens and I have no problem
with changing it to meet more needs :) Personally, I'd prefer to only
have one datastructure for this, but I suppose it depends on whether
Peter and Ingo think the argument order is important enough.

> FWIW, here is the amount of code involved:
> 
> include/linux/sbitmap.h
>   250 lines basic operations
>   284 lines for queueing
>   ---
>   534 lines total
> 
> lib/sbitmap.c
>   201 lines basic operations
>   380 lines for queueing
>   ---
>   581 lines total
> 
> include/linux/sparsemask.h
>   260 lines total
>   https://lkml.org/lkml/2018/11/9/1176
> 
> lib/sparsemask.c
>   142 lines total
>   https://lkml.org/lkml/2018/11/9/1176
> 
> - Steve

Re: linux-next: manual merge of the akpm-current tree with the btrfs-kdave tree

2018-10-05 Thread Omar Sandoval

On Fri, Oct 05, 2018 at 03:47:21PM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the akpm-current tree got a conflict in:
> 
>   include/linux/swap.h
> 
> between commit:
> 
>   0f83d16b8f1f ("mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS")
> 
> from the btrfs-kdave tree and commit:
> 
>   26833300651e ("mm, swap: fix race between swapoff and some swap operations")
> 
> from the akpm-current tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell

Thanks, Stephen, that looks good.

This reminds that Dave suggested that we chould route the two mm patches
in my series through Andrew's mm tree for 4.20 and get the Btrfs bits in
for 4.21. Would that make things harder for linux-next? If not, I can
resend "mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS" and "mm:
export add_swap_extent()" rebased on mmotm.

Re: linux-next: manual merge of the akpm-current tree with the btrfs-kdave tree

2018-10-05 Thread Omar Sandoval

On Fri, Oct 05, 2018 at 03:47:21PM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the akpm-current tree got a conflict in:
> 
>   include/linux/swap.h
> 
> between commit:
> 
>   0f83d16b8f1f ("mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS")
> 
> from the btrfs-kdave tree and commit:
> 
>   26833300651e ("mm, swap: fix race between swapoff and some swap operations")
> 
> from the akpm-current tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell

Thanks, Stephen, that looks good.

This reminds that Dave suggested that we chould route the two mm patches
in my series through Andrew's mm tree for 4.20 and get the Btrfs bits in
for 4.21. Would that make things harder for linux-next? If not, I can
resend "mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS" and "mm:
export add_swap_extent()" rebased on mmotm.

Re: linux-next: build warning after merge of the block tree

2018-09-28 Thread Omar Sandoval

On Fri, Sep 28, 2018 at 11:11:24AM +1000, Stephen Rothwell wrote:
> Hi Jens,
> 
> After merging the block tree, today's linux-next build (arm
> multi_v7_defconfig) produced this warning:
> 
> block/kyber-iosched.c:84:22: warning: integer overflow in expression of type 
> 'long int' results in '705032704' [-Woverflow]
>   [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
>   ^
> 
> Introduced by commit
> 
>   6e25cb01ea20 ("kyber: implement improved heuristics")

Ugh, thanks for the report. This should fix it:

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 2b62e362fb36..eccac01a10b6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -79,9 +79,9 @@ static const unsigned int kyber_depth[] = {
  * Default latency targets for each scheduling domain.
  */
 static const u64 kyber_latency_targets[] = {
-   [KYBER_READ] = 2 * NSEC_PER_MSEC,
-   [KYBER_WRITE] = 10 * NSEC_PER_MSEC,
-   [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
+   [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
+   [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
+   [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
 };
 
 /*

Jens, do you mind folding that in, or should I send it separately?

Re: linux-next: build warning after merge of the block tree

2018-09-28 Thread Omar Sandoval

On Fri, Sep 28, 2018 at 11:11:24AM +1000, Stephen Rothwell wrote:
> Hi Jens,
> 
> After merging the block tree, today's linux-next build (arm
> multi_v7_defconfig) produced this warning:
> 
> block/kyber-iosched.c:84:22: warning: integer overflow in expression of type 
> 'long int' results in '705032704' [-Woverflow]
>   [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
>   ^
> 
> Introduced by commit
> 
>   6e25cb01ea20 ("kyber: implement improved heuristics")

Ugh, thanks for the report. This should fix it:

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 2b62e362fb36..eccac01a10b6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -79,9 +79,9 @@ static const unsigned int kyber_depth[] = {
  * Default latency targets for each scheduling domain.
  */
 static const u64 kyber_latency_targets[] = {
-   [KYBER_READ] = 2 * NSEC_PER_MSEC,
-   [KYBER_WRITE] = 10 * NSEC_PER_MSEC,
-   [KYBER_DISCARD] = 5 * NSEC_PER_SEC,
+   [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
+   [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
+   [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
 };
 
 /*

Jens, do you mind folding that in, or should I send it separately?

Re: [PATCH] btrfs: list usage cleanup

2018-09-27 Thread Omar Sandoval

On Wed, Sep 26, 2018 at 04:35:45PM +0800, zhong jiang wrote:
> Trival cleanup, list_move_tail will implement the same function that
> list_del() + list_add_tail() will do. hence just replace them.
> 
> Signed-off-by: zhong jiang 
> ---
>  fs/btrfs/send.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> index 094cc144..d87f416 100644
> --- a/fs/btrfs/send.c
> +++ b/fs/btrfs/send.c
> @@ -2075,8 +2075,7 @@ static struct name_cache_entry 
> *name_cache_search(struct send_ctx *sctx,
>   */
>  static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry 
> *nce)
>  {
> - list_del(>list);
> - list_add_tail(>list, >name_cache_list);
> + list_move_tail(>list, >name_cache_list);
>  }

At that point do we even need such a trivial helper, considering that
this is only called in one place?

Re: [PATCH] btrfs: list usage cleanup

2018-09-27 Thread Omar Sandoval

On Wed, Sep 26, 2018 at 04:35:45PM +0800, zhong jiang wrote:
> Trival cleanup, list_move_tail will implement the same function that
> list_del() + list_add_tail() will do. hence just replace them.
> 
> Signed-off-by: zhong jiang 
> ---
>  fs/btrfs/send.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> index 094cc144..d87f416 100644
> --- a/fs/btrfs/send.c
> +++ b/fs/btrfs/send.c
> @@ -2075,8 +2075,7 @@ static struct name_cache_entry 
> *name_cache_search(struct send_ctx *sctx,
>   */
>  static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry 
> *nce)
>  {
> - list_del(>list);
> - list_add_tail(>list, >name_cache_list);
> + list_move_tail(>list, >name_cache_list);
>  }

At that point do we even need such a trivial helper, considering that
this is only called in one place?

Re: [PATCH v3] proc/kcore: fix invalid memory access in multi-page read optimization

2018-09-04 Thread Omar Sandoval

On Wed, Sep 05, 2018 at 12:38:22AM +0200, Dominique Martinet wrote:
> The 'm' kcore_list item could point to kclist_head, and it is incorrect to
> look at m->addr / m->size in this case.
> There is no choice but to run through the list of entries for every address
> if we did not find any entry in the previous iteration
> 
> Reset 'm' to NULL in that case at Omar Sandoval's suggestion.
> 
> Fixes: bf991c2231117 ("proc/kcore: optimize multiple page reads")

Reviewed-by: Omar Sandoval 

Thanks again for catching this!

> Signed-off-by: Dominique Martinet 
> ---
> 
> Sorry, resent v2 because From didn't match sob tag
> 
>  fs/proc/kcore.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index ad72261ee3fe..578926032880 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
> buflen, loff_t *fpos)
>   ret = -EFAULT;
>   goto out;
>   }
> + m = NULL;
>   } else if (m->type == KCORE_VMALLOC) {
>   vread(buf, (char *)start, tsz);
>   /* we have to zero-fill user buffer even if no read */
> -- 
> 2.17.1
>

Re: [PATCH v3] proc/kcore: fix invalid memory access in multi-page read optimization

2018-09-04 Thread Omar Sandoval

On Wed, Sep 05, 2018 at 12:38:22AM +0200, Dominique Martinet wrote:
> The 'm' kcore_list item could point to kclist_head, and it is incorrect to
> look at m->addr / m->size in this case.
> There is no choice but to run through the list of entries for every address
> if we did not find any entry in the previous iteration
> 
> Reset 'm' to NULL in that case at Omar Sandoval's suggestion.
> 
> Fixes: bf991c2231117 ("proc/kcore: optimize multiple page reads")

Reviewed-by: Omar Sandoval 

Thanks again for catching this!

> Signed-off-by: Dominique Martinet 
> ---
> 
> Sorry, resent v2 because From didn't match sob tag
> 
>  fs/proc/kcore.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index ad72261ee3fe..578926032880 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
> buflen, loff_t *fpos)
>   ret = -EFAULT;
>   goto out;
>   }
> + m = NULL;
>   } else if (m->type == KCORE_VMALLOC) {
>   vread(buf, (char *)start, tsz);
>   /* we have to zero-fill user buffer even if no read */
> -- 
> 2.17.1
>

Re: [PATCH] proc/kcore: fix invalid memory access in multi-page read optimization

2018-09-04 Thread Omar Sandoval

On Wed, Aug 29, 2018 at 06:04:07AM +0200, Dominique Martinet wrote:
> The 'm' kcore_list item can point to kclist_head, and it is incorrect to
> look at m->addr / m->size in this case.
> There is no choice but to run through the list of entries for every address
> if we did not find any entry in the previous iteration
> 
> Fixes: bf991c2231117 ("proc/kcore: optimize multiple page reads")
> Signed-off-by: Dominique Martinet 
> ---
> 
> I guess now I'm looking at bf991c2231117 again that it would be slightly
> more efficient to remove the !m check and initialize m to point to
> kclist_head like this:
>  m = list_entry(_head, struct kcore_list, list);
> but it feels a bit forced to me; deferring the choice to others.

Good catch! Sorry I missed this last week, Google decided this was spam
for some reason. How about fixing it like this? One less conditional in
the common case, no hacky list_entry :)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ad72261ee3fe..578926032880 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
ret = -EFAULT;
goto out;
}
+   m = NULL;
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */

>  fs/proc/kcore.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index ad72261ee3fe..50036f6e1f52 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -451,7 +451,8 @@ read_kcore(struct file *file, char __user *buffer, size_t 
> buflen, loff_t *fpos)
>* If this is the first iteration or the address is not within
>* the previous entry, search for a matching entry.
>*/
> - if (!m || start < m->addr || start >= m->addr + m->size) {
> + if (!m || >list == _head || start < m->addr ||
> + start >= m->addr + m->size) {
>   list_for_each_entry(m, _head, list) {
>   if (start >= m->addr &&
>   start < m->addr + m->size)
> -- 
> 2.17.1
>

Re: [PATCH] proc/kcore: fix invalid memory access in multi-page read optimization

2018-09-04 Thread Omar Sandoval

On Wed, Aug 29, 2018 at 06:04:07AM +0200, Dominique Martinet wrote:
> The 'm' kcore_list item can point to kclist_head, and it is incorrect to
> look at m->addr / m->size in this case.
> There is no choice but to run through the list of entries for every address
> if we did not find any entry in the previous iteration
> 
> Fixes: bf991c2231117 ("proc/kcore: optimize multiple page reads")
> Signed-off-by: Dominique Martinet 
> ---
> 
> I guess now I'm looking at bf991c2231117 again that it would be slightly
> more efficient to remove the !m check and initialize m to point to
> kclist_head like this:
>  m = list_entry(_head, struct kcore_list, list);
> but it feels a bit forced to me; deferring the choice to others.

Good catch! Sorry I missed this last week, Google decided this was spam
for some reason. How about fixing it like this? One less conditional in
the common case, no hacky list_entry :)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ad72261ee3fe..578926032880 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
ret = -EFAULT;
goto out;
}
+   m = NULL;
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */

>  fs/proc/kcore.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index ad72261ee3fe..50036f6e1f52 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -451,7 +451,8 @@ read_kcore(struct file *file, char __user *buffer, size_t 
> buflen, loff_t *fpos)
>* If this is the first iteration or the address is not within
>* the previous entry, search for a matching entry.
>*/
> - if (!m || start < m->addr || start >= m->addr + m->size) {
> + if (!m || >list == _head || start < m->addr ||
> + start >= m->addr + m->size) {
>   list_for_each_entry(m, _head, list) {
>   if (start >= m->addr &&
>   start < m->addr + m->size)
> -- 
> 2.17.1
>

[PATCH v4 1/9] proc/kcore: don't grab lock for kclist_add()

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

While we're here, mark kclist_add() with __init so that we'll get a
warning if it's called from non-init code.

Reviewed-by: Andrew Morton 
Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c   | 7 +++
 include/linux/kcore.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..b0b9a76f28d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+  int type)
 {
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
 };
 
 #ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
2.18.0

[PATCH v4 1/9] proc/kcore: don't grab lock for kclist_add()

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

While we're here, mark kclist_add() with __init so that we'll get a
warning if it's called from non-init code.

Reviewed-by: Andrew Morton 
Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c   | 7 +++
 include/linux/kcore.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..b0b9a76f28d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+  int type)
 {
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
 };
 
 #ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
2.18.0

[PATCH v4 3/9] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e83f15a4f66d..ae43a97d511d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,7 +59,7 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
@@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
+   down_write(_lock);
if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
@@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list)
} else
list_splice(list, );
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
-- 
2.18.0

[PATCH v4 3/9] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e83f15a4f66d..ae43a97d511d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,7 +59,7 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
@@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
+   down_write(_lock);
if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
@@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list)
} else
list_splice(list, );
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
-- 
2.18.0

[PATCH v4 5/9] proc/kcore: hold lock during read

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 95aa988c5b5d..dc34642bbdb7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v4 5/9] proc/kcore: hold lock during read

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 95aa988c5b5d..dc34642bbdb7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v4 7/9] proc/kcore: optimize multiple page reads

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 808ef9afd084..758c14e46a44 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v4 7/9] proc/kcore: optimize multiple page reads

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 808ef9afd084..758c14e46a44 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v4 2/9] proc/kcore: don't grab lock for memory hotplug notifier

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The memory hotplug notifier kcore_callback() only needs kclist_lock to
prevent races with __kcore_update_ram(), but we can easily eliminate
that race by using an atomic xchg() in __kcore_update_ram(). This is
preparation for converting kclist_lock to an rwsem.

Reviewed-by: Andrew Morton 
Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b0b9a76f28d6..e83f15a4f66d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list)
LIST_HEAD(garbage);
 
write_lock(_lock);
-   if (kcore_need_update) {
+   if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
write_unlock(_lock);
 
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
kcore_need_update = 1;
-   write_unlock(_lock);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v4 6/9] proc/kcore: clean up ELF header generation

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dc34642bbdb7..808ef9afd084 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
@@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, 
size_t size,
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;
-   struct memelfnote n

[PATCH v4 2/9] proc/kcore: don't grab lock for memory hotplug notifier

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The memory hotplug notifier kcore_callback() only needs kclist_lock to
prevent races with __kcore_update_ram(), but we can easily eliminate
that race by using an atomic xchg() in __kcore_update_ram(). This is
preparation for converting kclist_lock to an rwsem.

Reviewed-by: Andrew Morton 
Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b0b9a76f28d6..e83f15a4f66d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list)
LIST_HEAD(garbage);
 
write_lock(_lock);
-   if (kcore_need_update) {
+   if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
write_unlock(_lock);
 
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
kcore_need_update = 1;
-   write_unlock(_lock);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v4 6/9] proc/kcore: clean up ELF header generation

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dc34642bbdb7..808ef9afd084 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
@@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, 
size_t size,
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;
-   struct memelfnote n

[PATCH v4 9/9] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient, and it only adds a page
at most to the file.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 758c14e46a44..80464432dfe6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index e7b4025c7b24..a683bfb0530f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH v4 9/9] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient, and it only adds a page
at most to the file.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 758c14e46a44..80464432dfe6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index e7b4025c7b24..a683bfb0530f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH v4 0/9] /proc/kcore improvements

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. It fixes a couple
of small issues in v3 but is otherwise the same. Patches 1, 2, and 3 are
prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep patch.
Patches 6 and 7 are optimizations to ->read(). Patch 8 makes it possible
to enable CRASH_CORE on any architecture, which is needed for patch 9.
Patch 9 adds vmcoreinfo to /proc/kcore.

Based on v4.18-rc6 + James' patch in the mm tree, and tested with crash
and readelf.

Thanks!

Changes from v3:

- Fixes a mixed up up_write() instead of up_read() in patch 5 reported
  by Tetsuo Handa
- Added patch 8 to fix a build failure reported by Stephen Rothwell

Changes from v2:

- Add __init to kclist_add() as per Andrew
- Got rid of conversion of kcore_need_update from int to atomic_t and
  just used xchg() instead of atomic_cmpxchg() (split out into a new
  patch instead of combining it with the rwlock -> rwsem conversion)
- Add comment about the increase in file size to patch 8

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (9):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: don't grab lock for memory hotplug notifier
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 534 +
 include/linux/crash_core.h |   2 +
 include/linux/kcore.h  |   2 +-
 kernel/crash_core.c|   6 +-
 5 files changed, 252 insertions(+), 293 deletions(-)

-- 
2.18.0

[PATCH v4 8/9] crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

This is preparation for allowing CRASH_CORE to be enabled for any
architecture.

swapper_pg_dir is always either an array or a macro expanding to NULL.
In the latter case, VMCOREINFO_SYMBOL() won't work, as it tries to take
the address of the given symbol:

#define VMCOREINFO_SYMBOL(name) \
vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned 
long))

Instead, use VMCOREINFO_SYMBOL_ARRAY(), which uses the value:

#define VMCOREINFO_SYMBOL_ARRAY(name) \
vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned 
long)name)

This is the same thing for the array case but isn't an error for the
macro case.

Signed-off-by: Omar Sandoval 
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..e7b4025c7b24 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
 #ifdef CONFIG_MMU
-   VMCOREINFO_SYMBOL(swapper_pg_dir);
+   VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
-- 
2.18.0

[PATCH v4 4/9] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ae43a97d511d..95aa988c5b5d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (xchg(_need_update, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!xchg(_need_update, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   WRITE_ONCE(kcore_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_entry_safe(pos, tmp, , list) {

[PATCH v4 0/9] /proc/kcore improvements

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. It fixes a couple
of small issues in v3 but is otherwise the same. Patches 1, 2, and 3 are
prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep patch.
Patches 6 and 7 are optimizations to ->read(). Patch 8 makes it possible
to enable CRASH_CORE on any architecture, which is needed for patch 9.
Patch 9 adds vmcoreinfo to /proc/kcore.

Based on v4.18-rc6 + James' patch in the mm tree, and tested with crash
and readelf.

Thanks!

Changes from v3:

- Fixes a mixed up up_write() instead of up_read() in patch 5 reported
  by Tetsuo Handa
- Added patch 8 to fix a build failure reported by Stephen Rothwell

Changes from v2:

- Add __init to kclist_add() as per Andrew
- Got rid of conversion of kcore_need_update from int to atomic_t and
  just used xchg() instead of atomic_cmpxchg() (split out into a new
  patch instead of combining it with the rwlock -> rwsem conversion)
- Add comment about the increase in file size to patch 8

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (9):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: don't grab lock for memory hotplug notifier
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 534 +
 include/linux/crash_core.h |   2 +
 include/linux/kcore.h  |   2 +-
 kernel/crash_core.c|   6 +-
 5 files changed, 252 insertions(+), 293 deletions(-)

-- 
2.18.0

[PATCH v4 8/9] crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

This is preparation for allowing CRASH_CORE to be enabled for any
architecture.

swapper_pg_dir is always either an array or a macro expanding to NULL.
In the latter case, VMCOREINFO_SYMBOL() won't work, as it tries to take
the address of the given symbol:

#define VMCOREINFO_SYMBOL(name) \
vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned 
long))

Instead, use VMCOREINFO_SYMBOL_ARRAY(), which uses the value:

#define VMCOREINFO_SYMBOL_ARRAY(name) \
vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned 
long)name)

This is the same thing for the array case but isn't an error for the
macro case.

Signed-off-by: Omar Sandoval 
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..e7b4025c7b24 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
 #ifdef CONFIG_MMU
-   VMCOREINFO_SYMBOL(swapper_pg_dir);
+   VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
-- 
2.18.0

[PATCH v4 4/9] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-25 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ae43a97d511d..95aa988c5b5d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (xchg(_need_update, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!xchg(_need_update, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   WRITE_ONCE(kcore_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_entry_safe(pos, tmp, , list) {

Re: [PATCH v3 5/8] proc/kcore: hold lock during read

2018-07-25 Thread Omar Sandoval

On Wed, Jul 25, 2018 at 12:11:26AM +0900, Tetsuo Handa wrote:
> On 2018/07/19 7:58, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Now that we're using an rwsem, we can hold it during the entirety of
> > read_kcore() and have a common return path. This is preparation for the
> > next change.
> > 
> > Signed-off-by: Omar Sandoval 
> > ---
> >  fs/proc/kcore.c | 70 -
> >  1 file changed, 40 insertions(+), 30 deletions(-)
> > 
> > diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> > index 95aa988c5b5d..e317ac890871 100644
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -440,19 +440,18 @@ static ssize_t
> >  read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t 
> > *fpos)
> >  {
> > char *buf = file->private_data;
> > -   ssize_t acc = 0;
> > size_t size, tsz;
> > size_t elf_buflen;
> > int nphdr;
> > unsigned long start;
> > +   size_t orig_buflen = buflen;
> > +   int ret = 0;
> >  
> > down_read(_lock);
> 
> (...snipped...)
> 
> > +out:
> > +   up_write(_lock);
> 
> Oops. This needs to be up_read().

Yeah, thanks, I'll fix this and rerun my tests with lockdep.

Re: [PATCH v3 5/8] proc/kcore: hold lock during read

2018-07-25 Thread Omar Sandoval

On Wed, Jul 25, 2018 at 12:11:26AM +0900, Tetsuo Handa wrote:
> On 2018/07/19 7:58, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Now that we're using an rwsem, we can hold it during the entirety of
> > read_kcore() and have a common return path. This is preparation for the
> > next change.
> > 
> > Signed-off-by: Omar Sandoval 
> > ---
> >  fs/proc/kcore.c | 70 -
> >  1 file changed, 40 insertions(+), 30 deletions(-)
> > 
> > diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> > index 95aa988c5b5d..e317ac890871 100644
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -440,19 +440,18 @@ static ssize_t
> >  read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t 
> > *fpos)
> >  {
> > char *buf = file->private_data;
> > -   ssize_t acc = 0;
> > size_t size, tsz;
> > size_t elf_buflen;
> > int nphdr;
> > unsigned long start;
> > +   size_t orig_buflen = buflen;
> > +   int ret = 0;
> >  
> > down_read(_lock);
> 
> (...snipped...)
> 
> > +out:
> > +   up_write(_lock);
> 
> Oops. This needs to be up_read().

Yeah, thanks, I'll fix this and rerun my tests with lockdep.

Re: linux-next: build failure after merge of the akpm-current tree

2018-07-25 Thread Omar Sandoval

On Mon, Jul 23, 2018 at 07:42:31PM +1000, Stephen Rothwell wrote:
> Hi Andrew,
> 
> After merging the akpm tree, today's linux-next build (sparc (32 bit)
> defconfig) failed like this:
> 
> In file included from kernel/crash_core.c:9:0:
> kernel/crash_core.c: In function 'crash_save_vmcoreinfo_init':
> include/linux/crash_core.h:44:66: error: lvalue required as unary '&' operand
>   vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long))
>   ^
> kernel/crash_core.c:404:2: note: in expansion of macro 'VMCOREINFO_SYMBOL'
>   VMCOREINFO_SYMBOL(swapper_pg_dir);
>   ^
> 
> Caused by commit
> 
>   e527c514996a ("proc/kcore: add vmcoreinfo note to /proc/kcore")
> 
> It seems that sparc does not declare swapper_pg_dir for 32 but builds,
> they just define it to be NULL.  As do some others:
> 
> $ git grep -w 'define.*swapper_pg_dir'
> arch/arm/include/asm/pgtable-nommu.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/c6x/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/h8300/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/m68k/include/asm/pgtable_no.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/microblaze/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) NULL)
> arch/sparc/include/asm/pgtable_32.h:#define swapper_pg_dir NULL
> arch/xtensa/include/asm/pgtable.h:# define swapper_pg_dir NULL

Mm, I wrongly assumed that this would only be the case for !MMU. Looks
like it's always either NULL or an array, so VMCOREINFO_SYMBOL_ARRAY
should work here. I'll send a new version.

Re: linux-next: build failure after merge of the akpm-current tree

2018-07-25 Thread Omar Sandoval

On Mon, Jul 23, 2018 at 07:42:31PM +1000, Stephen Rothwell wrote:
> Hi Andrew,
> 
> After merging the akpm tree, today's linux-next build (sparc (32 bit)
> defconfig) failed like this:
> 
> In file included from kernel/crash_core.c:9:0:
> kernel/crash_core.c: In function 'crash_save_vmcoreinfo_init':
> include/linux/crash_core.h:44:66: error: lvalue required as unary '&' operand
>   vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long))
>   ^
> kernel/crash_core.c:404:2: note: in expansion of macro 'VMCOREINFO_SYMBOL'
>   VMCOREINFO_SYMBOL(swapper_pg_dir);
>   ^
> 
> Caused by commit
> 
>   e527c514996a ("proc/kcore: add vmcoreinfo note to /proc/kcore")
> 
> It seems that sparc does not declare swapper_pg_dir for 32 but builds,
> they just define it to be NULL.  As do some others:
> 
> $ git grep -w 'define.*swapper_pg_dir'
> arch/arm/include/asm/pgtable-nommu.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/c6x/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/h8300/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/m68k/include/asm/pgtable_no.h:#define swapper_pg_dir ((pgd_t *) 0)
> arch/microblaze/include/asm/pgtable.h:#define swapper_pg_dir ((pgd_t *) NULL)
> arch/sparc/include/asm/pgtable_32.h:#define swapper_pg_dir NULL
> arch/xtensa/include/asm/pgtable.h:# define swapper_pg_dir NULL

Mm, I wrongly assumed that this would only be the case for !MMU. Looks
like it's always either NULL or an array, so VMCOREINFO_SYMBOL_ARRAY
should work here. I'll send a new version.

[PATCH v3 1/8] proc/kcore: don't grab lock for kclist_add()

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

While we're here, mark kclist_add() with __init so that we'll get a
warning if it's called from non-init code.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c   | 7 +++
 include/linux/kcore.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..b0b9a76f28d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+  int type)
 {
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
 };
 
 #ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
2.18.0

[PATCH v3 1/8] proc/kcore: don't grab lock for kclist_add()

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

While we're here, mark kclist_add() with __init so that we'll get a
warning if it's called from non-init code.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c   | 7 +++
 include/linux/kcore.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..b0b9a76f28d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+  int type)
 {
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
 };
 
 #ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
2.18.0

[PATCH v3 0/8] /proc/kcore improvements

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1, 2, and 3
are prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep
patch. Patches 6 and 7 are optimizations to ->read(). Patch 8 adds
vmcoreinfo to /proc/kcore.

Again, based on v4.18-rc4 + James' patch in the mm tree, and tested with
crash and readelf.

Thanks!

Changes from v2:

- Add __init to kclist_add() as per Andrew
- Got rid of conversion of kcore_need_update from int to atomic_t and
  just used xchg() instead of atomic_cmpxchg() (split out into a new
  patch instead of combining it with the rwlock -> rwsem conversion)
- Add comment about the increase in file size to patch 8

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (8):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: don't grab lock for memory hotplug notifier
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 534 +
 include/linux/crash_core.h |   2 +
 include/linux/kcore.h  |   2 +-
 kernel/crash_core.c|   4 +-
 5 files changed, 251 insertions(+), 292 deletions(-)

-- 
2.18.0

[PATCH v3 0/8] /proc/kcore improvements

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1, 2, and 3
are prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep
patch. Patches 6 and 7 are optimizations to ->read(). Patch 8 adds
vmcoreinfo to /proc/kcore.

Again, based on v4.18-rc4 + James' patch in the mm tree, and tested with
crash and readelf.

Thanks!

Changes from v2:

- Add __init to kclist_add() as per Andrew
- Got rid of conversion of kcore_need_update from int to atomic_t and
  just used xchg() instead of atomic_cmpxchg() (split out into a new
  patch instead of combining it with the rwlock -> rwsem conversion)
- Add comment about the increase in file size to patch 8

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (8):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: don't grab lock for memory hotplug notifier
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 534 +
 include/linux/crash_core.h |   2 +
 include/linux/kcore.h  |   2 +-
 kernel/crash_core.c|   4 +-
 5 files changed, 251 insertions(+), 292 deletions(-)

-- 
2.18.0

[PATCH v3 4/8] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ae43a97d511d..95aa988c5b5d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (xchg(_need_update, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!xchg(_need_update, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   WRITE_ONCE(kcore_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_entry_safe(pos, tmp, , list) {

[PATCH v3 6/8] proc/kcore: clean up ELF header generation

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e317ac890871..e2ca58d49938 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
@@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, 
size_t size,
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;
-   struct memelfnote n

[PATCH v3 5/8] proc/kcore: hold lock during read

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 95aa988c5b5d..e317ac890871 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v3 7/8] proc/kcore: optimize multiple page reads

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e2ca58d49938..25fefdd05ee5 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v3 4/8] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ae43a97d511d..95aa988c5b5d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (xchg(_need_update, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!xchg(_need_update, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   WRITE_ONCE(kcore_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_entry_safe(pos, tmp, , list) {

[PATCH v3 6/8] proc/kcore: clean up ELF header generation

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e317ac890871..e2ca58d49938 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
@@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, 
size_t size,
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;
-   struct memelfnote n

[PATCH v3 5/8] proc/kcore: hold lock during read

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 95aa988c5b5d..e317ac890871 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v3 7/8] proc/kcore: optimize multiple page reads

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e2ca58d49938..25fefdd05ee5 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v3 3/8] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e83f15a4f66d..ae43a97d511d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,7 +59,7 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
@@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
+   down_write(_lock);
if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
@@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list)
} else
list_splice(list, );
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
-- 
2.18.0

[PATCH v3 3/8] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e83f15a4f66d..ae43a97d511d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,7 +59,7 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
@@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
+   down_write(_lock);
if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
@@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list)
} else
list_splice(list, );
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
-- 
2.18.0

[PATCH v3 2/8] proc/kcore: don't grab lock for memory hotplug notifier

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The memory hotplug notifier kcore_callback() only needs kclist_lock to
prevent races with __kcore_update_ram(), but we can easily eliminate
that race by using an atomic xchg() in __kcore_update_ram(). This is
preparation for converting kclist_lock to an rwsem.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b0b9a76f28d6..e83f15a4f66d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list)
LIST_HEAD(garbage);
 
write_lock(_lock);
-   if (kcore_need_update) {
+   if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
write_unlock(_lock);
 
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
kcore_need_update = 1;
-   write_unlock(_lock);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v3 8/8] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient, and it only adds a page
at most to the file.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 25fefdd05ee5..ab7c1a1dad50 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..d02c58b94460 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH v3 2/8] proc/kcore: don't grab lock for memory hotplug notifier

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The memory hotplug notifier kcore_callback() only needs kclist_lock to
prevent races with __kcore_update_ram(), but we can easily eliminate
that race by using an atomic xchg() in __kcore_update_ram(). This is
preparation for converting kclist_lock to an rwsem.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b0b9a76f28d6..e83f15a4f66d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list)
LIST_HEAD(garbage);
 
write_lock(_lock);
-   if (kcore_need_update) {
+   if (xchg(_need_update, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
write_unlock(_lock);
 
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
kcore_need_update = 1;
-   write_unlock(_lock);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v3 8/8] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-18 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient, and it only adds a page
at most to the file.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 25fefdd05ee5..ab7c1a1dad50 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..d02c58b94460 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

Re: [PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 08:27:53PM -0700, Andrew Morton wrote:
> On Tue, 17 Jul 2018 20:24:05 -0700 Omar Sandoval  wrote:
> 
> > > > --- a/fs/proc/kcore.c
> > > > +++ b/fs/proc/kcore.c
> > > > @@ -59,8 +59,8 @@ struct memelfnote
> > > >  };
> > > >  
> > > >  static LIST_HEAD(kclist_head);
> > > > -static DEFINE_RWLOCK(kclist_lock);
> > > > -static int kcore_need_update = 1;
> > > > +static DECLARE_RWSEM(kclist_lock);
> > > > +static atomic_t kcore_need_update = ATOMIC_INIT(1);
> > > 
> > > It's unclear why kcore_need_update was changed to atomic_t - it's still
> > > updated under kclist_lock?
> > 
> > Not in the hotplug notifier (kcore_callback()) anymore, so I need the
> > atomic_cmpxchg() in __kcore_update_ram().
> 
> Well that's just
> 
>   kcore_need_update = 1;
> 
> and turning that into an atomic_set doesn't change anything(?).
> 
> It's not a harmful change of course, but a bit ... odd.

The change from read, ..., write to cmpxchg in __kcore_update_ram() is
the important part, not the change from = to atomic_set(). I needed to
change that because now kcore_need_update could potentially be set again
by the hotplug notifier while we're in __kcore_update_ram(). But I'll
just put all of this in the commit message in v3 :)

> > That could use a mention in the commit message.
> 
> That never hurts ;)

Re: [PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 08:27:53PM -0700, Andrew Morton wrote:
> On Tue, 17 Jul 2018 20:24:05 -0700 Omar Sandoval  wrote:
> 
> > > > --- a/fs/proc/kcore.c
> > > > +++ b/fs/proc/kcore.c
> > > > @@ -59,8 +59,8 @@ struct memelfnote
> > > >  };
> > > >  
> > > >  static LIST_HEAD(kclist_head);
> > > > -static DEFINE_RWLOCK(kclist_lock);
> > > > -static int kcore_need_update = 1;
> > > > +static DECLARE_RWSEM(kclist_lock);
> > > > +static atomic_t kcore_need_update = ATOMIC_INIT(1);
> > > 
> > > It's unclear why kcore_need_update was changed to atomic_t - it's still
> > > updated under kclist_lock?
> > 
> > Not in the hotplug notifier (kcore_callback()) anymore, so I need the
> > atomic_cmpxchg() in __kcore_update_ram().
> 
> Well that's just
> 
>   kcore_need_update = 1;
> 
> and turning that into an atomic_set doesn't change anything(?).
> 
> It's not a harmful change of course, but a bit ... odd.

The change from read, ..., write to cmpxchg in __kcore_update_ram() is
the important part, not the change from = to atomic_set(). I needed to
change that because now kcore_need_update could potentially be set again
by the hotplug notifier while we're in __kcore_update_ram(). But I'll
just put all of this in the commit message in v3 :)

> > That could use a mention in the commit message.
> 
> That never hurts ;)

Re: [PATCH v2 7/7] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:44:21PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:39 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > The vmcoreinfo information is useful for runtime debugging tools, not
> > just for crash dumps. A lot of this information can be determined by
> > other means, but this is much more convenient.
> > 
> 
> What effect does this have upon the overall size of the dump file?

About 2 kB on my machine, and no more than one page + the few bytes of
extra headers.

Re: [PATCH v2 7/7] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:44:21PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:39 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > The vmcoreinfo information is useful for runtime debugging tools, not
> > just for crash dumps. A lot of this information can be determined by
> > other means, but this is much more convenient.
> > 
> 
> What effect does this have upon the overall size of the dump file?

About 2 kB on my machine, and no more than one page + the few bytes of
extra headers.

Re: [PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:38:41PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:34 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > Now we only need kclist_lock from user context and at fs init time, and
> > the following changes need to sleep while holding the kclist_lock.
> > 
> > ...
> >
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -59,8 +59,8 @@ struct memelfnote
> >  };
> >  
> >  static LIST_HEAD(kclist_head);
> > -static DEFINE_RWLOCK(kclist_lock);
> > -static int kcore_need_update = 1;
> > +static DECLARE_RWSEM(kclist_lock);
> > +static atomic_t kcore_need_update = ATOMIC_INIT(1);
> 
> It's unclear why kcore_need_update was changed to atomic_t - it's still
> updated under kclist_lock?

Not in the hotplug notifier (kcore_callback()) anymore, so I need the
atomic_cmpxchg() in __kcore_update_ram(). That could use a mention in
the commit message.

> Maybe it's for a later patch.
> 
>

Re: [PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:38:41PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:34 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > Now we only need kclist_lock from user context and at fs init time, and
> > the following changes need to sleep while holding the kclist_lock.
> > 
> > ...
> >
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -59,8 +59,8 @@ struct memelfnote
> >  };
> >  
> >  static LIST_HEAD(kclist_head);
> > -static DEFINE_RWLOCK(kclist_lock);
> > -static int kcore_need_update = 1;
> > +static DECLARE_RWSEM(kclist_lock);
> > +static atomic_t kcore_need_update = ATOMIC_INIT(1);
> 
> It's unclear why kcore_need_update was changed to atomic_t - it's still
> updated under kclist_lock?

Not in the hotplug notifier (kcore_callback()) anymore, so I need the
atomic_cmpxchg() in __kcore_update_ram(). That could use a mention in
the commit message.

> Maybe it's for a later patch.
> 
>

Re: [PATCH v2 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:35:04PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:33 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > kclist_add() is only called at init time, so there's no point in
> > grabbing any locks. We're also going to replace the rwlock with a rwsem,
> > which we don't want to try grabbing during early boot.
> > 
> > ...
> >
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
> >  static DEFINE_RWLOCK(kclist_lock);
> >  static int kcore_need_update = 1;
> >  
> > +/* This doesn't grab kclist_lock, so it should only be used at init time. 
> > */
> >  void
> >  kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
> >  {
> > @@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t 
> > size, int type)
> > new->size = size;
> > new->type = type;
> >  
> > -   write_lock(_lock);
> > list_add_tail(>list, _head);
> > -   write_unlock(_lock);
> >  }
> 
> So we can mark kclist_add() as __init, yes?

Yes, thanks, I'll add that in v2.

> That way we save a scrap of ram and if someone starts calling
> kclist_add() from non-__init code we get a build-time warning.
> 
> --- a/fs/proc/kcore.c~proc-kcore-dont-grab-lock-for-kclist_add-fix
> +++ a/fs/proc/kcore.c
> @@ -63,7 +63,7 @@ static DEFINE_RWLOCK(kclist_lock);
>  static int kcore_need_update = 1;
>  
>  /* This doesn't grab kclist_lock, so it should only be used at init time. */
> -void
> +void __init
>  kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
>  {
>   new->addr = (unsigned long)addr;
> --- a/include/linux/kcore.h~proc-kcore-dont-grab-lock-for-kclist_add-fix
> +++ a/include/linux/kcore.h
> @@ -35,7 +35,7 @@ struct vmcoredd_node {
>  };
>  
>  #ifdef CONFIG_PROC_KCORE
> -extern void kclist_add(struct kcore_list *, void *, size_t, int type);
> +extern void __init kclist_add(struct kcore_list *, void *, size_t, int type);
>  #else
>  static inline
>  void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
> 
>

Re: [PATCH v2 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-17 Thread Omar Sandoval

On Tue, Jul 17, 2018 at 07:35:04PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2018 17:09:33 -0700 Omar Sandoval  wrote:
> 
> > From: Omar Sandoval 
> > 
> > kclist_add() is only called at init time, so there's no point in
> > grabbing any locks. We're also going to replace the rwlock with a rwsem,
> > which we don't want to try grabbing during early boot.
> > 
> > ...
> >
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
> >  static DEFINE_RWLOCK(kclist_lock);
> >  static int kcore_need_update = 1;
> >  
> > +/* This doesn't grab kclist_lock, so it should only be used at init time. 
> > */
> >  void
> >  kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
> >  {
> > @@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t 
> > size, int type)
> > new->size = size;
> > new->type = type;
> >  
> > -   write_lock(_lock);
> > list_add_tail(>list, _head);
> > -   write_unlock(_lock);
> >  }
> 
> So we can mark kclist_add() as __init, yes?

Yes, thanks, I'll add that in v2.

> That way we save a scrap of ram and if someone starts calling
> kclist_add() from non-__init code we get a build-time warning.
> 
> --- a/fs/proc/kcore.c~proc-kcore-dont-grab-lock-for-kclist_add-fix
> +++ a/fs/proc/kcore.c
> @@ -63,7 +63,7 @@ static DEFINE_RWLOCK(kclist_lock);
>  static int kcore_need_update = 1;
>  
>  /* This doesn't grab kclist_lock, so it should only be used at init time. */
> -void
> +void __init
>  kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
>  {
>   new->addr = (unsigned long)addr;
> --- a/include/linux/kcore.h~proc-kcore-dont-grab-lock-for-kclist_add-fix
> +++ a/include/linux/kcore.h
> @@ -35,7 +35,7 @@ struct vmcoredd_node {
>  };
>  
>  #ifdef CONFIG_PROC_KCORE
> -extern void kclist_add(struct kcore_list *, void *, size_t, int type);
> +extern void __init kclist_add(struct kcore_list *, void *, size_t, int type);
>  #else
>  static inline
>  void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
> 
>

[PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ddeeb3a5a015..def92fccb167 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,8 +59,8 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
-static int kcore_need_update = 1;
+static DECLARE_RWSEM(kclist_lock);
+static atomic_t kcore_need_update = ATOMIC_INIT(1);
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
@@ -117,8 +117,8 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
-   if (kcore_need_update) {
+   down_write(_lock);
+   if (atomic_cmpxchg(_need_update, 1, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,9 +127,8 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -452,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -473,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -492,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -505,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
@@ -563,7 +562,7 @@ static int open_kcore(struct inode *inode, struct file 
*filp)
if (!filp->private_data)
return -ENOMEM;
 
-   if (kcore_need_update)
+   if (atomic_read(_need_update))
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
inode_lock(inode);
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
-   kcore_need_update = 1;
-   write_unlock(_lock);
+   atomic_set(_need_update, 1);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v2 5/7] proc/kcore: clean up ELF header generation

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f1ae848c7bcc..a7e730b40154 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static atomic_t kcore_need_update = ATOMIC_INIT(1);
@@ -73,7 +64,8 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;

[PATCH v2 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ddeeb3a5a015..def92fccb167 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,8 +59,8 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
-static int kcore_need_update = 1;
+static DECLARE_RWSEM(kclist_lock);
+static atomic_t kcore_need_update = ATOMIC_INIT(1);
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
@@ -117,8 +117,8 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
-   if (kcore_need_update) {
+   down_write(_lock);
+   if (atomic_cmpxchg(_need_update, 1, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,9 +127,8 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -452,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -473,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -492,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -505,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
@@ -563,7 +562,7 @@ static int open_kcore(struct inode *inode, struct file 
*filp)
if (!filp->private_data)
return -ENOMEM;
 
-   if (kcore_need_update)
+   if (atomic_read(_need_update))
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
inode_lock(inode);
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
-   kcore_need_update = 1;
-   write_unlock(_lock);
+   atomic_set(_need_update, 1);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH v2 5/7] proc/kcore: clean up ELF header generation

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3. Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort. It's cleaner and
more efficient to handle the three separately.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 350 +++-
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f1ae848c7bcc..a7e730b40154 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #definekc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-   const char *name;
-   int type;
-   unsigned int datasz;
-   void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static atomic_t kcore_need_update = ATOMIC_INIT(1);
@@ -73,7 +64,8 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
list_add_tail(>list, _head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+size_t *data_offset)
 {
size_t try, size;
struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
-   3 * ((sizeof(struct elf_note)) +
-roundup(sizeof(CORE_STR), 4)) +
-   roundup(sizeof(struct elf_prstatus), 4) +
-   roundup(sizeof(struct elf_prpsinfo), 4) +
-   roundup(arch_task_struct_size, 4);
-   *elf_buflen = PAGE_ALIGN(*elf_buflen);
-   return size + *elf_buflen;
+
+   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4));
+   *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+   return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
LIST_HEAD(list);
LIST_HEAD(garbage);
int nphdr;
-   size_t size;
+   size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
}
list_splice_tail(, _head);
 
-   proc_root_kcore->size = get_kcore_size(, );
+   proc_root_kcore->size = get_kcore_size(, _len, _len,
+  _offset);
 
 out:
up_write(_lock);
@@ -274,228 +267,168 @@ static int kcore_update_ram(void)
return ret;
 }
 
-/*/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
 {
-   int sz;
-
-   sz = sizeof(struct elf_note);
-   sz += roundup((strlen(en->name) + 1), 4);
-   sz += roundup(en->datasz, 4);
-
-   return sz;
-} /* end notesize() */
-
-/*/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-   struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-   en.n_namesz = strlen(men->name) + 1;
-   en.n_descsz = men->datasz;
-   en.n_type = men->type;
-
-   DUMP_WRITE(, sizeof(en));
-   DUMP_WRITE(men->name, en.n_namesz);
-
-   /* XXX - cast from long long to long to avoid need for libgcc.a */
-   bufp = (char*) roundup((unsigned long)bufp,4);
-   DUMP_WRITE(men->data, men->datasz);
-   bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-   return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-   struct elf_prstatus prstatus;   /* NT_PRSTATUS */
-   struct elf_prpsinfo prpsinfo;   /* NT_PRPSINFO */
-   struct elf_phdr *nhdr, *phdr;
-   struct elfhdr *elf;

[PATCH v2 4/7] proc/kcore: hold lock during read

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 33667db6e370..f1ae848c7bcc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v2 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..ddeeb3a5a015 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
 kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
@@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
-- 
2.18.0

[PATCH v2 4/7] proc/kcore: hold lock during read

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 33667db6e370..f1ae848c7bcc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

[PATCH v2 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..ddeeb3a5a015 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
 kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
@@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
-- 
2.18.0

[PATCH v2 6/7] proc/kcore: optimize multiple page reads

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a7e730b40154..d1b875afc359 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v2 6/7] proc/kcore: optimize multiple page reads

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

The current code does a full search of the segment list every time for
every page. This is wasteful, since it's almost certain that the next
page will be in the same segment. Instead, check if the previous segment
covers the current page before doing the list search.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a7e730b40154..d1b875afc359 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
 
+   m = NULL;
while (buflen) {
-   list_for_each_entry(m, _head, list) {
-   if (start >= m->addr && start < (m->addr+m->size))
-   break;
+   /*
+* If this is the first iteration or the address is not within
+* the previous entry, search for a matching entry.
+*/
+   if (!m || start < m->addr || start >= m->addr + m->size) {
+   list_for_each_entry(m, _head, list) {
+   if (start >= m->addr &&
+   start < m->addr + m->size)
+   break;
+   }
}
 
if (>list == _head) {
-- 
2.18.0

[PATCH v2 7/7] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d1b875afc359..bef78923b387 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..d02c58b94460 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH v2 7/7] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 18 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d1b875afc359..bef78923b387 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   /*
+* vmcoreinfo_size is mostly constant after init time, but it
+* can be changed by crash_save_vmcoreinfo(). Racing here with a
+* panic on another CPU before the machine goes down is insanely
+* unlikely, but it's better to not leave potential buffer
+* overflows lying around, regardless.
+*/
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..d02c58b94460 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH v2 3/7] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index def92fccb167..33667db6e370 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (atomic_cmpxchg(_need_update, 1, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!atomic_cmpxchg(_need_update, 1, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   atomic_set(_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_ent

[PATCH v2 0/7] /proc/kcore improvements

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1 and 2 are
prep patches. Patch 3 is a fix/cleanup. Patch 4 is another prep patch.
Patches 5 and 6 are optimizations to ->read(). Patch 7 adds vmcoreinfo
to /proc/kcore (apparently I'm not the only one who wants this, see
https://www.spinics.net/lists/arm-kernel/msg665103.html).

I tested that the crash utility still works with this applied, and
readelf is happy with it, as well.

Andrew, since this didn't get any traction on the fsdevel side, and
you're already carrying James' patch, could you take this through -mm?

Thanks!

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (7):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 536 +
 include/linux/crash_core.h |   2 +
 kernel/crash_core.c|   4 +-
 4 files changed, 251 insertions(+), 292 deletions(-)

-- 
2.18.0

[PATCH v2 3/7] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index def92fccb167..33667db6e370 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (atomic_cmpxchg(_need_update, 1, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!atomic_cmpxchg(_need_update, 1, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   atomic_set(_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_ent

[PATCH v2 0/7] /proc/kcore improvements

2018-07-12 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1 and 2 are
prep patches. Patch 3 is a fix/cleanup. Patch 4 is another prep patch.
Patches 5 and 6 are optimizations to ->read(). Patch 7 adds vmcoreinfo
to /proc/kcore (apparently I'm not the only one who wants this, see
https://www.spinics.net/lists/arm-kernel/msg665103.html).

I tested that the crash utility still works with this applied, and
readelf is happy with it, as well.

Andrew, since this didn't get any traction on the fsdevel side, and
you're already carrying James' patch, could you take this through -mm?

Thanks!

Changes from v1:

- Rebased onto v4.18-rc4 + James' patch
  (https://patchwork.kernel.org/patch/10519739/) in the mm tree
- Fix spurious sparse warning (see the report and response in
  https://patchwork.kernel.org/patch/10512431/)

Omar Sandoval (7):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 536 +
 include/linux/crash_core.h |   2 +
 kernel/crash_core.c|   4 +-
 4 files changed, 251 insertions(+), 292 deletions(-)

-- 
2.18.0

Re: [PATCH 5/7] proc/kcore: clean up ELF header generation

2018-07-07 Thread Omar Sandoval

On Sat, Jul 07, 2018 at 06:05:17PM +0800, kbuild test robot wrote:
> Hi Omar,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on linus/master]
> [also build test WARNING on v4.18-rc3 next-20180706]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Omar-Sandoval/proc-kcore-improvements/20180707-052548
> reproduce:
> # apt-get install sparse
> make ARCH=x86_64 allmodconfig
> make C=1 CF=-D__CHECK_ENDIAN__
> 
> 
> sparse warnings: (new ones prefixed by >>)
> 
>include/linux/nodemask.h:265:16: sparse: expression using sizeof(void)
>include/linux/nodemask.h:271:16: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:328:23: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:328:23: sparse: expression using sizeof(void)

Not sure why this confuses sparse. Maybe because it's #define elfhdr
elf64_hdr.

>fs//proc/kcore.c:368:23: sparse: expression using sizeof(void)
>fs//proc/kcore.c:368:23: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:384:49: sparse: missing braces around initializer

This is

>  > 384struct elf_prstatus prstatus = {0};

GCC doesn't complain, but I guess I can change it to "= {};", which
isn't strict C89 but both GCC and sparse are happy with.

Re: [PATCH 5/7] proc/kcore: clean up ELF header generation

2018-07-07 Thread Omar Sandoval

On Sat, Jul 07, 2018 at 06:05:17PM +0800, kbuild test robot wrote:
> Hi Omar,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on linus/master]
> [also build test WARNING on v4.18-rc3 next-20180706]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Omar-Sandoval/proc-kcore-improvements/20180707-052548
> reproduce:
> # apt-get install sparse
> make ARCH=x86_64 allmodconfig
> make C=1 CF=-D__CHECK_ENDIAN__
> 
> 
> sparse warnings: (new ones prefixed by >>)
> 
>include/linux/nodemask.h:265:16: sparse: expression using sizeof(void)
>include/linux/nodemask.h:271:16: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:328:23: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:328:23: sparse: expression using sizeof(void)

Not sure why this confuses sparse. Maybe because it's #define elfhdr
elf64_hdr.

>fs//proc/kcore.c:368:23: sparse: expression using sizeof(void)
>fs//proc/kcore.c:368:23: sparse: expression using sizeof(void)
> >> fs//proc/kcore.c:384:49: sparse: missing braces around initializer

This is

>  > 384struct elf_prstatus prstatus = {0};

GCC doesn't complain, but I guess I can change it to "= {};", which
isn't strict C89 but both GCC and sparse are happy with.

[PATCH 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index afd1ff8c2d3f..eb1be07bdb3d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,8 +59,8 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
-static int kcore_need_update = 1;
+static DECLARE_RWSEM(kclist_lock);
+static atomic_t kcore_need_update = ATOMIC_INIT(1);
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
@@ -117,8 +117,8 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
-   if (kcore_need_update) {
+   down_write(_lock);
+   if (atomic_cmpxchg(_need_update, 1, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,9 +127,8 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -450,11 +449,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -471,11 +470,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -490,7 +489,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -503,12 +502,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
@@ -561,7 +560,7 @@ static int open_kcore(struct inode *inode, struct file 
*filp)
if (!filp->private_data)
return -ENOMEM;
 
-   if (kcore_need_update)
+   if (atomic_read(_need_update))
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
inode_lock(inode);
@@ -591,9 +590,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
-   kcore_need_update = 1;
-   write_unlock(_lock);
+   atomic_set(_need_update, 1);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e64ecb9f2720..afd1ff8c2d3f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
 kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
@@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
-- 
2.18.0

[PATCH 0/7] /proc/kcore improvements

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1 and 2 are
prep patches. Patch 3 is a fix/cleanup. Patch 4 is another prep patch.
Patches 5 and 6 are optimizations to ->read(). Patch 7 adds vmcoreinfo
to /proc/kcore. This series is based on v4.18-rc3. Please consider it
for v4.19.

Thanks!

Omar Sandoval (7):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 524 +
 include/linux/crash_core.h |   2 +
 kernel/crash_core.c|   4 +-
 4 files changed, 241 insertions(+), 290 deletions(-)

-- 
2.18.0

[PATCH 2/7] proc/kcore: replace kclist_lock rwlock with rwsem

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index afd1ff8c2d3f..eb1be07bdb3d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,8 +59,8 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
-static int kcore_need_update = 1;
+static DECLARE_RWSEM(kclist_lock);
+static atomic_t kcore_need_update = ATOMIC_INIT(1);
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
@@ -117,8 +117,8 @@ static void __kcore_update_ram(struct list_head *list)
struct kcore_list *tmp, *pos;
LIST_HEAD(garbage);
 
-   write_lock(_lock);
-   if (kcore_need_update) {
+   down_write(_lock);
+   if (atomic_cmpxchg(_need_update, 1, 0)) {
list_for_each_entry_safe(pos, tmp, _head, list) {
if (pos->type == KCORE_RAM
|| pos->type == KCORE_VMEMMAP)
@@ -127,9 +127,8 @@ static void __kcore_update_ram(struct list_head *list)
list_splice_tail(list, _head);
} else
list_splice(list, );
-   kcore_need_update = 0;
proc_root_kcore->size = get_kcore_size(, );
-   write_unlock(_lock);
+   up_write(_lock);
 
free_kclist_ents();
 }
@@ -450,11 +449,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
int nphdr;
unsigned long start;
 
-   read_lock(_lock);
+   down_read(_lock);
size = get_kcore_size(, _buflen);
 
if (buflen == 0 || *fpos >= size) {
-   read_unlock(_lock);
+   up_read(_lock);
return 0;
}
 
@@ -471,11 +470,11 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = buflen;
elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) {
-   read_unlock(_lock);
+   up_read(_lock);
return -ENOMEM;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   read_unlock(_lock);
+   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -490,7 +489,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
if (buflen == 0)
return acc;
} else
-   read_unlock(_lock);
+   up_read(_lock);
 
/*
 * Check to see if our file offset matches with any of
@@ -503,12 +502,12 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   read_lock(_lock);
+   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   read_unlock(_lock);
+   up_read(_lock);
 
if (>list == _head) {
if (clear_user(buffer, tsz))
@@ -561,7 +560,7 @@ static int open_kcore(struct inode *inode, struct file 
*filp)
if (!filp->private_data)
return -ENOMEM;
 
-   if (kcore_need_update)
+   if (atomic_read(_need_update))
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
inode_lock(inode);
@@ -591,9 +590,8 @@ static int __meminit kcore_callback(struct notifier_block 
*self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
-   write_lock(_lock);
-   kcore_need_update = 1;
-   write_unlock(_lock);
+   atomic_set(_need_update, 1);
+   break;
}
return NOTIFY_OK;
 }
-- 
2.18.0

[PATCH 1/7] proc/kcore: don't grab lock for kclist_add()

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

kclist_add() is only called at init time, so there's no point in
grabbing any locks. We're also going to replace the rwlock with a rwsem,
which we don't want to try grabbing during early boot.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e64ecb9f2720..afd1ff8c2d3f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,6 +62,7 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
 void
 kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
@@ -69,9 +70,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, 
int type)
new->size = size;
new->type = type;
 
-   write_lock(_lock);
list_add_tail(>list, _head);
-   write_unlock(_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
-- 
2.18.0

[PATCH 0/7] /proc/kcore improvements

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

Hi,

This series makes a few improvements to /proc/kcore. Patches 1 and 2 are
prep patches. Patch 3 is a fix/cleanup. Patch 4 is another prep patch.
Patches 5 and 6 are optimizations to ->read(). Patch 7 adds vmcoreinfo
to /proc/kcore. This series is based on v4.18-rc3. Please consider it
for v4.19.

Thanks!

Omar Sandoval (7):
  proc/kcore: don't grab lock for kclist_add()
  proc/kcore: replace kclist_lock rwlock with rwsem
  proc/kcore: fix memory hotplug vs multiple opens race
  proc/kcore: hold lock during read
  proc/kcore: clean up ELF header generation
  proc/kcore: optimize multiple page reads
  proc/kcore: add vmcoreinfo note to /proc/kcore

 fs/proc/Kconfig|   1 +
 fs/proc/kcore.c| 524 +
 include/linux/crash_core.h |   2 +
 kernel/crash_core.c|   4 +-
 4 files changed, 241 insertions(+), 290 deletions(-)

-- 
2.18.0

[PATCH 3/7] proc/kcore: fix memory hotplug vs multiple opens race

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0  CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()  open_kcore()
kcore_update_ram()kcore_update_ram()
// Walk RAM   // Walk RAM
__kcore_update_ram()  __kcore_update_ram()
kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
  kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2. The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in
replacing the kcore list.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 93 +++--
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index eb1be07bdb3d..f335400300d3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-   struct kcore_list *tmp, *pos;
-
-   list_for_each_entry_safe(pos, tmp, head, list) {
-   list_del(>list);
-   kfree(pos);
-   }
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-   int nphdr;
-   size_t size;
-   struct kcore_list *tmp, *pos;
-   LIST_HEAD(garbage);
-
-   down_write(_lock);
-   if (atomic_cmpxchg(_need_update, 1, 0)) {
-   list_for_each_entry_safe(pos, tmp, _head, list) {
-   if (pos->type == KCORE_RAM
-   || pos->type == KCORE_VMEMMAP)
-   list_move(>list, );
-   }
-   list_splice_tail(list, _head);
-   } else
-   list_splice(list, );
-   proc_root_kcore->size = get_kcore_size(, );
-   up_write(_lock);
-
-   free_kclist_ents();
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-   LIST_HEAD(head);
struct kcore_list *ent;
-   int ret = 0;
 
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
-   list_add(>list, );
-   __kcore_update_ram();
-   return ret;
+   list_add(>list, head);
+   return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ kclist_add_private(unsigned long pfn, unsigned long 
nr_pages, void *arg)
return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
int nid, ret;
unsigned long end_pfn;
-   LIST_HEAD(head);
 
/* Not inializedupdate now */
/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
-   ret = walk_system_ram_range(0, end_pfn, , kclist_add_private);
-   if (ret) {
-   free_kclist_ents();
+   ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+   if (ret)
return -ENOMEM;
+   return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+   LIST_HEAD(list);
+   LIST_HEAD(garbage);
+   int nphdr;
+   size_t size;
+   struct kcore_list *tmp, *pos;
+   int ret = 0;
+
+   down_write(_lock);
+   if (!atomic_cmpxchg(_need_update, 1, 0))
+   goto out;
+
+   ret = kcore_ram_list();
+   if (ret) {
+   /* Couldn't get the RAM list, try again next time. */
+   atomic_set(_need_update, 1);
+   list_splice_tail(, );
+   goto out;
+   }
+
+   list_for_each_entry_safe(pos, tmp, _head, list) {
+   if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+   list_move(>list, );
+   }
+   list_splice_tail(, _head);
+
+   proc_root_kcore->size = get_kcore_size(, );
+
+out:
+   up_write(_lock);
+   list_for_each_ent

[PATCH 7/7] proc/kcore: add vmcoreinfo note to /proc/kcore

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

The vmcoreinfo information is useful for runtime debugging tools, not
just for crash dumps. A lot of this information can be determined by
other means, but this is much more convenient.

Signed-off-by: Omar Sandoval 
---
 fs/proc/Kconfig|  1 +
 fs/proc/kcore.c| 10 --
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c|  4 ++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+   select CRASH_CORE
help
  Provides a virtual ELF core file of the live kernel.  This can
  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f1e21579cd22..49537c4d78ab 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj 
Sarcar 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
}
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-   *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 
4)) +
+   *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4));
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
  *notes_len);
return *data_offset + size;
@@ -404,6 +408,8 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
  sizeof(prpsinfo));
append_kcore_note(notes, , CORE_STR, NT_TASKSTRUCT, current,
  arch_task_struct_size);
+   append_kcore_note(notes, , VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data, vmcoreinfo_size);
 
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..d02c58b94460 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include 
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
2.18.0

[PATCH 4/7] proc/kcore: hold lock during read

2018-07-06 Thread Omar Sandoval

From: Omar Sandoval 

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path. This is preparation for the
next change.

Signed-off-by: Omar Sandoval 
---
 fs/proc/kcore.c | 70 -
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f335400300d3..b7ff2e2ec350 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -438,19 +438,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
char *buf = file->private_data;
-   ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
int nphdr;
unsigned long start;
+   size_t orig_buflen = buflen;
+   int ret = 0;
 
down_read(_lock);
size = get_kcore_size(, _buflen);
 
-   if (buflen == 0 || *fpos >= size) {
-   up_read(_lock);
-   return 0;
-   }
+   if (buflen == 0 || *fpos >= size)
+   goto out;
 
/* trim buflen to not go beyond EOF */
if (buflen > size - *fpos)
@@ -463,28 +462,26 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
tsz = elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
-   elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+   elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
if (!elf_buf) {
-   up_read(_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-   up_read(_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
kfree(elf_buf);
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
-   acc += tsz;
 
/* leave now if filled buffer already */
if (buflen == 0)
-   return acc;
-   } else
-   up_read(_lock);
+   goto out;
+   }
 
/*
 * Check to see if our file offset matches with any of
@@ -497,25 +494,29 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
while (buflen) {
struct kcore_list *m;
 
-   down_read(_lock);
list_for_each_entry(m, _head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
-   up_read(_lock);
 
if (>list == _head) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
-   if (copy_to_user(buffer, buf, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, buf, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
-   if (copy_to_user(buffer, (char *)start, tsz))
-   return -EFAULT;
+   if (copy_to_user(buffer, (char *)start, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
if (kern_addr_valid(start)) {
/*
@@ -523,26 +524,35 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
 * hardened user copy kernel text checks.
 */
if (probe_kernel_read(buf, (void *) start, 
tsz)) {
-   if (clear_user(buffer, tsz))
-   return -EFAULT;
+   if (clear_user(buffer, tsz)) {
+   ret = -EFAULT;
+   goto out;
+   }
} else {
-

1 2 3 4 5 6 7 8 >

1 - 100 of 773 matches

Mail list logo