date:20150427

[PATCH 03/10] block: allow __blk_queue_bounce() to handle bios larger than BIO_MAX_PAGES

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Allow __blk_queue_bounce() to handle bios with more than BIO_MAX_PAGES
segments. Doing that, it becomes possible to simplify the block layer
in the kernel.

The issue is that any code that clones the bio and must clone the biovec
(i.e. it can't use bio_clone_fast()) won't be able to allocate a bio with
more than BIO_MAX_PAGES - bio_alloc_bioset() always fails in that case.

Fortunately, it's easy to make __blk_queue_bounce() just process part of
the bio if necessary, using bi_remaining to count the splits and punting
the rest back to generic_make_request().

Cc: Christoph Hellwig 
Cc: Jens Axboe 
Signed-off-by: Kent Overstreet 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 block/bounce.c | 60 ++
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba2..689ea89 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -196,6 +196,43 @@ static int must_snapshot_stable_pages(struct request_queue 
*q, struct bio *bio)
 }
 #endif /* CONFIG_NEED_BOUNCE_POOL */
 
+static struct bio *bio_clone_segments(struct bio *bio_src, gfp_t gfp_mask,
+ struct bio_set *bs, unsigned nsegs)
+{
+   struct bvec_iter iter;
+   struct bio_vec bv;
+   struct bio *bio;
+
+   bio = bio_alloc_bioset(gfp_mask, nsegs, bs);
+   if (!bio)
+   return NULL;
+
+   bio->bi_bdev= bio_src->bi_bdev;
+   bio->bi_rw  = bio_src->bi_rw;
+   bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
+
+   bio_for_each_segment(bv, bio_src, iter) {
+   bio->bi_io_vec[bio->bi_vcnt++] = bv;
+   bio->bi_iter.bi_size += bv.bv_len;
+   if (!--nsegs)
+   break;
+   }
+
+   if (bio_integrity(bio_src)) {
+   int ret;
+
+   ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+   if (ret < 0) {
+   bio_put(bio);
+   return NULL;
+   }
+   }
+
+   bio_src->bi_iter = iter;
+
+   return bio;
+}
+
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
   mempool_t *pool, int force)
 {
@@ -203,17 +240,24 @@ static void __blk_queue_bounce(struct request_queue *q, 
struct bio **bio_orig,
int rw = bio_data_dir(*bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
-   unsigned i;
+   int i, nsegs = 0, bounce = force;
 
-   if (force)
-   goto bounce;
-   bio_for_each_segment(from, *bio_orig, iter)
+   bio_for_each_segment(from, *bio_orig, iter) {
+   nsegs++;
if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
-   goto bounce;
+   bounce = 1;
+   }
+
+   if (!bounce)
+   return;
 
-   return;
-bounce:
-   bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
+   bio = bio_clone_segments(*bio_orig, GFP_NOIO, fs_bio_set,
+min(nsegs, BIO_MAX_PAGES));
+
+   if ((*bio_orig)->bi_iter.bi_size) {
+   atomic_inc(&(*bio_orig)->bi_remaining);
+   generic_make_request(*bio_orig);
+   }
 
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 08/10] fs: use helper bio_add_page() instead of open coding on bi_io_vec

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Call pre-defined helper bio_add_page() instead of open coding for
iterating through bi_io_vec[]. Doing that, it's possible to make some
parts in filesystems and mm/page_io.c simpler than before.

Acked-by: Dave Kleikamp 
Cc: Christoph Hellwig 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Kent Overstreet 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 fs/buffer.c |  7 ++-
 fs/jfs/jfs_logmgr.c | 14 --
 mm/page_io.c|  8 +++-
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index c7a5602..d9f00b6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3022,12 +3022,9 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned 
long bio_flags)
 
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
-   bio->bi_io_vec[0].bv_page = bh->b_page;
-   bio->bi_io_vec[0].bv_len = bh->b_size;
-   bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 
-   bio->bi_vcnt = 1;
-   bio->bi_iter.bi_size = bh->b_size;
+   bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+   BUG_ON(bio->bi_iter.bi_size != bh->b_size);
 
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index bc462dc..46fae06 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1999,12 +1999,9 @@ static int lbmRead(struct jfs_log * log, int pn, struct 
lbuf ** bpp)
 
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
-   bio->bi_io_vec[0].bv_page = bp->l_page;
-   bio->bi_io_vec[0].bv_len = LOGPSIZE;
-   bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
-   bio->bi_vcnt = 1;
-   bio->bi_iter.bi_size = LOGPSIZE;
+   bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
+   BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
 
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
@@ -2145,12 +2142,9 @@ static void lbmStartIO(struct lbuf * bp)
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
-   bio->bi_io_vec[0].bv_page = bp->l_page;
-   bio->bi_io_vec[0].bv_len = LOGPSIZE;
-   bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
-   bio->bi_vcnt = 1;
-   bio->bi_iter.bi_size = LOGPSIZE;
+   bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
+   BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
 
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
diff --git a/mm/page_io.c b/mm/page_io.c
index 6424869..9fb8a0d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -33,12 +33,10 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
if (bio) {
bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
-   bio->bi_io_vec[0].bv_page = page;
-   bio->bi_io_vec[0].bv_len = PAGE_SIZE;
-   bio->bi_io_vec[0].bv_offset = 0;
-   bio->bi_vcnt = 1;
-   bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
+
+   bio_add_page(bio, page, PAGE_SIZE, 0);
+   BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
}
return bio;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/10] block: simplify bio_add_page()

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Since generic_make_request() can now handle arbitrary size bios, all we
have to do is make sure the bvec array doesn't overflow.
__bio_add_page() doesn't need to call ->merge_bvec_fn(), where
we can get rid of unnecessary code paths.

Note that removing call to ->merge_bvec_fn() is fine for
bio_add_pc_page(), as SCSI devices usually don't even need that.
Few exceptional cases like pscsi or osd are not affected either.

Cc: Christoph Hellwig 
Cc: Jens Axboe 
Signed-off-by: Kent Overstreet 
[dpark: rebase and resolve merge conflicts, change a couple of comments,
 make bio_add_page() warn once upon a cloned bio.]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 block/bio.c | 135 +---
 1 file changed, 55 insertions(+), 80 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index f66a4ea..ae31cdb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -699,9 +699,23 @@ int bio_get_nr_vecs(struct block_device *bdev)
 }
 EXPORT_SYMBOL(bio_get_nr_vecs);
 
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors)
+/**
+ * bio_add_pc_page -   attempt to add page to bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by REQ_PC bios.
+ */
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+   *page, unsigned int len, unsigned int offset)
 {
int retried_segments = 0;
struct bio_vec *bvec;
@@ -712,7 +726,7 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
 
-   if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+   if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
return 0;
 
/*
@@ -725,28 +739,7 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
 
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
-   unsigned int prev_bv_len = prev->bv_len;
prev->bv_len += len;
-
-   if (q->merge_bvec_fn) {
-   struct bvec_merge_data bvm = {
-   /* prev_bvec is already charged in
-  bi_size, discharge it in order to
-  simulate merging updated prev_bvec
-  as new bvec. */
-   .bi_bdev = bio->bi_bdev,
-   .bi_sector = bio->bi_iter.bi_sector,
-   .bi_size = bio->bi_iter.bi_size -
-   prev_bv_len,
-   .bi_rw = bio->bi_rw,
-   };
-
-   if (q->merge_bvec_fn(q, &bvm, prev) < 
prev->bv_len) {
-   prev->bv_len -= len;
-   return 0;
-   }
-   }
-
bio->bi_iter.bi_size += len;
goto done;
}
@@ -789,27 +782,6 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
blk_recount_segments(q, bio);
}
 
-   /*
-* if queue has other restrictions (eg varying max sector size
-* depending on offset), it can specify a merge_bvec_fn in the
-* queue to get further control
-*/
-   if (q->merge_bvec_fn) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev = bio->bi_bdev,
-   .bi_sector = bio->bi_iter.bi_sector,
-   .bi_size = bio->bi_iter.bi_size - len,
-   .bi_rw = bio->bi_rw,
-   };
-
-   /*
-* merge_bvec_fn() returns number of bytes it can accept
-* at this offset
-*/
-   if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
-   goto failed;
-   }
-
/* If we may be able to merge these biovecs, force a recount */
if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio->bi_flags &= ~(1 << BIO_SEG_VALID

[PATCH 04/10] bcache: clean up hacks around bio_split_pool

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

There has been workarounds only in bcache, for splitting pool as well
as submitting bios. Since generic_make_request() is able to handle
arbitrarily sized bios, it's now possible to delete those hacks.

Cc: linux-bca...@vger.kernel.org
Signed-off-by: Kent Overstreet 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 drivers/md/bcache/bcache.h|  18 
 drivers/md/bcache/io.c| 100 +-
 drivers/md/bcache/journal.c   |   4 +-
 drivers/md/bcache/request.c   |  16 +++
 drivers/md/bcache/super.c |  32 +-
 drivers/md/bcache/util.h  |   5 ++-
 drivers/md/bcache/writeback.c |   4 +-
 7 files changed, 18 insertions(+), 161 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 04f7bc2..6b420a5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -243,19 +243,6 @@ struct keybuf {
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
-struct bio_split_pool {
-   struct bio_set  *bio_split;
-   mempool_t   *bio_split_hook;
-};
-
-struct bio_split_hook {
-   struct closure  cl;
-   struct bio_split_pool   *p;
-   struct bio  *bio;
-   bio_end_io_t*bi_end_io;
-   void*bi_private;
-};
-
 struct bcache_device {
struct closure  cl;
 
@@ -288,8 +275,6 @@ struct bcache_device {
int (*cache_miss)(struct btree *, struct search *,
  struct bio *, unsigned);
int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct io {
@@ -454,8 +439,6 @@ struct cache {
atomic_long_t   meta_sectors_written;
atomic_long_t   btree_sectors_written;
atomic_long_t   sectors_written;
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct gc_stat {
@@ -873,7 +856,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, 
const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
-void bch_generic_make_request(struct bio *, struct bio_split_pool *);
 void __bch_submit_bbio(struct bio *, struct cache_set *);
 void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, 
unsigned);
 
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa..86a0bb8 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,104 +11,6 @@
 
 #include 
 
-static unsigned bch_bio_max_sectors(struct bio *bio)
-{
-   struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   unsigned ret = 0, seg = 0;
-
-   if (bio->bi_rw & REQ_DISCARD)
-   return min(bio_sectors(bio), q->limits.max_discard_sectors);
-
-   bio_for_each_segment(bv, bio, iter) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev= bio->bi_bdev,
-   .bi_sector  = bio->bi_iter.bi_sector,
-   .bi_size= ret << 9,
-   .bi_rw  = bio->bi_rw,
-   };
-
-   if (seg == min_t(unsigned, BIO_MAX_PAGES,
-queue_max_segments(q)))
-   break;
-
-   if (q->merge_bvec_fn &&
-   q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
-   break;
-
-   seg++;
-   ret += bv.bv_len >> 9;
-   }
-
-   ret = min(ret, queue_max_sectors(q));
-
-   WARN_ON(!ret);
-   ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
-
-   return ret;
-}
-
-static void bch_bio_submit_split_done(struct closure *cl)
-{
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   s->bio->bi_end_io = s->bi_end_io;
-   s->bio->bi_private = s->bi_private;
-   bio_endio_nodec(s->bio, 0);
-
-   closure_debug_destroy(&s->cl);
-   mempool_free(s, s->p->bio_split_hook);
-}
-
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
-{
-   struct closure *cl = bio->bi_private;
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   if (error)
-   clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
-
-   bio_put(bio);
-   closure_put(cl);
-}
-
-void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
-{
-   struct bio_split_hook *s;
-   struct bio *n;
-
-   if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
-   goto submit;
-
-   if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
-   goto submit;
-
-   s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
-   closure_init(&s->cl, NULL);
-
-   s->bio  = bio;
-

[PATCH 09/10] md/raid10: make sync_request_write() call bio_copy_data()

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Refactor sync_request_write() of md/raid10 to use bio_copy_data()
instead of open coding bio_vec iterations.

Cc: Christoph Hellwig 
Cc: Neil Brown 
Cc: linux-r...@vger.kernel.org
Reviewed-by: Christoph Hellwig 
Acked-by: NeilBrown 
Signed-off-by: Kent Overstreet 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 drivers/md/raid10.c | 18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a46c402..6ea6f5f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1989,17 +1989,10 @@ static void sync_request_write(struct mddev *mddev, 
struct r10bio *r10_bio)
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-
-   for (j=0; j < vcnt ; j++) {
-   tbio->bi_io_vec[j].bv_offset = 0;
-   tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-
-   memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-  page_address(fbio->bi_io_vec[j].bv_page),
-  PAGE_SIZE);
-   }
tbio->bi_end_io = end_sync_write;
 
+   bio_copy_data(tbio, fbio);
+
d = r10_bio->devs[i].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
@@ -2014,17 +2007,14 @@ static void sync_request_write(struct mddev *mddev, 
struct r10bio *r10_bio)
 * that are active
 */
for (i = 0; i < conf->copies; i++) {
-   int j, d;
+   int d;
 
tbio = r10_bio->devs[i].repl_bio;
if (!tbio || !tbio->bi_end_io)
continue;
if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
-   for (j = 0; j < vcnt; j++)
-   memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-  page_address(fbio->bi_io_vec[j].bv_page),
-  PAGE_SIZE);
+   bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 06/10] md/raid5: get rid of bio_fits_rdev()

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Remove bio_fits_rdev() completely, because ->merge_bvec_fn() has now
gone. There's no point in calling bio_fits_rdev() only for ensuring
aligned read from rdev.

Cc: Neil Brown 
Cc: linux-r...@vger.kernel.org
Signed-off-by: Kent Overstreet 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 drivers/md/raid5.c | 23 +--
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..7f4a717 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4738,25 +4738,6 @@ static void raid5_align_endio(struct bio *bi, int error)
add_bio_to_retry(raid_bi, conf);
 }
 
-static int bio_fits_rdev(struct bio *bi)
-{
-   struct request_queue *q = bdev_get_queue(bi->bi_bdev);
-
-   if (bio_sectors(bi) > queue_max_sectors(q))
-   return 0;
-   blk_recount_segments(q, bi);
-   if (bi->bi_phys_segments > queue_max_segments(q))
-   return 0;
-
-   if (q->merge_bvec_fn)
-   /* it's too hard to apply the merge_bvec_fn at this stage,
-* just just give up
-*/
-   return 0;
-
-   return 1;
-}
-
 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 {
struct r5conf *conf = mddev->private;
@@ -4810,11 +4791,9 @@ static int chunk_aligned_read(struct mddev *mddev, 
struct bio * raid_bio)
align_bi->bi_bdev =  rdev->bdev;
__clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
 
-   if (!bio_fits_rdev(align_bi) ||
-   is_badblock(rdev, align_bi->bi_iter.bi_sector,
+   if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
-   /* too big in some way, or has a known bad block */
bio_put(align_bi);
rdev_dec_pending(rdev, mddev);
return 0;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 01/10] block: make generic_make_request handle arbitrarily sized bios

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

The way the block layer is currently written, it goes to great lengths
to avoid having to split bios; upper layer code (such as bio_add_page())
checks what the underlying device can handle and tries to always create
bios that don't need to be split.

But this approach becomes unwieldy and eventually breaks down with
stacked devices and devices with dynamic limits, and it adds a lot of
complexity. If the block layer could split bios as needed, we could
eliminate a lot of complexity elsewhere - particularly in stacked
drivers. Code that creates bios can then create whatever size bios are
convenient, and more importantly stacked drivers don't have to deal with
both their own bio size limitations and the limitations of the
(potentially multiple) devices underneath them.  In the future this will
let us delete merge_bvec_fn and a bunch of other code.

We do this by adding calls to blk_queue_split() to the various
make_request functions that need it - a few can already handle arbitrary
size bios. Note that we add the call _after_ any call to
blk_queue_bounce(); this means that blk_queue_split() and
blk_recalc_rq_segments() don't need to be concerned with bouncing
affecting segment merging.

Some make_request_fn() callbacks were simple enough to audit and verify
they don't need blk_queue_split() calls. The skipped ones are:

 * nfhd_make_request (arch/m68k/emu/nfblock.c)
 * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
 * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
 * brd_make_request (ramdisk - drivers/block/brd.c)
 * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c)
 * loop_make_request
 * null_queue_bio
 * bcache's make_request fns

Some others are almost certainly safe to remove now, but will be left
for future patches.

Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Al Viro 
Cc: Ming Lei 
Cc: Neil Brown 
Cc: Alasdair Kergon 
Cc: Mike Snitzer 
Cc: dm-de...@redhat.com
Cc: Lars Ellenberg 
Cc: drbd-u...@lists.linbit.com
Cc: Jiri Kosina 
Cc: Geoff Levand 
Cc: Jim Paris 
Cc: Joshua Morris 
Cc: Philip Kelleher 
Cc: Minchan Kim 
Cc: Nitin Gupta 
Cc: Oleg Drokin 
Cc: Andreas Dilger 
Signed-off-by: Kent Overstreet 
[dpark: skip more mq-based drivers, resolve merge conflicts, etc.]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 block/blk-core.c|  19 ++--
 block/blk-merge.c   | 151 ++--
 block/blk-mq.c  |   2 +
 drivers/block/drbd/drbd_req.c   |   2 +
 drivers/block/pktcdvd.c |   6 +-
 drivers/block/ps3vram.c |   2 +
 drivers/block/rsxx/dev.c|   2 +
 drivers/block/umem.c|   2 +
 drivers/block/zram/zram_drv.c   |   2 +
 drivers/md/dm.c |   2 +
 drivers/md/md.c |   2 +
 drivers/s390/block/dcssblk.c|   2 +
 drivers/s390/block/xpram.c  |   2 +
 drivers/staging/lustre/lustre/llite/lloop.c |   2 +
 include/linux/blkdev.h  |   3 +
 15 files changed, 179 insertions(+), 22 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b9..909f317 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -617,6 +617,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (q->id < 0)
goto fail_q;
 
+   q->bio_split = bioset_create(4, 0);
+   if (!q->bio_split)
+   goto fail_id;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
@@ -626,7 +630,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
err = bdi_init(&q->backing_dev_info);
if (err)
-   goto fail_id;
+   goto fail_split;
 
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
@@ -668,6 +672,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
 fail_bdi:
bdi_destroy(&q->backing_dev_info);
+fail_split:
+   bioset_free(q->bio_split);
 fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1584,6 +1590,8 @@ void blk_queue_bio(struct request_queue *q, struct bio 
*bio)
struct request *req;
unsigned int request_count = 0;
 
+   blk_queue_split(q, &bio, q->bio_split);
+
/*
 * low level driver can indicate that it wants pages above a
 * certain limit bounced to low memory (ie for highmem, or even
@@ -1807,15 +1815,6 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
 
-   if (likely(bio_is_rw(bio) &&
-  nr_sectors > queue_max_hw_sectors(q))) {
-   printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-  bdevname(bio-

[PATCH 07/10] block: kill merge_bvec_fn() completely

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

As generic_make_request() is now able to handle arbitrarily sized bios,
it's no longer necessary for each individual block driver to define its
own ->merge_bvec_fn() callback. Remove every invocation completely.

Cc: Jens Axboe 
Cc: Lars Ellenberg 
Cc: drbd-u...@lists.linbit.com
Cc: Jiri Kosina 
Cc: Yehuda Sadeh 
Cc: Sage Weil 
Cc: Alex Elder 
Cc: ceph-de...@vger.kernel.org
Cc: Alasdair Kergon 
Cc: Mike Snitzer 
Cc: dm-de...@redhat.com
Cc: Neil Brown 
Cc: linux-r...@vger.kernel.org
Cc: Christoph Hellwig 
Cc: "Martin K. Petersen" 
Signed-off-by: Kent Overstreet 
[dpark: also remove ->merge_bvec_fn() in dm-thin as well as
 dm-era-target, and resolve merge conflicts]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 block/blk-merge.c  |  17 +-
 block/blk-settings.c   |  22 
 drivers/block/drbd/drbd_int.h  |   1 -
 drivers/block/drbd/drbd_main.c |   1 -
 drivers/block/drbd/drbd_req.c  |  35 
 drivers/block/pktcdvd.c|  21 ---
 drivers/block/rbd.c|  47 
 drivers/md/dm-cache-target.c   |  21 ---
 drivers/md/dm-crypt.c  |  16 --
 drivers/md/dm-era-target.c |  15 -
 drivers/md/dm-flakey.c |  16 --
 drivers/md/dm-linear.c |  16 --
 drivers/md/dm-log-writes.c |  16 --
 drivers/md/dm-snap.c   |  15 -
 drivers/md/dm-stripe.c |  21 ---
 drivers/md/dm-table.c  |   8 ---
 drivers/md/dm-thin.c   |  31 ---
 drivers/md/dm-verity.c |  16 --
 drivers/md/dm.c| 120 +---
 drivers/md/dm.h|   2 -
 drivers/md/linear.c|  43 ---
 drivers/md/md.c|  26 -
 drivers/md/md.h|  12 
 drivers/md/multipath.c |  21 ---
 drivers/md/raid0.c |  56 ---
 drivers/md/raid0.h |   2 -
 drivers/md/raid1.c |  58 +---
 drivers/md/raid10.c| 121 +
 drivers/md/raid5.c |  32 ---
 include/linux/blkdev.h |  10 
 include/linux/device-mapper.h  |   4 --
 31 files changed, 9 insertions(+), 833 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 322f495..9d565a0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -69,24 +69,13 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
struct bio *split;
struct bio_vec bv = { 0 }, bvprv = { 0 };
struct bvec_iter iter;
-   unsigned seg_size = 0, nsegs = 0;
+   unsigned seg_size = 0, nsegs = 0, sectors = 0;
int prev = 0;
 
-   struct bvec_merge_data bvm = {
-   .bi_bdev= bio->bi_bdev,
-   .bi_sector  = bio->bi_iter.bi_sector,
-   .bi_size= 0,
-   .bi_rw  = bio->bi_rw,
-   };
-
bio_for_each_segment(bv, bio, iter) {
-   if (q->merge_bvec_fn &&
-   q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
-   goto split;
-
-   bvm.bi_size += bv.bv_len;
+   sectors += bv.bv_len >> 9;
 
-   if (bvm.bi_size >> 9 > queue_max_sectors(q))
+   if (sectors > queue_max_sectors(q))
goto split;
 
if (prev && blk_queue_cluster(q)) {
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 12600bf..e90d477 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, 
unprep_rq_fn *ufn)
 }
 EXPORT_SYMBOL(blk_queue_unprep_rq);
 
-/**
- * blk_queue_merge_bvec - set a merge_bvec function for queue
- * @q: queue
- * @mbfn:  merge_bvec_fn
- *
- * Usually queues have static limitations on the max sectors or segments that
- * we can put in a request. Stacking drivers may have some settings that
- * are dynamic, and thus we have to query the queue whether it is ok to
- * add a new bio_vec to a bio at a given offset or not. If the block device
- * has such limitations, it needs to register a merge_bvec_fn to control
- * the size of bio's sent to it. Note that a block device *must* allow a
- * single page to be added to an empty bio. The block device driver may want
- * to use the bio_split() function to deal with these bio's. By default
- * no merge_bvec_fn is defined for a queue, and only the fixed limits are
- * honored.
- */
-void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
-{
-   q->merge_bvec_fn = mbfn;
-}
-EXPORT_SYMBOL(blk_queue_merge_bvec);
-
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
q->softirq_done_fn = fn;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b905e98..63ce2b0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd

[RFC PATCH v2 00/10] simplify block layer based on immutable biovecs

2015-04-27 Thread Ming Lin

Dongsu sent v1 of this patchset.
https://lkml.org/lkml/2014/12/22/128

This is the second attempt of simplifying block layer based on immutable
biovecs. Immutable biovecs, implemented by Kent Overstreet, have been
available in mainline since v3.14. Its original goal was actually making
generic_make_request() accept arbitrarily sized bios, and pushing the
splitting down to the drivers or wherever it's required. See also
discussions in the past, [1] [2] [3].

This will bring not only performance improvements, but also a great amount
of reduction in code complexity all over the block layer. Performance gain
is possible due to the fact that bio_add_page() does not have to check
unnecesary conditions such as queue limits or if biovecs are mergeable.
Those will be delegated to the driver level. Kent already said that he
actually benchmarked the impact of this with fio on a micron p320h, which
showed definitely a positive impact.

Moreover, this patchset also allows a lot of code to be deleted, mainly
because of removal of merge_bvec_fn() callbacks. We have been aware that
it has been always a delicate issue for stacking block drivers (e.g. md
and bcache) to handle merging bio consistently. This simplication will
help every individual block driver avoid having such an issue.

Patches are against 4.1-rc1. These are also available in my git repo at:
 
  
https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=block-generic-req

This patchset is a prerequisite of other consecutive patchsets, e.g.
multipage biovecs, rewriting plugging, or rewriting direct-IO, which are
excluded this time. That means, this patchset should not bring any
regression to end-users.

I did some basic test. I'll run more tests to get some peformance numbers.

Comments are welcome.
Ming

[1] https://lkml.org/lkml/2014/11/23/263
[2] https://lkml.org/lkml/2013/11/25/732
[3] https://lkml.org/lkml/2014/2/26/618

Dongsu Park (1):
  Documentation: update notes in biovecs about arbitrarily sized bios

Kent Overstreet (9):
  block: make generic_make_request handle arbitrarily sized bios
  block: simplify bio_add_page()
  block: allow __blk_queue_bounce() to handle bios larger than BIO_MAX_PAGES
  bcache: clean up hacks around bio_split_pool
  btrfs: remove bio splitting and merge_bvec_fn() calls
  md/raid5: get rid of bio_fits_rdev()
  block: kill merge_bvec_fn() completely
  fs: use helper bio_add_page() instead of open coding on bi_io_vec
  md/raid10: make sync_request_write() call bio_copy_data()

 Documentation/block/biovecs.txt |  10 +-
 block/bio.c | 135 +++
 block/blk-core.c|  19 ++--
 block/blk-merge.c   | 140 ++--
 block/blk-mq.c  |   2 +
 block/blk-settings.c|  22 -
 block/bounce.c  |  60 ++--
 drivers/block/drbd/drbd_int.h   |   1 -
 drivers/block/drbd/drbd_main.c  |   1 -
 drivers/block/drbd/drbd_req.c   |  37 +---
 drivers/block/pktcdvd.c |  27 +-
 drivers/block/ps3vram.c |   2 +
 drivers/block/rbd.c |  47 --
 drivers/block/rsxx/dev.c|   2 +
 drivers/block/umem.c|   2 +
 drivers/block/zram/zram_drv.c   |   2 +
 drivers/md/bcache/bcache.h  |  18 
 drivers/md/bcache/io.c  | 100 +---
 drivers/md/bcache/journal.c |   4 +-
 drivers/md/bcache/request.c |  16 ++--
 drivers/md/bcache/super.c   |  32 +--
 drivers/md/bcache/util.h|   5 +-
 drivers/md/bcache/writeback.c   |   4 +-
 drivers/md/dm-cache-target.c|  21 -
 drivers/md/dm-crypt.c   |  16 
 drivers/md/dm-era-target.c  |  15 ---
 drivers/md/dm-flakey.c  |  16 
 drivers/md/dm-linear.c  |  16 
 drivers/md/dm-log-writes.c  |  16 
 drivers/md/dm-snap.c|  15 ---
 drivers/md/dm-stripe.c  |  21 -
 drivers/md/dm-table.c   |   8 --
 drivers/md/dm-thin.c|  31 --
 drivers/md/dm-verity.c  |  16 
 drivers/md/dm.c | 122 +---
 drivers/md/dm.h |   2 -
 drivers/md/linear.c |  43 -
 drivers/md/md.c |  28 +-
 drivers/md/md.h |  12 ---
 drivers/md/multipath.c  |  21 -
 drivers/md/raid0.c  |  56 ---
 drivers/md/raid0.h  |   2 -
 drivers/

[PATCH 05/10] btrfs: remove bio splitting and merge_bvec_fn() calls

2015-04-27 Thread Ming Lin

From: Kent Overstreet 

Btrfs has been doing bio splitting from btrfs_map_bio(), by checking
device limits as well as calling ->merge_bvec_fn() etc. That is not
necessary any more, because generic_make_request() is now able to
handle arbitrarily sized bios. So clean up unnecessary code paths.

Cc: Chris Mason 
Cc: Josef Bacik 
Cc: linux-bt...@vger.kernel.org
Signed-off-by: Kent Overstreet 
Signed-off-by: Chris Mason 
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 fs/btrfs/volumes.c | 72 --
 1 file changed, 72 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bcd2a0..de84f18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5708,34 +5708,6 @@ static noinline void btrfs_schedule_bio(struct 
btrfs_root *root,
 &device->work);
 }
 
-static int bio_size_ok(struct block_device *bdev, struct bio *bio,
-  sector_t sector)
-{
-   struct bio_vec *prev;
-   struct request_queue *q = bdev_get_queue(bdev);
-   unsigned int max_sectors = queue_max_sectors(q);
-   struct bvec_merge_data bvm = {
-   .bi_bdev = bdev,
-   .bi_sector = sector,
-   .bi_rw = bio->bi_rw,
-   };
-
-   if (WARN_ON(bio->bi_vcnt == 0))
-   return 1;
-
-   prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-   if (bio_sectors(bio) > max_sectors)
-   return 0;
-
-   if (!q->merge_bvec_fn)
-   return 1;
-
-   bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
-   if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
-   return 0;
-   return 1;
-}
-
 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
  struct bio *bio, u64 physical, int dev_nr,
  int rw, int async)
@@ -5769,38 +5741,6 @@ static void submit_stripe_bio(struct btrfs_root *root, 
struct btrfs_bio *bbio,
btrfsic_submit_bio(rw, bio);
 }
 
-static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
- struct bio *first_bio, struct btrfs_device *dev,
- int dev_nr, int rw, int async)
-{
-   struct bio_vec *bvec = first_bio->bi_io_vec;
-   struct bio *bio;
-   int nr_vecs = bio_get_nr_vecs(dev->bdev);
-   u64 physical = bbio->stripes[dev_nr].physical;
-
-again:
-   bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
-   if (!bio)
-   return -ENOMEM;
-
-   while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
-   if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
-bvec->bv_offset) < bvec->bv_len) {
-   u64 len = bio->bi_iter.bi_size;
-
-   atomic_inc(&bbio->stripes_pending);
-   submit_stripe_bio(root, bbio, bio, physical, dev_nr,
- rw, async);
-   physical += len;
-   goto again;
-   }
-   bvec++;
-   }
-
-   submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
-   return 0;
-}
-
 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 {
atomic_inc(&bbio->error);
@@ -5875,18 +5815,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, 
struct bio *bio,
continue;
}
 
-   /*
-* Check and see if we're ok with this bio based on it's size
-* and offset with the given device.
-*/
-   if (!bio_size_ok(dev->bdev, first_bio,
-bbio->stripes[dev_nr].physical >> 9)) {
-   ret = breakup_stripe_bio(root, bbio, first_bio, dev,
-dev_nr, rw, async_submit);
-   BUG_ON(ret);
-   continue;
-   }
-
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 10/10] Documentation: update notes in biovecs about arbitrarily sized bios

2015-04-27 Thread Ming Lin

From: Dongsu Park 

Update block/biovecs.txt so that it includes a note on what kind of
effects arbitrarily sized bios would bring to the block layer.
Also fix a trivial typo, bio_iter_iovec.

Cc: Christoph Hellwig 
Cc: Kent Overstreet 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Signed-off-by: Dongsu Park 
Signed-off-by: Ming Lin 
---
 Documentation/block/biovecs.txt | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
index 74a32ad..2568958 100644
--- a/Documentation/block/biovecs.txt
+++ b/Documentation/block/biovecs.txt
@@ -24,7 +24,7 @@ particular, presenting the illusion of partially completed 
biovecs so that
 normal code doesn't have to deal with bi_bvec_done.
 
  * Driver code should no longer refer to biovecs directly; we now have
-   bio_iovec() and bio_iovec_iter() macros that return literal struct biovecs,
+   bio_iovec() and bio_iter_iovec() macros that return literal struct biovecs,
constructed from the raw biovecs but taking into account bi_bvec_done and
bi_size.
 
@@ -109,3 +109,11 @@ Other implications:
over all the biovecs in the new bio - which is silly as it's not needed.
 
So, don't use bi_vcnt anymore.
+
+ * The current interface allows the block layer to split bios as needed, so we
+   could eliminate a lot of complexity particularly in stacked drivers. Code
+   that creates bios can then create whatever size bios are convenient, and
+   more importantly stacked drivers don't have to deal with both their own bio
+   size limitations and the limitations of the underlying devices. Thus
+   there's no need to define ->merge_bvec_fn() callbacks for individual block
+   drivers.
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] cpufreq: powernv: Register for OCC related opal_message notification

2015-04-27 Thread Viresh Kumar

On 28 April 2015 at 11:53, Shilpasri G Bhat
 wrote:

> Changes from v1:
> - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE
> - Define a structure to store chip id, chip mask which has bits set
>   for cpus present in the chip, throttled state and a work_struct.
> - Modify powernv_cpufreq_throttle_check() to be called via smp_call()

Why ? I might have missed it but there should be some reasoning behind
what you are changing.

> - On Pmax throttling/unthrottling update 'chip.throttled' and not the
>   global 'throttled' as Pmax capping is local to the chip.
> - Remove the condition which checks if local pstate is less than Pmin
>   while checking for Psafe frequency. When OCC becomes active after
>   reset we update 'thottled' to false and when the cpufreq governor
>   initiates a pstate change, the local pstate will be in Psafe and we
>   will be reporting a false positive when we are not throttled.
> - Schedule a kworker on receiving throttling/unthrottling OCC message
>   for that chip and schedule on all chips after receiving active.
> - After an OCC reset all the cpus will be in Psafe frequency. So call
>   target() and restore the frequency to policy->cur after OCC_ACTIVE
>   and Pmax unthrottling
> - Taken care of Viresh and Preeti's comments.

That's a lot. I am not an expert here and so really can't comment on
the internals of ppc. But, is it patch solving a single problem ? I don't
know, I somehow got the impression that it can be split into multiple
(smaller & review-able) patches. Only if it makes sense. Your call.

> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
> b/drivers/cpufreq/powernv-cpufreq.c

> +void powernv_cpufreq_work_fn(struct work_struct *work)
> +{
> +   struct chip *c = container_of(work, struct chip, throttle);
> +   unsigned int cpu;
> +
> +   smp_call_function_any(&c->mask,
> + powernv_cpufreq_throttle_check, NULL, 0);
> +
> +   for_each_cpu(cpu, &c->mask) {

for_each_online_cpu ?

> +   int index;
> +   struct cpufreq_frequency_table *freq_table;
> +   struct cpufreq_policy cpu_policy;

Name it policy.

> +
> +   if (!cpu_online(cpu))
> +   continue;

And you can kill this..

> +   cpufreq_get_policy(&cpu_policy, cpu);
> +   freq_table = cpufreq_frequency_get_table(cpu_policy.cpu);

Just do, policy->freq_table.


> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
> +  unsigned long msg_type, void *msg)
> +{

> +   if (reason && reason <= 5)
> +   pr_info("OCC: Chip %d Pmax reduced due to %s\n",
> +   (int)chip_id, throttle_reason[reason]);
> +   else
> +   pr_info("OCC: Chip %d %s\n", (int)chip_id,
> +   throttle_reason[reason]);

Blank line here. They are better for readability after blocks and loops.

> +   for (i = 0; i < nr_chips; i++)
> +   if (chips[i].id == (int)chip_id)

Why isn't .id 64 bit ?

> +   schedule_work(&chips[i].throttle);
> +   }
> +   return 0;
> +}
> +
> +static struct notifier_block powernv_cpufreq_opal_nb = {
> +   .notifier_call  = powernv_cpufreq_occ_msg,
> +   .next   = NULL,
> +   .priority   = 0,
> +};
> +
>  static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>  {
> struct powernv_smp_call_data freq_data;
> @@ -414,6 +530,35 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
> .attr   = powernv_cpu_freq_attr,
>  };
>
> +static int init_chip_info(void)
> +{
> +   int chip[256], i = 0, cpu;
> +   int prev_chip_id = INT_MAX;
> +
> +   for_each_possible_cpu(cpu) {
> +   int c = cpu_to_chip_id(cpu);

Does 'c' refer to id here ? Name it so then.

> +
> +   if (prev_chip_id != c) {
> +   prev_chip_id = c;
> +   chip[nr_chips++] = c;
> +   }
> +   }
> +
> +   chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL);
> +

A blank line isn't preferred much here :). Sorry about these blank lines.

> +   if (!chips)
> +   return -ENOMEM;
> +
> +   for (i = 0; i < nr_chips; i++) {
> +   chips[i].id = chip[i];
> +   cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
> +   chips[i].throttled = false;
> +   INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
> +   }
> +
> +   return 0;
> +}
> +
>  static int __init powernv_cpufreq_init(void)
>  {
> int rc = 0;
> @@ -429,7 +574,13 @@ static int __init powernv_cpufreq_init(void)
> return rc;
> }
>
> +   /* Populate chip info */
> +   rc = init_chip_info();
> +   if (rc)
> +   return rc;
> +
> register_reboot_notifier(&

[PATCH net-next,v3,1/1] hv_netvsc: introduce netif-msg into netvsc module

2015-04-27 Thread sixiao

From: Simon Xiao 

1. Introduce netif-msg to netvsc to control debug logging output
and keep msg_enable in netvsc_device_context so that it is
kept persistently.
2. Only call dump_rndis_message() when NETIF_MSG_RX_ERR or above
is specified in netvsc module debug param.
In non-debug mode, in current code, dump_rndis_message() will not
dump anything but it still initialize some local variables and
process the switch logic which is unnecessary, especially in
high network throughput situation.

Signed-off-by: Simon Xiao 
Reviewed-by: K. Y. Srinivasan 
Reviewed-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h   | 12 
 drivers/net/hyperv/netvsc.c   |  3 +++
 drivers/net/hyperv/netvsc_drv.c   | 20 ++--
 drivers/net/hyperv/rndis_filter.c |  3 ++-
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index a10b316..e55c8f4 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -612,6 +612,15 @@ struct multi_send_data {
u32 count; /* counter of batched packets */
 };
 
+/* The context of the netvsc device  */
+struct net_device_context {
+   /* point back to our device context */
+   struct hv_device *device_ctx;
+   struct delayed_work dwork;
+   struct work_struct work;
+   u32 msg_enable; /* debug level */
+};
+
 /* Per netvsc device */
 struct netvsc_device {
struct hv_device *dev;
@@ -667,6 +676,9 @@ struct netvsc_device {
struct multi_send_data msd[NR_CPUS];
u32 max_pkt; /* max number of pkt in one send, e.g. 8 */
u32 pkt_align; /* alignment bytes, e.g. 8 */
+
+   /* The net device context */
+   struct net_device_context *nd_ctx;
 };
 
 /* NdisInitialize message */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2e8ad06..c651d4d 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1197,6 +1197,9 @@ int netvsc_device_add(struct hv_device *device, void 
*additional_info)
 */
ndev = net_device->ndev;
 
+   /* Add netvsc_device context to netvsc_device */
+   net_device->nd_ctx = netdev_priv(ndev);
+
/* Initialize the NetVSC channel extension */
init_completion(&net_device->channel_init_wait);
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index a3a9d38..66c4b0c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -40,18 +40,21 @@
 
 #include "hyperv_net.h"
 
-struct net_device_context {
-   /* point back to our device context */
-   struct hv_device *device_ctx;
-   struct delayed_work dwork;
-   struct work_struct work;
-};
 
 #define RING_SIZE_MIN 64
 static int ring_size = 128;
 module_param(ring_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
 
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
+   NETIF_MSG_LINK | NETIF_MSG_IFUP |
+   NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
+   NETIF_MSG_TX_ERR;
+
+static int debug = -1;
+module_param(debug, int, S_IRUGO);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
 static void do_set_multicast(struct work_struct *w)
 {
struct net_device_context *ndevctx =
@@ -888,6 +891,11 @@ static int netvsc_probe(struct hv_device *dev,
 
net_device_ctx = netdev_priv(net);
net_device_ctx->device_ctx = dev;
+   net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
+   if (netif_msg_probe(net_device_ctx))
+   netdev_dbg(net, "netvsc msg_enable: %d\n",
+  net_device_ctx->msg_enable);
+
hv_set_drvdata(dev, net);
INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
INIT_WORK(&net_device_ctx->work, do_set_multicast);
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 0d92efe..9118cea 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -429,7 +429,8 @@ int rndis_filter_receive(struct hv_device *dev,
 
rndis_msg = pkt->data;
 
-   dump_rndis_message(dev, rndis_msg);
+   if (netif_msg_rx_err(net_dev->nd_ctx))
+   dump_rndis_message(dev, rndis_msg);
 
switch (rndis_msg->ndis_msg_type) {
case RNDIS_MSG_PACKET:
-- 
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Build regressions/improvements in v4.1-rc1

2015-04-27 Thread Rusty Russell

Geert Uytterhoeven  writes:
> On Mon, Apr 27, 2015 at 11:51 AM, Geert Uytterhoeven
>  wrote:
>> Below is the list of build error/warning regressions/improvements in
>> v4.1-rc1[1] compared to v4.0[2].
>>
>> Summarized:
>>   - build errors: +34/-11
>>   - build warnings: +135/-163
>>
>> As I haven't mastered kup yet, there's no verbose summary at
>> http://www.kernel.org/pub/linux/kernel/people/geert/linux-log/v4.1-rc1.summary.gz
>>
>> Happy fixing! ;-)
>>
>> Thanks to the linux-next team for providing the build service.
>>
>> [1] http://kisskb.ellerman.id.au/kisskb/head/8779/ (254 out of 257 configs)
>> [2] http://kisskb.ellerman.id.au/kisskb/head/8710/ (254 out of 257 configs)
>>
>>
>> *** ERRORS ***
>>
>> 34 regressions:
>
> The quiet days are over...
>
>>   + /home/kisskb/slave/src/arch/mips/cavium-octeon/smp.c: error: passing 
>> argument 2 of 'cpumask_clear_cpu' discards 'volatile' qualifier from pointer 
>> target type [-Werror]:  => 242:2
>>   + /home/kisskb/slave/src/arch/mips/kernel/process.c: error: passing 
>> argument 2 of 'cpumask_test_cpu' discards 'volatile' qualifier from pointer 
>> target type [-Werror]:  => 52:2
>>   + /home/kisskb/slave/src/arch/mips/kernel/smp.c: error: passing argument 2 
>> of 'cpumask_set_cpu' discards 'volatile' qualifier from pointer target type 
>> [-Werror]:  => 149:2, 211:2
>>   + /home/kisskb/slave/src/arch/mips/kernel/smp.c: error: passing argument 2 
>> of 'cpumask_test_cpu' discards 'volatile' qualifier from pointer target type 
>> [-Werror]:  => 221:2
>
> mips/bigsur_defconfig
> mips/malta_defconfig
> mips/cavium_octeon_defconfig
> mips/ip27_defconfig

Already fixed in other thread...

> and related warnings due to lack of -Werror on
> ia64-defconfig

That fix is fairly obvious, I'll post separately.

> tilegx_defconfig

Can't see that one with a simple grep: can you post warning?

> m32r/m32700ut.smp_defconfig

Will post fix for this too.

> cpumask also gives fishy warnings:
>
> lib/cpumask.c:167:25: warning: the address of 'cpu_all_bits' will
> always evaluate as 'true' [-Waddress]
>
> on sparc (e.g. sparc64/sparc64-allmodconfig) and powerpc (e.g.
> powerpc/ppc64_defconfig), which seem to have been reported 6 months
> ago...

Hmm, this is cpumask_of_node?  That's... Oh my, that requires
a separate post.

> Can we throw some bitcoins at the cpumasks? ;-)

I think I should be throwing bitcoins at you, instead!

Thanks,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] cpufreq: powernv: Register for OCC related opal_message notification

2015-04-27 Thread Shilpasri G Bhat

OCC is an On-Chip-Controller which takes care of power and thermal
safety of the chip. During runtime due to power failure or
overtemperature the OCC may throttle the frequencies of the CPUs to
remain within the power budget.

We want the cpufreq driver to be aware of such situations to be able
to report it to the user. We register to opal_message_notifier to
receive OCC messages from opal.

powernv_cpufreq_throttle_check() reports any frequency throttling and
this patch will report the reason or event that caused throttling. We
can be throttled if OCC is reset or OCC limits Pmax due to power or
thermal reasons. We are also notified of unthrottling after an OCC
reset or if OCC restores Pmax on the chip.

Signed-off-by: Shilpasri G Bhat 
CC: "Rafael J. Wysocki" 
CC: Viresh Kumar 
CC: Preeti U Murthy 
CC: linux...@vger.kernel.org
---
Changes from v1:
- Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE
- Define a structure to store chip id, chip mask which has bits set
  for cpus present in the chip, throttled state and a work_struct.
- Modify powernv_cpufreq_throttle_check() to be called via smp_call()
- On Pmax throttling/unthrottling update 'chip.throttled' and not the
  global 'throttled' as Pmax capping is local to the chip.
- Remove the condition which checks if local pstate is less than Pmin
  while checking for Psafe frequency. When OCC becomes active after
  reset we update 'thottled' to false and when the cpufreq governor
  initiates a pstate change, the local pstate will be in Psafe and we
  will be reporting a false positive when we are not throttled.
- Schedule a kworker on receiving throttling/unthrottling OCC message
  for that chip and schedule on all chips after receiving active.
- After an OCC reset all the cpus will be in Psafe frequency. So call
  target() and restore the frequency to policy->cur after OCC_ACTIVE
  and Pmax unthrottling
- Taken care of Viresh and Preeti's comments.

 drivers/cpufreq/powernv-cpufreq.c | 181 ++
 1 file changed, 166 insertions(+), 15 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index ebef0d8..b356c9d 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -27,20 +27,33 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include  /* Required for cpu_sibling_mask() in UP configs */
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
-#define PMSR_LP(x) ((x >> 48) & 0xFF)
+#define OCC_RESET  0
+#define OCC_LOAD   1
+#define OCC_THROTTLE   2
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
-static bool rebooting, throttled;
+static bool rebooting, throttled, occ_reset;
+
+static struct chip {
+   int id;
+   bool throttled;
+   cpumask_t mask;
+   struct work_struct throttle;
+} *chips;
+
+static int nr_chips;
 
 /*
  * Note: The set of pstates consists of contiguous integers, the
@@ -298,28 +311,33 @@ static inline unsigned int get_nominal_index(void)
return powernv_pstate_info.max - powernv_pstate_info.nominal;
 }
 
-static void powernv_cpufreq_throttle_check(unsigned int cpu)
+static void powernv_cpufreq_throttle_check(void *data)
 {
+   unsigned int cpu = smp_processor_id();
unsigned long pmsr;
-   int pmsr_pmax, pmsr_lp;
+   int pmsr_pmax, i;
 
pmsr = get_pmspr(SPRN_PMSR);
 
+   for (i = 0; i < nr_chips; i++)
+   if (chips[i].id == cpu_to_chip_id(cpu))
+   break;
+
/* Check for Pmax Capping */
pmsr_pmax = (s8)PMSR_MAX(pmsr);
if (pmsr_pmax != powernv_pstate_info.max) {
-   throttled = true;
-   pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax);
-   pr_info("Max allowed Pstate is capped\n");
+   if (chips[i].throttled)
+   goto next;
+   chips[i].throttled = true;
+   pr_info("CPU %d on chip %d Pmax is reduced to %d\n", cpu,
+   chips[i].id, pmsr_pmax);
+   } else {
+   chips[i].throttled = false;
}
 
-   /*
-* Check for Psafe by reading LocalPstate
-* or check if Psafe_mode_active is set in PMSR.
-*/
-   pmsr_lp = (s8)PMSR_LP(pmsr);
-   if ((pmsr_lp < powernv_pstate_info.min) ||
-   (pmsr & PMSR_PSAFE_ENABLE)) {
+   /* Check if Psafe_mode_active is set in PMSR. */
+next:
+   if (pmsr & PMSR_PSAFE_ENABLE) {
throttled = true;
pr_info("Pstate set to safe frequency\n");
}
@@ -350,7 +368,7 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
return 0;
 
if (!throttled)
-   powernv

[PATCH v2 1/2] powerpc/powernv: Add definition of OPAL_MSG_OCC message type

2015-04-27 Thread Shilpasri G Bhat

Add OPAL_MSG_OCC message definition to opal_message_type to receive
OCC events like reset, load and throttled. Host performance can be
affected when OCC is reset or OCC throttles the max Pstate.
We can register to opal_message_notifier to receive OPAL_MSG_OCC type
of message and report it to the userspace so as to keep the user
informed about the reason for a performance drop in workloads.

The reset and load OCC events are notified to kernel when FSP sends
OCC_RESET and OCC_LOAD commands.  Both reset and load messages are
sent to kernel on successful completion of reset and load operation
respectively.

The throttle OCC event indicates that the Pmax of the chip is reduced.
The chip_id and throttle reason for reducing Pmax is also queued along
with the message.

Additional opal message type OPAL_MSG_PRD is added to maintain
compatibility between opal and kernel definition of opal_message_type.

Signed-off-by: Shilpasri G Bhat 
Reviewed-by: Preeti U Murthy 
---
Changes from v1:
- Update the commit changelog

 arch/powerpc/include/asm/opal-api.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 0321a90..50053b7 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -352,6 +352,14 @@ enum opal_msg_type {
OPAL_MSG_SHUTDOWN,  /* params[0] = 1 reboot, 0 shutdown */
OPAL_MSG_HMI_EVT,
OPAL_MSG_DPO,
+   OPAL_MSG_PRD,
+   OPAL_MSG_OCC,   /*
+* params[0] = 0 reset,
+* 1 load,
+* 2 throttle
+* params[1] = chip_id
+* params[2] = throttle_status
+*/
OPAL_MSG_TYPE_MAX,
 };
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: fix IMC missing box initialization

2015-04-27 Thread Stephane Eranian

On Sun, Apr 26, 2015 at 8:43 PM, Liang, Kan  wrote:
>
>>
>> > This leads me to believe that this patch:
>> >
>> > commit c05199e5a57a579fea1e8fa65e2b511ceb524ffc
>> > Author: Kan Liang 
>> > Date:   Tue Jan 20 04:54:25 2015 +
>> >
>> > perf/x86/intel/uncore: Move uncore_box_init() out of driver
>> initialization
>> >
>> > If I revert it, I bet things will work again.
>>
>> Yes the initialization needs to be moved out of the IPI context.
>>
>
> Maybe we can move them to event init, which is not in IPI context.
>
> What do you think of this patch?
>
> ---
>
> From 8a61c48144921e9d1c841656829c3bae9bfb4408 Mon Sep 17 00:00:00 2001
> From: Kan Liang 
> Date: Sun, 26 Apr 2015 16:24:59 -0400
> Subject: [PATCH 1/1] perf/x86/intel/uncore: move uncore_box_init to uncore
>  event init
>
> commit c05199e5a57a("perf/x86/intel/uncore: Move uncore_box_init() out
> of driver initialization") moves uncore_box_init into uncore_enable_box
> to prevent potential boot failures. However, uncore_enable_box is not
> called on some client platforms (SNB/IVB/HSW) for counting IMC event.
> When it is not called, the box is not initialized, which hard locks the
> system.
>
> Additionally, uncore_enable_box along with the initialization code in it
> is always called in uncore event start functions, which are in IPI
> context. But the initizlization code should not be in IPI context. This
> is because, for example, the IMC box initialization codes for client
> platforms include ioremap, which is not allowed to be called in IPI
> context.
>
> This patch moves uncore_box_init out of IPI context, to uncore event
> init. The box is initialized only when it has not yet been initialized.
>
> Signed-off-by: Kan Liang 

Ok this works for me now. Thanks.

Tested-by: Stephane Eranian 

> ---
>  arch/x86/kernel/cpu/perf_event_intel_uncore.c | 4 
>  arch/x86/kernel/cpu/perf_event_intel_uncore.h | 2 --
>  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 3 +++
>  3 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> index c635b8b..cbc1a93 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -623,6 +623,10 @@ static int uncore_pmu_event_init(struct perf_event 
> *event)
> box = uncore_pmu_to_box(pmu, event->cpu);
> if (!box || box->cpu < 0)
> return -EINVAL;
> +
> +   /* Init box if it's not initialized yet */
> +   uncore_box_init(box);
> +
> event->cpu = box->cpu;
>
> event->hw.idx = -1;
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> index 6c8c1e7..1fb2905 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> @@ -273,8 +273,6 @@ static inline void uncore_disable_box(struct 
> intel_uncore_box *box)
>
>  static inline void uncore_enable_box(struct intel_uncore_box *box)
>  {
> -   uncore_box_init(box);
> -
> if (box->pmu->type->ops->enable_box)
> box->pmu->type->ops->enable_box(box);
>  }
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> index 4562e9e..ead70a6 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> @@ -279,6 +279,9 @@ static int snb_uncore_imc_event_init(struct perf_event 
> *event)
> if (!box || box->cpu < 0)
> return -EINVAL;
>
> +   /* Init box if it's not initialized yet */
> +   uncore_box_init(box);
> +
> event->cpu = box->cpu;
>
> event->hw.idx = -1;
>
> Thanks,
> Kan
>
>> -Andi
>>
>>
>> --
>> a...@linux.intel.com -- Speaking for myself only
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/2] powernv: cpufreq: Report frequency throttle by OCC

2015-04-27 Thread Shilpasri G Bhat

This patchset intends to add frequency throttle reporting mechanism
to powernv-cpufreq driver when OCC throttles the frequency. OCC is an
On-Chip-Controller which takes care of the power and thermal safety of
the chip. The CPU frequency can be throttled during an OCC reset or
when OCC tries to limit the max allowed frequency. The patchset will
report such conditions so as to keep the user informed about reason
for the drop in performance of workloads when frequency is throttled.

Shilpasri G Bhat (2):
  powerpc/powernv: Add definition of OPAL_MSG_OCC message type
  cpufreq: powernv: Register for OCC related opal_message notification

 arch/powerpc/include/asm/opal-api.h |   8 ++
 drivers/cpufreq/powernv-cpufreq.c   | 181 +---
 2 files changed, 174 insertions(+), 15 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Hefty, Sean

> > Keep in mind that this enum was Liran's response to Michael's original
> > patch.  In the enum in Michael's patch, there was both USNIC and
> > USNIC_UDP.
> 
> Right! That's why I'm confused. Seems wrong to drop it, right?

I think the original USNIC protocol is layered directly over Ethernet.  The 
protocol basically stole an Ethertype (the one used for IBoE/RoCE) and 
implemented a proprietary protocol instead.  I have no idea how you resolve 
that, but I also don't think it's used anymore.  USNIC_UDP is just UDP.

> Well, if RoCEv2 uses the same protocol enum, that may introduce new
> confusion, for example there will be some new CM handling for UDP encap,
> source port selection, and of course vlan/tag assignment, etc. But if
> there is support under way, and everyone is clear, then, ok.

RoCEv2/IBoUDP shares the same port space as UDP.  It has a similar issues as 
iWarp does sharing state with the main network stack.  I'm not aware of any 
proposal for resolving that.  Does it require using a separate IP address?  
Does it use a port mapper function?  Does netdev care for UDP?  I'm not sure 
what USNIC does for this either, but a common solution between USNIC and IBoUDP 
seems reasonable.


N�r��yb�X��ǧv�^�)޺{.n�+{zX����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf��^jǫy�m��@A�a���
0��h���i

Re: [PATCH 0/6] ARM: berlin: relicense the device trees under GPLv2/X11

2015-04-27 Thread Jisheng Zhang

Dear Antoine,

On Sun, 26 Apr 2015 07:03:27 -0700
Antoine Tenart  wrote:

> Hi,
> 
> The GPLv2 license makes it impractical for other software components
> licensed under another license to use our device trees. To fix this,
> and make our device tree usable by other software components, relicense
> them under a GPLv2/X11 dual-license.
> 
> In order to get this accepted, we *need* all contributors to the Berlin
> device tree files to ack-by the patches applying on a file they
> contributed on. A list of needed ack-by by device tree is shown below:

For all of them:

Acked-by: Jisheng Zhang 

> 
>  * berlin2q.dtsi:
> Antoine Tenart 
> Jisheng Zhang 
>   Alexandre Belloni 
> Sebastian Hesselbarth 
> 
>  * berlin2q-marvell-dmp.dtsi:
> Antoine Tenart 
> Jisheng Zhang 
> 
>  * berlin2.dtsi:
> Sebastian Hesselbarth 
> Antoine Tenart 
> Jisheng Zhang 
> 
>  * berlin2-sony-nsz-gs7.dts:
> Sebastian Hesselbarth 
> 
>  * berlin2cd.dtsi:
> Sebastian Hesselbarth 
> Antoine Tenart 
> Jisheng Zhang 
> 
>  * berlin2cd-google-chromecast.dts:
> Sebastian Hesselbarth 
> 
> Thanks!
> 
> Antoine
> 
> Antoine Tenart (6):
>   ARM: dts: berlin: relicense the berlin2q dtsi under GPLv2/X11
>   ARM: dts: berlin: relicense the BG2Q Marvell DMP dts under GPLv2/X11
>   ARM: dts: berlin: relicense the berlin2 dtsi under GPLv2/X11
>   ARM: dts: berlin: relicense the BG2 Sony NSZ-GS7 dts under GPLv2/X11
>   ARM: dts: berlin: relicense the berlin2cd dtsi under GPLv2/X11
>   ARM: dts: berlin: relicense the BG2CD Google Chromecast dts under
> GPLv2/X11
> 
>  arch/arm/boot/dts/berlin2-sony-nsz-gs7.dts| 34 
> +--
>  arch/arm/boot/dts/berlin2.dtsi| 34 
> +--
>  arch/arm/boot/dts/berlin2cd-google-chromecast.dts | 34 
> +--
>  arch/arm/boot/dts/berlin2cd.dtsi  | 34 
> +--
>  arch/arm/boot/dts/berlin2q-marvell-dmp.dts| 34 
> +--
>  arch/arm/boot/dts/berlin2q.dtsi   | 34 
> +--
>  6 files changed, 186 insertions(+), 18 deletions(-)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] cpufreq: powernv: Register for OCC related opal_message notification

2015-04-27 Thread Shilpasri G Bhat

Hi Preeti,

On 04/23/2015 05:28 PM, Preeti U Murthy wrote:
> Hi Shilpa,
> 
> On 04/22/2015 10:34 PM, Shilpasri G Bhat wrote:
>> OCC is an On-Chip-Controller which takes care of power and thermal
>> safety of the chip. During runtime due to power failure or
>> overtemperature the OCC may throttle the frequencies of the CPUs to
>> remain within the power budget.
>>
>> We want the cpufreq driver to be aware of such situations to be able
>> to report it to the user. We register to opal_message_notifier to
>> receive OCC messages from opal.
>>
>> powernv_cpufreq_throttle_check() reports any frequency throttling and
>> this patch will report the reason or event that caused throttling. We
>> can be throttled if OCC is reset or OCC limits Pmax due to power or
>> thermal reasons. We are also notified of unthrottling after an OCC
>> reset or if OCC restores Pmax on the chip.
>>
>> Signed-off-by: Shilpasri G Bhat 
>> CC: "Rafael J. Wysocki" 
>> CC: Viresh Kumar 
>> CC: linux...@vger.kernel.org
>> ---
>>  drivers/cpufreq/powernv-cpufreq.c | 70 
>> ++-
>>  1 file changed, 69 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
>> b/drivers/cpufreq/powernv-cpufreq.c
>> index ebef0d8..5718765 100644
>> --- a/drivers/cpufreq/powernv-cpufreq.c
>> +++ b/drivers/cpufreq/powernv-cpufreq.c
>> @@ -32,6 +32,7 @@
>>  #include 
>>  #include 
>>  #include  /* Required for cpu_sibling_mask() in UP configs */
>> +#include 
>>
>>  #define POWERNV_MAX_PSTATES 256
>>  #define PMSR_PSAFE_ENABLE   (1UL << 30)
>> @@ -40,7 +41,7 @@
>>  #define PMSR_LP(x)  ((x >> 48) & 0xFF)
>>
>>  static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
>> -static bool rebooting, throttled;
>> +static bool rebooting, throttled, occ_reset;
>>
>>  /*
>>   * Note: The set of pstates consists of contiguous integers, the
>> @@ -395,6 +396,72 @@ static struct notifier_block powernv_cpufreq_reboot_nb 
>> = {
>>  .notifier_call = powernv_cpufreq_reboot_notifier,
>>  };
>>
>> +static char throttle_reason[6][50] = {  "No throttling",
>> +"Power Cap",
>> +"Processor Over Temperature",
>> +"Power Supply Failure",
>> +"OverCurrent",
>> +"OCC Reset"
>> + };
>> +
>> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
>> +unsigned long msg_type, void *msg)
>> +{
>> +struct opal_msg *occ_msg = msg;
>> +uint64_t token;
>> +uint64_t chip_id, reason;
>> +
>> +if (msg_type != OPAL_MSG_OCC)
>> +return 0;
>> +token = be64_to_cpu(occ_msg->params[0]);
>> +switch (token) {
>> +case 0:
>> +occ_reset = true;
>> +/*
>> + * powernv_cpufreq_throttle_check() is called in
>> + * target() callback which can detect the throttle state
>> + * for governors like ondemand.
>> + * But static governors will not call target() often thus
>> + * report throttling here.
>> + */
>> +if (!throttled) {
>> +throttled = true;
>> +pr_crit("CPU Frequency is throttled\n");
>> +}
>> +pr_info("OCC in Reset\n");
>> +break;
>> +case 1:
>> +pr_info("OCC is Loaded\n");
>> +break;
>> +case 2:
> 
> You may want to replace the numbers with macros. Like
> OCC_RESET,OCC_LOAD, OCC_THROTTLE for better readability.

Okay will do.

> 
>> +chip_id = be64_to_cpu(occ_msg->params[1]);
>> +reason = be64_to_cpu(occ_msg->params[2]);
>> +if (occ_reset) {
>> +occ_reset = false;
>> +throttled = false;
>> +pr_info("OCC is Active\n");
>> +/* Sanity check for static governors */
>> +powernv_cpufreq_throttle_check(smp_processor_id());
>> +} else if (reason) {
>> +throttled = true;
>> +pr_info("Pmax reduced due to %s on chip %x\n",
>> +throttle_reason[reason], (int)chip_id);
>> +} else {
>> +throttled = false;
>> +pr_info("%s on chip %x\n",
>> +throttle_reason[reason], (int)chip_id);
> 
> Don't you need a powernv_cpufreq_throttle_check() here?  Or is it ok to
> rely on the OCC notification for unthrottle ?

Yes we need to check. Fixing this in v2.

Thanks and Regards,
Shilpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] coresight: fix typo in of_coresight.c

2015-04-27 Thread Pankaj Dubey

fixes obvious typo in of_coresight.c
%s/non-configuable/non-configurable

Signed-off-by: Pankaj Dubey 
---
 drivers/hwtracing/coresight/of_coresight.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/of_coresight.c 
b/drivers/hwtracing/coresight/of_coresight.c
index 35e51ce..b097361 100644
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -37,7 +37,7 @@ of_coresight_get_endpoint_device(struct device_node *endpoint)
struct device *dev = NULL;
 
/*
-* If we have a non-configuable replicator, it will be found on the
+* If we have a non-configurable replicator, it will be found on the
 * platform bus.
 */
dev = bus_find_device(&platform_bus_type, NULL,
-- 
2.2.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] cpufreq: powernv: Register for OCC related opal_message notification

2015-04-27 Thread Shilpasri G Bhat

Hi Viresh,

On 04/27/2015 10:02 AM, Viresh Kumar wrote:
> On 22 April 2015 at 22:34, Shilpasri G Bhat
>  wrote:
>> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
>> b/drivers/cpufreq/powernv-cpufreq.c
> 
>> +static char throttle_reason[6][50] = { "No throttling",
> 
> Don't need to mention 6 here.
> 
> And the max length you need right now is 27, so maybe s/50/30 ?
> 
> Also, start 'No Throttling' in a new line, like below.

Will do.
> 
>> +   "Power Cap",
>> +   "Processor Over Temperature",
>> +   "Power Supply Failure",
>> +   "OverCurrent",
> 
> s/OverCurrent/Over Current/ ?

Okay.
> 
>> +   "OCC Reset"
>> +};
>> +
>> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
>> +   unsigned long msg_type, void *msg)
>> +{
>> +   struct opal_msg *occ_msg = msg;
>> +   uint64_t token;
>> +   uint64_t chip_id, reason;
>> +
>> +   if (msg_type != OPAL_MSG_OCC)
>> +   return 0;
> 
> Blank line here.

Okay
> 
>> +   token = be64_to_cpu(occ_msg->params[0]);
> 
> Here as well..
> 
>> +   switch (token) {
>> +   case 0:
>> +   occ_reset = true;
>> +   /*
>> +* powernv_cpufreq_throttle_check() is called in
>> +* target() callback which can detect the throttle state
>> +* for governors like ondemand.
>> +* But static governors will not call target() often thus
>> +* report throttling here.
>> +*/
> 
> Now, do I understand correctly that this notifier will be called as
> soon as we switch throttling state ?
> 
> If yes, then do we still need the throttle_check() routine you added
> earlier ? Maybe not.

We cannot remove throttle_check() routine for the following reasons:
1) To report old firmware bugs which do not restore frequency control to host
after an OCC reset.
2) In BMC based boxes if OCC crashes currently firmware will not send 'reset'
and 'load' messages, in such cases throttle_check() will be sufficient to
monitor a throttled state caused by 'reset'.
3) Throttle reporting in old firmwares which do not have this notification.

> 
>> +   if (!throttled) {
>> +   throttled = true;
>> +   pr_crit("CPU Frequency is throttled\n");
>> +   }
>> +   pr_info("OCC in Reset\n");
>> +   break;
>> +   case 1:
>> +   pr_info("OCC is Loaded\n");
>> +   break;
>> +   case 2:
>> +   chip_id = be64_to_cpu(occ_msg->params[1]);
>> +   reason = be64_to_cpu(occ_msg->params[2]);
> 
> Blank line here.

Okay
> 
>> +   if (occ_reset) {
>> +   occ_reset = false;
>> +   throttled = false;
>> +   pr_info("OCC is Active\n");
>> +   /* Sanity check for static governors */
>> +   powernv_cpufreq_throttle_check(smp_processor_id());
>> +   } else if (reason) {
>> +   throttled = true;
>> +   pr_info("Pmax reduced due to %s on chip %x\n",
>> +   throttle_reason[reason], 
>> (int)chip_id);
>> +   } else {
>> +   throttled = false;
>> +   pr_info("%s on chip %x\n",
>> +   throttle_reason[reason], 
>> (int)chip_id);
>> +   }
> 
> Run checkpatch with --strict option, and you will see some warnings.

Okay will do.

Thanks and Regards,
Shilpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: revert "fs/befs/linuxvfs.c: replace strncpy by strlcpy"

2015-04-27 Thread Fabian Frederick



> On 28 April 2015 at 05:48 Al Viro  wrote:
>
>
> commit 39d7a29f867bd5a4a551fad6bb3812ceddb0bce1
> Author: Fabian Frederick 
> Date:   Fri Jun 6 14:36:15 2014 -0700
>
>     fs/befs/linuxvfs.c: replace strncpy by strlcpy
>     
>     strncpy + end of string assignment replaced by strlcpy
>
> replaces perfectly safe code with undefined behaviour.  All in the name
> of "security hardening", presumably.  Folks, seeing the words "designed to be
> safer, more consistent, and less error prone replacement" in a manpage does
> *NOT* mean "OK, quit reading it - no need to go further, not even to the end
> of the paragraph".  Because in the end of that paragraph it says "This means
> that for strlcpy() src must be NUL-terminated".  And sure enough, our
> implementation relies on that - it starts with strlen().
>
> strncpy() is guaranteed not to look further than size.  strlcpy() is *NOT*.
> strncpy() on unvalidated source is safe, provided that you sanitize the copy;
> strlcpy() on anything like that is an invitation for nasal daemons.
>
> Sure, we can (and probably should) make strlcpy(dst, src, n) never access
> memory beyond src + n - 1, but this kind of cargo-culting is a Bad Thing(tm);
> mindless "security improvements" without so much as bothering to RTFM are
> asking for trouble.  And in userland code anything like that _can't_ be
> papered over afterwards - not unless you can patch every libc implementation
> out there.
>
> This particular code is completely pointless - if anything, it should've been
> memcpy() + nd_terminate_link()...
>
> Al, very unhappy about the prospect of looking through ~2000 calls of
> strlcpy()
> we have in the tree...

Sorry Al, I thought it was more secure.
I guess the 2 following patches should be reversed as well :

6cb103b6f45a
"fs/befs/btree.c: replace strncpy by strlcpy + coding style fixing"

69201bb11327
"fs/ocfs2/super.c: use OCFS2_MAX_VOL_LABEL_LEN and strlcpy"

Regards,
Fabian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] spi: omap2-mcspi: Add support for GPIO chipselects

2015-04-27 Thread Martin Sperl


> On 28.04.2015, at 03:21, Michael Welling  wrote:
> If I were to attempt to convert the driver to use the core chipselect support,
> how would I go about doing it?
> 
> Is there another driver that I can use for reference?
You may look into this patch: e34ff011c70e5f4ef219141711142d5111ae6ebb
for the spi-bcm2835 driver, which did the conversion to the new transfer_one
interface (and framework based GPIO chipselects).

For most parts all you have to do is take the contents of the loop over all the
spi_transfers inside your master->transfer_one_message method and create a new
method with it and assign it to master->transfer_one.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 6/8] clk: sirf: Constify parent names in clock init data

2015-04-27 Thread Krzysztof Kozlowski

The array of parent names can be made as array of const pointers to
const strings.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/sirf/clk-common.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/clk/sirf/clk-common.c b/drivers/clk/sirf/clk-common.c
index 37af51c5f213..e9cf5730effe 100644
--- a/drivers/clk/sirf/clk-common.c
+++ b/drivers/clk/sirf/clk-common.c
@@ -188,7 +188,7 @@ static struct clk_ops std_pll_ops = {
.set_rate = pll_clk_set_rate,
 };
 
-static const char *pll_clk_parents[] = {
+static const char * const pll_clk_parents[] = {
"osc",
 };
 
@@ -284,7 +284,7 @@ static struct clk_hw usb_pll_clk_hw = {
  * clock domains - cpu, mem, sys/io, dsp, gfx
  */
 
-static const char *dmn_clk_parents[] = {
+static const char * const dmn_clk_parents[] = {
"rtc",
"osc",
"pll1",
@@ -673,7 +673,7 @@ static void std_clk_disable(struct clk_hw *hw)
clkc_writel(val, reg);
 }
 
-static const char *std_clk_io_parents[] = {
+static const char * const std_clk_io_parents[] = {
"io",
 };
 
@@ -949,7 +949,7 @@ static struct clk_std clk_pulse = {
},
 };
 
-static const char *std_clk_dsp_parents[] = {
+static const char * const std_clk_dsp_parents[] = {
"dsp",
 };
 
@@ -981,7 +981,7 @@ static struct clk_std clk_mf = {
},
 };
 
-static const char *std_clk_sys_parents[] = {
+static const char * const std_clk_sys_parents[] = {
"sys",
 };
 
@@ -999,7 +999,7 @@ static struct clk_std clk_security = {
},
 };
 
-static const char *std_clk_usb_parents[] = {
+static const char * const std_clk_usb_parents[] = {
"usb_pll",
 };
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 4/8] clk: tegra: Fix duplicate const for parent names

2015-04-27 Thread Krzysztof Kozlowski

Replace duplicated const keyword for 'emc_parent_clk_names' with proper
array of const pointers to const strings.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/tegra/clk-emc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/tegra/clk-emc.c b/drivers/clk/tegra/clk-emc.c
index 8757feda4a11..59697c61f2f1 100644
--- a/drivers/clk/tegra/clk-emc.c
+++ b/drivers/clk/tegra/clk-emc.c
@@ -45,7 +45,7 @@
 #define CLK_SOURCE_EMC_EMC_2X_CLK_SRC(x) (((x) & 
CLK_SOURCE_EMC_EMC_2X_CLK_SRC_MASK) << \
  CLK_SOURCE_EMC_EMC_2X_CLK_SRC_SHIFT)
 
-static const char const *emc_parent_clk_names[] = {
+static const char * const emc_parent_clk_names[] = {
"pll_m", "pll_c", "pll_p", "clk_m", "pll_m_ud",
"pll_c2", "pll_c3", "pll_c_ud"
 };
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 5/8] clk: cdce706: Constify parent names in clock init data

2015-04-27 Thread Krzysztof Kozlowski

The array of parent names can be made as array of const pointers to
const strings.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/clk-cdce706.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/clk-cdce706.c b/drivers/clk/clk-cdce706.c
index b8e4f8a822e9..8a2dfd0012f3 100644
--- a/drivers/clk/clk-cdce706.c
+++ b/drivers/clk/clk-cdce706.c
@@ -94,7 +94,7 @@ static const char * const cdce706_source_name[] = {
"clk_in0", "clk_in1",
 };
 
-static const char *cdce706_clkin_name[] = {
+static const char * const cdce706_clkin_name[] = {
"clk_in",
 };
 
@@ -102,7 +102,7 @@ static const char * const cdce706_pll_name[] = {
"pll1", "pll2", "pll3",
 };
 
-static const char *cdce706_divider_parent_name[] = {
+static const char * const cdce706_divider_parent_name[] = {
"clk_in", "pll1", "pll2", "pll2", "pll3",
 };
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 8/8] MIPS: Alchemy: Remove unneeded cast removing const

2015-04-27 Thread Krzysztof Kozlowski

Parent names in clock init data is now array of const pointers to const
strings so the cast is not needed.

Signed-off-by: Krzysztof Kozlowski 
---
 arch/mips/alchemy/common/clock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/mips/alchemy/common/clock.c b/arch/mips/alchemy/common/clock.c
index 6a98d2cb402c..6e46abe0dac6 100644
--- a/arch/mips/alchemy/common/clock.c
+++ b/arch/mips/alchemy/common/clock.c
@@ -752,12 +752,12 @@ static int __init alchemy_clk_init_fgens(int ctype)
switch (ctype) {
case ALCHEMY_CPU_AU1000...ALCHEMY_CPU_AU1200:
id.ops = &alchemy_clkops_fgenv1;
-   id.parent_names = (const char **)alchemy_clk_fgv1_parents;
+   id.parent_names = alchemy_clk_fgv1_parents;
id.num_parents = 2;
break;
case ALCHEMY_CPU_AU1300:
id.ops = &alchemy_clkops_fgenv2;
-   id.parent_names = (const char **)alchemy_clk_fgv2_parents;
+   id.parent_names = alchemy_clk_fgv2_parents;
id.num_parents = 3;
break;
default:
@@ -961,7 +961,7 @@ static int __init alchemy_clk_setup_imux(int ctype)
struct clk *c;
 
id.ops = &alchemy_clkops_csrc;
-   id.parent_names = (const char **)alchemy_clk_csrc_parents;
+   id.parent_names = alchemy_clk_csrc_parents;
id.num_parents = 7;
id.flags = CLK_SET_RATE_PARENT | CLK_GET_RATE_NOCACHE;
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 7/8] clk: ls1x: Fix duplicate const for parent names

2015-04-27 Thread Krzysztof Kozlowski

Replace duplicated const keyword with proper array of const pointers to
const strings.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/clk-ls1x.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/clk/clk-ls1x.c b/drivers/clk/clk-ls1x.c
index ca80103ac188..d4c61985f448 100644
--- a/drivers/clk/clk-ls1x.c
+++ b/drivers/clk/clk-ls1x.c
@@ -80,9 +80,9 @@ static struct clk *__init clk_register_pll(struct device *dev,
return clk;
 }
 
-static const char const *cpu_parents[] = { "cpu_clk_div", "osc_33m_clk", };
-static const char const *ahb_parents[] = { "ahb_clk_div", "osc_33m_clk", };
-static const char const *dc_parents[] = { "dc_clk_div", "osc_33m_clk", };
+static const char * const cpu_parents[] = { "cpu_clk_div", "osc_33m_clk", };
+static const char * const ahb_parents[] = { "ahb_clk_div", "osc_33m_clk", };
+static const char * const dc_parents[] = { "dc_clk_div", "osc_33m_clk", };
 
 void __init ls1x_clk_init(void)
 {
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 3/8] clk: tegra: Fix inconsistent indenting

2015-04-27 Thread Krzysztof Kozlowski

Fix the indentation - spaces used instead of tabs and actually wrong
number of spaces.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/tegra/clk-emc.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/clk/tegra/clk-emc.c b/drivers/clk/tegra/clk-emc.c
index 615da43a508d..8757feda4a11 100644
--- a/drivers/clk/tegra/clk-emc.c
+++ b/drivers/clk/tegra/clk-emc.c
@@ -130,11 +130,11 @@ static long emc_determine_rate(struct clk_hw *hw, 
unsigned long rate,
 
tegra = container_of(hw, struct tegra_clk_emc, hw);
 
- for (i = 0; i < tegra->num_timings; i++) {
-if (tegra->timings[i].ram_code != ram_code)
-continue;
+   for (i = 0; i < tegra->num_timings; i++) {
+   if (tegra->timings[i].ram_code != ram_code)
+   continue;
 
-timing = tegra->timings + i;
+   timing = tegra->timings + i;
 
if (timing->rate > max_rate) {
i = min(i, 1);
@@ -145,11 +145,11 @@ static long emc_determine_rate(struct clk_hw *hw, 
unsigned long rate,
continue;
 
if (timing->rate >= rate)
-return timing->rate;
-}
+   return timing->rate;
+   }
 
-if (timing)
-return timing->rate;
+   if (timing)
+   return timing->rate;
 
return __clk_get_rate(hw->clk);
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/8] clk: rockchip: Staticize file-scope declarations

2015-04-27 Thread Krzysztof Kozlowski

Add missing static to local (file-scope only) symbols.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/rockchip/clk-rk3188.c | 2 +-
 drivers/clk/rockchip/clk-rk3288.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/rockchip/clk-rk3188.c 
b/drivers/clk/rockchip/clk-rk3188.c
index 556ce041d371..e4f9d472f1ff 100644
--- a/drivers/clk/rockchip/clk-rk3188.c
+++ b/drivers/clk/rockchip/clk-rk3188.c
@@ -26,7 +26,7 @@ enum rk3188_plls {
apll, cpll, dpll, gpll,
 };
 
-struct rockchip_pll_rate_table rk3188_pll_rates[] = {
+static struct rockchip_pll_rate_table rk3188_pll_rates[] = {
RK3066_PLL_RATE(220800, 1, 92, 1),
RK3066_PLL_RATE(218400, 1, 91, 1),
RK3066_PLL_RATE(216000, 1, 90, 1),
diff --git a/drivers/clk/rockchip/clk-rk3288.c 
b/drivers/clk/rockchip/clk-rk3288.c
index d17eb4528a28..4f817ed9e6ee 100644
--- a/drivers/clk/rockchip/clk-rk3288.c
+++ b/drivers/clk/rockchip/clk-rk3288.c
@@ -27,7 +27,7 @@ enum rk3288_plls {
apll, dpll, cpll, gpll, npll,
 };
 
-struct rockchip_pll_rate_table rk3288_pll_rates[] = {
+static struct rockchip_pll_rate_table rk3288_pll_rates[] = {
RK3066_PLL_RATE(220800, 1, 92, 1),
RK3066_PLL_RATE(218400, 1, 91, 1),
RK3066_PLL_RATE(216000, 1, 90, 1),
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/8] clk: exynos: Staticize file-scope declarations

2015-04-27 Thread Krzysztof Kozlowski

Add missing static to local (file-scope only) symbols.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/clk/samsung/clk-exynos5260.c | 74 ++--
 drivers/clk/samsung/clk-exynos5420.c | 10 ++---
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/drivers/clk/samsung/clk-exynos5260.c 
b/drivers/clk/samsung/clk-exynos5260.c
index e2e5193d1049..df1d83c45554 100644
--- a/drivers/clk/samsung/clk-exynos5260.c
+++ b/drivers/clk/samsung/clk-exynos5260.c
@@ -94,7 +94,7 @@ PNAME(mout_aud_pll_user_p) = {"fin_pll", "fout_aud_pll"};
 PNAME(mout_sclk_aud_i2s_p) = {"mout_aud_pll_user", "ioclk_i2s_cdclk"};
 PNAME(mout_sclk_aud_pcm_p) = {"mout_aud_pll_user", "ioclk_pcm_extclk"};
 
-struct samsung_mux_clock aud_mux_clks[] __initdata = {
+static struct samsung_mux_clock aud_mux_clks[] __initdata = {
MUX(AUD_MOUT_AUD_PLL_USER, "mout_aud_pll_user", mout_aud_pll_user_p,
MUX_SEL_AUD, 0, 1),
MUX(AUD_MOUT_SCLK_AUD_I2S, "mout_sclk_aud_i2s", mout_sclk_aud_i2s_p,
@@ -103,7 +103,7 @@ struct samsung_mux_clock aud_mux_clks[] __initdata = {
MUX_SEL_AUD, 8, 1),
 };
 
-struct samsung_div_clock aud_div_clks[] __initdata = {
+static struct samsung_div_clock aud_div_clks[] __initdata = {
DIV(AUD_DOUT_ACLK_AUD_131, "dout_aclk_aud_131", "mout_aud_pll_user",
DIV_AUD0, 0, 4),
 
@@ -115,7 +115,7 @@ struct samsung_div_clock aud_div_clks[] __initdata = {
DIV_AUD1, 12, 4),
 };
 
-struct samsung_gate_clock aud_gate_clks[] __initdata = {
+static struct samsung_gate_clock aud_gate_clks[] __initdata = {
GATE(AUD_SCLK_I2S, "sclk_aud_i2s", "dout_sclk_aud_i2s",
EN_SCLK_AUD, 0, CLK_SET_RATE_PARENT, 0),
GATE(AUD_SCLK_PCM, "sclk_aud_pcm", "dout_sclk_aud_pcm",
@@ -203,7 +203,7 @@ PNAME(mout_phyclk_mipi_dphy_4lmrxclk_esc0_user_p) = 
{"fin_pll",
 PNAME(mout_sclk_hdmi_spdif_p) = {"fin_pll", "ioclk_spdif_extclk",
"dout_aclk_peri_aud", "phyclk_hdmi_phy_ref_cko"};
 
-struct samsung_mux_clock disp_mux_clks[] __initdata = {
+static struct samsung_mux_clock disp_mux_clks[] __initdata = {
MUX(DISP_MOUT_ACLK_DISP_333_USER, "mout_aclk_disp_333_user",
mout_aclk_disp_333_user_p,
MUX_SEL_DISP0, 0, 1),
@@ -272,7 +272,7 @@ struct samsung_mux_clock disp_mux_clks[] __initdata = {
MUX_SEL_DISP4, 4, 2),
 };
 
-struct samsung_div_clock disp_div_clks[] __initdata = {
+static struct samsung_div_clock disp_div_clks[] __initdata = {
DIV(DISP_DOUT_PCLK_DISP_111, "dout_pclk_disp_111",
"mout_aclk_disp_222_user",
DIV_DISP, 8, 4),
@@ -285,7 +285,7 @@ struct samsung_div_clock disp_div_clks[] __initdata = {
DIV_DISP, 16, 4),
 };
 
-struct samsung_gate_clock disp_gate_clks[] __initdata = {
+static struct samsung_gate_clock disp_gate_clks[] __initdata = {
GATE(DISP_MOUT_HDMI_PHY_PIXEL_USER, "sclk_hdmi_link_i_pixel",
"mout_phyclk_hdmi_phy_pixel_clko_user",
EN_SCLK_DISP0, 26, CLK_SET_RATE_PARENT, 0),
@@ -363,13 +363,13 @@ static unsigned long egl_clk_regs[] __initdata = {
 PNAME(mout_egl_b_p) = {"mout_egl_pll", "dout_bus_pll"};
 PNAME(mout_egl_pll_p) = {"fin_pll", "fout_egl_pll"};
 
-struct samsung_mux_clock egl_mux_clks[] __initdata = {
+static struct samsung_mux_clock egl_mux_clks[] __initdata = {
MUX(EGL_MOUT_EGL_PLL, "mout_egl_pll", mout_egl_pll_p,
MUX_SEL_EGL, 4, 1),
MUX(EGL_MOUT_EGL_B, "mout_egl_b", mout_egl_b_p, MUX_SEL_EGL, 16, 1),
 };
 
-struct samsung_div_clock egl_div_clks[] __initdata = {
+static struct samsung_div_clock egl_div_clks[] __initdata = {
DIV(EGL_DOUT_EGL1, "dout_egl1", "mout_egl_b", DIV_EGL, 0, 3),
DIV(EGL_DOUT_EGL2, "dout_egl2", "dout_egl1", DIV_EGL, 4, 3),
DIV(EGL_DOUT_ACLK_EGL, "dout_aclk_egl", "dout_egl2", DIV_EGL, 8, 3),
@@ -433,7 +433,7 @@ PNAME(mout_phyclk_usbdrd30_pipe_pclk_user_p) = {"fin_pll",
 PNAME(mout_phyclk_usbdrd30_phyclock_user_p) = {"fin_pll",
"phyclk_usbdrd30_udrd30_phyclock"};
 
-struct samsung_mux_clock fsys_mux_clks[] __initdata = {
+static struct samsung_mux_clock fsys_mux_clks[] __initdata = {
MUX(FSYS_MOUT_PHYCLK_USBDRD30_PHYCLOCK_USER,
"mout_phyclk_usbdrd30_phyclock_user",
mout_phyclk_usbdrd30_phyclock_user_p,
@@ -456,7 +456,7 @@ struct samsung_mux_clock fsys_mux_clks[] __initdata = {
MUX_SEL_FSYS1, 16, 1),
 };
 
-struct samsung_gate_clock fsys_gate_clks[] __initdata = {
+static struct samsung_gate_clock fsys_gate_clks[] __initdata = {
GATE(FSYS_PHYCLK_USBHOST20, "phyclk_usbhost20_phyclock",
"mout_phyclk_usbdrd30_phyclock_user",
EN_SCLK_FSYS, 1, 0, 0),
@@ -537,18 +537,18 @@ static unsigned lo

[PATCH v2 0/8] clk: Minor cleanups

2015-04-27 Thread Krzysztof Kozlowski

Hi,


Changes since v1

1. Rebase on next-20150427 and Sascha Hauer's:
   clk: make strings in parent name arrays const [1]
2. Add patch "clk: tegra: Fix inconsistent indenting".


Description and dependencies

Small cleanups for different clock drivers.

The first three patches are independent.

Rest of the patches (these related to constifying parent names,
including the change for MIPS) depend on the "clk: make strings in
parent name arrays const" from Sascha Hauer [1].


Tested on Arndale Octa (Exynos5420) and Trats2 (Exynos4412). Other
drivers (and MIPS related) only compile tested plus some static
checkers.


[1] http://www.spinics.net/lists/arm-kernel/msg413763.html

Best regards,
Krzysztof

Krzysztof Kozlowski (8):
  clk: rockchip: Staticize file-scope declarations
  clk: exynos: Staticize file-scope declarations
  clk: tegra: Fix inconsistent indenting
  clk: tegra: Fix duplicate const for parent names
  clk: cdce706: Constify parent names in clock init data
  clk: sirf: Constify parent names in clock init data
  clk: ls1x: Fix duplicate const for parent names
  MIPS: Alchemy: Remove unneeded cast removing const

 arch/mips/alchemy/common/clock.c |  6 +--
 drivers/clk/clk-cdce706.c|  4 +-
 drivers/clk/clk-ls1x.c   |  6 +--
 drivers/clk/rockchip/clk-rk3188.c|  2 +-
 drivers/clk/rockchip/clk-rk3288.c|  2 +-
 drivers/clk/samsung/clk-exynos5260.c | 74 ++--
 drivers/clk/samsung/clk-exynos5420.c | 10 ++---
 drivers/clk/sirf/clk-common.c| 12 +++---
 drivers/clk/tegra/clk-emc.c  | 18 -
 9 files changed, 67 insertions(+), 67 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3 V8] workqueue: Allow modifying low level unbound workqueue cpumask

2015-04-27 Thread Mike Galbraith

On Mon, 2015-04-27 at 23:44 -0400, Tejun Heo wrote:

> > So, we need an API to modify the wq_unbound_cpumask, and I provided
> > this public function.  Otherwise, the other code can't modify it.
> 
> I see.  I don't have too strong an opinion; however, changing the mask
> is a fairly heavy operation.  Are there specific reasons why we don't
> want to follow the nohz config right away?

Isolation is not only applicable to nohz_full.  Many loads are
unsuitable for nohz_full, yet require maximum isolation.

ATM, nohz_full is not dynamic, but hopefully one day will be.  In the
here and now, we can isolate cores from the scheduler on the fly via
cpusets, a prime API user candidate.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH v2 0/6] Drivers: hv: vmbus: fair round robin algorithm for vmbus_get_outgoing_channel()

2015-04-27 Thread Dexuan Cui

> -Original Message-
> From: Vitaly Kuznetsov [mailto:vkuzn...@redhat.com]
> Sent: Tuesday, April 28, 2015 1:04
> To: KY Srinivasan
> Cc: Haiyang Zhang; de...@linuxdriverproject.org; linux-
> ker...@vger.kernel.org; Dexuan Cui
> Subject: [PATCH v2 0/6] Drivers: hv: vmbus: fair round robin algorithm for
> vmbus_get_outgoing_channel()
> 
> Changes in v2:
> - Address Dexuan's review comments:
>   PATCH 3/6: s,channel,primary_channel;
>   PATCH 4/6: add a forward declaration instead of moving code around;
>   PATCH 6/6: fix an off-by-one
> - Change the algorithm in PATCH 6/6:
>   Instead of a simple round robin we first try to find a (sub)channel with
>   the current_cpu == target_cpu and we fallback to a round robin when we
> fail
>   to find one.
> 
> K. Y., Dexuan, can you please give it a spin in various testing environments
> you have? Thanks!
> 
> Original description:
> 
> This series is a continuation of the "Drivers: hv: vmbus: Use a round-robin
> algorithm for picking the outgoing channel" work. It is supposed to bring
> two
> significant changes:
> 1) Subchannels for a channel are distributed evenly across all vcpus we have.
>Currently we try to distribute all channels (including subchannels) across
>all vcpus, this approach doesn't guarantee that the particular channel's
>subchannels will be distributed in the same way as we process all offer
>requests in some random order. (Patch 05)
> 2) Channel picking based on the current vcpu is dropped from
>vmbus_get_outgoing_channel() in favor of a fair round robin. (Patch 06)
> (this
>is not true anymore, see 'Changes').
> 
> Patches 01 - 04 are cleanup/refactoring.
> 
> Vitaly Kuznetsov (6):
>   Drivers: hv: vmbus: unify calls to percpu_channel_enq()
>   Drivers: hv: vmbus: briefly comment num_sc and next_oc
>   Drivers: hv: vmbus: decrease num_sc on subchannel removal
>   Drivers: hv: vmbus: move init_vp_index() call to vmbus_process_offer()
>   Drivers: hv: vmbus: distribute subchannels among all vcpus
>   Drivers: hv: vmbus: improve selection of an outgoing channel
> 
>  drivers/hv/channel_mgmt.c | 127 ++-
> ---
>  include/linux/hyperv.h|  12 +++--
>  2 files changed, 80 insertions(+), 59 deletions(-)
> 
> --

Patch 1, 2 and 3 are good to me.

We'll have to test 4~6 for performance change.

Thanks,
-- Dexuan
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ASoC: fix simple_return.cocci warnings

2015-04-27 Thread kbuild test robot

sound/soc/codecs/adau1977.c:496:5-8: WARNING: end returns can be simpified

 Simplify a trivial if-return sequence.  Possibly combine with a
 preceding function call.
Generated by: scripts/coccinelle/misc/simple_return.cocci

CC: Lars-Peter Clausen 
Signed-off-by: Fengguang Wu 
---

 adau1977.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

--- a/sound/soc/codecs/adau1977.c
+++ b/sound/soc/codecs/adau1977.c
@@ -493,10 +493,7 @@ static int adau1977_set_bias_level(struc
break;
}
 
-   if (ret)
-   return ret;
-
-   return 0;
+   return ret;
 }
 
 static int adau1977_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ASoC: fix simple_return.cocci warnings

2015-04-27 Thread kbuild test robot

sound/soc/codecs/ssm2518.c:521:5-8: WARNING: end returns can be simpified

 Simplify a trivial if-return sequence.  Possibly combine with a
 preceding function call.
Generated by: scripts/coccinelle/misc/simple_return.cocci

CC: Lars-Peter Clausen 
Signed-off-by: Fengguang Wu 
---

 ssm2518.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

--- a/sound/soc/codecs/ssm2518.c
+++ b/sound/soc/codecs/ssm2518.c
@@ -518,10 +518,7 @@ static int ssm2518_set_bias_level(struct
break;
}
 
-   if (ret)
-   return ret;
-
-   return 0;
+   return ret;
 }
 
 static int ssm2518_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3 V8] workqueue: Allow modifying low level unbound workqueue cpumask

2015-04-27 Thread Tejun Heo

Hello,

On Tue, Apr 28, 2015 at 10:24:31AM +0800, Lai Jiangshan wrote:
> >> Wouldn't this make a lot more sense above when copying @attrs into
> >> @new_attrs?  The comment there even says "make a copy of @attrs and
> >> sanitize it".  Copy to @new_attrs, mask with wq_unbound_cpumask and
> >> fall back to wq_unbound_cpumask if empty.
> 
> We need to save the user original configured attrs.
> When any time wq_unbound_cpumask is changed, we should use
> the user original configured attrs (cpumask) to re-calculate
> the pwqs and avoid losing any information.

Sure, we can do that for new_attrs and then mask tmp_attrs further w/
wq_unbound_cpumask, no?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

revert "fs/befs/linuxvfs.c: replace strncpy by strlcpy"

2015-04-27 Thread Al Viro

commit 39d7a29f867bd5a4a551fad6bb3812ceddb0bce1
Author: Fabian Frederick 
Date:   Fri Jun 6 14:36:15 2014 -0700

fs/befs/linuxvfs.c: replace strncpy by strlcpy

strncpy + end of string assignment replaced by strlcpy

replaces perfectly safe code with undefined behaviour.  All in the name
of "security hardening", presumably.  Folks, seeing the words "designed to be
safer, more consistent, and less error prone replacement" in a manpage does
*NOT* mean "OK, quit reading it - no need to go further, not even to the end
of the paragraph".  Because in the end of that paragraph it says "This means
that for strlcpy() src must be NUL-terminated".  And sure enough, our
implementation relies on that - it starts with strlen().

strncpy() is guaranteed not to look further than size.  strlcpy() is *NOT*.
strncpy() on unvalidated source is safe, provided that you sanitize the copy;
strlcpy() on anything like that is an invitation for nasal daemons.

Sure, we can (and probably should) make strlcpy(dst, src, n) never access
memory beyond src + n - 1, but this kind of cargo-culting is a Bad Thing(tm);
mindless "security improvements" without so much as bothering to RTFM are
asking for trouble.  And in userland code anything like that _can't_ be
papered over afterwards - not unless you can patch every libc implementation
out there.

This particular code is completely pointless - if anything, it should've been
memcpy() + nd_terminate_link()...

Al, very unhappy about the prospect of looking through ~2000 calls of strlcpy()
we have in the tree...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3 V8] workqueue: Allow modifying low level unbound workqueue cpumask

2015-04-27 Thread Tejun Heo

Hello,

On Tue, Apr 28, 2015 at 09:44:44AM +0800, Lai Jiangshan wrote:
> >> +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
> > 
> > Why is this a public function?
> 
> 
> In V4 patchset, Kevin Hilman had requested the wq_unbound_cpumask
> to be "cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);"
> 
> I replied against it and I suggested that wq_unbound_cpumask can be
> re-set after workqueue initialized it.
> 
> And Frederic Weisbecker seemed on my side:
> """
> If it should be the default on NO_HZ_FULL, maybe we should do this from the
> tick nohz code. Some late or fs initcall that will do the workqueue affinity,
> timer affinity, etc...
> """
> 
> So, we need an API to modify the wq_unbound_cpumask, and I provided
> this public function.  Otherwise, the other code can't modify it.

I see.  I don't have too strong an opinion; however, changing the mask
is a fairly heavy operation.  Are there specific reasons why we don't
want to follow the nohz config right away?  Also, even if we do it
this way, the function doesn't need to be EXPORT_SYMBOL_GPL()'d,
right?

> > Is the following list_del() necessary?  The list is never used again,
> > right?
> 
> It isn't necessary. It was added in V7. I thought it could make
> the code more normal.

The problem with doing unnecessary stuff is that it's bound to be
inconsistent and makes the reader wonder whether something else which
requires such extra operation is going on when there's none.  It tends
to mislead than anything else.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/4] extcon: Modify the name of unused external connector

2015-04-27 Thread Krzysztof Kozlowski


On 27.04.2015 21:31, Chanwoo Choi wrote:

This patchset alter the unused name of external connector (jig/dock/MHL) as
following. The name of jig cable and dock device include the non-standard H/W
information. On user-space side, this information are not necessary. The extcon
core will support the other method to inform the specific H/W information to
kernel device driver and framework. For example, extcon core have the plan to 
add
the notifier chain for USB ID/VBUS pin state. If extcon consumer (kernel device
driver and framework) use the notifer chain for USB ID/VBUS, they can get the
state of both JIG and USB when JIG-USB-ON cable is attached.

And last patch removes the unused 'num_cables' filed on extcon-adc-jack.c.

1. jig cable name
- JIG-USB-ON   -->|
- JIG-USB-OFF  -->|
- JIG-UART-ON  -->|
- JIG-UART-OFF -->|--> JIG

2. dock device name
- Dock-Smart   -->|
- Dock-Desk-->|
- Dock-Audio   -->|
- Dock-Card-->|--> DOCK

3. MHL-TA cable name
- MHL-TA -> TA


Hi,

That makes sense but isn't such change a break of interface with 
user-space? The user-space may expect Dock-xxx for Dock.


Best regards,
Krzysztof

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread punnaiah choudary kalluri

On Tue, Apr 28, 2015 at 8:52 AM, Brian Norris
 wrote:
> On Tue, Apr 28, 2015 at 08:18:12AM +0530, punnaiah choudary kalluri wrote:
>> On Tue, Apr 28, 2015 at 4:53 AM, Brian Norris
>>  wrote:
>> > On Tue, Apr 28, 2015 at 12:19:16AM +0200, Richard Weinberger wrote:
>> >> Oh, I thought every driver has to implement that function. ;-\
>> >
>> > Nope.
>> >
>> >> But you're right there is a corner case.
>> >
>> > And it's not the only one! Right now, there's no guarantee even that
>> > read_buf() returns raw data, unmodified by the SoC's controller. Plenty
>> > of drivers actually have HW-enabled ECC turned on by default, and so
>> > they override the chip->ecc.read_page() (and sometimes
>> > chip->ecc.read_page_raw() functions, if we're lucky) with something
>> > that pokes the appropriate hardware instead. I expect anything
>> > comprehensive here is probably going to have to utilize
>> > chip->ecc.read_page_raw(), at least if it's provided by the hardware
>> > driver.
>>
>> Yes, overriding the chip->ecc.read_page_raw would solve this.
>
> I'm actually suggesting that (in this patch set, for on-die ECC
> support), maybe we *shouldn't* override chip->ecc.read_page_raw() and
> leave that to be defined by the driver, and then on-die ECC support
> should be added in a way that just calls chip->ecc.read_page_raw(). This
> should work for any driver that already properly supports the raw
> callbacks.

Ok. Understood.

>
>> Agree that
>> read_buf need not be returning raw data always including my new driver for
>> arasan nand flash controller.
>
> I agree with that. At the moment, chip->read_buf() really has very
> driver-specific meaning. Not sure if that's really a good thing, but
> it's the way things are...
>
>> http://lkml.iu.edu/hypermail/linux/kernel/1504.2/00313.html
>
> In the half a minute I just spent looking at this (I may review it
> properly later), I noted a few things:
>
> 1. you don't implement ecc.read_page_raw(); this means we'll probably
> have trouble supporting on-die ECC with your driver, among other things

On-die ECC is optional as long as the controller has better ecc coverage.
The arasan controller supports up to 24 bit ecc. There is no point to use
on-die ECC and will always use hw ecc even for On-die ecc devices.
This version of driver will not have the support for ecc.read_page_raw but
I will add based on the need in future.


>
> 2. your patch is all white-space mangled. Please use your favorite
> search engine to figure out how to get that right. git-send-email is
> your friend.

Oh sorry. Looks that was the web link issue. Here is the new one.
 https://lkml.org/lkml/2015/4/16/311

Also request your time for reviewing this driver.

Thanks,
Punnaiah

>
> Thanks,
> Brian
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 07/23] gpio: sysfs: rename gpiochip registration functions

2015-04-27 Thread Alexandre Courbot

On Mon, Apr 27, 2015 at 6:05 PM, Johan Hovold  wrote:
> On Mon, Apr 27, 2015 at 05:50:54PM +0900, Alexandre Courbot wrote:
>> On Mon, Apr 27, 2015 at 5:27 PM, Johan Hovold  wrote:
>> > On Mon, Apr 27, 2015 at 12:54:36PM +0900, Alexandre Courbot wrote:
>> >> On Wed, Apr 22, 2015 at 12:42 AM, Johan Hovold  wrote:
>> >> > Rename the gpio-chip export/unexport functions to the more descriptive
>> >> > names gpiochip_register and gpiochip_unregister.
>> >>
>> >> Since these functions are related to sysfs, wouldn't
>> >> gpiochip_sysfs_export (or gpiochip_sysfs_register, although the former
>> >> sounds better to me) be even more descriptive?
>> >
>> > I'm trying to get rid of the made up notion of "exporting" things. What
>> > we are doing is to register devices with driver core, and that involves
>> > a representation is sysfs.
>> >
>> > Eventually, a gpio chip should always be registered with driver core and
>> > this is not directly related to the (by then hopefully legacy)
>> > sysfs-interface.
>>
>> I understand and agree, but even after your patch series, registration
>> of a gpio chip with the driver core is still dependent on the
>> CONFIG_GPIO_SYSFS option. So maybe you could push the logic further
>> and either always register GPIO chips (effectively moving the call to
>> device_create into gpiolib.c) and only keep the legacy bits in
>> gpiolib-sysfs.c?
>
> That is the plan yes, but there's only so much I can do in one series.
> ;) The current crazy sysfs API also prevents the decoupling of the sysfs
> interface from chip device registration.

Sounds good then. This patch series is great anyway, so if Linus has
nothing against it I hope we can merge it quickly.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v9 13/17] h8300: configs

2015-04-27 Thread Guenter Roeck

On Mon, Apr 27, 2015 at 02:35:20PM +0900, Yoshinori Sato wrote:
> h8300h-sim_defconfig: H8/300H simulator config.
> h8s-sim_defconfig:H8S simulator config.
> edosk2674_defconfig:  EDOSK2674R evalution board config.

Did that get lost ?

Guenter

> 
> Signed-off-by: Yoshinori Sato 
> ---
>  arch/h8300/configs/h8300h-sim_defconfig | 53 
> +
>  arch/h8300/configs/h8s-sim_defconfig| 53 
> +
>  2 files changed, 106 insertions(+)
>  create mode 100644 arch/h8300/configs/h8300h-sim_defconfig
>  create mode 100644 arch/h8300/configs/h8s-sim_defconfig
> 
> diff --git a/arch/h8300/configs/h8300h-sim_defconfig 
> b/arch/h8300/configs/h8300h-sim_defconfig
> new file mode 100644
> index 000..bad1a1e
> --- /dev/null
> +++ b/arch/h8300/configs/h8300h-sim_defconfig
> @@ -0,0 +1,53 @@
> +# CONFIG_LOCALVERSION_AUTO is not set
> +# CONFIG_USELIB is not set
> +# CONFIG_INIT_FALLBACK is not set
> +CONFIG_CC_OPTIMIZE_FOR_SIZE=y
> +# CONFIG_UID16 is not set
> +# CONFIG_SYSFS_SYSCALL is not set
> +# CONFIG_KALLSYMS is not set
> +# CONFIG_BASE_FULL is not set
> +# CONFIG_FUTEX is not set
> +# CONFIG_EPOLL is not set
> +# CONFIG_SIGNALFD is not set
> +# CONFIG_TIMERFD is not set
> +# CONFIG_EVENTFD is not set
> +# CONFIG_AIO is not set
> +# CONFIG_ADVISE_SYSCALLS is not set
> +CONFIG_EMBEDDED=y
> +# CONFIG_VM_EVENT_COUNTERS is not set
> +# CONFIG_COMPAT_BRK is not set
> +CONFIG_SLOB=y
> +# CONFIG_BLOCK is not set
> +CONFIG_H8300H_SIM=y
> +CONFIG_CPU_CLOCK=2000
> +CONFIG_RAMSIZE=0x20
> +# CONFIG_BINFMT_SCRIPT is not set
> +CONFIG_BINFMT_FLAT=y
> +# CONFIG_COREDUMP is not set
> +# CONFIG_UEVENT_HELPER is not set
> +# CONFIG_STANDALONE is not set
> +# CONFIG_PREVENT_FIRMWARE_BUILD is not set
> +# CONFIG_FW_LOADER is not set
> +# CONFIG_ALLOW_DEV_COREDUMP is not set
> +# CONFIG_INPUT is not set
> +# CONFIG_SERIO is not set
> +# CONFIG_VT is not set
> +# CONFIG_UNIX98_PTYS is not set
> +# CONFIG_LEGACY_PTYS is not set
> +# CONFIG_DEVKMEM is not set
> +CONFIG_SERIAL_SH_SCI=y
> +CONFIG_SERIAL_SH_SCI_CONSOLE=y
> +# CONFIG_HW_RANDOM is not set
> +# CONFIG_HWMON is not set
> +# CONFIG_USB_SUPPORT is not set
> +# CONFIG_IOMMU_SUPPORT is not set
> +# CONFIG_FILE_LOCKING is not set
> +# CONFIG_DNOTIFY is not set
> +# CONFIG_INOTIFY_USER is not set
> +# CONFIG_PROC_FS is not set
> +# CONFIG_SYSFS is not set
> +# CONFIG_MISC_FILESYSTEMS is not set
> +CONFIG_DEBUG_INFO=y
> +# CONFIG_ENABLE_WARN_DEPRECATED is not set
> +# CONFIG_ENABLE_MUST_CHECK is not set
> +# CONFIG_CRC32 is not set
> diff --git a/arch/h8300/configs/h8s-sim_defconfig 
> b/arch/h8300/configs/h8s-sim_defconfig
> new file mode 100644
> index 000..025cdd8
> --- /dev/null
> +++ b/arch/h8300/configs/h8s-sim_defconfig
> @@ -0,0 +1,53 @@
> +# CONFIG_LOCALVERSION_AUTO is not set
> +# CONFIG_USELIB is not set
> +# CONFIG_INIT_FALLBACK is not set
> +CONFIG_CC_OPTIMIZE_FOR_SIZE=y
> +# CONFIG_UID16 is not set
> +# CONFIG_SYSFS_SYSCALL is not set
> +# CONFIG_KALLSYMS is not set
> +# CONFIG_BASE_FULL is not set
> +# CONFIG_FUTEX is not set
> +# CONFIG_EPOLL is not set
> +# CONFIG_SIGNALFD is not set
> +# CONFIG_TIMERFD is not set
> +# CONFIG_EVENTFD is not set
> +# CONFIG_AIO is not set
> +# CONFIG_ADVISE_SYSCALLS is not set
> +CONFIG_EMBEDDED=y
> +# CONFIG_VM_EVENT_COUNTERS is not set
> +# CONFIG_COMPAT_BRK is not set
> +CONFIG_SLOB=y
> +# CONFIG_BLOCK is not set
> +CONFIG_H8S_SIM=y
> +CONFIG_CPU_CLOCK=
> +CONFIG_RAMSIZE=0x20
> +# CONFIG_BINFMT_SCRIPT is not set
> +CONFIG_BINFMT_FLAT=y
> +# CONFIG_COREDUMP is not set
> +# CONFIG_UEVENT_HELPER is not set
> +# CONFIG_STANDALONE is not set
> +# CONFIG_PREVENT_FIRMWARE_BUILD is not set
> +# CONFIG_FW_LOADER is not set
> +# CONFIG_ALLOW_DEV_COREDUMP is not set
> +# CONFIG_INPUT is not set
> +# CONFIG_SERIO is not set
> +# CONFIG_VT is not set
> +# CONFIG_UNIX98_PTYS is not set
> +# CONFIG_LEGACY_PTYS is not set
> +# CONFIG_DEVKMEM is not set
> +CONFIG_SERIAL_SH_SCI=y
> +CONFIG_SERIAL_SH_SCI_CONSOLE=y
> +# CONFIG_HW_RANDOM is not set
> +# CONFIG_HWMON is not set
> +# CONFIG_USB_SUPPORT is not set
> +# CONFIG_IOMMU_SUPPORT is not set
> +# CONFIG_FILE_LOCKING is not set
> +# CONFIG_DNOTIFY is not set
> +# CONFIG_INOTIFY_USER is not set
> +# CONFIG_PROC_FS is not set
> +# CONFIG_SYSFS is not set
> +# CONFIG_MISC_FILESYSTEMS is not set
> +CONFIG_DEBUG_INFO=y
> +# CONFIG_ENABLE_WARN_DEPRECATED is not set
> +# CONFIG_ENABLE_MUST_CHECK is not set
> +# CONFIG_CRC32 is not set
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the F

Re: [PATCH] mlx4: Fix tx ring affinity_mask creation

2015-04-27 Thread Benjamin Poirier

On 2015/04/13 17:22, Benjamin Poirier wrote:
> On 2015/04/12 10:03, Ido Shamay wrote:
> > Hi Benjamin,
> > 
> > On 4/10/2015 7:27 PM, Benjamin Poirier wrote:
> > >By default, the number of tx queues is limited by the number of online 
> > >cpus in
> > >mlx4_en_get_profile(). However, this limit no longer holds after the 
> > >ethtool
> > >.set_channels method has been called. In that situation, the driver may 
> > >access
> > >invalid bits of certain cpumask variables when queue_index > nr_cpu_ids.
> > 
> > I must say I don't see the above issue with the current code.
> > Whatever is the modified value of priv->num_tx_rings_p_up, it will set XPS
> > only on queues which have
> > been set with CPU affinity mask (no access to invalid bits).
> 
> The problem is not with the call to netif_set_xps_queue() it is with the
> calls to cpu_online() and cpumask_set_cpu().
> 
> For example, if the user calls `ethtool -L ethX tx 32`, queue_index in
> mlx4_en_create_tx_ring() can be up to 255. Depending on CONFIG_NR_CPUS
> and CONFIG_CPUMASK_OFFSTACK this may result in calls to cpu_online() and
> cpumask_set_cpu() with cpu >= nr_cpumask_bits which is an invalid usage
> of the cpumask api. The driver will potentially read or write beyond the
> end of the bitmap. With CONFIG_CPUMASK_OFFSTACK=y and
> CONFIG_DEBUG_PER_CPU_MAPS=y, the aforementioned ethtool call on a system
> with <32 cpus triggers the warning in cpumask_check(). 
> 

Mellanox, can you please
ack the patch as submitted, or
clarify what changes you'd like to see given my reply above, or
submit a fix of your own for this problem

Thanks,
-Benjamin

> > 
> > It's true that when priv->num_tx_rings_p_up > nr_cpus. not all queues will
> > be set with XPS.
> > This is because the code tries to preserve 1:1 mapping of queues to cores,
> > to avoid a double mapping
> > of queues to cores.
> > I guess it's ok to break the 1:1 mapping in this condition, but the commit
> > message should say that instead
> > of invalid bits. Please fix me if I'm wrong.
> > 
> > >Signed-off-by: Benjamin Poirier 
> > >---
> > >  drivers/net/ethernet/mellanox/mlx4/en_tx.c | 8 +---
> > >  1 file changed, 5 insertions(+), 3 deletions(-)
> > >
> > >diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c 
> > >b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > >index 55f9f5c..8c234ec 100644
> > >--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > >+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> > >@@ -143,8 +143,10 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
> > >   ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
> > >   ring->queue_index = queue_index;
> > >-  if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index))
> > >-  cpumask_set_cpu(queue_index, &ring->affinity_mask);
> > >+  if (queue_index < priv->num_tx_rings_p_up)
> > >+  cpumask_set_cpu_local_first(queue_index,
> > >+  priv->mdev->dev->numa_node,
> > >+  &ring->affinity_mask);
> > Moving from cpumask_set_cpu to cpumask_set_cpu_local_first is great, but
> > should come in a different commit, since
> > the behavior of the XPS is changed here (xps_cpus[tx_ring[queue_index]] !=
> > queue_index from now).
> > Commit should state of this behavior change.
> > Thanks a lot Benjamin.
> > >   *pring = ring;
> > >   return 0;
> > >@@ -213,7 +215,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
> > >   err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
> > >  &ring->qp, &ring->qp_state);
> > >-  if (!user_prio && cpu_online(ring->queue_index))
> > >+  if (!cpumask_empty(&ring->affinity_mask))
> > >   netif_set_xps_queue(priv->dev, &ring->affinity_mask,
> > >   ring->queue_index);
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] ipc/mqueue: remove STATE_PENDING

2015-04-27 Thread Davidlohr Bueso

On Fri, 2015-04-24 at 00:18 +0200, Thomas Gleixner wrote:
> Can you please convert that over to Peters lockless wake queues so we
> do not reimplement the same thing open coded here.

So I'd like to include this in my v2 of the wake_q stuff, along with the
futex conversion. What do you guys think of the following?

Thanks,
Davidlohr

8<-
Subject: [PATCH] ipc/mqueue: lockless pipelined wakeups

This patch moves the wakeup_process() invocation so it is not done under
the info->lock by making use of a lockless wake_q. With this change, the
waiter is woken up once it is STATE_READY and it does not need to loop
on SMP if it is still in STATE_PENDING. In the timeout case we still need
to grab the info->lock to verify the state.

This change should also avoid the introduction of preempt_disable() in
-RT which avoids a busy-loop which pools for the STATE_PENDING -> STATE_READY
change if the waiter has a higher priority compared to the waker.

Additionally, this patch micro-optimizes wq_sleep by using the cheaper
cousin of set_current_state(TASK_INTERRUPTABLE) as we will block no
matter what, thus get rid of the implied barrier. Secondly, and related
to the lockless wakeups, comment the smp_wmb and add barrier pairing on
the reader side.

Based-on-work-from: Sebastian Andrzej Siewior 
Signed-off-by: Davidlohr Bueso 
---
 ipc/mqueue.c | 52 +---
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3aaea7f..11c7b92 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
 #define RECV   1
 
 #define STATE_NONE 0
-#define STATE_PENDING  1
-#define STATE_READY2
+#define STATE_READY1
 
 struct posix_msg_tree_node {
struct rb_node  rb_node;
@@ -571,15 +570,13 @@ static int wq_sleep(struct mqueue_inode_info *info, int 
sr,
wq_add(info, sr, ewp);
 
for (;;) {
-   set_current_state(TASK_INTERRUPTIBLE);
+   __set_current_state(TASK_INTERRUPTIBLE);
 
spin_unlock(&info->lock);
time = schedule_hrtimeout_range_clock(timeout, 0,
HRTIMER_MODE_ABS, CLOCK_REALTIME);
 
-   while (ewp->state == STATE_PENDING)
-   cpu_relax();
-
+   smp_rmb(); /* pairs with smp_wmb() in pipelined_send/receive */
if (ewp->state == STATE_READY) {
retval = 0;
goto out;
@@ -909,9 +906,14 @@ out_name:
  * bypasses the message array and directly hands the message over to the
  * receiver.
  * The receiver accepts the message and returns without grabbing the queue
- * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
- * are necessary. The same algorithm is used for sysv semaphores, see
- * ipc/sem.c for more details.
+ * spinlock. The used algorithm is different from sysv semaphores (ipc/sem.c):
+ *
+ * - Set pointer to message.
+ * - Queue the receiver task's for later wakeup (without the info->lock).
+ * - Update its state to STATE_READY. Now the receiver can continue.
+ * - Wake up the process after the lock is dropped. Should the process wake up
+ *   before this wakeup (due to a timeout or a signal) it will either see
+ *   STATE_READY and continue or acquire the lock to check the sate again.
  *
  * The same algorithm is used for senders.
  */
@@ -919,21 +921,29 @@ out_name:
 /* pipelined_send() - send a message directly to the task waiting in
  * sys_mq_timedreceive() (without inserting message into a queue).
  */
-static inline void pipelined_send(struct mqueue_inode_info *info,
+static inline void pipelined_send(struct wake_q_head *wake_q,
+ struct mqueue_inode_info *info,
  struct msg_msg *message,
  struct ext_wait_queue *receiver)
 {
receiver->msg = message;
list_del(&receiver->list);
-   receiver->state = STATE_PENDING;
-   wake_up_process(receiver->task);
-   smp_wmb();
+   wake_q_add(wake_q, receiver->task);
+   /*
+* Ensure that updating receiver->state is the last
+* write operation: As once set, the receiver can continue,
+* and if we don't have the reference count from the wake_q,
+* yet, at that point we can later have a use-after-free
+* condition and bogus wakeup.
+*/
+   smp_wmb(); /* pairs with smp_rmb() in wq_sleep */
receiver->state = STATE_READY;
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  * gets its message and put to the queue (we have one free place for sure). */
-static inline void pipelined_receive(struct mqueue_inode_info *info)
+static inline void pipelined_receive(struct wake_q_head *wake_q,
+struct mqueue_inode_info *info)
 {
struct ext_wa

RE: [PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()

2015-04-27 Thread KY Srinivasan



> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Monday, April 27, 2015 7:57 PM
> To: KY Srinivasan
> Cc: net...@vger.kernel.org; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; o...@aepfle.de; a...@canonical.com;
> jasow...@redhat.com
> Subject: Re: [PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()
> 
> From: "K. Y. Srinivasan" 
> Date: Mon, 27 Apr 2015 18:14:50 -0700
> 
> > Commit commit b08cc79155fc26d0d112b1470d1ece5034651a4b eliminated
> memory
> > allocation in the packet send path. This commit introduced a bug since it
> > did not account for the case if the skb was cloned. Fix this bug by
> > using the pre-reserved head room only if the skb is not cloned.
> >
> > Signed-off-by: K. Y. Srinivasan 
> 
> We have generic infrastructure to do this, please try instead:
> 
>   err = skb_cow_head(skb, pkt_sz);
> 
> this will take care of everything for you and you can get rid
> of all of this dynamic memory allocation etc. in this code
> path.

Thanks David; I will resubmit this patch.

Regards,

K. Y
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Brian Norris

On Tue, Apr 28, 2015 at 08:18:12AM +0530, punnaiah choudary kalluri wrote:
> On Tue, Apr 28, 2015 at 4:53 AM, Brian Norris
>  wrote:
> > On Tue, Apr 28, 2015 at 12:19:16AM +0200, Richard Weinberger wrote:
> >> Oh, I thought every driver has to implement that function. ;-\
> >
> > Nope.
> >
> >> But you're right there is a corner case.
> >
> > And it's not the only one! Right now, there's no guarantee even that
> > read_buf() returns raw data, unmodified by the SoC's controller. Plenty
> > of drivers actually have HW-enabled ECC turned on by default, and so
> > they override the chip->ecc.read_page() (and sometimes
> > chip->ecc.read_page_raw() functions, if we're lucky) with something
> > that pokes the appropriate hardware instead. I expect anything
> > comprehensive here is probably going to have to utilize
> > chip->ecc.read_page_raw(), at least if it's provided by the hardware
> > driver.
> 
> Yes, overriding the chip->ecc.read_page_raw would solve this.

I'm actually suggesting that (in this patch set, for on-die ECC
support), maybe we *shouldn't* override chip->ecc.read_page_raw() and
leave that to be defined by the driver, and then on-die ECC support
should be added in a way that just calls chip->ecc.read_page_raw(). This
should work for any driver that already properly supports the raw
callbacks.

> Agree that
> read_buf need not be returning raw data always including my new driver for
> arasan nand flash controller.

I agree with that. At the moment, chip->read_buf() really has very
driver-specific meaning. Not sure if that's really a good thing, but
it's the way things are...

> http://lkml.iu.edu/hypermail/linux/kernel/1504.2/00313.html

In the half a minute I just spent looking at this (I may review it
properly later), I noted a few things:

1. you don't implement ecc.read_page_raw(); this means we'll probably
have trouble supporting on-die ECC with your driver, among other things

2. your patch is all white-space mangled. Please use your favorite
search engine to figure out how to get that right. git-send-email is
your friend.

Thanks,
Brian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread punnaiah choudary kalluri

On Wed, Mar 25, 2015 at 7:32 PM, Richard Weinberger  wrote:
> Some Micron NAND chips offer an on-die ECC (AKA internal ECC)
> feature. It is useful when the host-platform does not offer
> multi-bit ECC and software ECC is not feasible.
>
> Based on original work by David Mosberger 
>
> Signed-off-by: Richard Weinberger 
> ---
>  drivers/mtd/nand/nand_base.c   |  66 +-
>  drivers/mtd/nand/nand_ondie.c  | 266 
> +
>  include/linux/mtd/nand.h   |   6 +
>  include/linux/mtd/nand_ondie.h |  40 +++
>  4 files changed, 374 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/mtd/nand/nand_ondie.c
>  create mode 100644 include/linux/mtd/nand_ondie.h
>
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index df7eb4f..92e7ed7 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -43,6 +43,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -79,6 +80,20 @@ static struct nand_ecclayout nand_oob_64 = {
>  .length = 38} }
>  };
>
> +static struct nand_ecclayout nand_oob_64_on_die = {
> +   .eccbytes = 32,
> +   .eccpos = {
> +   8,  9, 10, 11, 12, 13, 14, 15,
> +  24, 25, 26, 27, 28, 29, 30, 31,
> +  40, 41, 42, 43, 44, 45, 46, 47,
> +  56, 57, 58, 59, 60, 61, 62, 63},
> +   .oobfree = {
> +   {.offset =  4,   .length = 4},
> +   {.offset = 20,   .length = 4},
> +   {.offset = 36,   .length = 4},
> +   {.offset = 52,   .length = 4} }
> +};
> +
>  static struct nand_ecclayout nand_oob_128 = {
> .eccbytes = 48,
> .eccpos = {
> @@ -3115,9 +3130,10 @@ static int nand_setup_read_retry_micron(struct 
> mtd_info *mtd, int retry_mode)
>  /*
>   * Configure chip properties from Micron vendor-specific ONFI table
>   */
> -static void nand_onfi_detect_micron(struct nand_chip *chip,
> -   struct nand_onfi_params *p)
> +static void nand_onfi_detect_micron(struct mtd_info *mtd,
> +   struct nand_onfi_params *p)
>  {
> +   struct nand_chip *chip = mtd->priv;
> struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
>
> if (le16_to_cpu(p->vendor_revision) < 1)
> @@ -3125,6 +3141,8 @@ static void nand_onfi_detect_micron(struct nand_chip 
> *chip,
>
> chip->read_retries = micron->read_retry_options;
> chip->setup_read_retry = nand_setup_read_retry_micron;
> +
> +   nand_onfi_detect_on_die_ecc(mtd);
>  }
>
>  /*
> @@ -3226,7 +3244,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, 
> struct nand_chip *chip,
> }
>
> if (p->jedec_id == NAND_MFR_MICRON)
> -   nand_onfi_detect_micron(chip, p);
> +   nand_onfi_detect_micron(mtd, p);
>
> return 1;
>  }
> @@ -4056,6 +4074,40 @@ int nand_scan_tail(struct mtd_info *mtd)
> }
> break;
>
> +   case NAND_ECC_HW_ON_DIE:
> +   if (!mtd_nand_has_on_die()) {
> +   pr_warn("CONFIG_MTD_NAND_ECC_ON_DIE not enabled\n");
> +   BUG();
> +   }

Cant we get rid of this CONFIG option. lets say during the nand scan
phase it self,
ondie ecc feature status will be known from onfi parameter page based
on the product
and vendor id of micron parts. Now lets the driver choose which ecc it
want. Like legacy
controllers(pl353 smc controller ..) which doesn't have much ecc
coverage will use
ondie ecc feature and controllers that have better ecc coverage can
use controller ecc.
so, the driver can choose which ecc they want to use and pass it to
the scan_tail function.
Since ondie ecc has limitations on spare area usage and the ecc layout
is different to what
the default linux kernel layout, people may prefer to use controller
ecc if that controller has
better ecc coverage. But any case, i think we can avoid this config option.


Regards,,
Punnaiah
> +   /*
> +* nand_bbt.c by default puts the BBT marker at
> +* offset 8 in OOB, which is used for ECC (see
> +* nand_oob_64_on_die).
> +* Fixed by not using OOB for BBT marker.
> +*/
> +   chip->bbt_options |= NAND_BBT_NO_OOB;
> +   ecc->layout = &nand_oob_64_on_die;
> +   ecc->read_page = nand_read_page_on_die;
> +   ecc->read_subpage = nand_read_subpage_on_die;
> +   ecc->write_page = nand_write_page_raw;
> +   ecc->read_oob = nand_read_oob_std;
> +   ecc->read_page_raw = nand_read_page_raw;
> +   ecc->write_page_raw = nand_write_page_raw;
> +   ecc->write_oob = nand_write_oob_std;
> +   ecc->size = 512;
> +   ecc->bytes = 8;
> +   ecc->strength = 4;
> +   ecc->priv = nand_o

Re: [PATCH v2] net: netcp: remove call to netif_carrier_(on/off) for MAC to Phy interface

2015-04-27 Thread David Miller

From: Murali Karicheri 
Date: Mon, 27 Apr 2015 14:12:43 -0400

> Currently when interface type is MAC to Phy, netif_carrier_(on/off)
> is called which is not needed as Phy lib already updates the carrier
> status to net stack. This is needed only for other interface types
> 
> Signed-off-by: Murali Karicheri 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: Tree for Apr 28

2015-04-27 Thread Stephen Rothwell

Hi all,

Changes since 20150427:

New tree: dmi

Non-merge commits (relative to Linus' tree): 725
 874 files changed, 61499 insertions(+), 14685 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64 and a
multi_v7_defconfig for arm. After the final fixups (if any), it is also
built with powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig and
allyesconfig (this fails its final link) and i386, sparc, sparc64 and arm
defconfig.

Below is a summary of the state of the merge.

I am currently merging 215 trees (counting Linus' and 30 trees of patches
pending for Linus' tree).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (2decb2682f80 Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net)
Merging fixes/master (b94d525e58dc Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net)
Merging kbuild-current/rc-fixes (c517d838eb7d Linux 4.0-rc1)
Merging arc-current/for-curr (e4140819dadc ARC: signal handling robustify)
Merging arm-current/fixes (6c5c2a01fcfd ARM: proc-arm94*.S: fix setup function)
Merging m68k-current/for-linus (b24f670b7f5b m68k/mac: Fix out-of-bounds array 
index in OSS IRQ source initialization)
Merging metag-fixes/fixes (0164a711c97b metag: Fix ioremap_wc/ioremap_cached 
build errors)
Merging mips-fixes/mips-fixes (1795cd9b3a91 Linux 3.16-rc5)
Merging powerpc-merge-mpe/fixes (2e826695d87c powerpc/mm: Fix build error with 
CONFIG_PPC_TRANSACTIONAL_MEM disabled)
Merging powerpc-merge/merge (c517d838eb7d Linux 4.0-rc1)
Merging sparc/master (acc455cffa75 sparc64: Setup sysfs to mark LDOM sockets, 
cores and threads correctly)
Merging net/master (22a8f237c055 bnx2x: really disable TPA if 'disable_tpa' 
option is set)
Merging ipsec/master (092a29a40bab vti6: fix uninit when using x-netns)
Merging sound-current/for-linus (30e5f003ff4b ALSA: hda - Fix missing va_end() 
call in snd_hda_codec_pcm_new())
Merging pci-current/for-linus (b787f68c36d4 Linux 4.1-rc1)
Merging wireless-drivers/master (69628cd0652a Merge tag 
'iwlwifi-for-kalle-2015-03-30' of 
https://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi-fixes)
Merging driver-core.current/driver-core-linus (b787f68c36d4 Linux 4.1-rc1)
Merging tty.current/tty-linus (b787f68c36d4 Linux 4.1-rc1)
Merging usb.current/usb-linus (b787f68c36d4 Linux 4.1-rc1)
Merging usb-gadget-fixes/fixes (b787f68c36d4 Linux 4.1-rc1)
Merging usb-serial-fixes/usb-linus (39a8804455fb Linux 4.0)
Merging staging.current/staging-linus (b787f68c36d4 Linux 4.1-rc1)
Merging char-misc.current/char-misc-linus (b787f68c36d4 Linux 4.1-rc1)
Merging input-current/for-linus (48853389f206 Merge branch 'next' into 
for-linus)
Merging crypto-current/master (8c98ebd7a6ff crypto: img-hash - 
CRYPTO_DEV_IMGTEC_HASH should depend on HAS_DMA)
Merging ide/master (d681f1166919 ide: remove deprecated use of pci api)
Merging devicetree-current/devicetree/merge (41d9489319f2 drivers/of: Add empty 
ranges quirk for PA-Semi)
Merging rr-fixes/fixes (f47689345931 lguest: update help text.)
Merging vfio-fixes/for-linus (ec76f4007079 vfio-pci: Add missing break to 
enable VFIO_PCI_ERR_IRQ_INDEX)
Merging kselftest-fixes/fixes (67d8712dcc70 selftests: Fix build failures when 
invoked from kselftest target)
Merging drm-intel-fixes/for-linux-next-fixes (39a8804455fb Linux 4.0)
Merging asm-generic/master (643165c8bbc8 Merge tag 'uaccess_for_upstream' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost into asm-generic)
Merging arc/for-next (b787f68c36d4 Linux 4.1-rc1)
Merging arm/for-next (6b7acae74fc2 Merge branches 'misc' and 'vdso' into 
for-next)
Merging arm-perf/for-next/perf (39a8804455fb Linux 4.0)
Merging arm-soc/for-next (0f0175702e2e arm-soc: document merges)
CONFLICT (content): M

[PATCH v3 2/6] random: Async and sync API for accessing kernel_pool

2015-04-27 Thread Stephan Mueller

The kernel_pool is intended to be the in-kernel equivalent to the
blocking_pool, i.e. requests for random data may be blocked if
insufficient entropy is present.

The added API calls provide a synchronous function call
get_blocking_random_bytes where the caller is blocked.

In addition, an asynchronous API call of get_blocking_random_bytes_cb
is provided which returns immediately to the caller after submitting
the request for random data. The caller-provided buffer that shall be
filled with random data is filled up as available entropy permits. The
caller may provide a callback function that is invoked once the
request is completed.

A third API call, get_blocking_random_bytes_cancel, is provided to
cancel the random number gathering operation.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 drivers/char/random.c  | 116 +
 include/linux/random.h |  20 +
 2 files changed, 136 insertions(+)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0b139dc..30d39ba 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1804,3 +1804,119 @@ void add_hwgenerator_randomness(const char *buffer, 
size_t count,
credit_entropy_bits(poolp, entropy);
 }
 EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);
+
+static bool get_blocking_random_bytes_term(bool *cancel)
+{
+   if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
+   return true;
+   return *cancel;
+}
+
+/*
+ * Equivalent function to get_random_bytes with the difference that this
+ * function blocks the request in a similar fashion as random_read(),
+ * implementing a /dev/random device for in-kernel users.
+ *
+ * This function may sleep.
+ *
+ * @buf caller allocated buffer filled with random data
+ * @nbytes requested number of bytes -- buffer should be at least as big
+ * @cancel pointer to variable that can be used to cancel the collection
+ *operation. If this boolean is set to true, the collection operation
+ *is terminated immediately. When it is set to true during the
+ *collection loop, the collection is terminated immediately.
+ *
+ * return: positive value: obtained number of bytes on successful
+ *negative value: error code on error
+ */
+ssize_t get_blocking_random_bytes(void *buf, ssize_t nbytes, bool *cancel)
+{
+   ssize_t ret = 0;
+
+   if (nbytes <= 0)
+   return nbytes;
+   BUG_ON(!buf);
+
+   while (ret < nbytes) {
+   ssize_t round = 0;
+   ssize_t pull = min_t(ssize_t, (nbytes - ret), SEC_XFER_SIZE);
+
+   if (*cancel)
+   return ret;
+   round = extract_entropy(&kernel_pool, (buf + ret), pull, 0, 0);
+   if (0 > round)
+   return round;
+   if (0 == round)
+   wait_event_interruptible(random_kernel_wait,
+   get_blocking_random_bytes_term(cancel));
+   ret += round;
+   }
+   return ret;
+}
+EXPORT_SYMBOL(get_blocking_random_bytes);
+
+/*
+ * Immediate canceling the collection operation for the random_work
+ */
+void get_blocking_random_bytes_cancel(struct random_work *rw)
+{
+   rw->cancel = true;
+   wake_up_interruptible(&random_kernel_wait);
+
+}
+EXPORT_SYMBOL(get_blocking_random_bytes_cancel);
+
+static void get_blocking_random_bytes_work(struct work_struct *work)
+{
+   struct random_work *rw = container_of(work, struct random_work,
+ rw_work);
+   ssize_t ret;
+
+   ret = get_blocking_random_bytes(rw->rw_buf, rw->rw_len, &rw->cancel);
+   if (rw->rw_cb)
+   rw->rw_cb(rw->rw_buf, ret, rw->private);
+}
+
+/*
+ * Asynchronous invocation of the blocking interface. The function
+ * queues the request in either the private work queue supplied with the
+ * wq argument or in the general work queue framework if wq is NULL.
+ * Once the request is completed or upon receiving an error, the callback
+ * function of cb is called, if not NULL, to inform the caller about the
+ * completion of its operation.
+ *
+ * If a caller wants to cancel the work (e.g. in the module_exit function),
+ * simply call
+ * get_blocking_random_bytes_cancel(&my_random_work);
+ * cancel_work_sync(&my_random_work.rw_work);
+ *
+ * @wq pointer to private work queue or NULL - input
+ * @rw handle to the work queue frame - output
+ * @buf allocated buffer where random numbers are to be stored
+ * @nbytes size of buf and implicitly number of bytes requested
+ * @private pointer to data that is not processed by here, but handed to the
+ * callback function to allow the caller to maintain a state
+ * @cb callback function where
+ * * buf holds the pointer to buf will be supplied
+ * * buflen holds the length of the gathered random numbers or error code
+ *   of the generation f

[PATCH v3 3/6] crypto: drbg - prepare for async seeding

2015-04-27 Thread Stephan Mueller

In order to prepare for the addition of the asynchronous seeding call,
the invocation of seeding the DRBG is moved out into a helper function.

In addition, a block of memory is allocated during initialization time
that will be used as a scratchpad for obtaining entropy. That scratchpad
is used for the initial seeding operation as well as by the
asynchronous seeding call. The memory must be zeroized every time the
DRBG seeding call succeeds to avoid entropy data lingering in memory.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 crypto/drbg.c | 81 ++-
 include/crypto/drbg.h |  2 ++
 2 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index 23d444e..36dfece 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1041,6 +1041,21 @@ static struct drbg_state_ops drbg_hash_ops = {
  * Functions common for DRBG implementations
  **/
 
+static inline int __drbg_seed(struct drbg_state *drbg, struct list_head *seed,
+ int reseed)
+{
+   int ret = drbg->d_ops->update(drbg, seed, reseed);
+
+   if (ret)
+   return ret;
+
+   drbg->seeded = true;
+   /* 10.1.1.2 / 10.1.1.3 step 5 */
+   drbg->reseed_ctr = 1;
+
+   return ret;
+}
+
 /*
  * Seeding or reseeding of the DRBG
  *
@@ -1056,8 +1071,6 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
 bool reseed)
 {
int ret = 0;
-   unsigned char *entropy = NULL;
-   size_t entropylen = 0;
struct drbg_string data1;
LIST_HEAD(seedlist);
 
@@ -1073,26 +1086,10 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
 drbg->test_data.len);
pr_devel("DRBG: using test entropy\n");
} else {
-   /*
-* Gather entropy equal to the security strength of the DRBG.
-* With a derivation function, a nonce is required in addition
-* to the entropy. A nonce must be at least 1/2 of the security
-* strength of the DRBG in size. Thus, entropy * nonce is 3/2
-* of the strength. The consideration of a nonce is only
-* applicable during initial seeding.
-*/
-   entropylen = drbg_sec_strength(drbg->core->flags);
-   if (!entropylen)
-   return -EFAULT;
-   if (!reseed)
-   entropylen = ((entropylen + 1) / 2) * 3;
pr_devel("DRBG: (re)seeding with %zu bytes of entropy\n",
-entropylen);
-   entropy = kzalloc(entropylen, GFP_KERNEL);
-   if (!entropy)
-   return -ENOMEM;
-   get_random_bytes(entropy, entropylen);
-   drbg_string_fill(&data1, entropy, entropylen);
+drbg->seed_buf_len);
+   get_random_bytes(drbg->seed_buf, drbg->seed_buf_len);
+   drbg_string_fill(&data1, drbg->seed_buf, drbg->seed_buf_len);
}
list_add_tail(&data1.list, &seedlist);
 
@@ -,16 +1108,24 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
memset(drbg->C, 0, drbg_statelen(drbg));
}
 
-   ret = drbg->d_ops->update(drbg, &seedlist, reseed);
+   ret = __drbg_seed(drbg, &seedlist, reseed);
+
+   /*
+* Clear the initial entropy buffer as the async call may not overwrite
+* that buffer for quite some time.
+*/
+   memzero_explicit(drbg->seed_buf, drbg->seed_buf_len);
if (ret)
goto out;
-
-   drbg->seeded = true;
-   /* 10.1.1.2 / 10.1.1.3 step 5 */
-   drbg->reseed_ctr = 1;
+   /*
+* For all subsequent seeding calls, we only need the seed buffer
+* equal to the security strength of the DRBG. We undo the calculation
+* in drbg_alloc_state.
+*/
+   if (!reseed)
+   drbg->seed_buf_len = drbg->seed_buf_len / 3 * 2;
 
 out:
-   kzfree(entropy);
return ret;
 }
 
@@ -1143,6 +1148,8 @@ static inline void drbg_dealloc_state(struct drbg_state 
*drbg)
drbg->prev = NULL;
drbg->fips_primed = false;
 #endif
+   kzfree(drbg->seed_buf);
+   drbg->seed_buf = NULL;
 }
 
 /*
@@ -1204,6 +1211,26 @@ static inline int drbg_alloc_state(struct drbg_state 
*drbg)
if (!drbg->scratchpad)
goto err;
}
+
+   /*
+* Gather entropy equal to the security strength of the DRBG.
+* With a derivation function, a nonce is required in addition
+* to the entropy. A nonce must be at least 1/2 of the security
+* strength of the DRBG in size. Thus, entropy * nonce is 3/2
+* of the strength. T

[PATCH v3 0/6] Seeding DRBG with more entropy

2015-04-27 Thread Stephan Mueller

Hi,

as of now, the DRBG is only seeded from get_random_bytes. In various
circumstances, the nonblocking_pool behind get_random_bytes may not be fully
seeded from hardware events at the time the DRBG requires to be seeded.
Based on the discussion in [1], the DRBG seeding is updated such that it
does not completely rely on get_random_bytes any more.

The seeding approach can be characterized as follows:

1. pull buffer of size entropy + nonce from get_random_bytes

2. pull another buffer of size entropy + nonce from my Jitter RNG

3. concatenate both buffers

4. seed the DRBG with the concatenated buffer

5. trigger the async invocation of the in-kernel /dev/random with a buffer of
   size entropy

6. return the DRBG instance to the caller without waiting for the completion
   of step 5

7. at some point in time, the in-kernel /dev/random returns with a full buffer
   which is then used to re-seed the DRBG

This way, we will get entropy during the first initialization without
blocking.

The patch set adds an in-kernel /dev/random equivalent that was discussed with
Ted Ts'o last July -- see [2] and [3]. A test module for testing the
asynchronous operation of the in-kernel /dev/random is given with the code
below.

Ted: shall we really create and maintain a new entropy pool (the kernel_pool),
or should the in-kernel /dev/random logic draw directly from the input_pool?
In other words, shall we drop the first patch and update the 2nd patch to
use input_pool? Also, I would not recommend using the blocking_pool as this
would mix kernel and user land operation.

Note: the DRBG and Jitter RNG patches are against the current cryptodev-2.6
tree.

The new Jitter RNG is an RNG that has large set of tests and was presented on
LKML some time back. After speaking with mathematicians at NIST, that Jitter
RNG approach would be acceptable from their side as a noise source. Note, I
personally think that the Jitter RNG has sufficient entropy in almost all
circumstances (see the massive testing I conducted on all more widely used
CPUs as shown in [4]).

Changes v3:
* Patch 01: Correct calculation of entropy count as pointed out by Herbert Xu
* Patch 06: Correct a trivial coding issue in jent_entropy_init for
  checking JENT_EMINVARVAR reported by cppcheck

Changes v2:
* Use Dual BSD/GPL license in MODULE_LICENSE as suggested by
  Paul Bolle 
* Patch 05, drbg_dealloc_state: only deallocate Jitter RNG if one was
  instantiated in the first place. There are two main reasons why the Jitter RNG
  may not be allocated: either it is not available as kernel module/in vmlinuz
  or during init time of the Jitter RNG, the performed testing shows that the
  underlying hardware is not suitable for the Jitter RNG (e.g. has a too coarse
  timer).


[1] http://www.mail-archive.com/linux-crypto@vger.kernel.org/msg13891.html

[2] https://lkml.org/lkml/2014/4/27/174

[3] http://comments.gmane.org/gmane.linux.kernel/1701117

[4] http://www.chronox.de/jent.html


Stephan Mueller (6):
  random: Addition of kernel_pool
  random: Async and sync API for accessing kernel_pool
  crypto: drbg - prepare for async seeding
  crypto: drbg - add async seeding operation
  crypto: drbg - use Jitter RNG to obtain seed
  crypto: add jitterentropy RNG

 crypto/Kconfig |  10 +
 crypto/Makefile|   2 +
 crypto/drbg.c  | 161 +++--
 crypto/jitterentropy.c | 909 +
 crypto/testmgr.c   |   4 +
 drivers/char/random.c  | 168 -
 include/crypto/drbg.h  |   4 +
 include/linux/random.h |  20 ++
 8 files changed, 1238 insertions(+), 40 deletions(-)
 create mode 100644 crypto/jitterentropy.c

---
/*
 * Test module for verifying the correct operation of the
 * in-kernel /dev/random handling
 *
 * Use: compile, load into the kernel and observe dmesg
 *
 * Written by: Stephan Mueller 
 * Copyright (c) 2014
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *notice, and the entire permission notice in its entirety,
 *including the disclaimer of warranties.
 * 2. Redistributions in binary form must reproduce the above copyright
 *notice, this list of conditions and the following disclaimer in the
 *documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *products derived from this software without specific prior
 *written permission.
 *
 * ALTERNATIVELY, this product may be distributed under the terms of
 * the GNU General Public License, in which case the provisions of the GPL are
 * required INSTEAD OF the above restrictions.  (This clause is
 * necessary due to a potential bad interaction between the GPL and
 * the restrictions contained in a BSD-style copyright.)
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS O

Re: [PATCHv5 04/28] mm, thp: adjust conditions when we can reuse the page on WP fault

2015-04-27 Thread Hillf Danton

> 
> With new refcounting we will be able map the same compound page with
> PTEs and PMDs. It requires adjustment to conditions when we can reuse
> the page on write-protection fault.
> 
> For PTE fault we can't reuse the page if it's part of huge page.
> 
> For PMD we can only reuse the page if nobody else maps the huge page or
> it's part. We can do it by checking page_mapcount() on each sub-page,
> but it's expensive.
> 
> The cheaper way is to check page_count() to be equal 1: every mapcount
> takes page reference, so this way we can guarantee, that the PMD is the
> only mapping.
> 
> This approach can give false negative if somebody pinned the page, but
> that doesn't affect correctness.
>
Then we have to try more to allocate THP if pinned?
Are we adding new cost?

Hillf

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 1/6] random: Addition of kernel_pool

2015-04-27 Thread Stephan Mueller

The kernel pool is intended to serve kernel-internal callers only.
Its purpose and usage is identical to the blocking_pool.

As the kernel_pool is not available to user space, user space cannot
directly interfere with the blocking behavior when obtaining
data from the kernel_pool. Thus, if entropy is present in the
kernel_pool, user space can hog /dev/random and yet the kernel
internal requestor of random numbers that are generated equally
to the blocking_pool (i.e. with the blocking behavior) will not
be affected until data is needed from the input_pool.

The patch treats the kernel_pool fully equally to the blocking and
nonblocking pool with respect to the initialization and update. As now
there are three output pools, the patch adds a round-robin logic for
processing additional entropy when the input_pool is nearly full.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 drivers/char/random.c | 52 ++-
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9cd6968..0b139dc 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -407,6 +407,7 @@ static struct poolinfo {
 static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
 static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static DECLARE_WAIT_QUEUE_HEAD(urandom_init_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_kernel_wait);
 static struct fasync_struct *fasync;
 
 /**
@@ -442,6 +443,7 @@ static void push_to_pool(struct work_struct *work);
 static __u32 input_pool_data[INPUT_POOL_WORDS];
 static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
 static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
+static __u32 kernel_pool_data[OUTPUT_POOL_WORDS];
 
 static struct entropy_store input_pool = {
.poolinfo = &poolinfo_table[0],
@@ -472,6 +474,17 @@ static struct entropy_store nonblocking_pool = {
push_to_pool),
 };
 
+static struct entropy_store kernel_pool = {
+   .poolinfo = &poolinfo_table[1],
+   .name = "kernel",
+   .limit = 1,
+   .pull = &input_pool,
+   .lock = __SPIN_LOCK_UNLOCKED(kernel_pool.lock),
+   .pool = kernel_pool_data,
+   .push_work = __WORK_INITIALIZER(kernel_pool.push_work,
+   push_to_pool),
+};
+
 static __u32 const twist_table[8] = {
0x, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
@@ -674,29 +687,41 @@ retry:
 
/* should we wake readers? */
if (entropy_bits >= random_read_wakeup_bits) {
+   wake_up_interruptible(&random_kernel_wait);
wake_up_interruptible(&random_read_wait);
kill_fasync(&fasync, SIGIO, POLL_IN);
}
/* If the input pool is getting full, send some
-* entropy to the two output pools, flipping back and
+* entropy to the output pools, flipping back and
 * forth between them, until the output pools are 75%
 * full.
 */
if (entropy_bits > random_write_wakeup_bits &&
r->initialized &&
r->entropy_total >= 2*random_read_wakeup_bits) {
-   static struct entropy_store *last = &blocking_pool;
-   struct entropy_store *other = &blocking_pool;
-
-   if (last == &blocking_pool)
-   other = &nonblocking_pool;
-   if (other->entropy_count <=
-   3 * other->poolinfo->poolfracbits / 4)
-   last = other;
-   if (last->entropy_count <=
-   3 * last->poolinfo->poolfracbits / 4) {
-   schedule_work(&last->push_work);
-   r->entropy_total = 0;
+#define NUM_OUTPUT_POOLS 3
+   /* as we will recalculate this variable first thing in
+* the loop, it will point to the first output pool
+* after the first recalculation */
+   static int selected_pool = (NUM_OUTPUT_POOLS - 1);
+   int i = 0;
+   struct entropy_store *output_pools[NUM_OUTPUT_POOLS] = {
+   &blocking_pool,
+   &nonblocking_pool,
+   &kernel_pool};
+   /* select the next pool that has less than 75% fill
+* rate */
+   for (i = 0; NUM_OUTPUT_POOLS > i; i++) {
+   struct entropy_store *p = NULL;
+   selected_pool =
+

[PATCH v3 4/6] crypto: drbg - add async seeding operation

2015-04-27 Thread Stephan Mueller

The async seeding operation is triggered during initalization right
after the first non-blocking seeding is completed. As required by the
asynchronous operation of random.c, a callback function is provided that
is triggered by random.c once entropy is available. That callback
function performs the actual seeding of the DRBG.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 crypto/drbg.c | 46 ++
 include/crypto/drbg.h |  1 +
 2 files changed, 47 insertions(+)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index 36dfece..13dd626 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1056,6 +1056,40 @@ static inline int __drbg_seed(struct drbg_state *drbg, 
struct list_head *seed,
return ret;
 }
 
+/* DRBG callback for obtaining data from the async Linux RNG */
+static void drbg_async_seed_cb(void *buf, ssize_t buflen, void *private)
+{
+   struct drbg_string data;
+   LIST_HEAD(seedlist);
+   struct drbg_state *drbg = (struct drbg_state *)private;
+   int ret = 0;
+
+   if (buflen <= 0 || !buf)
+   return;
+
+   drbg_string_fill(&data, buf, buflen);
+   list_add_tail(&data.list, &seedlist);
+   /* sanity check to verify that there is still a DRBG instance */
+   if (!drbg)
+   return;
+   mutex_lock(&drbg->drbg_mutex);
+   /* sanity check to verify that the DRBG instance is valid */
+   if (!drbg->V) {
+   mutex_unlock(&drbg->drbg_mutex);
+   return;
+   }
+   ret = __drbg_seed(drbg, &seedlist, true);
+   memzero_explicit(buf, buflen);
+   mutex_unlock(&drbg->drbg_mutex);
+}
+
+/* Cancel any outstanding async operation and wait for their completion */
+static inline void drbg_async_work_cancel(struct random_work *work)
+{
+   get_blocking_random_bytes_cancel(work);
+   cancel_work_sync(&work->rw_work);
+}
+
 /*
  * Seeding or reseeding of the DRBG
  *
@@ -1081,6 +1115,11 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
return -EINVAL;
}
 
+   /* cancel any previously invoked seeding */
+   mutex_unlock(&drbg->drbg_mutex);
+   drbg_async_work_cancel(&drbg->seed_work);
+   mutex_lock(&drbg->drbg_mutex);
+
if (list_empty(&drbg->test_data.list)) {
drbg_string_fill(&data1, drbg->test_data.buf,
 drbg->test_data.len);
@@ -1125,6 +1164,12 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
if (!reseed)
drbg->seed_buf_len = drbg->seed_buf_len / 3 * 2;
 
+   /* Invoke asynchronous seeding unless DRBG is in test mode. */
+   if (!list_empty(&drbg->test_data.list))
+   get_blocking_random_bytes_cb(NULL, &drbg->seed_work,
+drbg->seed_buf, drbg->seed_buf_len,
+drbg, drbg_async_seed_cb);
+
 out:
return ret;
 }
@@ -1487,6 +1532,7 @@ unlock:
  */
 static int drbg_uninstantiate(struct drbg_state *drbg)
 {
+   drbg_async_work_cancel(&drbg->seed_work);
if (drbg->d_ops)
drbg->d_ops->crypto_fini(drbg);
drbg_dealloc_state(drbg);
diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h
index b052698..e4980a1 100644
--- a/include/crypto/drbg.h
+++ b/include/crypto/drbg.h
@@ -119,6 +119,7 @@ struct drbg_state {
bool fips_primed;   /* Continuous test primed? */
unsigned char *prev;/* FIPS 140-2 continuous test value */
 #endif
+   struct random_work seed_work;   /* asynchronous seeding support */
u8 *seed_buf;   /* buffer holding the seed */
size_t seed_buf_len;
const struct drbg_state_ops *d_ops;
-- 
2.1.0


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 6/6] crypto: add jitterentropy RNG

2015-04-27 Thread Stephan Mueller

The CPU Jitter RNG provides a source of good entropy by
collecting CPU executing time jitter. The entropy in the CPU
execution time jitter is magnified by the CPU Jitter Random
Number Generator. The CPU Jitter Random Number Generator uses
the CPU execution timing jitter to generate a bit stream
which complies with different statistical measurements that
determine the bit stream is random.

The CPU Jitter Random Number Generator delivers entropy which
follows information theoretical requirements. Based on these
studies and the implementation, the caller can assume that
one bit of data extracted from the CPU Jitter Random Number
Generator holds one bit of entropy.

The CPU Jitter Random Number Generator provides a decentralized
source of entropy, i.e. every caller can operate on a private
state of the entropy pool.

The RNG does not have any dependencies on any other service
in the kernel. The RNG only needs a high-resolution time
stamp.

Further design details, the cryptographic assessment and
large array of test results are documented at
http://www.chronox.de/jent.html.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 crypto/Kconfig |  10 +
 crypto/Makefile|   2 +
 crypto/jitterentropy.c | 909 +
 crypto/testmgr.c   |   4 +
 4 files changed, 925 insertions(+)
 create mode 100644 crypto/jitterentropy.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 8aaf298..5cf9174 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1479,9 +1479,19 @@ config CRYPTO_DRBG
tristate
default CRYPTO_DRBG_MENU if (CRYPTO_DRBG_HMAC || CRYPTO_DRBG_HASH || 
CRYPTO_DRBG_CTR)
select CRYPTO_RNG
+   select CRYPTO_JITTERENTROPY
 
 endif  # if CRYPTO_DRBG_MENU
 
+config CRYPTO_JITTERENTROPY
+   tristate "Jitterentropy Non-Deterministic Random Number Generator"
+   help
+ The Jitterentropy RNG is a noise that is intended
+ to provide seed to another RNG. The RNG does not
+ perform any cryptographic whitening of the generated
+ random numbers. This Jitterentropy RNG registers with
+ the kernel crypto API and can be used by any caller.
+
 config CRYPTO_USER_API
tristate
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 97b7d3a..2f450ef 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -94,6 +94,8 @@ obj-$(CONFIG_CRYPTO_RNG2) += rng.o
 obj-$(CONFIG_CRYPTO_RNG2) += krng.o
 obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o
 obj-$(CONFIG_CRYPTO_DRBG) += drbg.o
+CFLAGS_jitterentropy.o = -O0
+obj-$(CONFIG_CRYPTO_JITTERENTROPY) += jitterentropy.o
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
 obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c
new file mode 100644
index 000..1ebe58a
--- /dev/null
+++ b/crypto/jitterentropy.c
@@ -0,0 +1,909 @@
+/*
+ * Non-physical true random number generator based on timing jitter.
+ *
+ * Copyright Stephan Mueller , 2014
+ *
+ * Design
+ * ==
+ *
+ * See http://www.chronox.de/jent.html
+ *
+ * License
+ * ===
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, and the entire permission notice in its entirety,
+ *including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *products derived from this software without specific prior
+ *written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL2 are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ *

[PATCH v3 5/6] crypto: drbg - use Jitter RNG to obtain seed

2015-04-27 Thread Stephan Mueller

During initialization, the DRBG now tries to allocate a handle of the
Jitter RNG. If such a Jitter RNG is available during seeding, the DRBG
pulls the required entropy/nonce string from get_random_bytes and
concatenates it with a string of equal size from the Jitter RNG. That
combined string is now the seed for the DRBG.

Written differently, the initial seed of the DRBG is now:

get_random_bytes(entropy/nonce) || jitterentropy (entropy/nonce)

If the Jitter RNG is not available, the DRBG only seeds from
get_random_bytes.

CC: Andreas Steffen 
CC: Theodore Ts'o 
CC: Sandy Harris 
Signed-off-by: Stephan Mueller 
---
 crypto/drbg.c | 46 --
 include/crypto/drbg.h |  1 +
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index 13dd626..fe081e1 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1125,10 +1125,25 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
 drbg->test_data.len);
pr_devel("DRBG: using test entropy\n");
} else {
-   pr_devel("DRBG: (re)seeding with %zu bytes of entropy\n",
-drbg->seed_buf_len);
+   /* Get seed from in-kernel /dev/urandom */
get_random_bytes(drbg->seed_buf, drbg->seed_buf_len);
-   drbg_string_fill(&data1, drbg->seed_buf, drbg->seed_buf_len);
+
+   /* Get seed from Jitter RNG */
+   if (!drbg->jent ||
+   crypto_rng_get_bytes(drbg->jent,
+drbg->seed_buf + drbg->seed_buf_len,
+drbg->seed_buf_len)) {
+   pr_info("DRBG: could not obtain random data from Jitter 
RNG\n");
+   drbg_string_fill(&data1, drbg->seed_buf,
+drbg->seed_buf_len);
+   pr_devel("DRBG: (re)seeding with %zu bytes of 
entropy\n",
+drbg->seed_buf_len);
+   } else {
+   drbg_string_fill(&data1, drbg->seed_buf,
+drbg->seed_buf_len * 2);
+   pr_devel("DRBG: (re)seeding with %zu bytes of 
entropy\n",
+drbg->seed_buf_len * 2);
+   }
}
list_add_tail(&data1.list, &seedlist);
 
@@ -1153,7 +1168,7 @@ static int drbg_seed(struct drbg_state *drbg, struct 
drbg_string *pers,
 * Clear the initial entropy buffer as the async call may not overwrite
 * that buffer for quite some time.
 */
-   memzero_explicit(drbg->seed_buf, drbg->seed_buf_len);
+   memzero_explicit(drbg->seed_buf, drbg->seed_buf_len * 2);
if (ret)
goto out;
/*
@@ -1195,6 +1210,10 @@ static inline void drbg_dealloc_state(struct drbg_state 
*drbg)
 #endif
kzfree(drbg->seed_buf);
drbg->seed_buf = NULL;
+   if (drbg->jent) {
+   crypto_free_rng(drbg->jent);
+   drbg->jent = NULL;
+   }
 }
 
 /*
@@ -1270,12 +1289,27 @@ static inline int drbg_alloc_state(struct drbg_state 
*drbg)
ret = -EFAULT;
goto err;
}
-   /* ensure we have sufficient buffer space for initial seed */
+   /*
+* Ensure we have sufficient buffer space for initial seed which
+* consists of the seed from get_random_bytes and the Jitter RNG.
+*/
drbg->seed_buf_len = ((drbg->seed_buf_len + 1) / 2) * 3;
-   drbg->seed_buf = kzalloc(drbg->seed_buf_len, GFP_KERNEL);
+   drbg->seed_buf = kzalloc(drbg->seed_buf_len * 2, GFP_KERNEL);
if (!drbg->seed_buf)
goto err;
 
+   drbg->jent = crypto_alloc_rng("jitterentropy_rng", 0, 0);
+   if(IS_ERR(drbg->jent))
+   {
+   pr_info("DRBG: could not allocate Jitter RNG handle for 
seeding\n");
+   /*
+* As the Jitter RNG is a module that may not be present, we
+* continue with the operation and do not fully tie the DRBG
+* to the Jitter RNG.
+*/
+   drbg->jent = NULL;
+   }
+
return 0;
 
 err:
diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h
index e4980a1..fabf102 100644
--- a/include/crypto/drbg.h
+++ b/include/crypto/drbg.h
@@ -122,6 +122,7 @@ struct drbg_state {
struct random_work seed_work;   /* asynchronous seeding support */
u8 *seed_buf;   /* buffer holding the seed */
size_t seed_buf_len;
+   struct crypto_rng *jent;
const struct drbg_state_ops *d_ops;
const struct drbg_core *core;
struct drbg_string test_data;
-- 
2.1.0


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majo

Re: [PATCH v6 0/6] arm64: Add kernel probes (kprobes) support

2015-04-27 Thread William Cohen


Hi All,

I have been experimenting with the patches for arm64 kprobes support.
On occasion the kernel gets stuck in a loop printing output:

 Unexpected kernel single-step exception at EL1

This message by itself is not that enlighten.  I added the attached
patch to get some additional information about register state when the
warning is printed out.  Below is an example output:


[14613.263536] Unexpected kernel single-step exception at EL1
[14613.269001] kcb->ss_ctx.ss_status = 1
[14613.272643] kcb->ss_ctx.match_addr = fdfffc001250 0xfdfffc001250
[14613.279324] instruction_pointer(regs) = fe093358 el1_da+0x8/0x70
[14613.286003] 
[14613.287487] CPU: 3 PID: 621 Comm: irqbalance Tainted: G   OE   
4.0.0u4+ #6
[14613.295019] Hardware name: AppliedMicro Mustang/Mustang, BIOS 1.1.0-rh-0.15 
Mar 13 2015
[14613.302982] task: fe01d6806780 ti: fe01d68ac000 task.ti: 
fe01d68ac000
[14613.310430] PC is at el1_da+0x8/0x70
[14613.313990] LR is at trampoline_probe_handler+0x188/0x1ec
[14613.319363] pc : [] lr : [] pstate: 
61c5
[14613.326724] sp : fe01d68af640
[14613.330021] x29: fe01d68afbf0 x28: fe01d68ac000 
[14613.335328] x27: fe0939cc x26: febb09d0 
[14613.340634] x25: fe01d68afdb0 x24: 0025 
[14613.345939] x23: 83c5 x22: fdfffc001284 
[14613.351245] x21: fe01d68af760 x20: fe01d7c79a00 
[14613.356552] x19:  x18: 03ffa4b8e600 
[14613.361858] x17: 03ffa5480698 x16: fe1f2afc 
[14613.367164] x15: 0007 x14: 03ffeffa8690 
[14613.372471] x13: 0001 x12: 03ffa4baf200 
[14613.38] x11: fe6bb328 x10: fe6bb32c 
[14613.383084] x9 : fe01d68afd10 x8 : fe01d6806d10 
[14613.388390] x7 : fe01ffd01298 x6 : fe09192c 
[14613.393696] x5 : fec1b398 x4 :  
[14613.399001] x3 : 00200200 x2 : 00100100 
[14613.404306] x1 : 9606 x0 : 0015 
[14613.409610] 
[14613.411094] BUG: failure at 
arch/arm64/kernel/debug-monitors.c:276/single_step_handler()!


The really odd thing is the address of the PC it is in el1_da the code
to handle data aborts.  it looks like it is getting the unexpected
single_step exception right after the enable_debug in el1_da.  I think
what might be happening is:

-an instruction is instrumented with kprobe
-the instruction is copied to a buffer
-a breakpoint replaces the instruction
-the kprobe fires when the breakpoint is encountered
-the instruction in the buffer is set to single step
-a single step of the instruction is attempted
-a data abort exception is raised
-el1_da is called
-el1_da does an enable_dbg to unmask the debug exceptions
-single_step_handler is called
-single_step_handler doesn't find anything to handle that pc
-single_step_handler prints the warning about unexpected el1 single step
-single_step_handler re-enable ss step
-the single step of the instruction is attempted endlessly

It looks like commit 1059c6bf8534acda249e7e65c81e7696fb074dc1 from Mon
Sep 22   "arm64: debug: don't re-enable debug exceptions on return from el1_dbg"
was trying to address a similar problem for the el1_dbg
function.  Should el1_da and other el1_* functions have the enable_dbg
removed?

If single_step_handler doesn't find a handler, is re-enabling the
single step with set_regs_spsr_ss in single_step_handler the right thing to do?

-Will

diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index dae7bb4..ec5a1b2 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -262,6 +262,19 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 
 		if (!handler_found) {
 			pr_warning("Unexpected kernel single-step exception at EL1\n");
+			{
+		  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+			pr_warning("kcb->ss_ctx.ss_status = %ld\n",
+   kcb->ss_ctx.ss_status);
+			printk("kcb->ss_ctx.match_addr = %lx ",
+ kcb->ss_ctx.match_addr);
+			print_symbol("%s\n", kcb->ss_ctx.match_addr);
+			printk("instruction_pointer(regs) = %lx ",
+   instruction_pointer(regs));
+			print_symbol("%s\n", instruction_pointer(regs));
+			show_regs(regs);
+			BUG();
+			}
 			/*
 			 * Re-enable stepping since we know that we will be
 			 * returning to regs.

Re: [PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()

2015-04-27 Thread David Miller

From: "K. Y. Srinivasan" 
Date: Mon, 27 Apr 2015 18:14:50 -0700

> Commit commit b08cc79155fc26d0d112b1470d1ece5034651a4b eliminated memory
> allocation in the packet send path. This commit introduced a bug since it
> did not account for the case if the skb was cloned. Fix this bug by
> using the pre-reserved head room only if the skb is not cloned.
> 
> Signed-off-by: K. Y. Srinivasan 

We have generic infrastructure to do this, please try instead:

err = skb_cow_head(skb, pkt_sz);

this will take care of everything for you and you can get rid
of all of this dynamic memory allocation etc. in this code
path.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()

2015-04-27 Thread Dexuan Cui

> -Original Message-
> From: devel [mailto:driverdev-devel-boun...@linuxdriverproject.org] On
> Behalf Of K. Y. Srinivasan
> Sent: Tuesday, April 28, 2015 9:15
> To: da...@davemloft.net; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; de...@linuxdriverproject.org; o...@aepfle.de;
> a...@canonical.com; jasow...@redhat.com
> Subject: [PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()
> 
> Commit commit b08cc79155fc26d0d112b1470d1ece5034651a4b
> eliminated memory
> allocation in the packet send path. This commit introduced a bug since it
> did not account for the case if the skb was cloned. Fix this bug by
> using the pre-reserved head room only if the skb is not cloned.
> 
> Signed-off-by: K. Y. Srinivasan 
> ---
>  drivers/net/hyperv/netvsc_drv.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c
> b/drivers/net/hyperv/netvsc_drv.c
> index a3a9d38..7eb0251 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -421,7 +421,7 @@ check_size:
> 
>   pkt_sz = sizeof(struct hv_netvsc_packet) + RNDIS_AND_PPI_SIZE;
> 
> - if (head_room < pkt_sz) {
> + if (skb->cloned ||  head_room < pkt_sz) {
>   packet = kmalloc(pkt_sz, GFP_ATOMIC);
>   if (!packet) {
>   /* out of memory, drop packet */
> --

Without the patch, the guest can panic due to memory corruption.

I confirm the patch can fix the panic I saw.

Tested-by: Dexuan Cui 

Thanks,
-- Dexuan
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread punnaiah choudary kalluri

On Tue, Apr 28, 2015 at 4:53 AM, Brian Norris
 wrote:
> On Tue, Apr 28, 2015 at 12:19:16AM +0200, Richard Weinberger wrote:
>> Am 27.04.2015 um 23:35 schrieb Ben Shelton:
>> > I tested this against the latest version of the PL353 NAND driver that 
>> > Punnaiah
>> > has been working to upstream (copying her on this message).  With a few 
>> > changes
>> > to that driver, I got it most of the way through initialization with 
>> > on-die ECC
>> > enabled, but it segfaults here with a null pointer dereference because the
>> > PL353 driver does not implement chip->cmd_ctrl.  Instead, it implements a
>> > custom override of cmd->cmdfunc that does not call cmd_ctrl.  Looking 
>> > through
>> > the other in-tree NAND drivers, it looks like most of them do implement
>> > cmd_ctrl, but quite a few of them do not (e.g. au1550nd, denali, docg4).
>> >
>> > What do you think would be the best way to handle this?  It seems like 
>> > this gap
>> > could be bridged from either side -- either the PL353 driver could 
>> > implement
>> > cmd_ctrl, at least as a stub version that provides the expected behavior in
>> > this case; or the on-die framework could break this out into a callback
>> > function with a default implementation that the driver could override to
>> > perform this behavior in the manner of its choosing.
>>
>> Oh, I thought every driver has to implement that function. ;-\
>
> Nope.
>
>> But you're right there is a corner case.
>
> And it's not the only one! Right now, there's no guarantee even that
> read_buf() returns raw data, unmodified by the SoC's controller. Plenty
> of drivers actually have HW-enabled ECC turned on by default, and so
> they override the chip->ecc.read_page() (and sometimes
> chip->ecc.read_page_raw() functions, if we're lucky) with something
> that pokes the appropriate hardware instead. I expect anything
> comprehensive here is probably going to have to utilize
> chip->ecc.read_page_raw(), at least if it's provided by the hardware
> driver.

Yes, overriding the chip->ecc.read_page_raw would solve this. Agree that
read_buf need not be returning raw data always including my new driver for
arasan nand flash controller.

http://lkml.iu.edu/hypermail/linux/kernel/1504.2/00313.html


Regards,
Punnaiah
>
>> What we could do is just using chip->cmdfunc() with a custom NAND command.
>> i.e. chip->cmdfunc(mtd, NAND_CMD_READMODE, -1, -1);
>>
>> Gerhard Sittig tried to introduce such a command some time ago:
>> http://lists.infradead.org/pipermail/linux-mtd/2014-April/053115.html
>
> Yikes! Please no! It's bad enough to have a ton of drivers doing
> switch/case on a bunch of real, somewhat well-known opcodes, but to add
> new fake ones? I'd rather not. We're inflicting ourselves with a
> kernel-internal version of ioctl(). What's the justification, again? I
> don't really remember the context of Gerhard's previous patch.
>
> Brian
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC] net/macb: Fix UDPv4 checksum offload

2015-04-27 Thread David Miller

From: Jaeden Amero 
Date: Mon, 27 Apr 2015 17:43:30 -0500

> If we set the checksum field in the UDP header to 0, the checksum is
> computed correctly.

I think this is completely bogus.

A UDP checksum of zero, means "checksum not computed".  And your
device isn't computing the checksum at all, but rather is leaving it
at zero.

You need to handle this properly by computing the checksum in
software and then setting the TX descriptor bits such that the
chip leaves the checksum field alone.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] timer: make deferrable cpu unbound timers really not bound to a cpu

2015-04-27 Thread Joonwoo Park

Thomas, I made some clean up.  Will much appreciate if you can give me some 
feedback on this.

Thanks,
Joonwoo

On 04/27/2015 07:39 PM, Joonwoo Park wrote:
> When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
> queue_delayed_work() it's probably intended to run the work item on any
> CPU that isn't idle. However, we queue the work to run at a later time
> by starting a deferrable timer that binds to whatever CPU the work is
> queued on which is same with queue_delayed_work_on(smp_processor_id())
> effectively.
> 
> As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
> In fact this is perfectly fine with UP kernel and also won't affect much a
> system without dyntick with SMP kernel too as every cpus run timers
> periodically.  But on SMP systems with dyntick current implementation leads
> deferrable timers not very scalable because the timer's base which has
> queued the deferrable timer won't wake up till next non-deferrable timer
> expires even though there are possible other non idle cpus are running
> which are able to run expired deferrable timers.
> 
> The deferrable work is a good example of the current implementation's
> victim like below.
> 
> INIT_DEFERRABLE_WORK(&dwork, fn);
> CPU 0 CPU 1
> queue_delayed_work(wq, &dwork, HZ);
> queue_delayed_work_on(WORK_CPU_UNBOUND);
> ...
>   __mod_timer() -> queues timer to the
>current cpu's timer
>base.
>   ...
> tick_nohz_idle_enter() -> cpu enters idle.
> A second later
> cpu 0 is now in idle. cpu 1 exits idle or wasn't in idle so
>   now it's in active but won't
> cpu 0 won't wake up till next handle cpu unbound deferrable timer
> non-deferrable timer expires. as it's in cpu 0's timer base.
> 
> To make all cpu unbound deferrable timers are scalable, introduce a common
> timer base which is only for cpu unbound deferrable timers to make those
> are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
> This common timer fixes scalability issue of delayed work and all other cpu
> unbound deferrable timer using implementations.
> 
> CC: Thomas Gleixner 
> CC: John Stultz 
> CC: Tejun Heo 
> Signed-off-by: Joonwoo Park 
> ---
> Changes in v3:
>  * Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache 
> bouncing.
> 
> Changes in v4:
>  * Kill CONFIG_SMP ifdefry.
>  * Allocate and initialize tvec_base_deferrable at compile time.
>  * Pin pinned deferrable timer. 
>  * s/deferral/deferrable/
> 
>  include/linux/timer.h |  14 ++-
>  kernel/time/timer.c   | 103 
> --
>  2 files changed, 97 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/timer.h b/include/linux/timer.h
> index 8c5a197..45847ca 100644
> --- a/include/linux/timer.h
> +++ b/include/linux/timer.h
> @@ -34,6 +34,9 @@ struct timer_list {
>  };
>  
>  extern struct tvec_base boot_tvec_bases;
> +#ifdef CONFIG_SMP
> +extern struct tvec_base tvec_base_deferrable;
> +#endif
>  
>  #ifdef CONFIG_LOCKDEP
>  /*
> @@ -70,12 +73,21 @@ extern struct tvec_base boot_tvec_bases;
>  
>  #define TIMER_FLAG_MASK  0x3LU
>  
> +#ifdef CONFIG_SMP
> +#define __TIMER_BASE(_flags) \
> + ((_flags) & TIMER_DEFERRABLE ? \
> +  (unsigned long)&tvec_base_deferrable + (_flags) : \
> +  (unsigned long)&boot_tvec_bases + (_flags))
> +#else
> +#define __TIMER_BASE(_flags) ((unsigned long)&boot_tvec_bases + (_flags))
> +#endif
> +
>  #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
>   .entry = { .prev = TIMER_ENTRY_STATIC },\
>   .function = (_function),\
>   .expires = (_expires),  \
>   .data = (_data),\
> - .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
> + .base = (void *)(__TIMER_BASE(_flags)), \
>   .slack = -1,\
>   __TIMER_LOCKDEP_MAP_INITIALIZER(\
>   __FILE__ ":" __stringify(__LINE__)) \
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index e5d5733c..133e94a 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -49,6 +49,8 @@
>  #include 
>  #include 
>  
> +#include "tick-internal.h"
> +
>  #define CREATE_TRACE_POINTS
>  #include 
>  
> @@ -103,6 +105,9 @@ struct tvec_base boot_tvec_bases;
>  EXPORT_SYMBOL(boot_tvec_bases);
>  
>  static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
> +#ifdef CONFIG_SMP
> +struct tvec_base tvec_base_deferrable;
> +#endif
>  
>  /* Functions below help us manage 'deferrable' flag */
>  static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
> @@ -662,10 +667,63 @@ static inline void debug_assert_init(str

[PATCH 2/2] timer: make deferrable cpu unbound timers really not bound to a cpu

2015-04-27 Thread Joonwoo Park

When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0 CPU 1
queue_delayed_work(wq, &dwork, HZ);
queue_delayed_work_on(WORK_CPU_UNBOUND);
...
__mod_timer() -> queues timer to the
 current cpu's timer
 base.
...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle. cpu 1 exits idle or wasn't in idle so
  now it's in active but won't
cpu 0 won't wake up till next handle cpu unbound deferrable timer
non-deferrable timer expires. as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

CC: Thomas Gleixner 
CC: John Stultz 
CC: Tejun Heo 
Signed-off-by: Joonwoo Park 
---
Changes in v3:
 * Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache 
bouncing.

Changes in v4:
 * Kill CONFIG_SMP ifdefry.
 * Allocate and initialize tvec_base_deferrable at compile time.
 * Pin pinned deferrable timer. 
 * s/deferral/deferrable/

 include/linux/timer.h |  14 ++-
 kernel/time/timer.c   | 103 --
 2 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..45847ca 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -34,6 +34,9 @@ struct timer_list {
 };
 
 extern struct tvec_base boot_tvec_bases;
+#ifdef CONFIG_SMP
+extern struct tvec_base tvec_base_deferrable;
+#endif
 
 #ifdef CONFIG_LOCKDEP
 /*
@@ -70,12 +73,21 @@ extern struct tvec_base boot_tvec_bases;
 
 #define TIMER_FLAG_MASK0x3LU
 
+#ifdef CONFIG_SMP
+#define __TIMER_BASE(_flags) \
+   ((_flags) & TIMER_DEFERRABLE ? \
+(unsigned long)&tvec_base_deferrable + (_flags) : \
+(unsigned long)&boot_tvec_bases + (_flags))
+#else
+#define __TIMER_BASE(_flags) ((unsigned long)&boot_tvec_bases + (_flags))
+#endif
+
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
.entry = { .prev = TIMER_ENTRY_STATIC },\
.function = (_function),\
.expires = (_expires),  \
.data = (_data),\
-   .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
+   .base = (void *)(__TIMER_BASE(_flags)), \
.slack = -1,\
__TIMER_LOCKDEP_MAP_INITIALIZER(\
__FILE__ ":" __stringify(__LINE__)) \
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e5d5733c..133e94a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include 
 #include 
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include 
 
@@ -103,6 +105,9 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+struct tvec_base tvec_base_deferrable;
+#endif
 
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -662,10 +667,63 @@ static inline void debug_assert_init(struct timer_list 
*timer)
debug_timer_assert_init(timer);
 }
 
+#ifdef CONFIG_SMP
+static inline struct tvec_base *__get_timer_base(unsigned int flags)
+{
+   if (flags & TIMER_DEFERRABLE)
+   return &tvec_base_deferrable;
+   else
+   return raw_cpu_read(tvec_bases);
+}
+
+static inline bool is_deferrable_timer_base(struct tvec

RE: [RFC PATCH 5/5] GHES: Make NMI handler have a single reader

2015-04-27 Thread Zheng, Lv

Hi,

> From: Zheng, Lv
> Sent: Tuesday, April 28, 2015 8:44 AM
> 
> Hi,
> 
> > From: Borislav Petkov [mailto:b...@alien8.de]
> > Sent: Monday, April 27, 2015 4:47 PM
> >
> > On Mon, Apr 27, 2015 at 03:16:00AM +, Zheng, Lv wrote:
> > > > @@ -840,7 +840,9 @@ static int ghes_notify_nmi(unsigned int cmd, struct 
> > > > pt_regs *regs)
> > > > struct ghes *ghes;
> > > > int sev, ret = NMI_DONE;
> > > >
> > > > -   raw_spin_lock(&ghes_nmi_lock);
> > > > +   if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
> > > > +   return ret;
> > > > +
> > >
> > > Just a simple question.
> > > Why not just using cmpxchg here instead of atomic_add_unless so that no 
> > > atomic_dec will be needed.
> >
> > What do you think atomic_add_unless ends up doing:
> >
> > #APP
> > # 177 "./arch/x86/include/asm/atomic.h" 1
> > .pushsection .smp_locks,"a"
> > .balign 4
> > .long 671f - .
> > .popsection
> > 671:
> > lock; cmpxchgl %edx,ghes_in_nmi(%rip)   # D.37056, MEM[(volatile u32 
> > *)&ghes_in_nmi]
> > # 0 "" 2
> > #NO_APP
> >
> > And you need to atomic_dec() so that another reader can enter, i.e. how
> > the exclusion primitive works.
> >
> > Or did you have something else in mind?
> 
> My mistake.
> I mean cmpxchg() and xchg() (or atomic_cmpxchg() and atomic_xchg()) pair 
> here, so nothing can be reduced.

Let me correct, it should be atomic_cmpxchg() and atomic_set() here as you only 
need to switch between 0 and 1.
Sorry for the noise.

Thanks and best regards
-Lv

> 
> But IMO, atomic_add_unless() is implemented via cmpxchg on many architectures.
> And it might be better to use it directly here which is a bit faster as you 
> actually only need one value switch here.
> 
> Thanks and best regards
> -Lv
> 
> 
> >
> > --
> > Regards/Gruss,
> > Boris.
> >
> > ECO tip #101: Trim your mails when you reply.
> > --
N�r��yb�X��ǧv�^�)޺{.n�+{zX����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf��^jǫy�m��@A�a���
0��h���i

Re: [RFC PATCH 3/3] iio: derive the mounting matrix from ACPI _PLD objects

2015-04-27 Thread Octavian Purdila

On Tue, Apr 28, 2015 at 12:57 AM, sathyanarayanan kuppuswamy
 wrote:
> Hi
>
> On 04/27/2015 08:54 AM, Octavian Purdila wrote:
>>
>> On Mon, Apr 27, 2015 at 6:42 PM, Kuppuswamy Sathyanarayanan
>>  wrote:
>>>
>>> Since Acpi framework already exports this info to user space, Why not do
>>> this derivation in user space code ? Why do we need new ABI, if the same
>>> can be derived from existing one.
>>>
>> The ABI was added in the previous patch so that we can present the
>> sensor orientation information to userspace even in the case of device
>> tree.
>
> If the main reason for implementing a new ABI is to support DT platforms,
> Why not implement a version of _PLD for device tree ? Don't you think it
> would be much better than adding a new ABI to export redundant information ?
>

IMO the mounting matrix is more consistent with the IIO ABIs. Although
I have no issue with repicating _PLD for device tree if people agree
that it is better.

> Also the location information of the device is not just specific to iio
> drivers. You should consider that we would have similar requirements for
> devices implemented as input or platform drivers.

The upstream standard for those sensors where the orientation matters
(accelerometer, gyro, compass) is IIO.

Granted, there are other device types for which the orientation
information may be useful (e.g. camera). However the actual
interpretation and action to be taken is different for each subsystem
(e.g. in the camera case do the correction via V4L2_CID_HFLIP /
V4L2_CID_VFLIP) so I think it is better to expose it at the subsystem
level in a way consistent with the subsystem's ABIs.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] timer: avoid unnecessary waking up of nohz CPU

2015-04-27 Thread Joonwoo Park

At present, internal_add_timer() examines flags with 'base' which doesn't
contain flags.  Examine with 'timer->base' to avoid unnecessary waking up
of nohz CPU when timer base has TIMER_DEFERRABLE.

CC: Thomas Gleixner 
CC: John Stultz 
Signed-off-by: Joonwoo Park 
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa..e5d5733c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -434,7 +434,7 @@ static void internal_add_timer(struct tvec_base *base, 
struct timer_list *timer)
 * require special care against races with idle_cpu(), lets deal
 * with that later.
 */
-   if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+   if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu))
wake_up_nohz_cpu(base->cpu);
 }
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3 V8] workqueue: Allow modifying low level unbound workqueue cpumask

2015-04-27 Thread Lai Jiangshan

On 04/28/2015 09:44 AM, Lai Jiangshan wrote:

>>>  
>>> +   /* save the user configured attrs */
>>> +   cpumask_and(new_attrs->cpumask, attrs->cpumask, cpu_possible_mask);
>>
>> Wouldn't this make a lot more sense above when copying @attrs into
>> @new_attrs?  The comment there even says "make a copy of @attrs and
>> sanitize it".  Copy to @new_attrs, mask with wq_unbound_cpumask and
>> fall back to wq_unbound_cpumask if empty.


We need to save the user original configured attrs.
When any time wq_unbound_cpumask is changed, we should use
the user original configured attrs (cpumask) to re-calculate
the pwqs and avoid losing any information.

> 
> It should be:
> 
> + copy_workqueue_attrs(new_attrs, attrs);
> + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V6 00/10] namespaces: log namespaces per task

2015-04-27 Thread Eric W. Biederman

Richard Guy Briggs  writes:

> On 15/04/24, Eric W. Biederman wrote:
>> Richard Guy Briggs  writes:
>> > On 15/04/22, Richard Guy Briggs wrote:
>> >> On 15/04/20, Eric W. Biederman wrote:
>> >> > Richard Guy Briggs  writes:
>> >> > 
>> >
>> > Do I even need to report the device number anymore since I am concluding
>> > s_dev is never set (or always zero) in the nsfs filesystem by
>> > mount_pseudo() and isn't even mountable? 
>> 
>> We still need the dev. We do have a device number get_anon_bdev fills it in.
>
> Fine, it has a device number.  There appears to be only one of these
> allocated per kernel.  I can get it from &nsfs->fs_supers (and take the
> first instance given by hlist_for_each_entry and verify there are no
> others).  Why do I need it, again?

Because if we have to preserve the inode number over a migration event I
want to preserve the fact that we are talking about inode numbers from a
superblock with a device number.

Otherwise known as I am allergic to kernel global identifiers, because
they can be major pains.  I don't want to have to go back and implement
a namespace for namespaces.

>> >> They are all covered:
>> >> sys_unshare > unshare_userns > create_user_ns
>> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
>> >> copy_mnt_ns
>> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
>> >> copy_utsname > clone_uts_ns
>> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
>> >> copy_ipcs > get_ipc_ns
>> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
>> >> copy_pid_ns > create_pid_namespace
>> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
>> >> copy_net_ns
>> 
>> Then why the special change to fork?  That was not reflected on
>> the unshare path as far as I could see.
>
> Fork can specify more than one CLONE flag at once, so collecting them
> all in one statementn seemed helpful.  setns can only set one at a time.

unshare can also specify more than one CLONE flag at once.

I just pointed that out becase that seemed really unsymmetrical.

> Ok, understood, we can't just punt this one to a higher layer...
>
> So this comes back to a question above, which is how do we determine
> which device it is from?  Sounds like we need something added to
> ns_common or one of the 6 namespace types structs.

Or we can just hard code reading it off of the appropriate magic
filesystem.  Probably what we want is a well named helper function that
does the job.

I just care that when we talk about these things we are talking about
inode numbers from a superblock that is associated with a given device
number.  That way I don't have nightmares about dealing with a namespace
for namespaces.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] irqchip: renesas-intc-irqpin: Improve binding documentation

2015-04-27 Thread Simon Horman

On Mon, Apr 27, 2015 at 04:45:37PM +0200, Geert Uytterhoeven wrote:
> Add missing documentation for required properties:
>   - interrupt-controller,
>   - parent interrupts (one entry per provided interrupt).
> 
> Add missing documentation for optional properties:
>   - functional clock (managed since commit 705bc96c2c15313c ("irqchip:
> renesas-intc-irqpin: Add minimal runtime PM support")),
>   - power-domains.
> 
> Add an example, taken from r8a7740.dtsi.
> 
> Signed-off-by: Geert Uytterhoeven 

Acked-by: Simon Horman 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V6 00/10] namespaces: log namespaces per task

2015-04-27 Thread Richard Guy Briggs

On 15/04/24, Eric W. Biederman wrote:
> Richard Guy Briggs  writes:
> > On 15/04/22, Richard Guy Briggs wrote:
> >> On 15/04/20, Eric W. Biederman wrote:
> >> > Richard Guy Briggs  writes:
> >> > 
> >> > > The purpose is to track namespace instances in use by logged processes 
> >> > > from the
> >> > > perspective of init_*_ns by logging the namespace IDs (device ID and 
> >> > > namespace
> >> > > inode - offset).
> >> > 
> >> > In broad strokes the user interface appears correct.
> >> > 
> >> > Things that I see that concern me:
> >> > 
> >> > - After Als most recent changes these inodes no longer live in the proc
> >> >   superblock so the device number reported in these patches is
> >> >   incorrect.
> >> 
> >> Ok, found the patchset you're talking about:
> >>3d3d35b kill proc_ns completely
> >>e149ed2 take the targets of /proc/*/ns/* symlinks to separate fs
> >>f77c801 bury struct proc_ns in fs/proc
> >>33c4294 copy address of proc_ns_ops into ns_common
> >>6344c43 new helpers: ns_alloc_inum/ns_free_inum
> >>6496452 make proc_ns_operations work with struct ns_common * instead of 
> >> void *
> >>3c04118 switch the rest of proc_ns_operations to working with &...->ns
> >>ff24870 netns: switch ->get()/->put()/->install()/->inum() to working 
> >> with &net->ns
> >>58be2825 make mntns ->get()/->put()/->install()/->inum() work with 
> >> &mnt_ns->ns
> >>435d5f4 common object embedded into various struct ns
> >> 
> >> Ok, I've got some minor jigging to do to get inum too...
> >
> > Do I even need to report the device number anymore since I am concluding
> > s_dev is never set (or always zero) in the nsfs filesystem by
> > mount_pseudo() and isn't even mountable? 
> 
> We still need the dev. We do have a device number get_anon_bdev fills it in.

Fine, it has a device number.  There appears to be only one of these
allocated per kernel.  I can get it from &nsfs->fs_supers (and take the
first instance given by hlist_for_each_entry and verify there are no
others).  Why do I need it, again?

> > In fact, I never needed to
> > report the device since proc ida/idr and inodes are kernel-global and
> > namespace-oblivious.
> 
> This is the bit I really want to keep to be forward looking.  If we
> every need to preserve the inode numbers across a migration we could
> have different super blocks with different inode numbers for the same
> namespace.

I don't quite follow your argument here, but can accept that in the
future we might add other namespace devices.  I wonder if we might do
that augmentation later and leave out the device number for now...

> >> > - I am nervous about audit logs being flooded with users creating lots
> >> >   of namespaces.  But that is more your lookout than mine.
> >> 
> >> There was a thought to create a filter to en/disable this logging...
> >> It is an auxiliary record to syscalls, so they can be ignored by userspace 
> >> tools.
> >> 
> >> > - unshare is not logging when it creates new namespaces.
> >> 
> >> They are all covered:
> >> sys_unshare > unshare_userns > create_user_ns
> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
> >> copy_mnt_ns
> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
> >> copy_utsname > clone_uts_ns
> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
> >> copy_ipcs > get_ipc_ns
> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
> >> copy_pid_ns > create_pid_namespace
> >> sys_unshare > unshare_nsproxy_namespaces > create_new_namespaces > 
> >> copy_net_ns
> 
> Then why the special change to fork?  That was not reflected on
> the unshare path as far as I could see.

Fork can specify more than one CLONE flag at once, so collecting them
all in one statementn seemed helpful.  setns can only set one at a time.

> >> > As small numbers are nice and these inodes all live in their own
> >> > superblock now we should be able to remove the games with
> >> > PROC_DYNAMIC_FIRST and just use small numbers for these inodes
> >> > everywhere.
> >> 
> >> That is compelling if I can untangle the proc inode allocation code from 
> >> the
> >> ida/idr.  Should be as easy as defining a new ns_alloc_inum (and 
> >> ns_free_inum)
> >> to use instead of proc_alloc_inum with its own ns_inum_ida and 
> >> ns_inum_lock,
> >> then defining a NS_DYNAMIC_FIRST and defining 
> >> NS_{IPC,UTS,USER,PID}_INIT_INO in
> >> the place of the existing PROC_*_INIT_INO.
> 
> Something like that.  Just a new ida/idr allocator specific to that
> superblock.
> 
> Yeah.  It is somewhere on my todo, but I have been prioritizing getting
> the bugs that look potentially expoloitable fixed in the mount
> namespace.  Al made things nice for one case but left a mess for a bunch
> of others.
> 
> >> > I honestly don't know how much we are going to care about namespace ids
> >> > during migration.  So far this is not a problem that has come up.
> >> 
> >> Not for CRIU, but it

Re: [PATCH 3/3 V8] workqueue: Allow modifying low level unbound workqueue cpumask

2015-04-27 Thread Lai Jiangshan

Hello

> 
>> --- a/include/linux/workqueue.h
>> +++ b/include/linux/workqueue.h
>> @@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t 
>> gfp_mask);
>>  void free_workqueue_attrs(struct workqueue_attrs *attrs);
>>  int apply_workqueue_attrs(struct workqueue_struct *wq,
>>const struct workqueue_attrs *attrs);
>> +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
> 
> Why is this a public function?


In V4 patchset, Kevin Hilman had requested the wq_unbound_cpumask
to be "cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);"

I replied against it and I suggested that wq_unbound_cpumask can be
re-set after workqueue initialized it.

And Frederic Weisbecker seemed on my side:
"""
If it should be the default on NO_HZ_FULL, maybe we should do this from the
tick nohz code. Some late or fs initcall that will do the workqueue affinity,
timer affinity, etc...
"""

So, we need an API to modify the wq_unbound_cpumask, and I provided
this public function.  Otherwise, the other code can't modify it.

> 
>> --- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -3548,13 +3549,18 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
>>   * If something goes wrong during CPU up/down, we'll fall back to
>>   * the default pwq covering whole @attrs->cpumask.  Always create
>>   * it even if we don't use it immediately.
>> + *
>> + * If the user configured cpumask doesn't overlap with the
>> + * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
>>   */
>> +if (unlikely(cpumask_empty(new_attrs->cpumask)))
>> +cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
> 
> Please see below.
> 
>>  ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
>>  if (!ctx->dfl_pwq)
>>  goto out_free;
>>  
>>  for_each_node(node) {
>> -if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
>> +if (wq_calc_node_cpumask(new_attrs, node, -1, 
>> tmp_attrs->cpumask)) {
>>  ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
>>  if (!ctx->pwq_tbl[node])
>>  goto out_free;
>> @@ -3564,7 +3570,10 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
>>  }
>>  }
>>  
>> +/* save the user configured attrs */
>> +cpumask_and(new_attrs->cpumask, attrs->cpumask, cpu_possible_mask);
> 
> Wouldn't this make a lot more sense above when copying @attrs into
> @new_attrs?  The comment there even says "make a copy of @attrs and
> sanitize it".  Copy to @new_attrs, mask with wq_unbound_cpumask and
> fall back to wq_unbound_cpumask if empty.

It should be:

+   copy_workqueue_attrs(new_attrs, attrs);
+   cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);

> 
>> +static int workqueue_apply_unbound_cpumask(void)
>> +{
> ...
>> +list_for_each_entry_safe(ctx, n, &ctxs, list) {
> 
> Is the following list_del() necessary?  The list is never used again,
> right?

It isn't necessary. It was added in V7. I thought it could make
the code more normal.


Thanks
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Tom Talpey


On 4/27/2015 6:24 PM, Doug Ledford wrote:

On Mon, 2015-04-27 at 17:53 -0700, Tom Talpey wrote:

On 4/27/2015 5:36 PM, Doug Ledford wrote:

On Mon, 2015-04-27 at 17:16 -0700, Tom Talpey wrote:

On 4/27/2015 2:52 PM, ira.weiny wrote:

On Mon, Apr 27, 2015 at 09:39:05AM +0200, Michael Wang wrote:

On 04/24/2015 05:12 PM, Liran Liss wrote:

[snip]


Like:

enum rdma_protocol {
RDMA_PROTOCOL_IB,
RDMA_PROTOCOL_IBOE,
RDMA_PROTOCOL_IWARP,
RDMA_PROTOCOL_USNIC_UDP
};

So we could use query_protocol() to ask device provide the protocol
type, and there will be no mixing with the legacy transport type
anymore :-)


I'm ok with that.  I like introducing a unique namespace which is clearly
different from the previous "transport" one.


I agree the word "transport" takes things into the weeds.

But on the topic of naming protocols, I've been wondering, is there
some reason that "IBOE" is being used instead of "RoCE"?


Because back in the day, when RoCE was accepted into the kernel, I'm
pretty sure it was prior to the IBTA's final stamp of approval and
before the name was set on RoCE, so IBoE was chosen upstream as the more
"correct" name because it properly denoted what it was deemed to truly
be: IB Verbs over Ethernet.


Well history is all well and good, but it seems weird to not use the
current, standard name in new code. It confuses me, anyway, because
it seems like IBOE could easily mean something else.


Having some of it refer to things as IBOE and some as ROCE would be
similarly confusing, and switching existing IBOE usage to ROCE would
cause pain to people with out of tree drivers (Lustre is the main one I
know of).  There's not a good answer here.  There's only less sucky
ones.


Hrm. Well, avoiding churn is good but legacies can wear ya down.
MHO it is worth doing since these are new enums/new patches.





Also wondering, why add "UDP" to USNIC, is there a different USNIC?


Yes, there are two transports, one a distinct ethertype and one that
encapsulates USNIC in UDP.


But this new enum isn't about transport, it's about protocol. So is
there one USNIC protocol, with a raw layering and a separate one with
UDP? Or is it one USNIC protocol with two different framings? Seems
there should be at least the USNIC protocol, without the _UDP
decoration, and I don't see it in the enum.


Keep in mind that this enum was Liran's response to Michael's original
patch.  In the enum in Michael's patch, there was both USNIC and
USNIC_UDP.


Right! That's why I'm confused. Seems wrong to drop it, right?






Naming multiple layers together seems confusing and maybe in the end
will create more code to deal with the differences. For example, what
token will RoCEv2 take? RoCE_UDP, RoCE_v2 or ... ?


Uncertain as of now.


Ok, but it's imminent, right? What's the preference/guidance?


There is a patchset from Devesh Sharma at Emulex.  It added the RoCEv2
capability.  As I recall, it used a new flag added to the existing port
capabilities bitmask and notably did not modify either the node type or
link layer that are currently used to differentiate between the
different protocols.  That's from memory though, so I could be mistaken.

But that patchset was not written with this patchset in mind, and
merging the two may well change that.  In any case, there is a proposed
spec to follow, so for now that's the preference/guidance (unless this
rework means that we need to depart from the spec on internals for
implementation reasons).


Well, if RoCEv2 uses the same protocol enum, that may introduce new
confusion, for example there will be some new CM handling for UDP encap,
source port selection, and of course vlan/tag assignment, etc. But if
there is support under way, and everyone is clear, then, ok.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3.12 00/83] 3.12.41-stable review

2015-04-27 Thread Guenter Roeck


On 04/27/2015 02:31 PM, Jiri Slaby wrote:

This is the start of the stable review cycle for the 3.12.41 release.
There are 83 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Wed Apr 29 20:39:50 CEST 2015.
Anything received after that time might be too late.


Build results:
total: 125 pass: 124 fail: 1
Failed builds:
arm64:allmodconfig

Qemu test results:
total: 27 pass: 27 fail: 0

Results are as expected.
Details are available at http://server.roeck-us.net:8010/builders.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Doug Ledford

On Mon, 2015-04-27 at 17:53 -0700, Tom Talpey wrote:
> On 4/27/2015 5:36 PM, Doug Ledford wrote:
> > On Mon, 2015-04-27 at 17:16 -0700, Tom Talpey wrote:
> >> On 4/27/2015 2:52 PM, ira.weiny wrote:
> >>> On Mon, Apr 27, 2015 at 09:39:05AM +0200, Michael Wang wrote:
>  On 04/24/2015 05:12 PM, Liran Liss wrote:
> > [snip]
> 
>  Like:
> 
>  enum rdma_protocol {
>   RDMA_PROTOCOL_IB,
>   RDMA_PROTOCOL_IBOE,
>   RDMA_PROTOCOL_IWARP,
>   RDMA_PROTOCOL_USNIC_UDP
>  };
> 
>  So we could use query_protocol() to ask device provide the protocol
>  type, and there will be no mixing with the legacy transport type
>  anymore :-)
> >>>
> >>> I'm ok with that.  I like introducing a unique namespace which is clearly
> >>> different from the previous "transport" one.
> >>
> >> I agree the word "transport" takes things into the weeds.
> >>
> >> But on the topic of naming protocols, I've been wondering, is there
> >> some reason that "IBOE" is being used instead of "RoCE"?
> >
> > Because back in the day, when RoCE was accepted into the kernel, I'm
> > pretty sure it was prior to the IBTA's final stamp of approval and
> > before the name was set on RoCE, so IBoE was chosen upstream as the more
> > "correct" name because it properly denoted what it was deemed to truly
> > be: IB Verbs over Ethernet.
> 
> Well history is all well and good, but it seems weird to not use the
> current, standard name in new code. It confuses me, anyway, because
> it seems like IBOE could easily mean something else.

Having some of it refer to things as IBOE and some as ROCE would be
similarly confusing, and switching existing IBOE usage to ROCE would
cause pain to people with out of tree drivers (Lustre is the main one I
know of).  There's not a good answer here.  There's only less sucky
ones.

> >> Also wondering, why add "UDP" to USNIC, is there a different USNIC?
> >
> > Yes, there are two transports, one a distinct ethertype and one that
> > encapsulates USNIC in UDP.
> 
> But this new enum isn't about transport, it's about protocol. So is
> there one USNIC protocol, with a raw layering and a separate one with
> UDP? Or is it one USNIC protocol with two different framings? Seems
> there should be at least the USNIC protocol, without the _UDP
> decoration, and I don't see it in the enum.

Keep in mind that this enum was Liran's response to Michael's original
patch.  In the enum in Michael's patch, there was both USNIC and
USNIC_UDP.

> >
> >> Naming multiple layers together seems confusing and maybe in the end
> >> will create more code to deal with the differences. For example, what
> >> token will RoCEv2 take? RoCE_UDP, RoCE_v2 or ... ?
> >
> > Uncertain as of now.
> 
> Ok, but it's imminent, right? What's the preference/guidance?

There is a patchset from Devesh Sharma at Emulex.  It added the RoCEv2
capability.  As I recall, it used a new flag added to the existing port
capabilities bitmask and notably did not modify either the node type or
link layer that are currently used to differentiate between the
different protocols.  That's from memory though, so I could be mistaken.

But that patchset was not written with this patchset in mind, and
merging the two may well change that.  In any case, there is a proposed
spec to follow, so for now that's the preference/guidance (unless this
rework means that we need to depart from the spec on internals for
implementation reasons).

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD

signature.asc
Description: This is a digitally signed message part

Re: [PATCH v2] spi: omap2-mcspi: Add support for GPIO chipselects

2015-04-27 Thread Michael Welling

On Mon, Apr 27, 2015 at 08:55:50PM +0100, Mark Brown wrote:
> On Sun, Apr 26, 2015 at 10:44:30PM -0500, Michael Welling wrote:
> 
> > +   if (gpio_is_valid(spi->cs_gpio)) {
> > +   gpio_set_value(spi->cs_gpio, (cs_active) ?
> > +   !!(spi->mode & SPI_CS_HIGH) :
> > +   !(spi->mode & SPI_CS_HIGH));
> > +   }
> 
> Two problems here.  One is that the above logic statement is just not
> readable (the repitition of hecks, the ternery operator, the
> indentation...) and the other is that the core chipselect support
> already handles GPIO chipselects so you should really be converting the
> driver to use that.  At the very least the code needs to be legible
> though.

Before I send another patch how does this look?

if (gpio_is_valid(spi->cs_gpio)) {
if (cs_active)
gpio_set_value(spi->cs_gpio, spi->mode & SPI_CS_HIGH);
else
gpio_set_value(spi->cs_gpio, !(spi->mode & 
SPI_CS_HIGH));
}

If I were to attempt to convert the driver to use the core chipselect support,
how would I go about doing it?

Is there another driver that I can use for reference?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] ARM: imx: Constify irq_domain_ops

2015-04-27 Thread Shawn Guo

On Mon, Apr 27, 2015 at 09:51:39PM +0900, Krzysztof Kozlowski wrote:
> The irq_domain_ops are not modified by the driver and the irqdomain core
> code accepts pointer to a const data.
> 
> Signed-off-by: Krzysztof Kozlowski 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Tom Talpey


On 4/27/2015 5:36 PM, Doug Ledford wrote:

On Mon, 2015-04-27 at 17:16 -0700, Tom Talpey wrote:

On 4/27/2015 2:52 PM, ira.weiny wrote:

On Mon, Apr 27, 2015 at 09:39:05AM +0200, Michael Wang wrote:

On 04/24/2015 05:12 PM, Liran Liss wrote:

[snip]


Like:

enum rdma_protocol {
RDMA_PROTOCOL_IB,
RDMA_PROTOCOL_IBOE,
RDMA_PROTOCOL_IWARP,
RDMA_PROTOCOL_USNIC_UDP
};

So we could use query_protocol() to ask device provide the protocol
type, and there will be no mixing with the legacy transport type
anymore :-)


I'm ok with that.  I like introducing a unique namespace which is clearly
different from the previous "transport" one.


I agree the word "transport" takes things into the weeds.

But on the topic of naming protocols, I've been wondering, is there
some reason that "IBOE" is being used instead of "RoCE"?


Because back in the day, when RoCE was accepted into the kernel, I'm
pretty sure it was prior to the IBTA's final stamp of approval and
before the name was set on RoCE, so IBoE was chosen upstream as the more
"correct" name because it properly denoted what it was deemed to truly
be: IB Verbs over Ethernet.


Well history is all well and good, but it seems weird to not use the
current, standard name in new code. It confuses me, anyway, because
it seems like IBOE could easily mean something else.


Also wondering, why add "UDP" to USNIC, is there a different USNIC?


Yes, there are two transports, one a distinct ethertype and one that
encapsulates USNIC in UDP.


But this new enum isn't about transport, it's about protocol. So is
there one USNIC protocol, with a raw layering and a separate one with
UDP? Or is it one USNIC protocol with two different framings? Seems
there should be at least the USNIC protocol, without the _UDP
decoration, and I don't see it in the enum.




Naming multiple layers together seems confusing and maybe in the end
will create more code to deal with the differences. For example, what
token will RoCEv2 take? RoCE_UDP, RoCE_v2 or ... ?


Uncertain as of now.


Ok, but it's imminent, right? What's the preference/guidance?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/7] ARM: shmobile: Add IRQC clock to device tree

2015-04-27 Thread Simon Horman

On Mon, Apr 27, 2015 at 04:15:14PM +0200, Geert Uytterhoeven wrote:
> Hi Simon,
> 
> On Thu, Mar 19, 2015 at 2:43 AM, Simon Horman  wrote:
> > On Wed, Mar 18, 2015 at 07:55:54PM +0100, Geert Uytterhoeven wrote:
> >> This patch series adds the IRQC clock to the device tree on SoCs that
> >> have such a clock (r8a73a4 and r8a779x), and adds mininal runtime PM
> >> support to the renesas-irqc driver, to make sure the clock is enabled
> >> when needed by the external IRQ controller(s).
> >> Before, the clock was assumed enabled by the bootloader or reset state.
> >>
> >> As usual when involving clocks, the DTS changes depend stricly on the
> >> driver changes. Else the clock will be disabled as assumed unused,
> >> breaking the boot.
> >>
> >> This was tested on r8a73a4/ape6evm and r8a7791/koelsch.
> >
> > I will defer the DTS changes until the irqchip changes, which I
> > have reviewed, are accepted.
> 
> The irqchip changes are upstream. Do you want me to resend the DTS
> changes, or will you apply the original patches?

THanks, I have located the original patches and queued them up for v4.2.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [RFC PATCH 5/5] GHES: Make NMI handler have a single reader

2015-04-27 Thread Zheng, Lv

Hi,

> From: Borislav Petkov [mailto:b...@alien8.de]
> Sent: Monday, April 27, 2015 4:47 PM
> 
> On Mon, Apr 27, 2015 at 03:16:00AM +, Zheng, Lv wrote:
> > > @@ -840,7 +840,9 @@ static int ghes_notify_nmi(unsigned int cmd, struct 
> > > pt_regs *regs)
> > >   struct ghes *ghes;
> > >   int sev, ret = NMI_DONE;
> > >
> > > - raw_spin_lock(&ghes_nmi_lock);
> > > + if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
> > > + return ret;
> > > +
> >
> > Just a simple question.
> > Why not just using cmpxchg here instead of atomic_add_unless so that no 
> > atomic_dec will be needed.
> 
> What do you think atomic_add_unless ends up doing:
> 
> #APP
> # 177 "./arch/x86/include/asm/atomic.h" 1
>   .pushsection .smp_locks,"a"
> .balign 4
> .long 671f - .
> .popsection
> 671:
>   lock; cmpxchgl %edx,ghes_in_nmi(%rip)   # D.37056, MEM[(volatile u32 
> *)&ghes_in_nmi]
> # 0 "" 2
> #NO_APP
> 
> And you need to atomic_dec() so that another reader can enter, i.e. how
> the exclusion primitive works.
> 
> Or did you have something else in mind?

My mistake.
I mean cmpxchg() and xchg() (or atomic_cmpxchg() and atomic_xchg()) pair here, 
so nothing can be reduced.

But IMO, atomic_add_unless() is implemented via cmpxchg on many architectures.
And it might be better to use it directly here which is a bit faster as you 
actually only need one value switch here.

Thanks and best regards
-Lv


> 
> --
> Regards/Gruss,
> Boris.
> 
> ECO tip #101: Trim your mails when you reply.
> --

BUG: unable to handle kernel paging request at ffffffee

2015-04-27 Thread Fengguang Wu

[   11.352957]  [] SyS_open+0x30/0x50
[   11.352957]  [] syscall_call+0x7/0x7
[   11.352957] Code: 05 80 2f f2 c2 01 83 15 84 2f f2 c2 00 84 c9 89 04 9d 00 
65 5c c2 75 16 a1 20 33 f2 c2 83 05 88 2f f2 c2 01 83 15 8c 2f f2 c2 00 <8b> 50 
04 83 05 90 2f f2 c2 01 8b 9a 04 03 00 00 83 15 94 2f f2
[   11.352957] EIP: [] devpts_new_index+0x54/0x2e0 SS:ESP 
0068:ce293d94
[   11.352957] CR2: ffee
[   11.352957] ---[ end trace fcc960a7acab2024 ]---
[   11.352957] Kernel panic - not syncing: Fatal exception

git bisect start 7445ff424e7ff7121865b523d1588af20a41c207 
39a8804455fb23f09157341d3ba7db6d7ae6ee76 --
git bisect  bad 0ada7e339e8560d78d90ae0b58af8506a3d97c78  # 10:46  0-  
5  Merge 'iwlwifi-next/master' into devel-cairo-smoke-201504200851
git bisect good 7eb17b8fca4c6ee755bc37e41705f7c15f40d73f  # 13:34 20+  
2  0day base guard for 'devel-cairo-smoke-201504200851'
git bisect  bad 5092f1587a049826da2dc128d46d69b0fa74af78  # 14:13  0-  
5  Merge 'pm/bleeding-edge' into devel-cairo-smoke-201504200851
git bisect good 2481bc75283ea10e75d5fb1a8b42af363fc4b45c  # 19:43 20+  
2  Merge tag 'pm+acpi-4.1-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
git bisect good 4e78eb0dbf867ccf206706ff2af34084f71a99bf  # 00:00 20+  
0  Merge tag 'mac80211-next-for-davem-2015-04-10' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next
git bisect good d0a3997c0c3f9351e24029349dee65dd1d9e8d84  # 02:56 20+  
2  Merge tag 'sound-4.1-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound
git bisect  bad 497a5df7bf6ffd136ae21c49d1a01292930d7ca2  # 03:13  0-  
5  Merge tag 'stable/for-linus-4.1-rc0-tag' of 
git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
git bisect good ff7a2adac50873aaba71759779505693806adcc1  # 07:30 20+  
0  powerpc: Remove PPC32 code from pseries specific find_and_init_phbs()
git bisect  bad 9f6a240e8b08d3fa711c2b615e7ea901cf59e590  # 07:52  0-  
5  power: wakeup: remove use of seq_printf return value
git bisect good 018e9a49a554d915ba945a5faf34c592d65fe575  # 13:13 20+  
2  mm/compaction.c: fix "suitable_migration_target() unused" warning
git bisect good 946e87981942552e526aca9cb6204f02a6c847cb  # 11:33 20+  
0  paride: fix the "verbose" module param
git bisect  bad d1c1b12137fff14363d0cf45c8b7a9ec5cd4578b  # 12:45  0-  
6  lib/vsprintf.c: another small hack
git bisect  bad 7a54f46b301cfab8a0d7365aa186545f8b98f22e  # 13:14  0-  
5  kernel/reboot.c: add orderly_reboot for graceful reboot
git bisect  bad 96831c0a6738f88f89e7012f4df0a747514af0a0  # 13:41  0-  
5  kernel/resource.c: remove deprecated __check_region() and friends
git bisect  bad 2813893f8b197a14f1e1ddb04d99bce46817c84a  # 14:45  0-  
5  kernel: conditionally support non-root users, groups and capabilities
git bisect good c79574abe2baddf569532e7e430e491dd25c  # 16:30 20+  
2  lib/test-hexdump.c: fix initconst confusion
# first bad commit: [2813893f8b197a14f1e1ddb04d99bce46817c84a] kernel: 
conditionally support non-root users, groups and capabilities
git bisect good c79574abe2baddf569532e7e430e491dd25c  # 17:52 60+  
2  lib/test-hexdump.c: fix initconst confusion
# extra tests with DEBUG_INFO
git bisect good 2813893f8b197a14f1e1ddb04d99bce46817c84a  # 23:49 60+ 
39  kernel: conditionally support non-root users, groups and capabilities
# extra tests on HEAD of linux-devel/devel-cairo-smoke-201504200851
git bisect  bad 7445ff424e7ff7121865b523d1588af20a41c207  # 23:49  0- 
12  0day head guard for 'devel-cairo-smoke-201504200851'
# extra tests on tree/branch linus/master
git bisect  bad b787f68c36d49bb1d9236f403813641efa74a031  # 00:04  0-  
5  Linux 4.1-rc1
# extra tests on tree/branch linus/master
git bisect  bad b787f68c36d49bb1d9236f403813641efa74a031  # 00:04  0-  
5  Linux 4.1-rc1
# extra tests on tree/branch next/master
git bisect  bad b722a93ff1074b1c3f2273c669bd51368aeedf66  # 00:29  0-  
5  Add linux-next specific files for 20150427


This script may reproduce the error.


#!/bin/bash

kernel=$1
initrd=yocto-minimal-i386.cgz

wget --no-clobber 
https://github.com/fengguang/reproduce-kernel-bug/raw/master/initrd/$initrd

kvm=(
qemu-system-x86_64
-enable-kvm
-cpu Haswell,+smep,+smap
-kernel $kernel
-initrd $initrd
-m 256
-smp 1
-device e1000,netdev=net0
-netdev user,id=net0
-boot order=nc
-no-reboot
-watchdog i6300esb
-rtc base=localtime
-serial stdio
-display none
-monitor null 
)

append=(
hung_task_panic=1
earlyprintk=ttyS0,115200
rd.udev.log-priority

Re: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Doug Ledford

On Mon, 2015-04-27 at 17:16 -0700, Tom Talpey wrote:
> On 4/27/2015 2:52 PM, ira.weiny wrote:
> > On Mon, Apr 27, 2015 at 09:39:05AM +0200, Michael Wang wrote:
> >>
> >>
> >> On 04/24/2015 05:12 PM, Liran Liss wrote:
>  From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
> 
> >>> [snip]
>  a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
>  65994a1..d54f91e 100644
>  --- a/include/rdma/ib_verbs.h
>  +++ b/include/rdma/ib_verbs.h
>  @@ -75,10 +75,13 @@ enum rdma_node_type {  };
> 
>    enum rdma_transport_type {
>  +/* legacy for users */
>   RDMA_TRANSPORT_IB,
>   RDMA_TRANSPORT_IWARP,
>   RDMA_TRANSPORT_USNIC,
>  -RDMA_TRANSPORT_USNIC_UDP
>  +RDMA_TRANSPORT_USNIC_UDP,
>  +/* new transport */
>  +RDMA_TRANSPORT_IBOE,
> >>>
> >>> Remove RDMA_TRANSPORT_IBOE - it is not a transport.
> >>> ROCE uses IBTA transport.
> >>>
> >>> If any code should test for ROCE should invoke a specific helper, e.g., 
> >>> rdma_protocol_iboe().
> >>> This is  what you currently call "rdma_tech_iboe" is patch 02/26.
> >>>
> >>> I think that pretty much everybody agrees that rdma_protocol_*() is a 
> >>> better name than rdma_tech_*(), right?
> >>> So, let's change this.
> >>
> >> Sure, sounds reasonable now, about the IBOE, we still need it to
> >> separate the port support IB/ETH without the check on link-layer,
> >> So what about a new enum on protocol type?
> >>
> >> Like:
> >>
> >> enum rdma_protocol {
> >>RDMA_PROTOCOL_IB,
> >>RDMA_PROTOCOL_IBOE,
> >>RDMA_PROTOCOL_IWARP,
> >>RDMA_PROTOCOL_USNIC_UDP
> >> };
> >>
> >> So we could use query_protocol() to ask device provide the protocol
> >> type, and there will be no mixing with the legacy transport type
> >> anymore :-)
> >
> > I'm ok with that.  I like introducing a unique namespace which is clearly
> > different from the previous "transport" one.
> 
> I agree the word "transport" takes things into the weeds.
> 
> But on the topic of naming protocols, I've been wondering, is there
> some reason that "IBOE" is being used instead of "RoCE"?

Because back in the day, when RoCE was accepted into the kernel, I'm
pretty sure it was prior to the IBTA's final stamp of approval and
before the name was set on RoCE, so IBoE was chosen upstream as the more
"correct" name because it properly denoted what it was deemed to truly
be: IB Verbs over Ethernet.

>  The IBOE
> protocol used to exist and is not the same as the currently
> standardized RoCE, right?

I don't believe so.  To my knowledge, there was never an IBoE except in
linux upstream parlance.

> Also wondering, why add "UDP" to USNIC, is there a different USNIC?

Yes, there are two transports, one a distinct ethertype and one that
encapsulates USNIC in UDP.

> Naming multiple layers together seems confusing and maybe in the end
> will create more code to deal with the differences. For example, what
> token will RoCEv2 take? RoCE_UDP, RoCE_v2 or ... ?

Uncertain as of now.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD




signature.asc
Description: This is a digitally signed message part

Re: [PATCH 3.12 00/83] 3.12.41-stable review

2015-04-27 Thread Shuah Khan

On 04/27/2015 03:31 PM, Jiri Slaby wrote:
> This is the start of the stable review cycle for the 3.12.41 release.
> There are 83 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Wed Apr 29 20:39:50 CEST 2015.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   
> http://kernel.org/pub/linux/kernel/people/jirislaby/stable-review/patch-3.12.41-rc1.xz
> and the diffstat can be found below.
> 
> thanks,
> js
> 

Compiled and booted on my test system. No dmesg regressions.

-- Shuah

-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shua...@osg.samsung.com | (970) 217-8978
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Interacting with coherent memory on external devices

2015-04-27 Thread Benjamin Herrenschmidt

On Mon, 2015-04-27 at 11:48 -0500, Christoph Lameter wrote:
> On Mon, 27 Apr 2015, Rik van Riel wrote:
> 
> > Why would we want to avoid the sane approach that makes this thing
> > work with the fewest required changes to core code?
> 
> Becaus new ZONEs are a pretty invasive change to the memory management and
> because there are  other ways to handle references to device specific
> memory.

ZONEs is just one option we put on the table.

I think we can mostly agree on the fundamentals that a good model of
such a co-processor is a NUMA node, possibly with a higher distance
than other nodes (but even that can be debated).

That gives us a lot of the basics we need such as struct page, ability
to use existing migration infrastructure, and is actually a reasonably
representation at high level as well.

The question is how do we additionally get the random stuff we don't
care about out of the way. The large distance will not help that much
under memory pressure for example.

Covering the entire device memory with a CMA goes a long way toward that
goal. It will avoid your ordinary kernel allocations.

It also provides just what we need to be able to do large contiguous
"explicit" allocations for use by workloads that don't want the
transparent migration and by the driver for the device which might also
need such special allocations for its own internal management data
structures. 

We still have the risk of pages in the CMA being pinned by something
like gup however, that's where the ZONE idea comes in, to ensure the
various kernel allocators will *never* allocate in that zone unless
explicitly specified, but that could possibly implemented differently.

Maybe a concept of "exclusive" NUMA node, where allocations never
fallback to that node unless explicitly asked to go there.

Of course that would have an impact on memory pressure calculations,
nothign comes completely for free, but at this stage, this is the goal
of this thread, ie, to swap ideas around and see what's most likely to
work in the long run before we even start implementing something.

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 01/26] IB/Verbs: Implement new callback query_transport()

2015-04-27 Thread Tom Talpey


On 4/27/2015 2:52 PM, ira.weiny wrote:

On Mon, Apr 27, 2015 at 09:39:05AM +0200, Michael Wang wrote:



On 04/24/2015 05:12 PM, Liran Liss wrote:

From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-


[snip]

a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
65994a1..d54f91e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -75,10 +75,13 @@ enum rdma_node_type {  };

  enum rdma_transport_type {
+   /* legacy for users */
RDMA_TRANSPORT_IB,
RDMA_TRANSPORT_IWARP,
RDMA_TRANSPORT_USNIC,
-   RDMA_TRANSPORT_USNIC_UDP
+   RDMA_TRANSPORT_USNIC_UDP,
+   /* new transport */
+   RDMA_TRANSPORT_IBOE,


Remove RDMA_TRANSPORT_IBOE - it is not a transport.
ROCE uses IBTA transport.

If any code should test for ROCE should invoke a specific helper, e.g., 
rdma_protocol_iboe().
This is  what you currently call "rdma_tech_iboe" is patch 02/26.

I think that pretty much everybody agrees that rdma_protocol_*() is a better 
name than rdma_tech_*(), right?
So, let's change this.


Sure, sounds reasonable now, about the IBOE, we still need it to
separate the port support IB/ETH without the check on link-layer,
So what about a new enum on protocol type?

Like:

enum rdma_protocol {
RDMA_PROTOCOL_IB,
RDMA_PROTOCOL_IBOE,
RDMA_PROTOCOL_IWARP,
RDMA_PROTOCOL_USNIC_UDP
};

So we could use query_protocol() to ask device provide the protocol
type, and there will be no mixing with the legacy transport type
anymore :-)


I'm ok with that.  I like introducing a unique namespace which is clearly
different from the previous "transport" one.


I agree the word "transport" takes things into the weeds.

But on the topic of naming protocols, I've been wondering, is there
some reason that "IBOE" is being used instead of "RoCE"? The IBOE
protocol used to exist and is not the same as the currently
standardized RoCE, right?

Also wondering, why add "UDP" to USNIC, is there a different USNIC?

Naming multiple layers together seems confusing and maybe in the end
will create more code to deal with the differences. For example, what
token will RoCEv2 take? RoCE_UDP, RoCE_v2 or ... ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] livepatch: x86: make kASLR logic more accurate

2015-04-27 Thread Minfei Huang

On 04/28/15 at 01:29P, Jiri Kosina wrote:
> On Mon, 27 Apr 2015, Minfei Huang wrote:
> 
> > Found that kaslr_enabled is only exist for x86. Maybe you can define a 
> > weak function klp_adjustment_function_addr in general. Then each arch 
> > can overwrite the function to implemente it specially.
> 
> It might start to make sense once there is at least one additional arch 
> that supports kaslr. Currently, I don't see a benefit.
> 
> Why are you so obstinate about this? I personally don't find that 
> important at all; it's something that can always be sorted out once more 
> archs start supporting kaslr.
> 

ohhh... Previously, IMO, putting the relevant function address adjustment
into the specified arch is more clearly to be reviewed and understood.

Now, I know what you actual want according to above commit, I am fine
with it.

Thanks
Minfei

> Thanks,
> 
> -- 
> Jiri Kosina
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe live-patching" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net 1/1] hv_netvsc: Fix a bug in netvsc_start_xmit()

2015-04-27 Thread K. Y. Srinivasan

Commit commit b08cc79155fc26d0d112b1470d1ece5034651a4b eliminated memory
allocation in the packet send path. This commit introduced a bug since it
did not account for the case if the skb was cloned. Fix this bug by
using the pre-reserved head room only if the skb is not cloned.

Signed-off-by: K. Y. Srinivasan 
---
 drivers/net/hyperv/netvsc_drv.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index a3a9d38..7eb0251 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -421,7 +421,7 @@ check_size:
 
pkt_sz = sizeof(struct hv_netvsc_packet) + RNDIS_AND_PPI_SIZE;
 
-   if (head_room < pkt_sz) {
+   if (skb->cloned ||  head_room < pkt_sz) {
packet = kmalloc(pkt_sz, GFP_ATOMIC);
if (!packet) {
/* out of memory, drop packet */
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] MIPS: R6: memcpy bugfix - zero length overwrites memory

2015-04-27 Thread Leonid Yegoshin

MIPS R6 version of memcpy has bug - then length to copy is zero
and addresses are not aligned then it can overwrite a whole memory.

Signed-off-by: Leonid Yegoshin 
---
 arch/mips/lib/memcpy.S |2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index 9245e1705e69..7e0250f3aec8 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -514,6 +514,8 @@
 
 #ifdef CONFIG_CPU_MIPSR6
 .Lcopy_unaligned_bytes\@:
+   beqzlen, .Ldone\@
+nop
 1:
COPY_BYTE(0)
COPY_BYTE(1)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHv5 00/28] THP refcounting redesign

2015-04-27 Thread Kirill A. Shutemov

On Mon, Apr 27, 2015 at 04:03:48PM -0700, Andrew Morton wrote:
> On Fri, 24 Apr 2015 00:03:35 +0300 "Kirill A. Shutemov" 
>  wrote:
> 
> > Hello everybody,
> > 
> > Here's reworked version of my patchset. All known issues were addressed.
> > 
> > The goal of patchset is to make refcounting on THP pages cheaper with
> > simpler semantics and allow the same THP compound page to be mapped with
> > PMD and PTEs. This is required to get reasonable THP-pagecache
> > implementation.
> 
> Are there any measurable performance improvements?

I was focused on stability up to this point. I'll bring some numbers.

> > With the new refcounting design it's much easier to protect against
> > split_huge_page(): simple reference on a page will make you the deal.
> > It makes gup_fast() implementation simpler and doesn't require
> > special-case in futex code to handle tail THP pages.
> > 
> > It should improve THP utilization over the system since splitting THP in
> > one process doesn't necessary lead to splitting the page in all other
> > processes have the page mapped.
> > 
> > The patchset drastically lower complexity of get_page()/put_page()
> > codepaths. I encourage reviewers look on this code before-and-after to
> > justify time budget on reviewing this patchset.
> >
> > ...
> >
> >  59 files changed, 1144 insertions(+), 1509 deletions(-)
> 
> It's huge.  I'm going to need help reviewing this.  Have earlier
> versions been reviewed much?

The most helpful was feedback from Aneesh for v4. Hugh pointed to few weak
parts. But I can't say that the patchset was reviewed much.

Sasha helped with testing. Few bugs he found was fixed during preparing v5
for posting. One more issue was pointed after posting the patchset. I work
on it now.

> Who do you believe are suitable reviewers?

Andrea is obvious candidate. Hugh looked recently into the same area with
his team pages idea.

In general, I tried to keep people who can be helpful with review or
testing on CC list.

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] livepatch: x86: make kASLR logic more accurate

2015-04-27 Thread Jiri Kosina

On Mon, 27 Apr 2015, Minfei Huang wrote:

> Found that kaslr_enabled is only exist for x86. Maybe you can define a 
> weak function klp_adjustment_function_addr in general. Then each arch 
> can overwrite the function to implemente it specially.

It might start to make sense once there is at least one additional arch 
that supports kaslr. Currently, I don't see a benefit.

Why are you so obstinate about this? I personally don't find that 
important at all; it's something that can always be sorted out once more 
archs start supporting kaslr.

Thanks,

-- 
Jiri Kosina
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Brian Norris

On Tue, Apr 28, 2015 at 12:19:16AM +0200, Richard Weinberger wrote:
> Am 27.04.2015 um 23:35 schrieb Ben Shelton:
> > I tested this against the latest version of the PL353 NAND driver that 
> > Punnaiah
> > has been working to upstream (copying her on this message).  With a few 
> > changes
> > to that driver, I got it most of the way through initialization with on-die 
> > ECC
> > enabled, but it segfaults here with a null pointer dereference because the
> > PL353 driver does not implement chip->cmd_ctrl.  Instead, it implements a
> > custom override of cmd->cmdfunc that does not call cmd_ctrl.  Looking 
> > through
> > the other in-tree NAND drivers, it looks like most of them do implement
> > cmd_ctrl, but quite a few of them do not (e.g. au1550nd, denali, docg4).
> > 
> > What do you think would be the best way to handle this?  It seems like this 
> > gap
> > could be bridged from either side -- either the PL353 driver could implement
> > cmd_ctrl, at least as a stub version that provides the expected behavior in
> > this case; or the on-die framework could break this out into a callback
> > function with a default implementation that the driver could override to
> > perform this behavior in the manner of its choosing.
> 
> Oh, I thought every driver has to implement that function. ;-\

Nope.

> But you're right there is a corner case.

And it's not the only one! Right now, there's no guarantee even that
read_buf() returns raw data, unmodified by the SoC's controller. Plenty
of drivers actually have HW-enabled ECC turned on by default, and so
they override the chip->ecc.read_page() (and sometimes
chip->ecc.read_page_raw() functions, if we're lucky) with something
that pokes the appropriate hardware instead. I expect anything
comprehensive here is probably going to have to utilize
chip->ecc.read_page_raw(), at least if it's provided by the hardware
driver.

> What we could do is just using chip->cmdfunc() with a custom NAND command.
> i.e. chip->cmdfunc(mtd, NAND_CMD_READMODE, -1, -1);
> 
> Gerhard Sittig tried to introduce such a command some time ago:
> http://lists.infradead.org/pipermail/linux-mtd/2014-April/053115.html

Yikes! Please no! It's bad enough to have a ton of drivers doing
switch/case on a bunch of real, somewhat well-known opcodes, but to add
new fake ones? I'd rather not. We're inflicting ourselves with a
kernel-internal version of ioctl(). What's the justification, again? I
don't really remember the context of Gerhard's previous patch.

Brian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Brian Norris

On Mon, Apr 27, 2015 at 4:15 PM, Richard Weinberger  wrote:
> Am 28.04.2015 um 01:10 schrieb Brian Norris:
>> On Mon, Apr 27, 2015 at 3:57 PM, Richard Weinberger  wrote:
>>> Am 28.04.2015 um 00:53 schrieb Brian Norris:
 On Tue, Apr 28, 2015 at 12:42:18AM +0200, Richard Weinberger wrote:
> Am 28.04.2015 um 00:36 schrieb Ben Shelton:
 When I build this without CONFIG_MTD_NAND_ECC_ON_DIE enabled, I get the
 following warning here:

 In file included from drivers/mtd/nand/nand_base.c:46:0:
 include/linux/mtd/nand_ondie.h: In function 'nand_read_subpage_on_die':
 include/linux/mtd/nand_ondie.h:28:1: warning: no return statement in 
 function returning non-void [-Wreturn-type]
 include/linux/mtd/nand_ondie.h: In function 'nand_read_page_on_die':
 include/linux/mtd/nand_ondie.h:34:1: warning: no return statement in 
 function returning non-void [-Wreturn-type]

 Perhaps return an error code here, even though you'll never get past 
 the BUG()?
>>>
>>> What gcc is this?
>>> gcc 4.8 here does not warn, I thought it is smart enough that this 
>>> function does never
>>> return. Can it be that your .config has CONFIG_BUG=n?
>>> Anyway, this functions clearly needs a return statement. :)
>>
>> gcc 4.7.2, and you are correct that I had CONFIG_BUG off.  :)
>
> Yeah, just noticed that BUG() with CONFIG_BUG=n does not have
> a nonreturn attribute. So, gcc cannot know...

 But it's an obvious infinite loop... all of my toolchains (4.2, 4.5,
 4.6, 4.8) are able to compile this without complaining (gcc -Wall):

 int test() { do { } while (1); }
>>>
>>> Not here. gcc 4.8 warns on that.
>>> As soon I add __attribute__ ((noreturn)) it does not longer complain.
>>
>> Huh? Maybe I have a crazy modified gcc.
>>
>> $ gcc --version
>> gcc (Ubuntu 4.8.2-19ubuntu1) 4.8.2
>> Copyright (C) 2013 Free Software Foundation, Inc.
>> This is free software; see the source for copying conditions.  There is NO
>> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>>
>> $ gcc -Wall -Wextra -c a.c
>> $ cat a.c
>> int test() { do {} while (1); }
>
> Make test static and gcc will warn.

Hmm. That's a strange distinction for gcc to make. Maybe because of
the potential for inlining? Still seems odd.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Richard Weinberger

Am 28.04.2015 um 01:10 schrieb Brian Norris:
> On Mon, Apr 27, 2015 at 3:57 PM, Richard Weinberger  wrote:
>> Am 28.04.2015 um 00:53 schrieb Brian Norris:
>>> On Tue, Apr 28, 2015 at 12:42:18AM +0200, Richard Weinberger wrote:
 Am 28.04.2015 um 00:36 schrieb Ben Shelton:
>>> When I build this without CONFIG_MTD_NAND_ECC_ON_DIE enabled, I get the
>>> following warning here:
>>>
>>> In file included from drivers/mtd/nand/nand_base.c:46:0:
>>> include/linux/mtd/nand_ondie.h: In function 'nand_read_subpage_on_die':
>>> include/linux/mtd/nand_ondie.h:28:1: warning: no return statement in 
>>> function returning non-void [-Wreturn-type]
>>> include/linux/mtd/nand_ondie.h: In function 'nand_read_page_on_die':
>>> include/linux/mtd/nand_ondie.h:34:1: warning: no return statement in 
>>> function returning non-void [-Wreturn-type]
>>>
>>> Perhaps return an error code here, even though you'll never get past 
>>> the BUG()?
>>
>> What gcc is this?
>> gcc 4.8 here does not warn, I thought it is smart enough that this 
>> function does never
>> return. Can it be that your .config has CONFIG_BUG=n?
>> Anyway, this functions clearly needs a return statement. :)
>
> gcc 4.7.2, and you are correct that I had CONFIG_BUG off.  :)

 Yeah, just noticed that BUG() with CONFIG_BUG=n does not have
 a nonreturn attribute. So, gcc cannot know...
>>>
>>> But it's an obvious infinite loop... all of my toolchains (4.2, 4.5,
>>> 4.6, 4.8) are able to compile this without complaining (gcc -Wall):
>>>
>>> int test() { do { } while (1); }
>>
>> Not here. gcc 4.8 warns on that.
>> As soon I add __attribute__ ((noreturn)) it does not longer complain.
> 
> Huh? Maybe I have a crazy modified gcc.
> 
> $ gcc --version
> gcc (Ubuntu 4.8.2-19ubuntu1) 4.8.2
> Copyright (C) 2013 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> 
> $ gcc -Wall -Wextra -c a.c
> $ cat a.c
> int test() { do {} while (1); }

Make test static and gcc will warn.

Thanks,
//richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Brian Norris

On Mon, Apr 27, 2015 at 3:57 PM, Richard Weinberger  wrote:
> Am 28.04.2015 um 00:53 schrieb Brian Norris:
>> On Tue, Apr 28, 2015 at 12:42:18AM +0200, Richard Weinberger wrote:
>>> Am 28.04.2015 um 00:36 schrieb Ben Shelton:
>> When I build this without CONFIG_MTD_NAND_ECC_ON_DIE enabled, I get the
>> following warning here:
>>
>> In file included from drivers/mtd/nand/nand_base.c:46:0:
>> include/linux/mtd/nand_ondie.h: In function 'nand_read_subpage_on_die':
>> include/linux/mtd/nand_ondie.h:28:1: warning: no return statement in 
>> function returning non-void [-Wreturn-type]
>> include/linux/mtd/nand_ondie.h: In function 'nand_read_page_on_die':
>> include/linux/mtd/nand_ondie.h:34:1: warning: no return statement in 
>> function returning non-void [-Wreturn-type]
>>
>> Perhaps return an error code here, even though you'll never get past the 
>> BUG()?
>
> What gcc is this?
> gcc 4.8 here does not warn, I thought it is smart enough that this 
> function does never
> return. Can it be that your .config has CONFIG_BUG=n?
> Anyway, this functions clearly needs a return statement. :)

 gcc 4.7.2, and you are correct that I had CONFIG_BUG off.  :)
>>>
>>> Yeah, just noticed that BUG() with CONFIG_BUG=n does not have
>>> a nonreturn attribute. So, gcc cannot know...
>>
>> But it's an obvious infinite loop... all of my toolchains (4.2, 4.5,
>> 4.6, 4.8) are able to compile this without complaining (gcc -Wall):
>>
>> int test() { do { } while (1); }
>
> Not here. gcc 4.8 warns on that.
> As soon I add __attribute__ ((noreturn)) it does not longer complain.

Huh? Maybe I have a crazy modified gcc.

$ gcc --version
gcc (Ubuntu 4.8.2-19ubuntu1) 4.8.2
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

$ gcc -Wall -Wextra -c a.c
$ cat a.c
int test() { do {} while (1); }

But:

$ gcc -Wall -Wextra -c b.c
b.c: In function ‘test’:
b.c:1:1: warning: control reaches end of non-void function [-Wreturn-type]
 int test() { }
 ^
$ cat b.c
int test() { }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHv5 00/28] THP refcounting redesign

2015-04-27 Thread Andrew Morton

On Fri, 24 Apr 2015 00:03:35 +0300 "Kirill A. Shutemov" 
 wrote:

> Hello everybody,
> 
> Here's reworked version of my patchset. All known issues were addressed.
> 
> The goal of patchset is to make refcounting on THP pages cheaper with
> simpler semantics and allow the same THP compound page to be mapped with
> PMD and PTEs. This is required to get reasonable THP-pagecache
> implementation.

Are there any measurable performance improvements?

> With the new refcounting design it's much easier to protect against
> split_huge_page(): simple reference on a page will make you the deal.
> It makes gup_fast() implementation simpler and doesn't require
> special-case in futex code to handle tail THP pages.
> 
> It should improve THP utilization over the system since splitting THP in
> one process doesn't necessary lead to splitting the page in all other
> processes have the page mapped.
> 
> The patchset drastically lower complexity of get_page()/put_page()
> codepaths. I encourage reviewers look on this code before-and-after to
> justify time budget on reviewing this patchset.
>
> ...
>
>  59 files changed, 1144 insertions(+), 1509 deletions(-)

It's huge.  I'm going to need help reviewing this.  Have earlier
versions been reviewed much?  Who do you believe are suitable
reviewers?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mm: compaction: BUG in isolate_migratepages_block()

2015-04-27 Thread Kirill A. Shutemov

On Mon, Apr 27, 2015 at 06:34:59PM -0400, Sasha Levin wrote:
> Hi all,
> 
> While fuzzing with trinity inside a KVM tools guest running the latest -next
> kernel I've stumbled on the following spew:
> 
> [ 4249.344788] kernel BUG at include/linux/page-flags.h:575!

This should help: https://lkml.org/lkml/2015/4/27/218

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] mtd: nand: Add on-die ECC support

2015-04-27 Thread Richard Weinberger

Am 28.04.2015 um 00:53 schrieb Brian Norris:
> On Tue, Apr 28, 2015 at 12:42:18AM +0200, Richard Weinberger wrote:
>> Am 28.04.2015 um 00:36 schrieb Ben Shelton:
> When I build this without CONFIG_MTD_NAND_ECC_ON_DIE enabled, I get the
> following warning here:
>
> In file included from drivers/mtd/nand/nand_base.c:46:0:
> include/linux/mtd/nand_ondie.h: In function 'nand_read_subpage_on_die':
> include/linux/mtd/nand_ondie.h:28:1: warning: no return statement in 
> function returning non-void [-Wreturn-type]
> include/linux/mtd/nand_ondie.h: In function 'nand_read_page_on_die':
> include/linux/mtd/nand_ondie.h:34:1: warning: no return statement in 
> function returning non-void [-Wreturn-type]
>
> Perhaps return an error code here, even though you'll never get past the 
> BUG()?

 What gcc is this?
 gcc 4.8 here does not warn, I thought it is smart enough that this 
 function does never
 return. Can it be that your .config has CONFIG_BUG=n?
 Anyway, this functions clearly needs a return statement. :)
>>>
>>> gcc 4.7.2, and you are correct that I had CONFIG_BUG off.  :)
>>
>> Yeah, just noticed that BUG() with CONFIG_BUG=n does not have
>> a nonreturn attribute. So, gcc cannot know...
> 
> But it's an obvious infinite loop... all of my toolchains (4.2, 4.5,
> 4.6, 4.8) are able to compile this without complaining (gcc -Wall):
> 
> int test() { do { } while (1); }

Not here. gcc 4.8 warns on that.
As soon I add __attribute__ ((noreturn)) it does not longer complain.

Thanks,
//richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 >

1 - 100 of 803 matches

Mail list logo