Re: [PATCH] devfreq: add error check for sscanf in userspace governor

2017-08-07 Thread Pavan Kondeti
Hi Santosh,

On Mon, Aug 7, 2017 at 6:36 PM, Santosh Mardi  wrote:
> store_freq function of devfreq userspace governor
> executes further, even if error is returned from sscanf,
> this will result in setting up wrong frequency value.
>
> Add proper error check to bail out if any error is returned.
>
> Signed-off-by: Santosh Mardi 
> ---
>  drivers/devfreq/governor_userspace.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/devfreq/governor_userspace.c 
> b/drivers/devfreq/governor_userspace.c
> index 77028c2..1d0c9cc 100644
> --- a/drivers/devfreq/governor_userspace.c
> +++ b/drivers/devfreq/governor_userspace.c
> @@ -53,12 +53,15 @@ static ssize_t store_freq(struct device *dev, struct 
> device_attribute *attr,
> mutex_lock(&devfreq->lock);
> data = devfreq->data;
>
> -   sscanf(buf, "%lu", &wanted);
> +   err = sscanf(buf, "%lu", &wanted);
> +   if (err != 1)
> +   goto out;

You can save this goto statement by moving this sscanf checking to
before taking the mutex.

> data->user_frequency = wanted;
> data->valid = true;
> err = update_devfreq(devfreq);
> if (err == 0)
> err = count;
> +out:
> mutex_unlock(&devfreq->lock);
> return err;
>  }
> --
> 1.9.1
>


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a
Linux Foundation Collaborative Project


Re: [MD] Crash with 4.12+ kernel and high disk load -- bisected to 4ad23a976413: MD: use per-cpu counter for writes_pending

2017-08-07 Thread David R


Quoting Shaohua Li :


Spent some time to check this one, unfortunately I can't find how that patch
makes rcu stall. the percpu part looks good to me too. Can you  
double check if
reverting 4ad23a976413aa57 makes the issue go away? When the rcu  
stall happens,

what the /sys/block/md/md0/array_state? please also attach /proc/mdstat. When
you say the mdx_raid1 threads are in 'R' state, can you double check if the
/proc/pid/stack always 0xff?

Thanks,
Shaohua


I confess to knowing absolutely nothing about the md code, so please  
don't be too hard on me. However

:-

static bool set_in_sync(struct mddev *mddev)
{
WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
if (!mddev->in_sync) {
mddev->sync_checkers++;
spin_unlock(&mddev->lock);
percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
spin_lock(&mddev->lock);
if (!mddev->in_sync &&
percpu_ref_is_zero(&mddev->writes_pending)) {
mddev->in_sync = 1;
/*
 * Ensure ->in_sync is visible before we clear
 * ->sync_checkers.
 */
smp_mb();
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (--mddev->sync_checkers == 0)
percpu_ref_switch_to_percpu(&mddev->writes_pending);


The switch_to_percpu() takes place under mddev->lock however  
switch_to_atomic_sync() does not. A thread can be in the middle of (or  
about to execute) switch_to_atomic_sync() at the same time as another  
is calling switch_to_percpu(). This can't be correct surely?


Cheers
David



[PATCH v1 4/6] mm:swap: use on-stack-bio for BDI_CAP_SYNC devices

2017-08-07 Thread Minchan Kim
There is no need to use dynamic bio allocation for BDI_CAP_SYNC
devices. They can live with on-stack-bio without concern about
waiting bio allocation from mempool under heavy memory pressure.

It would be much better for swap devices because the bio mempool
for swap IO have been used with fs. It means super-fast swap
IO like zram don't need to depends on slow eMMC read/write
completion.

Signed-off-by: Minchan Kim 
---
 include/linux/swap.h |   3 +-
 mm/page_io.c | 123 +++
 mm/swapfile.c|   3 ++
 3 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ae3da979a7b7..6ed9b6423f7d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -152,8 +152,9 @@ enum {
SWP_AREA_DISCARD = (1 << 8),/* single-time swap area discards */
SWP_PAGE_DISCARD = (1 << 9),/* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 10),  /* no overwrite PG_writeback pages */
+   SWP_SYNC_IO = (1<<11),  /* synchronous IO is efficient */
/* add others here before... */
-   SWP_SCANNING= (1 << 11),/* refcount in scan_swap_map */
+   SWP_SCANNING= (1 << 12),/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/page_io.c b/mm/page_io.c
index 3502a97f7c48..d794fd810773 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -44,7 +44,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
return bio;
 }
 
-void end_swap_bio_write(struct bio *bio)
+void end_swap_bio_write_simple(struct bio *bio)
 {
struct page *page = bio->bi_io_vec[0].bv_page;
 
@@ -66,6 +66,11 @@ void end_swap_bio_write(struct bio *bio)
ClearPageReclaim(page);
}
end_page_writeback(page);
+}
+
+void end_swap_bio_write(struct bio *bio)
+{
+   end_swap_bio_write_simple(bio);
bio_put(bio);
 }
 
@@ -117,10 +122,9 @@ static void swap_slot_free_notify(struct page *page)
}
 }
 
-static void end_swap_bio_read(struct bio *bio)
+static void end_swap_bio_read_simple(struct bio *bio)
 {
struct page *page = bio->bi_io_vec[0].bv_page;
-   struct task_struct *waiter = bio->bi_private;
 
if (bio->bi_status) {
SetPageError(page);
@@ -136,6 +140,13 @@ static void end_swap_bio_read(struct bio *bio)
swap_slot_free_notify(page);
 out:
unlock_page(page);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+   struct task_struct *waiter = bio->bi_private;
+
+   end_swap_bio_read_simple(bio);
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
@@ -275,7 +286,6 @@ static inline void count_swpout_vm_event(struct page *page)
 
 int __swap_writepage(struct page *page, struct writeback_control *wbc)
 {
-   struct bio *bio;
int ret;
struct swap_info_struct *sis = page_swap_info(page);
 
@@ -328,25 +338,43 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc)
}
 
ret = 0;
-   bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
-   if (bio == NULL) {
-   set_page_dirty(page);
+   if (!(sis->flags & SWP_SYNC_IO)) {
+   struct bio *bio;
+
+   bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+   if (bio == NULL) {
+   set_page_dirty(page);
+   unlock_page(page);
+   ret = -ENOMEM;
+   goto out;
+   }
+   bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+   set_page_writeback(page);
unlock_page(page);
-   ret = -ENOMEM;
-   goto out;
+   submit_bio(bio);
+   } else {
+
+   /* on-stack-bio */
+   struct bio sbio;
+   struct bio_vec bvec;
+
+   bio_init(&sbio, &bvec, 1);
+   sbio.bi_bdev = sis->bdev;
+   sbio.bi_iter.bi_sector = swap_page_sector(page);
+   sbio.bi_end_io = end_swap_bio_write_simple;
+   bio_add_page(&sbio, page, PAGE_SIZE, 0);
+   bio_set_op_attrs(&sbio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
+   set_page_writeback(page);
+   unlock_page(page);
+   submit_bio(&sbio);
}
-   bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
count_swpout_vm_event(page);
-   set_page_writeback(page);
-   unlock_page(page);
-   submit_bio(bio);
 out:
return ret;
 }
 
 int swap_readpage(struct page *page, bool do_poll)
 {
-   struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
@@ -383,33 +411,50 @@ int swap_readpage(struct page *page, bool do_poll)
}
 
ret = 0;
-   bio = get_swap_bio(GFP_KERNEL, page, end

[PATCH v1 2/6] fs: use on-stack-bio if backing device has BDI_CAP_SYNC capability

2017-08-07 Thread Minchan Kim
There is no need to use dynamic bio allocation for BDI_CAP_SYNC
devices. They can with on-stack-bio without concern about waiting
bio allocation from mempool under heavy memory pressure.

Signed-off-by: Minchan Kim 
---
 fs/mpage.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/fs/mpage.c b/fs/mpage.c
index 2e4c41ccb5c9..eaeaef27d693 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -31,6 +31,14 @@
 #include 
 #include "internal.h"
 
+static void on_stack_page_end_io(struct bio *bio)
+{
+   struct page *page = bio->bi_io_vec->bv_page;
+
+   page_endio(page, op_is_write(bio_op(bio)),
+   blk_status_to_errno(bio->bi_status));
+}
+
 /*
  * I/O completion handler for multipage BIOs.
  *
@@ -278,6 +286,22 @@ do_mpage_readpage(struct bio *bio, struct page *page, 
unsigned nr_pages,
 alloc_new:
if (bio == NULL) {
if (first_hole == blocks_per_page) {
+   if (bdi_cap_synchronous_io(inode_to_bdi(inode))) {
+   /* on-stack-bio */
+   struct bio sbio;
+   struct bio_vec bvec;
+
+   bio_init(&sbio, &bvec, 1);
+   sbio.bi_bdev = bdev;
+   sbio.bi_iter.bi_sector =
+   blocks[0] << (blkbits - 9);
+   sbio.bi_end_io = on_stack_page_end_io;
+   bio_add_page(&sbio, page, PAGE_SIZE, 0);
+   bio_set_op_attrs(&sbio, REQ_OP_READ, 0);
+   submit_bio(&sbio);
+   goto out;
+   }
+
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
page))
goto out;
@@ -604,6 +628,25 @@ static int __mpage_writepage(struct page *page, struct 
writeback_control *wbc,
 alloc_new:
if (bio == NULL) {
if (first_unmapped == blocks_per_page) {
+   if (bdi_cap_synchronous_io(inode_to_bdi(inode))) {
+   /* on-stack-bio */
+   struct bio sbio;
+   struct bio_vec bvec;
+
+   bio_init(&sbio, &bvec, 1);
+   sbio.bi_bdev = bdev;
+   sbio.bi_iter.bi_sector =
+   blocks[0] << (blkbits - 9);
+   sbio.bi_end_io = on_stack_page_end_io;
+   bio_add_page(&sbio, page, PAGE_SIZE, 0);
+   bio_set_op_attrs(&sbio, REQ_OP_WRITE, op_flags);
+   WARN_ON_ONCE(PageWriteback(page));
+   set_page_writeback(page);
+   unlock_page(page);
+   submit_bio(&sbio);
+   clean_buffers(page, first_unmapped);
+   }
+
if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
page, wbc)) {
clean_buffers(page, first_unmapped);
-- 
2.7.4



[PATCH v1 3/6] mm:swap: remove end_swap_bio_write argument

2017-08-07 Thread Minchan Kim
Every caller of __swap_writepage uses end_swap_bio_write as
end_write_func argument so the argument is pointless.
Remove it.

Signed-off-by: Minchan Kim 
---
 include/linux/swap.h | 3 +--
 mm/page_io.c | 7 +++
 mm/zswap.c   | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 76f1632eea5a..ae3da979a7b7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -336,8 +336,7 @@ extern void kswapd_stop(int nid);
 extern int swap_readpage(struct page *page, bool do_poll);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern void end_swap_bio_write(struct bio *bio);
-extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
-   bio_end_io_t end_write_func);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int swap_set_page_dirty(struct page *page);
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
diff --git a/mm/page_io.c b/mm/page_io.c
index 20139b90125a..3502a97f7c48 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -254,7 +254,7 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)
end_page_writeback(page);
goto out;
}
-   ret = __swap_writepage(page, wbc, end_swap_bio_write);
+   ret = __swap_writepage(page, wbc);
 out:
return ret;
 }
@@ -273,8 +273,7 @@ static inline void count_swpout_vm_event(struct page *page)
count_vm_events(PSWPOUT, hpage_nr_pages(page));
 }
 
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
-   bio_end_io_t end_write_func)
+int __swap_writepage(struct page *page, struct writeback_control *wbc)
 {
struct bio *bio;
int ret;
@@ -329,7 +328,7 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
}
 
ret = 0;
-   bio = get_swap_bio(GFP_NOIO, page, end_write_func);
+   bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
diff --git a/mm/zswap.c b/mm/zswap.c
index d39581a076c3..38db258515b5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -900,7 +900,7 @@ static int zswap_writeback_entry(struct zpool *pool, 
unsigned long handle)
SetPageReclaim(page);
 
/* start writeback */
-   __swap_writepage(page, &wbc, end_swap_bio_write);
+   __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;
 
-- 
2.7.4



[PATCH v1 6/6] fs: remove rw_page

2017-08-07 Thread Minchan Kim
Currently, there is no user of rw_page so remove it.

Signed-off-by: Minchan Kim 
---
 fs/block_dev.c | 76 --
 fs/mpage.c | 12 ++--
 include/linux/blkdev.h |  4 ---
 mm/page_io.c   | 17 ---
 4 files changed, 2 insertions(+), 107 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9941dc8342df..6fb408041e7d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -649,82 +649,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t 
end, int datasync)
 }
 EXPORT_SYMBOL(blkdev_fsync);
 
-/**
- * bdev_read_page() - Start reading a page from a block device
- * @bdev: The device to read the page from
- * @sector: The offset on the device to read the page to (need not be aligned)
- * @page: The page to read
- *
- * On entry, the page should be locked.  It will be unlocked when the page
- * has been read.  If the block driver implements rw_page synchronously,
- * that will be true on exit from this function, but it need not be.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to read this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_read_page(struct block_device *bdev, sector_t sector,
-   struct page *page)
-{
-   const struct block_device_operations *ops = bdev->bd_disk->fops;
-   int result = -EOPNOTSUPP;
-
-   if (!ops->rw_page || bdev_get_integrity(bdev))
-   return result;
-
-   result = blk_queue_enter(bdev->bd_queue, false);
-   if (result)
-   return result;
-   result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
-   blk_queue_exit(bdev->bd_queue);
-   return result;
-}
-EXPORT_SYMBOL_GPL(bdev_read_page);
-
-/**
- * bdev_write_page() - Start writing a page to a block device
- * @bdev: The device to write the page to
- * @sector: The offset on the device to write the page to (need not be aligned)
- * @page: The page to write
- * @wbc: The writeback_control for the write
- *
- * On entry, the page should be locked and not currently under writeback.
- * On exit, if the write started successfully, the page will be unlocked and
- * under writeback.  If the write failed already (eg the driver failed to
- * queue the page to the device), the page will still be locked.  If the
- * caller is a ->writepage implementation, it will need to unlock the page.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to write this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_write_page(struct block_device *bdev, sector_t sector,
-   struct page *page, struct writeback_control *wbc)
-{
-   int result;
-   const struct block_device_operations *ops = bdev->bd_disk->fops;
-
-   if (!ops->rw_page || bdev_get_integrity(bdev))
-   return -EOPNOTSUPP;
-   result = blk_queue_enter(bdev->bd_queue, false);
-   if (result)
-   return result;
-
-   set_page_writeback(page);
-   result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
-   if (result)
-   end_page_writeback(page);
-   else
-   unlock_page(page);
-   blk_queue_exit(bdev->bd_queue);
-   return result;
-}
-EXPORT_SYMBOL_GPL(bdev_write_page);
-
 /*
  * pseudo-fs
  */
diff --git a/fs/mpage.c b/fs/mpage.c
index eaeaef27d693..707d77fe7289 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -301,11 +301,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, 
unsigned nr_pages,
submit_bio(&sbio);
goto out;
}
-
-   if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
-   page))
-   goto out;
}
+
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
if (bio == NULL)
@@ -646,13 +643,8 @@ static int __mpage_writepage(struct page *page, struct 
writeback_control *wbc,
submit_bio(&sbio);
clean_buffers(page, first_unmapped);
}
-
-   if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
-   page, wbc)) {
-   clean_buffers(page, first_unmapped);
-   goto out;
-   }
}
+
bio = mpage_alloc(bdev, blocks[0] << (b

[PATCH v1 1/6] bdi: introduce BDI_CAP_SYNC

2017-08-07 Thread Minchan Kim
By discussion[1], we will replace rw_page devices with on-stack-bio.
For such super-fast devices to be detected, this patch introduces
BDI_CAP_SYNC which means synchronous IO would be more efficient for
asnychronous IO and uses the flags to brd, zram, btt and pmem.

[1] lkml.kernel.org/r/<20170728165604.10455-1-ross.zwis...@linux.intel.com>
Signed-off-by: Minchan Kim 
---
 drivers/block/brd.c   | 2 ++
 drivers/block/zram/zram_drv.c | 2 +-
 drivers/nvdimm/btt.c  | 2 ++
 drivers/nvdimm/pmem.c | 2 ++
 include/linux/backing-dev.h   | 7 +++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 293250582f00..97d4e1679de7 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 #include 
 #include 
@@ -436,6 +437,7 @@ static struct brd_device *brd_alloc(int i)
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
+   disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNC;
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bbbc2f230b8e..3eda88d0ca95 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1577,7 +1577,7 @@ static int zram_add(void)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
 
zram->disk->queue->backing_dev_info->capabilities |=
-   BDI_CAP_STABLE_WRITES;
+   (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNC);
add_disk(zram->disk);
 
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e10d3300b64c..16f60351e4fd 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "btt.h"
 #include "nd.h"
 
@@ -1273,6 +1274,7 @@ static int btt_blk_init(struct btt *btt)
btt->btt_disk->private_data = btt;
btt->btt_disk->queue = btt->btt_queue;
btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
+   btt->btt_disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNC;
 
blk_queue_make_request(btt->btt_queue, btt_make_request);
blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b5f04559a497..e1704099b5cc 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
@@ -379,6 +380,7 @@ static int pmem_attach_disk(struct device *dev,
disk->fops  = &pmem_fops;
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
+   disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNC;
nvdimm_namespace_disk_name(ndns, disk->disk_name);
set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
/ 512);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 854e1bdd0b2a..397ee71763d7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -123,6 +123,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, 
unsigned int max_ratio);
  * BDI_CAP_STRICTLIMIT:Keep number of dirty pages below bdi threshold.
  *
  * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
+ * BDI_CAP_SYNC: Device is so fast that asynchronous IO would be inefficient.
  */
 #define BDI_CAP_NO_ACCT_DIRTY  0x0001
 #define BDI_CAP_NO_WRITEBACK   0x0002
@@ -130,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, 
unsigned int max_ratio);
 #define BDI_CAP_STABLE_WRITES  0x0008
 #define BDI_CAP_STRICTLIMIT0x0010
 #define BDI_CAP_CGROUP_WRITEBACK 0x0020
+#define BDI_CAP_SYNC   0x0040
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
@@ -177,6 +179,11 @@ long wait_iff_congested(struct pglist_data *pgdat, int 
sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
 
+static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
+{
+   return bdi->capabilities & BDI_CAP_SYNC;
+}
+
 static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
 {
return bdi->capabilities & BDI_CAP_STABLE_WRITES;
-- 
2.7.4



[PATCH v1 0/6] Remove rw_page

2017-08-07 Thread Minchan Kim
Recently, there was a dicussion about removing rw_page due to maintainance
burden[1] but the problem was zram because zram has a clear win for the
benchmark at that time. The reason why only zram have a win is due to
bio allocation wait time from mempool under extreme memory pressure.

Christoph Hellwig suggested we can use on-stack-bio for rw_page devices.
This patch implements it and replace rw_page operations with on-stack-bio
and then finally, remove rw_page interface completely.

This patch is based on linux-next-20170804

[1] 
http://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwis...@linux.intel.com>

Minchan Kim (6):
  bdi: introduce BDI_CAP_SYNC
  fs: use on-stack-bio if backing device has BDI_CAP_SYNC capability
  mm:swap: remove end_swap_bio_write argument
  mm:swap: use on-stack-bio for BDI_CAP_SYNC devices
  zram: remove zram_rw_page
  fs: remove rw_page

 drivers/block/brd.c   |   2 +
 drivers/block/zram/zram_drv.c |  54 +---
 drivers/nvdimm/btt.c  |   2 +
 drivers/nvdimm/pmem.c |   2 +
 fs/block_dev.c|  76 --
 fs/mpage.c|  45 +++--
 include/linux/backing-dev.h   |   7 ++
 include/linux/blkdev.h|   4 --
 include/linux/swap.h  |   6 +-
 mm/page_io.c  | 145 +-
 mm/swapfile.c |   3 +
 mm/zswap.c|   2 +-
 12 files changed, 147 insertions(+), 201 deletions(-)

-- 
2.7.4



Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API

2017-08-07 Thread Coly Li
On 2017/8/8 下午2:00, Byungchul Park wrote:
> On Tue, Aug 08, 2017 at 01:28:39PM +0800, Coly Li wrote:
> + llist_for_each_entry_safe(cl, t, reverse, list) {

 Just wondering why not using llist_for_each_entry(), or you use the
 _safe version on purpose ?
>>>
>>> If I use llist_for_each_entry(), then it would change the original
>>> behavior. Is it ok?
>>>
>>
>> I feel llist_for_each_entry() keeps the original behavior, and variable
> 
> Ah.. I see. Then.. Can I change it into non-safe version? Is it still ok
> with non-safe one? I will change it at the next spin, if yes.
> 
>> 't' can be removed. Anyway, either llist_for_each_entry() or
>> llist_for_each_entry_safe() works correctly and well here. Any one you
>> use is OK to me, thanks for your informative reply :-)
> 
> I rather appriciate it.
> 

Yes, please. And you have my Acked-by :-)


-- 
Coly Li


[PATCH v1 5/6] zram: remove zram_rw_page

2017-08-07 Thread Minchan Kim
With on-stack-bio, rw_page interface doesn't provide a clear performance
benefit for zram and surely has a maintenance burden, so remove the
last user to remove rw_page completely.

Cc: Sergey Senozhatsky 
Signed-off-by: Minchan Kim 
---
 drivers/block/zram/zram_drv.c | 52 ---
 1 file changed, 52 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3eda88d0ca95..9620163308fa 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1268,57 +1268,6 @@ static void zram_slot_free_notify(struct block_device 
*bdev,
atomic64_inc(&zram->stats.notify_free);
 }
 
-static int zram_rw_page(struct block_device *bdev, sector_t sector,
-  struct page *page, bool is_write)
-{
-   int offset, ret;
-   u32 index;
-   struct zram *zram;
-   struct bio_vec bv;
-
-   if (PageTransHuge(page))
-   return -ENOTSUPP;
-   zram = bdev->bd_disk->private_data;
-
-   if (!valid_io_request(zram, sector, PAGE_SIZE)) {
-   atomic64_inc(&zram->stats.invalid_io);
-   ret = -EINVAL;
-   goto out;
-   }
-
-   index = sector >> SECTORS_PER_PAGE_SHIFT;
-   offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
-
-   bv.bv_page = page;
-   bv.bv_len = PAGE_SIZE;
-   bv.bv_offset = 0;
-
-   ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
-out:
-   /*
-* If I/O fails, just return error(ie, non-zero) without
-* calling page_endio.
-* It causes resubmit the I/O with bio request by upper functions
-* of rw_page(e.g., swap_readpage, __swap_writepage) and
-* bio->bi_end_io does things to handle the error
-* (e.g., SetPageError, set_page_dirty and extra works).
-*/
-   if (unlikely(ret < 0))
-   return ret;
-
-   switch (ret) {
-   case 0:
-   page_endio(page, is_write, 0);
-   break;
-   case 1:
-   ret = 0;
-   break;
-   default:
-   WARN_ON(1);
-   }
-   return ret;
-}
-
 static void zram_reset_device(struct zram *zram)
 {
struct zcomp *comp;
@@ -1460,7 +1409,6 @@ static int zram_open(struct block_device *bdev, fmode_t 
mode)
 static const struct block_device_operations zram_devops = {
.open = zram_open,
.swap_slot_free_notify = zram_slot_free_notify,
-   .rw_page = zram_rw_page,
.owner = THIS_MODULE
 };
 
-- 
2.7.4



RE: hotplug support for arch/arc/plat-eznps platform

2017-08-07 Thread Ofer Levi(SW)

> On Monday, August 7, 2017 6:10 PM +, Ofer Levi(SW) wrote:
> 
> On Mon, Aug 07, 2017 at 01:41:38PM +, Ofer Levi(SW) wrote:
> > > You've failed to explain why you think hotplug should be a
> > > performance critical path.
> > 1. hotplug bring up of 4K cpus takes 40 minutes.  Way too much for any
> user.
> > 2. plat-eznps is a network processor, where bring up time is sensitive.
> 
> But who is doing actual hotplug? Why would you ever unplug or plug a CPU in
> a time critical situation?

The idea behind implementing hotplug for this arch is to shorten time to 
traffic processing. 
This way instead of waiting ~5 min for all cpus to boot, application running on 
cpu 0 will 
Loop booting other cpus and assigning  the traffic processing application to 
it. 
Outgoing traffic will build up until all cpus are up and running full traffic 
rate.
This method allow for traffic processing to start after ~20 sec instead of the 
5 min.

> 
> > > I'm also not seeing how it would be different from boot; you'd be
> > > looking at a similar cost for SMP bringup.
> > bring up time of 4k cpus during kernel boot takes 4.5 minutes.
> > The function in question is performed only when smp init was performed.
> > If I understand correctly, whatever this function is doing is
> > performed after all cpus were brought up during kernel boot.
> 
> Doesn't make sense. If you look at smp_init() boot brings up the CPUs one at
> a time.
> 
> So how can boot be different than hot-pugging them?

Please have a look at following code kernel/sched/core.c, sched_cpu_activate() :

if (sched_smp_initialized) {
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
The cpuset_cpu_active call eventually leads to the function in question 
partition_sched_domains()
When cold-booting cpus the sched_smp_initialized flag is false and therefore 
partition_sched_domains is not executing.

This leads me back to my questions


Thanks.


Re: [PATCH v2 0/3] fix xen hvm guest with kaslr enabled

2017-08-07 Thread Juergen Gross
On 28/07/17 12:23, Juergen Gross wrote:
> This patch series fixes a regression introduced in 4.13-rc1: A Xen
> HVM guest with KASLR enabled wouldn't boot any longer due to the usage
> of __va() before kernel_randomize_memory() was called.
> 
> Changes in V2:
> - patch 1: test for x86_hyper being not NULL
> 
> Juergen Gross (3):
>   x86: provide an init_mem_mapping hypervisor hook
>   xen: split up xen_hvm_init_shared_info()
>   xen: fix hvm guest with kaslr enabled
> 
>  arch/x86/include/asm/hypervisor.h | 10 +++
>  arch/x86/mm/init.c|  3 ++
>  arch/x86/xen/enlighten_hvm.c  | 59 
> ---
>  3 files changed, 50 insertions(+), 22 deletions(-)
> 

Could I have some feedback, please?

I'd like to get this regression fixed in 4.13.

In case nobody objects this week I'll just add the patches to the Xen
tree for rc5.


Juergen


Re: [PATCH] i2c: imx: Remove a useless test in 'i2c_imx_init_recovery_info()'

2017-08-07 Thread Christophe JAILLET

Le 07/08/2017 à 09:16, Julia Lawall a écrit :


On Mon, 7 Aug 2017, Uwe Kleine-König wrote:


On Mon, Aug 07, 2017 at 01:49:53AM +0200, Christophe JAILLET wrote:

'devm_pinctrl_get()' never returns NULL, so this test can be simplified.

That's wrong. If CONFIG_PINCTRL is disabled devm_pinctrl_get returns
NULL. But I think this shouldn't be considered an error, so your change
is right, just the commit log is not.


diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 54a47b40546f..7e84662fe1c0 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -997,7 +997,7 @@ static int i2c_imx_init_recovery_info(struct imx_i2c_struct 
*i2c_imx,
struct i2c_bus_recovery_info *rinfo = &i2c_imx->rinfo;

i2c_imx->pinctrl = devm_pinctrl_get(&pdev->dev);
-   if (!i2c_imx->pinctrl || IS_ERR(i2c_imx->pinctrl)) {
+   if (IS_ERR(i2c_imx->pinctrl)) {
dev_info(&pdev->dev, "can't get pinctrl, bus recovery not 
supported\n");
return PTR_ERR(i2c_imx->pinctrl);
}

Side note: I'm not sure, this construct is valid. IIRC PTR_ERR should
only be called for values x where IS_ERR(x) is true. Here it is at least
surprising that an message hints to a problem but the return code is 0.

@Julia: I'm sure coccinelle can find more of those?!

I only found a few.  Christophe, if you want to fix tem up, please go
ahead.


Hi Julia,

I've looked quickly at your output, and can't see what could/should be 
done in the spotted cases.
   e100.c: a comment says that 'If it's NULL, then no ucode is 
required', so, the behavior looks ok to me
   chcr_algo.c: function 'create_wr_fn' is passed as a parameter. I've 
no way to make sure of its behavior, so the code could be valid. I won't 
touch it.
   acl.c:  by code inspection, the way the code is written looks valid 
to me. We have NULL if the a call in 'ocfs2_get_acl_nolock' returns 
-ENODATA. Not that strange to return success in this case


So, personally, I won't propose anything on these files. Up to anyone to 
dig further than me.


CJ

julia

diff -u -p /var/linuxes/linux-next/drivers/net/ethernet/intel/e100.c
/tmp/nothing/drivers/net/ethernet/intel/e100.c
--- /var/linuxes/linux-next/drivers/net/ethernet/intel/e100.c
+++ /tmp/nothing/drivers/net/ethernet/intel/e100.c
@@ -1370,8 +1370,6 @@ static inline int e100_load_ucode_wait(s

 fw = e100_request_firmware(nic);
 /* If it's NULL, then no ucode is required */
-   if (!fw || IS_ERR(fw))
-   return PTR_ERR(fw);

 if ((err = e100_exec_cb(nic, (void *)fw, e100_setup_ucode)))
 netif_err(nic, probe, nic->netdev,
diff -u -p /var/linuxes/linux-next/drivers/i2c/busses/i2c-imx.c
/tmp/nothing/drivers/i2c/busses/i2c-imx.c
--- /var/linuxes/linux-next/drivers/i2c/busses/i2c-imx.c
+++ /tmp/nothing/drivers/i2c/busses/i2c-imx.c
@@ -997,9 +997,7 @@ static int i2c_imx_init_recovery_info(st
 struct i2c_bus_recovery_info *rinfo = &i2c_imx->rinfo;

 i2c_imx->pinctrl = devm_pinctrl_get(&pdev->dev);
-   if (!i2c_imx->pinctrl || IS_ERR(i2c_imx->pinctrl)) {
 dev_info(&pdev->dev, "can't get pinctrl, bus recovery not
supported\n");
-   return PTR_ERR(i2c_imx->pinctrl);
 }

 i2c_imx->pinctrl_pins_default =
pinctrl_lookup_state(i2c_imx->pinctrl,
diff -u -p /var/linuxes/linux-next/drivers/crypto/chelsio/chcr_algo.c
/tmp/nothing/drivers/crypto/chelsio/chcr_algo.c
--- /var/linuxes/linux-next/drivers/crypto/chelsio/chcr_algo.c
+++ /tmp/nothing/drivers/crypto/chelsio/chcr_algo.c
@@ -3159,8 +3159,6 @@ static int chcr_aead_op(struct aead_requ
 skb = create_wr_fn(req, u_ctx->lldi.rxq_ids[ctx->rx_qidx], size,
op_type);

-   if (IS_ERR(skb) || !skb)
-   return PTR_ERR(skb);

 skb->dev = u_ctx->lldi.ports[0];
 set_wr_txq(skb, CPL_PRIORITY_DATA, ctx->tx_qidx);
diff -u -p /var/linuxes/linux-next/fs/ocfs2/acl.c
/tmp/nothing/fs/ocfs2/acl.c
--- /var/linuxes/linux-next/fs/ocfs2/acl.c
+++ /tmp/nothing/fs/ocfs2/acl.c
@@ -331,8 +331,6 @@ int ocfs2_acl_chmod(struct inode *inode,
 return 0;

 acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
-   if (IS_ERR(acl) || !acl)
-   return PTR_ERR(acl);
 ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 if (ret)
 return ret;






Best regards
Uwe

--
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |
--
To unsubscribe from this list: send the line "unsubscribe kernel-janitors" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

>





Re: [RFC]Add new mdev interface for QoS

2017-08-07 Thread Kirti Wankhede


On 8/7/2017 1:11 PM, Gao, Ping A wrote:
> 
> On 2017/8/4 5:11, Alex Williamson wrote:
>> On Thu, 3 Aug 2017 20:26:14 +0800
>> "Gao, Ping A"  wrote:
>>
>>> On 2017/8/3 0:58, Alex Williamson wrote:
 On Wed, 2 Aug 2017 21:16:28 +0530
 Kirti Wankhede  wrote:
  
> On 8/2/2017 6:29 PM, Gao, Ping A wrote:  
>> On 2017/8/2 18:19, Kirti Wankhede wrote:
>>> On 8/2/2017 3:56 AM, Alex Williamson wrote:
 On Tue, 1 Aug 2017 13:54:27 +0800
 "Gao, Ping A"  wrote:

> On 2017/7/28 0:00, Gao, Ping A wrote:
>> On 2017/7/27 0:43, Alex Williamson wrote:  
>>> [cc +libvir-list]
>>>
>>> On Wed, 26 Jul 2017 21:16:59 +0800
>>> "Gao, Ping A"  wrote:
>>>  
 The vfio-mdev provide the capability to let different guest share 
 the
 same physical device through mediate sharing, as result it bring a
 requirement about how to control the device sharing, we need a QoS
 related interface for mdev to management virtual device resource.

 E.g. In practical use, vGPUs assigned to different quests almost 
 has
 different performance requirements, some guests may need higher 
 priority
 for real time usage, some other may need more portion of the GPU
 resource to get higher 3D performance, corresponding we can define 
 some
 interfaces like weight/cap for overall budget control, priority for
 single submission control.

 So I suggest to add some common attributes which are vendor 
 agnostic in
 mdev core sysfs for QoS purpose.  
>>> I think what you're asking for is just some standardization of a QoS
>>> attribute_group which a vendor can optionally include within the
>>> existing mdev_parent_ops.mdev_attr_groups.  The mdev core will
>>> transparently enable this, but it really only provides the standard,
>>> all of the support code is left for the vendor.  I'm fine with that,
>>> but of course the trouble with and sort of standardization is 
>>> arriving
>>> at an agreed upon standard.  Are there QoS knobs that are generic
>>> across any mdev device type?  Are there others that are more 
>>> specific
>>> to vGPU?  Are there existing examples of this that we can steal 
>>> their
>>> specification?  
>> Yes, you are right, standardization QoS knobs are exactly what I 
>> wanted.
>> Only when it become a part of the mdev framework and libvirt, then 
>> QoS
>> such critical feature can be leveraged by cloud usage. HW vendor only
>> need to focus on the implementation of the corresponding QoS 
>> algorithm
>> in their back-end driver.
>>
>> Vfio-mdev framework provide the capability to share the device that 
>> lack
>> of HW virtualization support to guests, no matter the device type,
>> mediated sharing actually is a time sharing multiplex method, from 
>> this
>> point of view, QoS can be take as a generic way about how to control 
>> the
>> time assignment for virtual mdev device that occupy HW. As result we 
>> can
>> define QoS knob generic across any device type by this way. Even if 
>> HW
>> has build in with some kind of QoS support, I think it's not a 
>> problem
>> for back-end driver to convert mdev standard QoS definition to their
>> specification to reach the same performance expectation. Seems there 
>> are
>> no examples for us to follow, we need define it from scratch.
>>
>> I proposal universal QoS control interfaces like below:
>>
>> Cap: The cap limits the maximum percentage of time a mdev device can 
>> own
>> physical device. e.g. cap=60, means mdev device cannot take over 60% 
>> of
>> total physical resource.
>>
>> Weight: The weight define proportional control of the mdev device
>> resource between guests, it’s orthogonal with Cap, to target load
>> balancing. E.g. if guest 1 should take double mdev device resource
>> compare with guest 2, need set weight ratio to 2:1.
>>
>> Priority: The guest who has higher priority will get execution first,
>> target to some real time usage and speeding interactive response.
>>
>> Above QoS interfaces cover both overall budget control and single
>> submission control. I will sent out detail design later once get 
>> aligned.  
> Hi Alex,
> Any comments about the interface mentioned above?
 Not really.

 Kirti, are there a

Re: [PATCH 2/2] sched/debug: intruduce task_state_to_char helper function

2017-08-07 Thread kbuild test robot
Hi Xie,

[auto build test ERROR on tip/sched/core]
[also build test ERROR on v4.13-rc4 next-20170807]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Xie-XiuQi/sched-debug-show-task-state-on-proc-sched_debug/20170808-135825
config: i386-randconfig-x019-201732 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   kernel/sched/core.c: In function 'sched_show_task':
>> kernel/sched/core.c:5114:64: error: expected ')' before ';' token
 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p);
   ^
>> kernel/sched/core.c:5133:1: error: expected ';' before '}' token
}
^
   kernel/sched/core.c:5109:6: warning: unused variable 'ppid' 
[-Wunused-variable]
 int ppid;
 ^~~~
   kernel/sched/core.c:5108:16: warning: unused variable 'free' 
[-Wunused-variable]
 unsigned long free = 0;
   ^~~~
   In file included from arch/x86/include/asm/current.h:4:0,
from include/linux/sched.h:11,
from kernel/sched/core.c:8:
   kernel/sched/core.c: At top level:
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'strcpy' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:390:2: note: in expansion of macro 'if'
 if (p_size == (size_t)-1 && q_size == (size_t)-1)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'kmemdup' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:380:2: note: in expansion of macro 'if'
 if (p_size < size)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'kmemdup' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:378:2: note: in expansion of macro 'if'
 if (__builtin_constant_p(size) && p_size < size)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'memchr_inv' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:369:2: note: in expansion of macro 'if'
 if (p_size < size)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'memchr_inv' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:367:2: note: in expansion of macro 'if'
 if (__builtin_constant_p(size) && p_size < size)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'memchr' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:358:2: note: in expansion of macro 'if'
 if (p_size < size)
 ^~
   include/linux/compiler.h:162:4: warning: '__f' is static but declared in 
inline function 'memchr' which is not static
   __f = { \
   ^
   include/linux/compiler.h:154:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^~
   include/linux/string.h:356:2: note: in expansion of macro 'if'
 if (__builtin_constant_p(size) && p_size < size)
 ^~
   include/l

[PATCH] powerpc/32: Fix boot failure on non 6xx platforms

2017-08-07 Thread Christophe Leroy
commit d300627c6a536 ("powerpc/6xx: Handle DABR match before
calling do_page_fault") breaks non 6xx platforms.

[6.029556] Failed to execute /init (error -14)
[6.034623] Starting init: /bin/sh exists but couldn't execute it
(error -14)
[6.041489] Kernel panic - not syncing: No working init found.  Try
passing init= option to kernel. See Linux
Documentation/admin-guide/init.rst for guidance.
[6.055518] CPU: 0 PID: 1 Comm: init Not tainted
4.13.0-rc3-s3k-dev-00143-g7aa62e972a56 #56
[6.063745] Call Trace:
[6.066224] [c60f1ed0] [c001a624] panic+0x108/0x250 (unreliable)
[6.072140] [c60f1f30] [c0002640] rootfs_mount+0x0/0x58
[6.077311] [c60f1f40] [c000cb80] ret_from_kernel_thread+0x5c/0x64
[6.083405] Rebooting in 180 seconds..

This is because in handle_page_fault(), the call to do_page_fault()
has been mistakenly enclosed inside an #ifdef CONFIG_6xx

Fixes: d300627c6a536 ("powerpc/6xx: Handle DABR match before
calling do_page_fault")

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/entry_32.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index dff51ea52e49..b14bf7def2e2 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -586,11 +586,11 @@ ppc_swapcontext:
 handle_page_fault:
stw r4,_DAR(r1)
addir3,r1,STACK_FRAME_OVERHEAD
-   andis.  r0,r5,DSISR_DABRMATCH@h
 #ifdef CONFIG_6xx
+   andis.  r0,r5,DSISR_DABRMATCH@h
bne-handle_dabr_fault
-   bl  do_page_fault
 #endif
+   bl  do_page_fault
cmpwi   r3,0
beq+ret_from_except
SAVE_NVGPRS(r1)
-- 
2.13.3



Re: [PATCH] f2fs: fix some cases with reserved_blocks

2017-08-07 Thread Yunlong Song
Does this means the reserved_blocks cannot be used by users by can be 
used by filesystem?
If it can be used by filesystem, then this cannot ensure the flash 
device really reserve the

reserved_blocks space, right? The reserved_blocks is just for users?

On 2017/8/8 14:08, Chao Yu wrote:

On 2017/8/8 12:12, Yunlong Song wrote:

Signed-off-by: Yunlong Song 
---
  fs/f2fs/recovery.c | 3 ++-
  fs/f2fs/super.c| 9 +
  2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a3d0261..e288319 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -51,7 +51,8 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
  {
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
  
-	if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)

+   if (sbi->last_valid_block_count + nalloc +
+   sbi->reserved_blocks > sbi->user_block_count)

I think we can treat reserved blocks as over-provision space in f2fs, so it
would be safe to store invalid data (may become valid during recovery) there.
Anyway, it OK to remain old condition judgment.


return false;
return true;
  }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4c1bdcb..c644bf5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -946,6 +946,7 @@ static int f2fs_statfs(struct dentry *dentry, struct 
kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
block_t total_count, user_block_count, start_count, ovp_count;
u64 avail_node_count;
+   block_t avail_user_block_count;
  
  	total_count = le64_to_cpu(sbi->raw_super->block_count);

user_block_count = sbi->user_block_count;
@@ -953,16 +954,16 @@ static int f2fs_statfs(struct dentry *dentry, struct 
kstatfs *buf)
ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
buf->f_type = F2FS_SUPER_MAGIC;
buf->f_bsize = sbi->blocksize;
+   avail_user_block_count = user_block_count - sbi->reserved_blocks;
  
  	buf->f_blocks = total_count - start_count;

buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
-   buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-   sbi->reserved_blocks;
+   buf->f_bavail = avail_user_block_count - valid_user_blocks(sbi);
  
  	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
  
-	if (avail_node_count > user_block_count) {

-   buf->f_files = user_block_count;
+   if (avail_node_count > avail_user_block_count) {

Likewise f_blocks calculation, the f_files one doesn't need to consider
reserved_blocks.

Thanks,


+   buf->f_files = avail_user_block_count;
buf->f_ffree = buf->f_bavail;
} else {
buf->f_files = avail_node_count;



.



--
Thanks,
Yunlong Song




Re: [PATCH 00/29] constify scsi pci_device_id.

2017-08-07 Thread Johannes Thumshirn
On Mon, Aug 07, 2017 at 01:28:18PM -0400, Martin K . Petersen wrote:
> However, having a bazillion identical commit messages is also really
> annoying. So for automated changes like this, I'd rather just have a
> single patch.

Thought so ;-)

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


[PATCH v2] usb: quirks: Add no-lpm quirk for Moshi USB to Ethernet Adapter

2017-08-07 Thread Kai-Heng Feng
Moshi USB to Ethernet Adapter internally uses a Genesys Logic hub to
connect to Realtek r8153.

The Realtek r8153 ethernet does not work on the internal hub, no-lpm quirk
can make it work.

Since another r8153 dongle at my hand does not have the issue, so add
the quirk to the hub instead.

Signed-off-by: Kai-Heng Feng 
---
v2: Clarify that the adapter uses a hub internally.

 drivers/usb/core/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 3116edfcdc18..c96daf34431e 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -150,6 +150,9 @@ static const struct usb_device_id usb_quirk_list[] = {
/* appletouch */
{ USB_DEVICE(0x05ac, 0x021a), .driver_info = USB_QUIRK_RESET_RESUME },
 
+   /* Moshi USB to Ethernet Adapter */
+   { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
+
/* Avision AV600U */
{ USB_DEVICE(0x0638, 0x0a13), .driver_info =
  USB_QUIRK_STRING_FETCH_255 },
-- 
2.13.4



Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API

2017-08-07 Thread Byungchul Park
On Tue, Aug 08, 2017 at 01:28:39PM +0800, Coly Li wrote:
> >>> + llist_for_each_entry_safe(cl, t, reverse, list) {
> >>
> >> Just wondering why not using llist_for_each_entry(), or you use the
> >> _safe version on purpose ?
> > 
> > If I use llist_for_each_entry(), then it would change the original
> > behavior. Is it ok?
> > 
> 
> I feel llist_for_each_entry() keeps the original behavior, and variable

Ah.. I see. Then.. Can I change it into non-safe version? Is it still ok
with non-safe one? I will change it at the next spin, if yes.

> 't' can be removed. Anyway, either llist_for_each_entry() or
> llist_for_each_entry_safe() works correctly and well here. Any one you
> use is OK to me, thanks for your informative reply :-)

I rather appriciate it.

Thank you,
Byungchul


Re: [virtio-dev] Re: [PATCH v13 4/5] mm: support reporting free page blocks

2017-08-07 Thread Wei Wang

On 08/08/2017 02:12 PM, Wei Wang wrote:

On 08/03/2017 05:11 PM, Michal Hocko wrote:

On Thu 03-08-17 14:38:18, Wei Wang wrote:
This is just too ugly and wrong actually. Never provide struct page
pointers outside of the zone->lock. What I've had in mind was to simply
walk free lists of the suitable order and call the callback for each 
one.

Something as simple as

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = &pgdat->node_zones[i];

if (!populated_zone(zone))
continue;


Can we directly use for_each_populated_zone(zone) here?



spin_lock_irqsave(&zone->lock, flags);
for (order = min_order; order < MAX_ORDER; ++order) {



This appears to be covered by for_each_migratetype_order(order, mt) 
below.




struct free_area *free_area = &zone->free_area[order];
enum migratetype mt;
struct page *page;

if (!free_area->nr_pages)
continue;

for_each_migratetype_order(order, mt) {
list_for_each_entry(page,
&free_area->free_list[mt], lru) {

pfn = page_to_pfn(page);
visit(opaque2, prn, 1lock, flags);
list_for_each_entry(page,
&zone->free_area[order].free_list[mt], lru) {
pfn = page_to_pfn(page);
visit(opaque1, pfn, 1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
}
}





Best,
Wei


linux-next: Tree for Aug 8

2017-08-07 Thread Stephen Rothwell
Hi all,

Changes since 20170807:

The rdma tree gained a conflict against Linus' tree.

I again reverted a commit from the staging tree that was causing overnight
build failures.

The userns tree gained a conflict against the mips tree.

Non-merge commits (relative to Linus' tree): 4689
 4882 files changed, 180132 insertions(+), 101872 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc and an allmodconfig (with
CONFIG_BUILD_DOCSRC=n) for x86_64, a multi_v7_defconfig for arm and a
native build of tools/perf. After the final fixups (if any), I do an
x86_64 modules_install followed by builds for x86_64 allnoconfig,
powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig, allyesconfig
and pseries_le_defconfig and i386, sparc and sparc64 defconfig. And
finally, a simple boot test of the powerpc pseries_le_defconfig kernel
in qemu.

Below is a summary of the state of the merge.

I am currently merging 267 trees (counting Linus' and 41 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (aae4e7a8bc44 Linux 4.13-rc4)
Merging fixes/master (b4b8cbf679c4 Cavium CNN55XX: fix broken default Kconfig 
entry)
Merging kbuild-current/fixes (ad8181060788 kconfig: fix sparse warnings in 
nconfig)
Merging arc-current/for-curr (b5ddb6d54729 ARCv2: PAE40: set MSB even if 
!CONFIG_ARC_HAS_PAE40 but PAE exists in SoC)
Merging arm-current/fixes (ce184a0dee92 ARM: 8687/1: signal: Fix unparseable 
iwmmxt_sigframe in uc_regspace[])
Merging m68k-current/for-linus (204a2be30a7a m68k: Remove ptrace_signal_deliver)
Merging metag-fixes/fixes (b884a190afce metag/usercopy: Add missing fixups)
Merging powerpc-fixes/fixes (3db40c312c2c powerpc/64: Fix __check_irq_replay 
missing decrementer interrupt)
Merging sparc/master (0a23ea65ce9f Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc)
Merging fscrypt-current/for-stable (42d97eb0ade3 fscrypt: fix renaming and 
linking special files)
Merging net/master (eb2a6b800c2d qed: Fix a memory allocation failure test in 
'qed_mcp_cmd_init()')
Merging ipsec/master (7bab09631c2a xfrm: policy: check policy direction value)
Merging netfilter/master (9beceb54fa2c netfilter: x_tables: Fix use-after-free 
in ipt_do_table.)
Merging ipvs/master (f7fb77fc1235 netfilter: nft_compat: check extension hook 
mask only if set)
Merging wireless-drivers/master (368bd88ebb64 Merge tag 
'iwlwifi-for-kalle-2017-08-02' of 
git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi-fixes)
Merging mac80211/master (d7f13f745036 cfg80211: Validate frequencies nested in 
NL80211_ATTR_SCAN_FREQUENCIES)
Merging sound-current/for-linus (5ef26e966d3f Merge tag 'asoc-fix-v4.13-rc3' of 
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus)
Merging pci-current/for-linus (8466489ef5ba xhci: Reset Renesas uPD72020x USB 
controller for 32-bit DMA issue)
Merging driver-core.current/driver-core-linus (5771a8c08880 Linux v4.13-rc1)
Merging tty.current/tty-linus (9527b82ae3af Revert "serial: Delete dead code 
for CIR serial ports")
Merging usb.current/usb-linus (8cc34c8d6113 Merge tag 'usb-serial-4.13-rc4' of 
git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial into usb-linus)
Merging usb-gadget-fixes/fixes (b7d44c36a6f6 usb: renesas_usbhs: gadget: fix 
unused-but-set-variable warning)
Merging usb-serial-fixes/usb-linus (fd1b8668af59 USB: serial: option: add 
D-Link DWM-222 device ID)
Merging usb-chipidea-fixes/ci-for-usb-stable (cbb22ebcfb99 usb: chipidea: core: 
check before accessing ci_role in ci_role_show)
Merging phy/fixes (5771a8c08880 Linux v4.13-rc1)
Merging staging.current/staging-linus (cef988642cda staging: comedi: 
comedi_fops: do not call blocking ops when !TASK_RUNNING)
Merging char-misc.current/char-mi

Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page

2017-08-07 Thread Huang, Ying
Christopher Lameter  writes:

> On Mon, 7 Aug 2017, Huang, Ying wrote:
>
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4374,9 +4374,31 @@ void clear_huge_page(struct page *page,
>>  }
>>
>>  might_sleep();
>> -for (i = 0; i < pages_per_huge_page; i++) {
>> +VM_BUG_ON(clamp(addr_hint, addr, addr +
>> +(pages_per_huge_page << PAGE_SHIFT)) != addr_hint);
>> +n = (addr_hint - addr) / PAGE_SIZE;
>> +if (2 * n <= pages_per_huge_page) {
>> +base = 0;
>> +l = n;
>> +for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
>> +cond_resched();
>> +clear_user_highpage(page + i, addr + i * PAGE_SIZE);
>> +}
>
> I really like the idea behind the patch but this is not clearing from last
> to first byte of the huge page.
>
> What seems to be happening here is clearing from the last page to the
> first page and I would think that within each page the clearing is from
> first byte to last byte. Maybe more gains can be had by really clearing
> from last to first byte of the huge page instead of this jumping over 4k
> addresses?

I changed the code to use clear_page_orig() and make it clear pages from
last to first.  The patch is as below.

With that, there is no visible changes in benchmark result.  But the
cache miss rate dropped a little from 27.64% to 26.70%.  The cache miss
rate is different with before because the clear_page() implementation
used is different.

I think this is because the size of page is relative small compared with
the cache size, so that the effect is almost invisible.

Best Regards,
Huang, Ying

--->8
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..01d201afde92 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -42,8 +42,8 @@ void clear_page_erms(void *page);
 static inline void clear_page(void *page)
 {
alternative_call_2(clear_page_orig,
-  clear_page_rep, X86_FEATURE_REP_GOOD,
-  clear_page_erms, X86_FEATURE_ERMS,
+  clear_page_orig, X86_FEATURE_REP_GOOD,
+  clear_page_orig, X86_FEATURE_ERMS,
   "=D" (page),
   "0" (page)
   : "memory", "rax", "rcx");
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 81b1635d67de..23e6238e625d 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -25,19 +25,20 @@ EXPORT_SYMBOL_GPL(clear_page_rep)
 ENTRY(clear_page_orig)
xorl   %eax,%eax
movl   $4096/64,%ecx
+   addq   $4096-64,%rdi
.p2align 4
 .Lloop:
decl%ecx
 #define PUT(x) movq %rax,x*8(%rdi)
-   movq %rax,(%rdi)
-   PUT(1)
-   PUT(2)
-   PUT(3)
-   PUT(4)
-   PUT(5)
-   PUT(6)
PUT(7)
-   leaq64(%rdi),%rdi
+   PUT(6)
+   PUT(5)
+   PUT(4)
+   PUT(3)
+   PUT(2)
+   PUT(1)
+   movq %rax,(%rdi)
+   leaq-64(%rdi),%rdi
jnz .Lloop
nop
ret


RE: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC

2017-08-07 Thread Qiang Zhao
On Mon 8/7/2017 3:02 PM, Michael Ellerman  wrote:

> -Original Message-
> From: Michael Ellerman [mailto:m...@ellerman.id.au]
> Sent: Monday, August 07, 2017 3:02 PM
> To: Qiang Zhao ; t...@linutronix.de
> Cc: o...@buserror.net; Qiang Zhao ; linuxppc-
> d...@lists.ozlabs.org; Xiaobo Xie ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC
> 
> Zhao Qiang  writes:
> 
> > QEIC was supported on PowerPC, and dependent on PPC, Now it is
> > supported on other platforms, so remove PPCisms.
> >
> > Signed-off-by: Zhao Qiang 
> > ---
> >  arch/powerpc/platforms/83xx/km83xx.c  |   1 -
> >  arch/powerpc/platforms/83xx/misc.c|   1 -
> >  arch/powerpc/platforms/83xx/mpc832x_mds.c |   1 -
> >  arch/powerpc/platforms/83xx/mpc832x_rdb.c |   1 -
> >  arch/powerpc/platforms/83xx/mpc836x_mds.c |   1 -
> >  arch/powerpc/platforms/83xx/mpc836x_rdk.c |   1 -
> >  arch/powerpc/platforms/85xx/corenet_generic.c |   1 -
> >  arch/powerpc/platforms/85xx/mpc85xx_mds.c |   1 -
> >  arch/powerpc/platforms/85xx/mpc85xx_rdb.c |   1 -
> >  arch/powerpc/platforms/85xx/twr_p102x.c   |   1 -
> >  drivers/irqchip/irq-qeic.c| 188 
> > +++---
> >  include/soc/fsl/qe/qe_ic.h| 132 --
> >  12 files changed, 80 insertions(+), 250 deletions(-)  delete mode
> > 100644 include/soc/fsl/qe/qe_ic.h
> >
> > diff --git a/arch/powerpc/platforms/83xx/km83xx.c
> > b/arch/powerpc/platforms/83xx/km83xx.c
> > index d8642a4..b1cef0a 100644
> > --- a/arch/powerpc/platforms/83xx/km83xx.c
> > +++ b/arch/powerpc/platforms/83xx/km83xx.c
> > @@ -38,7 +38,6 @@
> >  #include 
> >  #include 
> >  #include 
> > -#include 
> 
> You deleted that file in patch 2. So didn't you just break the build for the 
> last two
> commits?

Sorry, I am not sure what you said. Could you explain?
Thank you!

BR
Qiang Zhao



[RESEND PATCH 0/2] Fix clock name in Aspeed GPIO bindings and driver

2017-08-07 Thread Andrew Jeffery
Resending to fix Ryan's email address. I think I'll step away from computers
now.

Hello,

Joel discovered I brain-farted the referenced clock name in the patches
introducing the debounce capability for the Aspeed driver. The datasheet says
PCLK but I wrote HPLL for whatever reason, so clean up that mistake by simply
removing references to HPLL and put the onus on the devicetree author to
get it right.

Cheers,

Andrew

Andrew Jeffery (2):
  dt-bindings: gpio: aspeed: Remove reference to clock name
  gpio: aspeed: Remove reference to clock name in debounce warning
message

 Documentation/devicetree/bindings/gpio/gpio-aspeed.txt | 2 +-
 drivers/gpio/gpio-aspeed.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

-- 
2.11.0



[RESEND PATCH 1/2] dt-bindings: gpio: aspeed: Remove reference to clock name

2017-08-07 Thread Andrew Jeffery
HPLL was in fact not the clock we need. Remove prescription of which
clock to avoid further error. Please refer to your datasheet and double
check like I should have.

Signed-off-by: Andrew Jeffery 
---
 Documentation/devicetree/bindings/gpio/gpio-aspeed.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt 
b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
index c756afa88cc6..fc6378c778c5 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
@@ -18,7 +18,7 @@ Required properties:
 Optional properties:
 
 - interrupt-parent  : The parent interrupt controller, optional if 
inherited
-- clocks: A phandle to the HPLL clock node for debounce timings
+- clocks: A phandle to the clock to use for debounce timings
 
 The gpio and interrupt properties are further described in their respective
 bindings documentation:
-- 
2.11.0



[RESEND PATCH 2/2] gpio: aspeed: Remove reference to clock name in debounce warning message

2017-08-07 Thread Andrew Jeffery
HPLL was in fact not the clock we need. Remove description of which
clock to avoid any further error.

Signed-off-by: Andrew Jeffery 
---
 drivers/gpio/gpio-aspeed.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 4ca436e66bdb..bfc53995064a 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -834,7 +834,7 @@ static int __init aspeed_gpio_probe(struct platform_device 
*pdev)
gpio->clk = of_clk_get(pdev->dev.of_node, 0);
if (IS_ERR(gpio->clk)) {
dev_warn(&pdev->dev,
-   "No HPLL clock phandle provided, debouncing 
disabled\n");
+   "Failed to get clock from devicetree, 
debouncing disabled\n");
gpio->clk = NULL;
}
 
-- 
2.11.0



Re: [PATCH 0/6] In-kernel QMI handling

2017-08-07 Thread Marcel Holtmann
Hi Bjorn,

> This series starts by moving the common definitions of the QMUX
> protocol to the
> uapi header, as they are shared with clients - both in kernel and
> userspace.
> 
> This series then introduces in-kernel helper functions for aiding the
> handling
> of QMI encoded messages in the kernel. QMI encoding is a wire-format
> used in
> exchanging messages between the majority of QRTR clients and
> services.
 
 This raises a few red-flags for me.
>>> 
>>> I'm glad it does. In discussions with the responsible team within
>>> Qualcomm I've highlighted a number of concerns about enabling this
>>> support in the kernel. Together we're continuously looking into what
>>> should be pushed out to user space, and trying to not introduce
>>> unnecessary new users.
>>> 
 So far, we've kept almost everything QMI related in userspace and
 handled all QMI control-channel messages from libraries like libqmi or
 uqmi via the cdc-wdm driver and the "rmnet" interface via the qmi_wwan
 driver.  The kernel drivers just serve as the transport.
 
>>> 
>>> The path that was taken to support the MSM-style devices was to
>>> implement net/qrtr, which exposes a socket interface to abstract the
>>> physical transports (QMUX or IPCROUTER in Qualcomm terminology).
>>> 
>>> As I share you view on letting the kernel handle the transportation only
>>> the task of keeping track of registered services (service id -> node and
>>> port mapping) was done in a user space process and so far we've only
>>> ever have to deal with QMI encoded messages in various user space tools.
>> 
>> I think that the transport and multiplexing can be in the kernel as
>> long as it is done as proper subsystem. Similar to Phonet or CAIF.
>> Meaning it should have a well defined socket interface that can be
>> easily used from userspace, but also a clean in-kernel interface
>> handling.
>> 
> 
> In a mobile Qualcomm device there's a few different components involved
> here: message routing, QMUX protocol and QMI-encoding.
> 
> The downstream Qualcomm kernel implements the two first in the
> IPCROUTER, upstream this is split between the kernel net/qrtr and a user
> space service-register implementing the QMUX protocol for knowing where
> services are located.

as long as all of QMUX moves into the kernel and userspace doesn’t need to know 
about QMUX anymore, that would be good. The cross termination of QMUX in kernel 
space and userspace is a really bad idea. It is even worse if userspace has to 
do service registration. That is just a recipe for disaster.

One extra thing to keep in mind is that all the USB dongle should register with 
such a new QMI subsystem. And have their network interfaces being proper 
children of the QMI node. And please do not forget QMI passthrough via MBIM. 
Just saying we move some QMUX code into the kernel is not enough. It really 
needs to be a proper subsystem with a proper hierarchy of the child devices.

> The common encoding of messages passed between endpoints of the message
> routing is QMI, which is made an affair totally that of each client.
> 
>> If Qualcomm is supportive of this effort and is willing to actually
>> assist and/or open some of the specs or interface descriptions, then
>> this is a good thing. Service registration and cleanup is really done
>> best in the kernel. Same applies to multiplexing. Trying to do
>> multiplexing in userspace is always cumbersome and leads to overhead
>> that is of no gain. For example within oFono, we had to force
>> everything to go via oFono since it was the only sane way of handling
>> it. Other approaches were error prone and full of race conditions. You
>> need a central entity that can clean up.
>> 
> 
> The current upstream solution depends on a collaboration between
> net/qrtr and the user space service register for figuring out whom to
> send messages to. After that muxing et al is handled by the socket
> interface and service registry does not need to be involved.
> 
> Qualcomm is very supporting of this solution and we're collaborating on
> transitioning "downstream" to use this implementation.

It would be good if someone looks into oFono and makes sure that it works there 
as well. I would prefer at least some initial patches to proof-point the kernel 
APIs. oFono is a full telephony stack. So if you can make that one work, then 
you are most likely on the right track.

>> For the definition of an UAPI to share some code, I am actually not
>> sure that is such a good idea. For example the QMI code in oFono
>> follows a way simpler approach. And I am not convinced that all the
>> macros are actually beneficial. For example, the whole netlink macros
>> are pretty cumbersome. Adding some Documentation/qmi.txt on how the
>> wire format looks like and what is expected seems to be a way better
>> approach.
>> 
> 
> The socket interface provided by the kernel expects some knowledge of
> the QMUX protoc

Re: [PATCH v2 2/4] usb: common: Move u_serial from gadget/function to usb/common

2017-08-07 Thread Felipe Balbi

Hi,

Lu Baolu  writes:
>> Lu Baolu  writes:
>>> The component u_serial provides a glue layer between TTY layer
>>> and a USB gadget device needed to provide a basic serial port
>>> functionality. Currently, u_serial sits under gadget/function
>>> and depends on CONFIG_USB_GADGET to be compiled and used.
>>>
>>> Most of the serial gadget devices are based on a UDC (USB device
>>> controller) and implemented by making use of the Linux gadget
>>> frameworks. But we are facing other implementions as well. One
>>> example can be found with xHCI debug capability. The xHCI debug
>>> capability implements a serial gadget with hardware and firmware,
>>> and provides an interface similar with xHCI host for submitting
>>> and reaping the transfer requests.
>>>
>>> In order to make better use of u_serial when implementing xHCI
>>> debug capability in xHCI driver, this patch moves u_serial.c
>>> from gadget/function to usb/common, and moves u_serial.h from
>>> gadget/function to include/linux/usb.
>>>
>>> Signed-off-by: Lu Baolu 
>> NAK, u_serial uses the gadget API. It's definitely not COMMON.
>>
>
> Okay. It seems that I can't use u_serial anyway. I will implement
> a new tty glue for my case.

have you looked at drivers/usb/serial/?

-- 
balbi


Re: [PATCH v13 4/5] mm: support reporting free page blocks

2017-08-07 Thread Wei Wang

On 08/03/2017 05:11 PM, Michal Hocko wrote:

On Thu 03-08-17 14:38:18, Wei Wang wrote:
This is just too ugly and wrong actually. Never provide struct page
pointers outside of the zone->lock. What I've had in mind was to simply
walk free lists of the suitable order and call the callback for each one.
Something as simple as

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = &pgdat->node_zones[i];

if (!populated_zone(zone))
continue;


Can we directly use for_each_populated_zone(zone) here?



spin_lock_irqsave(&zone->lock, flags);
for (order = min_order; order < MAX_ORDER; ++order) {



This appears to be covered by for_each_migratetype_order(order, mt) below.



struct free_area *free_area = &zone->free_area[order];
enum migratetype mt;
struct page *page;

if (!free_area->nr_pages)
continue;

for_each_migratetype_order(order, mt) {
list_for_each_entry(page,
&free_area->free_list[mt], lru) 
{

pfn = page_to_pfn(page);
visit(opaque2, prn, 1lock, flags);
list_for_each_entry(page,
&zone->free_area[order].free_list[mt], lru) {
pfn = page_to_pfn(page);
visit(opaque1, pfn, 1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
}
}


Best,
Wei



Re: [alsa-devel] [PATCH 08/11] ALSA: vsnd: Add timer for period interrupt emulation

2017-08-07 Thread Oleksandr Andrushchenko

On 08/07/2017 06:14 PM, Oleksandr Andrushchenko wrote:


On 08/07/2017 04:55 PM, Clemens Ladisch wrote:

Oleksandr Andrushchenko wrote:

On 08/07/2017 04:11 PM, Clemens Ladisch wrote:

How does that interface work?

For the buffer received in .copy_user/.copy_kernel we send
a request to the backend and get response back (async) when it has 
copied
the bytes into HW/mixer/etc, so the buffer at frontend side can be 
reused.

So if the frontend sends too many (too large) requests, does the
backend wait until there is enough free space in the buffer before
it does the actual copying and then acks?

Well, the frontend should be backend agnostic,
In our implementation backend is a user-space application which sits
either on top of ALSA driver or PulseAudio: so, it acks correspondingly,
e.g, when, for example, ALSA driver completes .copy_user and returns
from the kernel

If yes, then these acks can be used as interrupts.

we can probably teach our backend to track periods elapsed for ALSA,
but not sure if it is possible for PulseAudio - do you know if this is 
also

doable for pulse?

Let's assume backend blocks until the buffer played/consumed...

   (You still
have to count frames, and call snd_pcm_period_elapsed() exactly
when a period boundary was reached or crossed.)
... and what if the buffer has multiple periods? So, that the backend 
sends
a single response for multiple periods (buffers with fractional period 
number

can be handled separately)?
We will have to either send snd_pcm_period_elapsed once (wrong, because
multiple periods consumed) or multiple times at one time with no delay 
(wrong,
because there will be a confusion that multiple periods were not 
reported for quite

some long time and then there is a burst of events)
Either way the behavior will not be the one desired (please correct me
if I am wrong here)


Splitting a large read/write into smaller requests to the backend
would improve the granularity of the known stream position.

The overall latency would be the sum of the sizes of the frontend
and backend buffers.


Why is the protocol designed this way?
We also work on para-virtualizing display device and there we tried to 
use

page flip events from backend to frontend to signal similar to
period interrupt for audio. When multiple displays (read multiple 
audio streams)
were in place we flooded with the system interrupts (which are period 
events in our case)

and performance dropped significantly. This is why we switched to
interrupt emulation, here via timer for audio. The main measures were:
1. Number of events between front and back
2. Latency
With timer approach we reduce 1) to the minimum which is a must (no 
period

interrupts), but 2) is still here
With emulated period interrupts (protocol events) we have issue with 1)
and still 2) remains.


BTW, there is one more approach to solve this [1],
but it uses its own Xen sound protocol and heavily relies
on Linux implementation, which cannot be a part of a generic protocol

So, to me, neither approach solves the problem for 100%, so we decided
to stick to timers. Hope, this gives more background on why we did things
the way we did.

  Wasn't the goal to expose
some 'real' sound card?


yes, but it can be implemented in different ways, please see above

Regards,
Clemens

Thank you for your interest,
Oleksandr


[1] 
https://github.com/OpenXT/pv-linux-drivers/blob/master/archive/openxt-audio/main.c#L356


[PATCH 0/2] Fix clock name in Aspeed GPIO bindings and driver

2017-08-07 Thread Andrew Jeffery
Hello,

Joel discovered I brain-farted the referenced clock name in the patches
introducing the debounce capability for the Aspeed driver. The datasheet says
PCLK but I wrote HPLL for whatever reason, so clean up that mistake by simply
removing references to HPLL and put the onus on the devicetree author to
get it right.

Cheers,

Andrew

Andrew Jeffery (2):
  dt-bindings: gpio: aspeed: Remove reference to clock name
  gpio: aspeed: Remove reference to clock name in debounce warning
message

 Documentation/devicetree/bindings/gpio/gpio-aspeed.txt | 2 +-
 drivers/gpio/gpio-aspeed.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

-- 
2.11.0



[PATCH 1/2] dt-bindings: gpio: aspeed: Remove reference to clock name

2017-08-07 Thread Andrew Jeffery
HPLL is in fact not the clock we need. Remove prescription of which clock to
avoid further error. Please refer to your datasheet and double check like I
should have.

Signed-off-by: Andrew Jeffery 
---
 Documentation/devicetree/bindings/gpio/gpio-aspeed.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt 
b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
index c756afa88cc6..fc6378c778c5 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt
@@ -18,7 +18,7 @@ Required properties:
 Optional properties:
 
 - interrupt-parent  : The parent interrupt controller, optional if 
inherited
-- clocks: A phandle to the HPLL clock node for debounce timings
+- clocks: A phandle to the clock to use for debounce timings
 
 The gpio and interrupt properties are further described in their respective
 bindings documentation:
-- 
2.11.0



Re: [PATCH] f2fs: fix some cases with reserved_blocks

2017-08-07 Thread Chao Yu
On 2017/8/8 12:12, Yunlong Song wrote:
> Signed-off-by: Yunlong Song 
> ---
>  fs/f2fs/recovery.c | 3 ++-
>  fs/f2fs/super.c| 9 +
>  2 files changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> index a3d0261..e288319 100644
> --- a/fs/f2fs/recovery.c
> +++ b/fs/f2fs/recovery.c
> @@ -51,7 +51,8 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
>  {
>   s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
>  
> - if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
> + if (sbi->last_valid_block_count + nalloc +
> + sbi->reserved_blocks > sbi->user_block_count)

I think we can treat reserved blocks as over-provision space in f2fs, so it
would be safe to store invalid data (may become valid during recovery) there.
Anyway, it OK to remain old condition judgment.

>   return false;
>   return true;
>  }
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index 4c1bdcb..c644bf5 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -946,6 +946,7 @@ static int f2fs_statfs(struct dentry *dentry, struct 
> kstatfs *buf)
>   u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
>   block_t total_count, user_block_count, start_count, ovp_count;
>   u64 avail_node_count;
> + block_t avail_user_block_count;
>  
>   total_count = le64_to_cpu(sbi->raw_super->block_count);
>   user_block_count = sbi->user_block_count;
> @@ -953,16 +954,16 @@ static int f2fs_statfs(struct dentry *dentry, struct 
> kstatfs *buf)
>   ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
>   buf->f_type = F2FS_SUPER_MAGIC;
>   buf->f_bsize = sbi->blocksize;
> + avail_user_block_count = user_block_count - sbi->reserved_blocks;
>  
>   buf->f_blocks = total_count - start_count;
>   buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
> - buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
> - sbi->reserved_blocks;
> + buf->f_bavail = avail_user_block_count - valid_user_blocks(sbi);
>  
>   avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
>  
> - if (avail_node_count > user_block_count) {
> - buf->f_files = user_block_count;
> + if (avail_node_count > avail_user_block_count) {

Likewise f_blocks calculation, the f_files one doesn't need to consider
reserved_blocks.

Thanks,

> + buf->f_files = avail_user_block_count;
>   buf->f_ffree = buf->f_bavail;
>   } else {
>   buf->f_files = avail_node_count;
> 



Re: [PATCH] userfaultfd: replace ENOSPC with ESRCH in case mm has gone during copy/zeropage

2017-08-07 Thread Mike Rapoport
(adding Michal)

On Mon, Aug 07, 2017 at 04:12:25PM +0300, Mike Rapoport wrote:
> When the process exit races with outstanding mcopy_atomic, it would be
> better to return ESRCH error. When such race occurs the process and it's mm
> are going away and returning "no such process" to the uffd monitor seems
> better fit than ENOSPC.
> 
> Suggested-by: Michal Hocko 
> Cc: Andrea Arcangeli 
> Cc: "Dr. David Alan Gilbert" 
> Cc: Pavel Emelyanov 
> Cc: Mike Kravetz 
> Signed-off-by: Mike Rapoport 
> ---
> The man-pages update is ready and I'll send it out once the patch is
> merged.
> 
>  fs/userfaultfd.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 06ea26b8c996..b0d5897bc4e6 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1600,7 +1600,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  uffdio_copy.len);
>   mmput(ctx->mm);
>   } else {
> - return -ENOSPC;
> + return -ESRCH;
>   }
>   if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
>   return -EFAULT;
> @@ -1647,7 +1647,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx 
> *ctx,
>uffdio_zeropage.range.len);
>   mmput(ctx->mm);
>   } else {
> - return -ENOSPC;
> + return -ESRCH;
>   }
>   if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
>   return -EFAULT;
> -- 
> 2.7.4
> 

-- 
Sincerely yours,
Mike.



[PATCH 2/2] gpio: aspeed: Remove reference to clock name in debounce warning message

2017-08-07 Thread Andrew Jeffery
HPLL is in fact not the clock we need. Remove the description of which clock we
failed to find a phandle to in order to avoid any further error.

Signed-off-by: Andrew Jeffery 
---
 drivers/gpio/gpio-aspeed.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 4ca436e66bdb..bfc53995064a 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -834,7 +834,7 @@ static int __init aspeed_gpio_probe(struct platform_device 
*pdev)
gpio->clk = of_clk_get(pdev->dev.of_node, 0);
if (IS_ERR(gpio->clk)) {
dev_warn(&pdev->dev,
-   "No HPLL clock phandle provided, debouncing 
disabled\n");
+   "Failed to get clock from devicetree, 
debouncing disabled\n");
gpio->clk = NULL;
}
 
-- 
2.11.0



[PATCH] gpu: host1x: fix error return code in host1x_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the host1x driver
ignores it and always returns -ENXIO. This is not correct and,
prevents -EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/gpu/host1x/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 7782725..7f22c5c 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -134,8 +134,8 @@ static int host1x_probe(struct platform_device *pdev)
 
syncpt_irq = platform_get_irq(pdev, 0);
if (syncpt_irq < 0) {
-   dev_err(&pdev->dev, "failed to get IRQ\n");
-   return -ENXIO;
+   dev_err(&pdev->dev, "failed to get IRQ: %d\n", syncpt_irq);
+   return syncpt_irq;
}
 
host = devm_kzalloc(&pdev->dev, sizeof(*host), GFP_KERNEL);
-- 
2.5.0



Re: linux-next: manual merge of the userns tree with the mips tree

2017-08-07 Thread Ralf Baechle
On Tue, Aug 08, 2017 at 03:10:04PM +1000, Stephen Rothwell wrote:

(Maciej added to cc.)

> Hi Eric,
> 
> Today's linux-next merge of the userns tree got a conflict in:
> 
>   arch/mips/kernel/traps.c
> 
> between commit:
> 
>   260a789828aa ("MIPS: signal: Remove unreachable code from 
> force_fcr31_sig().")
> 
> from the mips tree and commit:
> 
>   ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE")
> 
> from the userns tree.
> 
> I fixed it up (the former removed the code updated by the latter) and
> can carry the fix as necessary. This is now fixed as far as linux-next
> is concerned, but any non trivial conflicts should be mentioned to your
> upstream maintainer when your tree is submitted for merging.  You may
> also want to consider cooperating with the maintainer of the conflicting
> tree to minimise any particularly complex conflicts.

Eric,

after yesterday's emails on the topic I think commit ea1b75cf9138 ("signal/
mips: Document a conflict with SI_USER with SIGFPE") should be dropped.

  Ralf


[PATCH] mmc: mxcmmc: Handle return value of clk_prepare_enable

2017-08-07 Thread Arvind Yadav
clk_prepare_enable() can fail here and we must check its return value.

Signed-off-by: Arvind Yadav 
---
 drivers/mmc/host/mxcmmc.c | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index fb3ca82..c016820 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -1098,8 +1098,13 @@ static int mxcmci_probe(struct platform_device *pdev)
goto out_free;
}
 
-   clk_prepare_enable(host->clk_per);
-   clk_prepare_enable(host->clk_ipg);
+   ret = clk_prepare_enable(host->clk_per);
+   if (ret)
+   goto out_free;
+
+   ret = clk_prepare_enable(host->clk_ipg);
+   if (ret)
+   goto out_clk_per_put;
 
mxcmci_softreset(host);
 
@@ -1168,8 +1173,9 @@ static int mxcmci_probe(struct platform_device *pdev)
dma_release_channel(host->dma);
 
 out_clk_put:
-   clk_disable_unprepare(host->clk_per);
clk_disable_unprepare(host->clk_ipg);
+out_clk_per_put:
+   clk_disable_unprepare(host->clk_per);
 
 out_free:
mmc_free_host(mmc);
@@ -1212,10 +1218,17 @@ static int __maybe_unused mxcmci_resume(struct device 
*dev)
 {
struct mmc_host *mmc = dev_get_drvdata(dev);
struct mxcmci_host *host = mmc_priv(mmc);
+   int ret;
 
-   clk_prepare_enable(host->clk_per);
-   clk_prepare_enable(host->clk_ipg);
-   return 0;
+   ret = clk_prepare_enable(host->clk_per);
+   if (ret)
+   return ret;
+
+   ret = clk_prepare_enable(host->clk_ipg);
+   if (ret)
+   clk_disable_unprepare(host->clk_per);
+
+   return ret;
 }
 
 static SIMPLE_DEV_PM_OPS(mxcmci_pm_ops, mxcmci_suspend, mxcmci_resume);
-- 
1.9.1



Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression

2017-08-07 Thread Nadav Amit
Nadav Amit  wrote:

> Minchan Kim  wrote:
> 
>> Hi,
>> 
>> On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote:
>>> Greeting,
>>> 
>>> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to 
>>> commit:
>>> 
>>> 
>>> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix 
>>> MADV_[FREE|DONTNEED] TLB flush miss problem")
>>> url: 
>>> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715
>>> 
>>> 
>>> in testcase: will-it-scale
>>> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with 
>>> 64G memory
>>> with following parameters:
>>> 
>>> nr_task: 16
>>> mode: process
>>> test: brk1
>>> cpufreq_governor: performance
>>> 
>>> test-description: Will It Scale takes a testcase and runs it from 1 through 
>>> to n parallel copies to see if the testcase will scale. It builds both a 
>>> process and threads based test in order to see any differences between the 
>>> two.
>>> test-url: https://github.com/antonblanchard/will-it-scale
>> 
>> Thanks for the report.
>> Could you explain what kinds of workload you are testing?
>> 
>> Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple
>> threads?
> 
> According to the description it is "testcase:brk increase/decrease of one
> page”. According to the mode it spawns multiple processes, not threads.
> 
> Since a single page is unmapped each time, and the iTLB-loads increase
> dramatically, I would suspect that for some reason a full TLB flush is
> caused during do_munmap().
> 
> If I find some free time, I’ll try to profile the workload - but feel free
> to beat me to it.

The root-cause appears to be that tlb_finish_mmu() does not call
dec_tlb_flush_pending() - as it should. Any chance you can take care of it?

Having said that it appears that cpumask_any_but() is really inefficient
since it does not have an optimization for the case in which
small_const_nbits(nbits)==true. When I find some free time, I’ll try to deal
with it.

Thanks,
Nadav

[PATCH] mmc: wmt-sdmmc: Handle return value of clk_prepare_enable

2017-08-07 Thread Arvind Yadav
clk_prepare_enable() can fail here and we must check its return value.

Signed-off-by: Arvind Yadav 
---
 drivers/mmc/host/wmt-sdmmc.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c
index 21ebba8..e64f930 100644
--- a/drivers/mmc/host/wmt-sdmmc.c
+++ b/drivers/mmc/host/wmt-sdmmc.c
@@ -856,7 +856,9 @@ static int wmt_mci_probe(struct platform_device *pdev)
goto fail5;
}
 
-   clk_prepare_enable(priv->clk_sdmmc);
+   ret = clk_prepare_enable(priv->clk_sdmmc);
+   if (ret)
+   goto fail6;
 
/* configure the controller to a known 'ready' state */
wmt_reset_hardware(mmc);
@@ -866,6 +868,8 @@ static int wmt_mci_probe(struct platform_device *pdev)
dev_info(&pdev->dev, "WMT SDHC Controller initialized\n");
 
return 0;
+fail6:
+   clk_put(priv->clk_sdmmc);
 fail5:
free_irq(dma_irq, priv);
 fail4:
-- 
1.9.1



[PATCH 1/4] usb: mtu3: add generic compatible string

2017-08-07 Thread Chunfeng Yun
The mtu3 driver is a generic driver for MediaTek usb3 DRD IP, add
a generic compatible to avoid confusion when support new SoCs but
use a compatible with specific SoC's name "mt8173".

Signed-off-by: Chunfeng Yun 
---
 drivers/usb/mtu3/mtu3_plat.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/mtu3/mtu3_plat.c b/drivers/usb/mtu3/mtu3_plat.c
index 0d3ebb3..088e3e6 100644
--- a/drivers/usb/mtu3/mtu3_plat.c
+++ b/drivers/usb/mtu3/mtu3_plat.c
@@ -500,6 +500,7 @@ static int __maybe_unused mtu3_resume(struct device *dev)
 
 static const struct of_device_id mtu3_of_match[] = {
{.compatible = "mediatek,mt8173-mtu3",},
+   {.compatible = "mediatek,mtu3",},
{},
 };
 
-- 
1.7.9.5



[PATCH 2/4] usb: xhci-mtk: add generic compatible string

2017-08-07 Thread Chunfeng Yun
The xhci-mtk driver is a generic driver for MediaTek xHCI IP, add
a generic compatible to avoid confusion when support new SoCs but
use a compatible with specific SoC's name "mt8173".

Signed-off-by: Chunfeng Yun 
---
 drivers/usb/host/xhci-mtk.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c
index 67d5dc7..d2934b9 100644
--- a/drivers/usb/host/xhci-mtk.c
+++ b/drivers/usb/host/xhci-mtk.c
@@ -795,6 +795,7 @@ static int __maybe_unused xhci_mtk_resume(struct device 
*dev)
 #ifdef CONFIG_OF
 static const struct of_device_id mtk_xhci_of_match[] = {
{ .compatible = "mediatek,mt8173-xhci"},
+   { .compatible = "mediatek,xhci-mtk"},
{ },
 };
 MODULE_DEVICE_TABLE(of, mtk_xhci_of_match);
-- 
1.7.9.5



[PATCH 3/4] dt-bindings: mt8173-mtu3: add generic compatible and rename file

2017-08-07 Thread Chunfeng Yun
The mt8173-mtu3.txt actually holds the bindings for all mediatek
SoCs with usb3 DRD IP, so add a generic compatible and change the
name to mtu3.txt.

Signed-off-by: Chunfeng Yun 
---
 .../bindings/usb/{mt8173-mtu3.txt => mtu3.txt} |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)
 rename Documentation/devicetree/bindings/usb/{mt8173-mtu3.txt => mtu3.txt} 
(95%)

diff --git a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt 
b/Documentation/devicetree/bindings/usb/mtu3.txt
similarity index 95%
rename from Documentation/devicetree/bindings/usb/mt8173-mtu3.txt
rename to Documentation/devicetree/bindings/usb/mtu3.txt
index 1d7c3bc..832741d 100644
--- a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt
+++ b/Documentation/devicetree/bindings/usb/mtu3.txt
@@ -1,7 +1,9 @@
 The device node for Mediatek USB3.0 DRD controller
 
 Required properties:
- - compatible : should be "mediatek,mt8173-mtu3"
+ - compatible : should be one of
+   "mediatek,mt8173-mtu3" (deprecated, use "mediatek,mtu3" instead),
+   "mediatek,mtu3"
  - reg : specifies physical base address and size of the registers
  - reg-names: should be "mac" for device IP and "ippc" for IP port control
  - interrupts : interrupt used by the device IP
@@ -44,7 +46,7 @@ Optional properties:
 Sub-nodes:
 The xhci should be added as subnode to mtu3 as shown in the following example
 if host mode is enabled. The DT binding details of xhci can be found in:
-Documentation/devicetree/bindings/usb/mt8173-xhci.txt
+Documentation/devicetree/bindings/usb/xhci-mtk.txt
 
 Example:
 ssusb: usb@11271000 {
-- 
1.7.9.5



[PATCH 4/4] dt-bindings: mt8173-xhci: add generic compatible and rename file

2017-08-07 Thread Chunfeng Yun
The mt8173-xhci.txt actually holds the bindings for all mediatek
SoCs with xHCI controller, so add a generic compatible and change
the name to xhci-mtk.txt to reflect that.

Signed-off-by: Chunfeng Yun 
---
 .../bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)
 rename Documentation/devicetree/bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} 
(92%)

diff --git a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt 
b/Documentation/devicetree/bindings/usb/xhci-mtk.txt
similarity index 92%
rename from Documentation/devicetree/bindings/usb/mt8173-xhci.txt
rename to Documentation/devicetree/bindings/usb/xhci-mtk.txt
index 0acfc8a..1ce77c7 100644
--- a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt
+++ b/Documentation/devicetree/bindings/usb/xhci-mtk.txt
@@ -11,7 +11,9 @@ into two parts.
 
 
 Required properties:
- - compatible : should contain "mediatek,mt8173-xhci"
+ - compatible : should be one of
+   "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead),
+   "mediatek,xhci-mtk"
  - reg : specifies physical base address and size of the registers
  - reg-names: should be "mac" for xHCI MAC and "ippc" for IP port control
  - interrupts : interrupt used by the controller
@@ -68,10 +70,12 @@ usb30: usb@1127 {
 
 In the case, xhci is added as subnode to mtu3. An example and the DT binding
 details of mtu3 can be found in:
-Documentation/devicetree/bindings/usb/mt8173-mtu3.txt
+Documentation/devicetree/bindings/usb/mtu3.txt
 
 Required properties:
- - compatible : should contain "mediatek,mt8173-xhci"
+ - compatible : should be one of
+   "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead),
+   "mediatek,xhci-mtk"
  - reg : specifies physical base address and size of the registers
  - reg-names: should be "mac" for xHCI MAC
  - interrupts : interrupt used by the host controller
-- 
1.7.9.5



[PATCH] ARM: dts: DRA7: Add pcie1 dt node for EP mode

2017-08-07 Thread Kishon Vijay Abraham I
Add pcie1 dt node in order for the controller to operate in
endpoint mode. However since none of the dra7 based boards have
slots configured to operate in endpoint mode, keep EP mode
disabled.

Signed-off-by: Kishon Vijay Abraham I 
---
 arch/arm/boot/dts/am571x-idk.dts|  9 +
 arch/arm/boot/dts/am572x-idk.dts|  7 ++-
 arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi |  7 ++-
 arch/arm/boot/dts/dra7-evm.dts  |  4 
 arch/arm/boot/dts/dra7.dtsi | 23 ++-
 arch/arm/boot/dts/dra72-evm-common.dtsi |  4 
 6 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts
index adc70fb091a2..0c0bb4e93f25 100644
--- a/arch/arm/boot/dts/am571x-idk.dts
+++ b/arch/arm/boot/dts/am571x-idk.dts
@@ -96,3 +96,12 @@
status = "okay";
};
 };
+
+&pcie1_rc {
+   status = "okay";
+   gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
+};
+
+&pcie1_ep {
+   gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
+};
diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts
index 940fcbe5380b..5ff75004afcf 100644
--- a/arch/arm/boot/dts/am572x-idk.dts
+++ b/arch/arm/boot/dts/am572x-idk.dts
@@ -88,7 +88,12 @@
load-gpios = <&gpio3 19 GPIO_ACTIVE_LOW>;
 };
 
-&pcie1 {
+&pcie1_rc {
+   status = "okay";
+   gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
+};
+
+&pcie1_ep {
gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
 };
 
diff --git a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi 
b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi
index fdfe5b16b806..d433a50cd18a 100644
--- a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi
+++ b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi
@@ -570,7 +570,12 @@
};
 };
 
-&pcie1 {
+&pcie1_rc {
+   status = "ok";
+   gpios = <&gpio2 8 GPIO_ACTIVE_LOW>;
+};
+
+&pcie1_ep {
gpios = <&gpio2 8 GPIO_ACTIVE_LOW>;
 };
 
diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts
index f47fc4daf062..57bd75909d96 100644
--- a/arch/arm/boot/dts/dra7-evm.dts
+++ b/arch/arm/boot/dts/dra7-evm.dts
@@ -720,3 +720,7 @@
status = "okay";
};
 };
+
+&pcie1_rc {
+   status = "okay";
+};
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index 0f0f6f58bd18..e6f2c6a15dc1 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -196,6 +196,7 @@
scm_conf1: scm_conf@1c04 {
compatible = "syscon";
reg = <0x1c04 0x0020>;
+   #syscon-cells = <2>;
};
 
scm_conf_pcie: scm_conf@1c24 {
@@ -287,7 +288,11 @@
#address-cells = <1>;
ranges = <0x5100 0x5100 0x3000
  0x00x2000 0x1000>;
-   pcie1: pcie@5100 {
+   /**
+* To enable PCI endpoint mode, disable the pcie1_rc
+* node and enable pcie1_ep mode.
+*/
+   pcie1_rc: pcie@5100 {
compatible = "ti,dra7-pcie";
reg = <0x5100 0x2000>, <0x51002000 0x14c>, 
<0x1000 0x2000>;
reg-names = "rc_dbics", "ti_conf", "config";
@@ -309,12 +314,28 @@
<0 0 0 2 &pcie1_intc 2>,
<0 0 0 3 &pcie1_intc 3>,
<0 0 0 4 &pcie1_intc 4>;
+   status = "disabled";
pcie1_intc: interrupt-controller {
interrupt-controller;
#address-cells = <0>;
#interrupt-cells = <1>;
};
};
+
+   pcie1_ep: pcie_ep@5100 {
+   compatible = "ti,dra7-pcie-ep";
+   reg = <0x5100 0x28>, <0x51002000 0x14c>, 
<0x51001000 0x28>, <0x1000 0x1000>;
+   reg-names = "ep_dbics", "ti_conf", "ep_dbics2", 
"addr_space";
+   interrupts = <0 232 0x4>;
+   num-lanes = <1>;
+   num-ib-windows = <4>;
+   num-ob-windows = <16>;
+   ti,hwmods = "pcie1";
+   phys = <&pcie1_phy>;
+   phy-names = "pcie-phy0";
+   ti,syscon-unaligned-access = <&scm_conf1 0x14 
2>;
+   st

[PATCH] memory: mtk-smi: Handle return value of clk_prepare_enable

2017-08-07 Thread Arvind Yadav
clk_prepare_enable() can fail here and we must check its return value.

Signed-off-by: Arvind Yadav 
---
 drivers/memory/mtk-smi.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/memory/mtk-smi.c b/drivers/memory/mtk-smi.c
index 4afbc41..edf36f0 100644
--- a/drivers/memory/mtk-smi.c
+++ b/drivers/memory/mtk-smi.c
@@ -321,6 +321,7 @@ static int mtk_smi_common_probe(struct platform_device 
*pdev)
struct resource *res;
const struct of_device_id *of_id;
enum mtk_smi_gen smi_gen;
+   int ret;
 
if (!dev->pm_domain)
return -EPROBE_DEFER;
@@ -359,7 +360,9 @@ static int mtk_smi_common_probe(struct platform_device 
*pdev)
if (IS_ERR(common->clk_async))
return PTR_ERR(common->clk_async);
 
-   clk_prepare_enable(common->clk_async);
+   ret = clk_prepare_enable(common->clk_async);
+   if (ret)
+   return ret;
}
pm_runtime_enable(dev);
platform_set_drvdata(pdev, common);
-- 
1.9.1



Re: [RFC PATCH 0/1] Add hugetlbfs support to memfd_create()

2017-08-07 Thread Michal Hocko
Hi,
I am one foot out of office and will be offline for two days so I
didn't get to review the patch yet but this information is an useful
information about the usecase that should be in the patch directly for
future reference.

On Mon 07-08-17 16:47:51, Mike Kravetz wrote:
> This patch came out of discussions in this e-mail thread [1].
> 
> The Oracle JVM team is developing a new garbage collection model.  This
> new model requires multiple mappings of the same anonymous memory.  One
> straight forward way to accomplish this is with memfd_create.  They can
> use the returned fd to create multiple mappings of the same memory.
> 
> The JVM today has an option to use (static hugetlb) huge pages.  If this
> option is specified, they would like to use the same garbage collection
> model requiring multiple mappings to the same memory.  Using hugetlbfs,
> it is possible to explicitly mount a filesystem and specify file paths
> in order to get an fd that can be used for multiple mappings.  However,
> this introduces additional system admin work and coordination.
> 
> Ideally they would like to get a hugetlbfs fd without requiring explicit
> mounting of a filesystem.   Today, mmap and shmget can make use of
> hugetlbfs without explicitly mounting a filesystem.  The patch adds this
> functionality to hugetlbfs.
> 
> A new flag MFD_HUGETLB is introduced to request a hugetlbfs file.  Like
> other system calls where hugetlb can be requested, the huge page size
> can be encoded in the flags argument is the non-default huge page size
> is desired.  hugetlbfs does not support sealing operations, therefore
> specifying MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL.
> 
> Of course, the memfd_man page would need updating if this type of
> functionality moves forward.
> 
> [1] https://lkml.org/lkml/2017/7/6/564
> 
> Mike Kravetz (1):
>   mm/shmem: add hugetlbfs support to memfd_create()
> 
>  include/uapi/linux/memfd.h | 24 
>  mm/shmem.c | 37 +++--
>  2 files changed, 55 insertions(+), 6 deletions(-)
> 
> -- 
> 2.7.5

-- 
Michal Hocko
SUSE Labs


Re: [PATCH] mm: ratelimit PFNs busy info message

2017-08-07 Thread Michael Ellerman
Andrew Morton  writes:

> On Wed,  2 Aug 2017 13:44:57 -0400 Jonathan Toppins  
> wrote:
>
>> The RDMA subsystem can generate several thousand of these messages per
>> second eventually leading to a kernel crash. Ratelimit these messages
>> to prevent this crash.
>
> Well...  why are all these EBUSY's occurring?  It sounds inefficient (at
> least) but if it is expected, normal and unavoidable then perhaps we
> should just remove that message altogether?

We see them on powerpc sometimes when CMA is unable to make large
allocations for the hash table of a KVM guest.

At least in that context they're not useful, CMA will try the
allocation again, and if it really can't allocate then CMA will print
more useful information itself.

So I'd vote for dropping the message and letting the callers decide what
to do.

cheers


Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API

2017-08-07 Thread Coly Li
On 2017/8/8 下午12:12, Byungchul Park wrote:
> On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote:
>> On 2017/8/7 下午4:38, Byungchul Park wrote:
>>> Although llist provides proper APIs, they are not used. Make them used.
>>>
>>> Signed-off-by: Byungchul Park > Only have a question about why not using llist_for_each_entry(), it's
> 
> Hello,
> 
> The reason is to keep the original logic unchanged. The logic already
> does as if it's the safe version against removal.
> 
>> still OK with llist_for_each_entry_safe(). The rested part is good to me.
>>
>> Acked-by: Coly Li 
>>
>>> ---
>>>  drivers/md/bcache/closure.c | 17 +++--
>>>  1 file changed, 3 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
>>> index 864e673..1841d03 100644
>>> --- a/drivers/md/bcache/closure.c
>>> +++ b/drivers/md/bcache/closure.c
>>> @@ -64,27 +64,16 @@ void closure_put(struct closure *cl)
>>>  void __closure_wake_up(struct closure_waitlist *wait_list)
>>>  {
>>> struct llist_node *list;
>>> -   struct closure *cl;
>>> +   struct closure *cl, *t;
>>> struct llist_node *reverse = NULL;
>>>  
>>> list = llist_del_all(&wait_list->list);
>>>  
>>> /* We first reverse the list to preserve FIFO ordering and fairness */
>>> -
>>> -   while (list) {
>>> -   struct llist_node *t = list;
>>> -   list = llist_next(list);
>>> -
>>> -   t->next = reverse;
>>> -   reverse = t;
>>> -   }
>>> +   reverse = llist_reverse_order(list);
>>>  
>>> /* Then do the wakeups */
>>> -
>>> -   while (reverse) {
>>> -   cl = container_of(reverse, struct closure, list);
>>> -   reverse = llist_next(reverse);
>>> -
>>> +   llist_for_each_entry_safe(cl, t, reverse, list) {
>>
>> Just wondering why not using llist_for_each_entry(), or you use the
>> _safe version on purpose ?
> 
> If I use llist_for_each_entry(), then it would change the original
> behavior. Is it ok?
> 

I feel llist_for_each_entry() keeps the original behavior, and variable
't' can be removed. Anyway, either llist_for_each_entry() or
llist_for_each_entry_safe() works correctly and well here. Any one you
use is OK to me, thanks for your informative reply :-)



-- 
Coly Li


Re: [PATCH] arm64: correct modules range of kernel virtual memory layout

2017-08-07 Thread Miles Chen
On Tue, 2017-08-08 at 12:44 +0800, Miles Chen wrote:
> On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote:
> > On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote:
> > > On 7 August 2017 at 14:16, Will Deacon  wrote:
> > > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote:
> > > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR")
> > > >> moved module virtual address to
> > > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE).
> > > >>
> > > >> Display module information of the virtual kernel
> > > >> memory layout by using module_alloc_base.
> > > >>
> > > >> testing output:
> > > >> 1) Current implementation:
> > > >> Virtual kernel memory layout:
> > > >>   modules : 0xff80 - 0xff800800   (   128 MB)
> > > >> 2) this patch + KASLR:
> > > >> Virtual kernel memory layout:
> > > >>   modules : 0xff800056 - 0xff800856   (   128 MB)
> > > >> 3) this patch + KASLR and a dummy seed:
> > > >> Virtual kernel memory layout:
> > > >>   modules : 0xffa7df637000 - 0xffa7e7637000   (   128 MB)
> > > >>
> > > >> Signed-off-by: Miles Chen 
> > > >> ---
> > > >>  arch/arm64/mm/init.c | 5 +++--
> > > >>  1 file changed, 3 insertions(+), 2 deletions(-)
> > > >
> > > > Does this mean the modules code in our pt dumper is busted
> > > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these 
> > > > addresses
> > > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END
> > > > altogether?
> > > >
> > > 
> > > I don't think we need this patch. The 'module' line simply prints the
> > > VA region that is reserved for modules. The fact that we end up
> > > putting them elsewhere when running randomized does not necessarily
> > > mean this line should reflect that.
> > 
> > I was more concerned by other users of MODULES_VADDR tbh, although I see
> > now that we don't randomize the module region if kasan is enabled. Still,
> > the kcore code adds the modules region as a separate area (distinct from
> > vmalloc) if MODULES_VADDR is defined, the page table dumping code uses
> > MODULES_VADDR to identify the module region and I think we'll get false
> > positives from is_vmalloc_or_module_addr, which again uses the static
> > region.
> > 
> > So, given that MODULES_VADDR never points at the module area, can't we get
> > rid of it?
> 
> Agreed.MODULES_VADDR should be phased out. Considering the kernel
> modules live somewhere between [VMALLOC_START, VMALLOC_END) now:
> (arch/arm64/kernel/module.c:module_alloc). I suggest the following
> changes:
> 
> 1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly
> 2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END.
> 3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to
> get the shadow memory? (the kernel modules still live in this range when
> kasan is enabled)
> 4. remove modules line in kernel memory layout
> (optional, thanks for Ard's feedback)
> 5. remove MODULE_VADDR, MODULES_END definition

I was wrong about this. is_vmalloc_or_module_addr() is defined
in mm/vmalloc and it uses MODULES_VADDR and MODULES_END.
May it is better to give MODULES_VADDR and MODULES_END
proper values, not remove them.

> Miles
> > 
> > Will
> 
> 




[PATCH 2/2] ARM: dts: am572x-idk: Fix GPIO polarity for MMC1 card detect

2017-08-07 Thread Kishon Vijay Abraham I
The GPIO polarity for MMC1 card detect is set to '0' which means
active-high. However the polarity should be active-low. Fix it
here.

Signed-off-by: Kishon Vijay Abraham I 
---
 arch/arm/boot/dts/am572x-idk.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts
index 9da6d83ca185..940fcbe5380b 100644
--- a/arch/arm/boot/dts/am572x-idk.dts
+++ b/arch/arm/boot/dts/am572x-idk.dts
@@ -81,7 +81,7 @@
vmmc-supply = <&v3_3d>;
vmmc_aux-supply = <&ldo1_reg>;
bus-width = <4>;
-   cd-gpios = <&gpio6 27 0>; /* gpio 219 */
+   cd-gpios = <&gpio6 27 GPIO_ACTIVE_LOW>; /* gpio 219 */
 };
 
 &sn65hvs882 {
-- 
2.11.0



[PATCH 1/2] ARM: dts: am571x-idk: Fix GPIO polarity for MMC1 card detect

2017-08-07 Thread Kishon Vijay Abraham I
The GPIO polarity for MMC1 card detect is set to '0' which means
active-high. However the polarity should be active-low. Fix it
here.

Signed-off-by: Kishon Vijay Abraham I 
---
 arch/arm/boot/dts/am571x-idk.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts
index 7b207835b2d1..adc70fb091a2 100644
--- a/arch/arm/boot/dts/am571x-idk.dts
+++ b/arch/arm/boot/dts/am571x-idk.dts
@@ -68,7 +68,7 @@
status = "okay";
vmmc-supply = <&ldo1_reg>;
bus-width = <4>;
-   cd-gpios = <&gpio6 27 0>; /* gpio 219 */
+   cd-gpios = <&gpio6 27 GPIO_ACTIVE_LOW>; /* gpio 219 */
 };
 
 &omap_dwc3_2 {
-- 
2.11.0



[PATCH] mmc: host: omap_hsmmc: Add CMD23 capability to omap_hsmmc driver

2017-08-07 Thread Kishon Vijay Abraham I
omap_hsmmc driver always relied on CMD12 to stop transmission.
However if CMD12 is not issued at the correct timing, the card will
indicate a out of range error. With certain cards in some of the
DRA7 based boards, -EIO error is observed. By Adding CMD23 capability,
the MMC core will send MMC_SET_BLOCK_COUNT command before
MMC_READ_MULTIPLE_BLOCK/MMC_WRITE_MULTIPLE_BLOCK commands.

commit a04e6bae9e6f12 ("mmc: core: check also R1 response for
stop commands") exposed this bug in omap_hsmmc driver.

Signed-off-by: Kishon Vijay Abraham I 
---
 drivers/mmc/host/omap_hsmmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 04ff3c97a535..2ab4788d021f 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -2086,7 +2086,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
mmc->max_seg_size = mmc->max_req_size;
 
mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED |
-MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE;
+MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE | MMC_CAP_CMD23;
 
mmc->caps |= mmc_pdata(host)->caps;
if (mmc->caps & MMC_CAP_8_BIT_DATA)
-- 
2.11.0



linux-next: manual merge of the userns tree with the mips tree

2017-08-07 Thread Stephen Rothwell
Hi Eric,

Today's linux-next merge of the userns tree got a conflict in:

  arch/mips/kernel/traps.c

between commit:

  260a789828aa ("MIPS: signal: Remove unreachable code from force_fcr31_sig().")

from the mips tree and commit:

  ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE")

from the userns tree.

I fixed it up (the former removed the code updated by the latter) and
can carry the fix as necessary. This is now fixed as far as linux-next
is concerned, but any non trivial conflicts should be mentioned to your
upstream maintainer when your tree is submitted for merging.  You may
also want to consider cooperating with the maintainer of the conflicting
tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


[PATCH v2] scheduler: enhancement to show_state_filter

2017-08-07 Thread Yafang Shao
Sometimes we want to get tasks in TASK_RUNNING sepcifically,
instead of dump all tasks.
For example, when the loadavg are high, we want to dump
tasks in TASK_RUNNING and TASK_UNINTERRUPTIBLE, which contribute
to system load. But mostly there're lots of tasks in Sleep state,
which occupies almost all of the kernel log buffer, even overflows
it, that causes the useful messages get lost. Although we can
enlarge the kernel log buffer, but that's not a good idea.

So I made this change to make the show_state_filter more flexible,
and then we can dump the tasks in TASK_RUNNING specifically.

Signed-off-by: Yafang Shao 
---
 drivers/tty/sysrq.c | 2 +-
 include/linux/sched.h   | 1 +
 include/linux/sched/debug.h | 6 --
 kernel/sched/core.c | 7 ---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 3ffc1ce..86db51b 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -291,7 +291,7 @@ static void sysrq_handle_showstate(int key)
 
 static void sysrq_handle_showstate_blocked(int key)
 {
-   show_state_filter(TASK_UNINTERRUPTIBLE);
+   show_state_filter(TASK_UNINTERRUPTIBLE << 1);
 }
 static struct sysrq_key_op sysrq_showstate_blocked_op = {
.handler= sysrq_handle_showstate_blocked,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2d..318f149 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -82,6 +82,7 @@
 #define TASK_NOLOAD1024
 #define TASK_NEW   2048
 #define TASK_STATE_MAX 4096
+#define TASK_ALL_BITS  ((TASK_STATE_MAX << 1) - 1)
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index e0eaee5..c844689 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_SCHED_DEBUG_H
 #define _LINUX_SCHED_DEBUG_H
 
+#include 
+
 /*
  * Various scheduler/task debugging interfaces:
  */
@@ -10,13 +12,13 @@
 extern void dump_cpu_task(int cpu);
 
 /*
- * Only dump TASK_* tasks. (0 for all tasks)
+ * Only dump TASK_* tasks. (TASK_ALL_BITS for all tasks)
  */
 extern void show_state_filter(unsigned long state_filter);
 
 static inline void show_state(void)
 {
-   show_state_filter(0);
+   show_state_filter(TASK_ALL_BITS);
 }
 
 struct pt_regs;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20..f9b9529 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5161,19 +5161,20 @@ void show_state_filter(unsigned long state_filter)
 */
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
-   if (!state_filter || (p->state & state_filter))
+   /* in case we want to set TASK_RUNNING specifically */
+   if ((p->state != TASK_RUNNING ? p->state << 1 : 1) & 
state_filter)
sched_show_task(p);
}
 
 #ifdef CONFIG_SCHED_DEBUG
-   if (!state_filter)
+   if (state_filter == TASK_ALL_BITS)
sysrq_sched_debug_show();
 #endif
rcu_read_unlock();
/*
 * Only show locks if all tasks are dumped:
 */
-   if (!state_filter)
+   if (state_filter == TASK_ALL_BITS)
debug_show_all_locks();
 }
 
-- 
1.8.3.1



[PATCH] spi/bcm63xx-hspi: fix error return code in bcm63xx_hsspi_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the spi-bcm63xx-hsspi
driver ignores it and always returns -ENXIO. This is not correct and,
prevents -EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/spi/spi-bcm63xx-hsspi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c
index 475a790..cbcba61 100644
--- a/drivers/spi/spi-bcm63xx-hsspi.c
+++ b/drivers/spi/spi-bcm63xx-hsspi.c
@@ -338,8 +338,8 @@ static int bcm63xx_hsspi_probe(struct platform_device *pdev)
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   dev_err(dev, "no irq\n");
-   return -ENXIO;
+   dev_err(dev, "no irq: %d\n", irq);
+   return irq;
}
 
res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-- 
2.5.0



[PATCH] spi/bcm63xx: fix error return code in bcm63xx_spi_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the spi-bcm63xx driver
ignores it and always returns -ENXIO. This is not correct and,
prevents -EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/spi/spi-bcm63xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c
index 84c7356..bfe5754 100644
--- a/drivers/spi/spi-bcm63xx.c
+++ b/drivers/spi/spi-bcm63xx.c
@@ -530,8 +530,8 @@ static int bcm63xx_spi_probe(struct platform_device *pdev)
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   dev_err(dev, "no irq\n");
-   return -ENXIO;
+   dev_err(dev, "no irq: %d\n", irq);
+   return irq;
}
 
clk = devm_clk_get(dev, "spi");
-- 
2.5.0



Re: [MD] Crash with 4.12+ kernel and high disk load -- bisected to 4ad23a976413: MD: use per-cpu counter for writes_pending

2017-08-07 Thread Shaohua Li
On Mon, Aug 07, 2017 at 01:20:25PM +0200, Dominik Brodowski wrote:
> Neil, Shaohua,
> 
> following up on David R's bug message: I have observed something similar
> on v4.12.[345] and v4.13-rc4, but not on v4.11. This is a RAID1 (on bare
> metal partitions, /dev/sdaX and /dev/sdbY linked together). In case it
> matters: Further upwards are cryptsetup, a DM volume group, then logical
> volumes, and then filesystems (ext4, but also happened with xfs).
> 
> In a tedious bisect (the bug wasn't as quickly reproducible as I would like,
> but happened when I repeatedly created large lvs and filled them with some
> content, while compiling kernels in parallel), I was able to track this
> down to:
> 
> 
> commit 4ad23a976413aa57fe5ba7a25953dc35ccca5b71
> Author: NeilBrown 
> Date:   Wed Mar 15 14:05:14 2017 +1100
> 
> MD: use per-cpu counter for writes_pending
> 
> The 'writes_pending' counter is used to determine when the
> array is stable so that it can be marked in the superblock
> as "Clean".  Consequently it needs to be updated frequently
> but only checked for zero occasionally.  Recent changes to
> raid5 cause the count to be updated even more often - once
> per 4K rather than once per bio.  This provided
> justification for making the updates more efficient.
> 
> ...
> 
> 
> CC'ing t...@kernel.org, as 4ad23a976413 is the first (and only?) user
> of percpu_ref_switch_to_atomic_sync() introduced in 210f7cdcf088.
> 
> Applying a415c0f10627 on top of 4ad23a976413 does *not* fix the issue, but
> reverting all of a2bfc6753065, a415c0f10627 and 4ad23a976413 seems to fix
> the issue for v4.12.5.

Spent some time to check this one, unfortunately I can't find how that patch
makes rcu stall. the percpu part looks good to me too. Can you double check if
reverting 4ad23a976413aa57 makes the issue go away? When the rcu stall happens,
what the /sys/block/md/md0/array_state? please also attach /proc/mdstat. When
you say the mdx_raid1 threads are in 'R' state, can you double check if the
/proc/pid/stack always 0xff?

Thanks,
Shaohua
> In addition, I can provide the following stack traces, which appear in dmesg
> around the time the system becomes more or less unusuable, with one or more
> of the md[0123]_raid1 threads in the "R" state.
> 
> ...  ...
> [  142.275244] INFO: rcu_sched self-detected stall on CPU
> [  142.275386]  4-...: (5999 ticks this GP) idle=d8a/141/0 
> softirq=2404/2404 fqs=2954
> [  142.275441]   (t=6000 jiffies g=645 c=644 q=199031)
> [  142.275490] NMI backtrace for cpu 4
> [  142.275537] CPU: 4 PID: 1164 Comm: md2_raid1 Not tainted 4.12.4 #2
> [  142.275586] Hardware name: MSI MS-7522/MSI X58 Pro (MS-7522)  , BIOS 
> V8.14B8 11/09/2012
> [  142.275640] Call Trace:
> [  142.275683]  
> [  142.275728]  dump_stack+0x4d/0x6a
> [  142.275775]  nmi_cpu_backtrace+0x9b/0xa0
> [  142.275822]  ? irq_force_complete_move+0xf0/0xf0
> [  142.275869]  nmi_trigger_cpumask_backtrace+0x8f/0xc0
> [  142.275918]  arch_trigger_cpumask_backtrace+0x14/0x20
> [  142.275967]  rcu_dump_cpu_stacks+0x8f/0xd9
> [  142.276016]  rcu_check_callbacks+0x62e/0x780
> [  142.276064]  ? acct_account_cputime+0x17/0x20
> [  142.276111]  update_process_times+0x2a/0x50
> [  142.276159]  tick_sched_handle.isra.18+0x2d/0x30
> [  142.276222]  tick_sched_timer+0x38/0x70
> [  142.276283]  __hrtimer_run_queues+0xbe/0x120
> [  142.276345]  hrtimer_interrupt+0xa3/0x190
> [  142.276408]  local_apic_timer_interrupt+0x33/0x60
> [  142.276471]  smp_apic_timer_interrupt+0x33/0x50
> [  142.276534]  apic_timer_interrupt+0x86/0x90
> [  142.276598] RIP: 0010:__wake_up+0x44/0x50
> [  142.276658] RSP: 0018:c9f8fd88 EFLAGS: 0246 ORIG_RAX: 
> ff10
> [  142.276742] RAX: 81a84bc0 RBX: 880235cf8800 RCX: 
> 
> [  142.276809] RDX: 81a84bd8 RSI: 0246 RDI: 
> 81a84bd0
> [  142.276876] RBP: c9f8fd98 R08:  R09: 
> 0001
> [  142.276943] R10:  R11:  R12: 
> 880235cf8800
> [  142.277009] R13: 880235eb2c28 R14: 0001 R15: 
> 
> [  142.277076]  
> [  142.277136]  md_check_recovery+0x30b/0x4a0
> [  142.277199]  raid1d+0x4c/0x810
> [  142.277258]  md_thread+0x11a/0x150
> [  142.277319]  ? md_thread+0x11a/0x150
> [  142.277379]  ? __wake_up_common+0x80/0x80
> [  142.277442]  kthread+0x11a/0x150
> [  142.277502]  ? find_pers+0x70/0x70
> [  142.277562]  ? __kthread_create_on_node+0x140/0x140
> [  142.277625]  ret_from_fork+0x22/0x30
> 
> ... or this one (on v4.12.5):
> [ 1294.560172] INFO: rcu_sched self-detected stall on CPU  
> [ 1294.560285]  2-...: (6000 ticks this GP) idle=f06/141/0 
> softirq=140681/140681 fqs=2988
> [ 1294.560365]   (t=6001 jiffies g=28666 c=28665 q=129416)
> [ 1294.560426] NMI backtrace for cpu 2
> [ 1294.560483] CPU: 2 PID: 1173 Comm: md3_raid1 Not tainted 4.12.5 #1
> [ 1294.560543] Hardware name: MSI MS-75

Re: linux-next: Signed-off-by missing for commit in the scsi-mkp tree

2017-08-07 Thread Finn Thain
On Tue, 8 Aug 2017, Stephen Rothwell wrote:

> Hi Martin,
> 
> Commit
> 
>   facfc963ae92 ("scsi: g_NCR5380: Two DTC436 PDMA workarounds")
> 
> is missing a Signed-off-by from its author.
> 

Sorry about that. The patch was a joint effort.

Ondrej, would you please send your "Signed-off-by" tag so that Martin can 
amend this commit (if need be).

-- 


[PATCH] usb: dwc3: omap: fix error return code in dwc3_omap_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the dwc3-omap driver
ignores it and always returns -EINVAL. This is not correct and,
prevents -EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/usb/dwc3/dwc3-omap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index f5aaa0c..3530795 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -478,8 +478,8 @@ static int dwc3_omap_probe(struct platform_device *pdev)
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   dev_err(dev, "missing IRQ resource\n");
-   return -EINVAL;
+   dev_err(dev, "missing IRQ resource: %d\n", irq);
+   return irq;
}
 
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-- 
2.5.0



Re: [PATCH 0/6] In-kernel QMI handling

2017-08-07 Thread Bjorn Andersson
On Mon 07 Aug 12:19 PDT 2017, Marcel Holtmann wrote:

> Hi Bjorn,
> 
> >>> This series starts by moving the common definitions of the QMUX
> >>> protocol to the
> >>> uapi header, as they are shared with clients - both in kernel and
> >>> userspace.
> >>> 
> >>> This series then introduces in-kernel helper functions for aiding the
> >>> handling
> >>> of QMI encoded messages in the kernel. QMI encoding is a wire-format
> >>> used in
> >>> exchanging messages between the majority of QRTR clients and
> >>> services.
> >> 
> >> This raises a few red-flags for me.
> > 
> > I'm glad it does. In discussions with the responsible team within
> > Qualcomm I've highlighted a number of concerns about enabling this
> > support in the kernel. Together we're continuously looking into what
> > should be pushed out to user space, and trying to not introduce
> > unnecessary new users.
> > 
> >> So far, we've kept almost everything QMI related in userspace and
> >> handled all QMI control-channel messages from libraries like libqmi or
> >> uqmi via the cdc-wdm driver and the "rmnet" interface via the qmi_wwan
> >> driver.  The kernel drivers just serve as the transport.
> >> 
> > 
> > The path that was taken to support the MSM-style devices was to
> > implement net/qrtr, which exposes a socket interface to abstract the
> > physical transports (QMUX or IPCROUTER in Qualcomm terminology).
> > 
> > As I share you view on letting the kernel handle the transportation only
> > the task of keeping track of registered services (service id -> node and
> > port mapping) was done in a user space process and so far we've only
> > ever have to deal with QMI encoded messages in various user space tools.
> 
> I think that the transport and multiplexing can be in the kernel as
> long as it is done as proper subsystem. Similar to Phonet or CAIF.
> Meaning it should have a well defined socket interface that can be
> easily used from userspace, but also a clean in-kernel interface
> handling.
> 

In a mobile Qualcomm device there's a few different components involved
here: message routing, QMUX protocol and QMI-encoding.

The downstream Qualcomm kernel implements the two first in the
IPCROUTER, upstream this is split between the kernel net/qrtr and a user
space service-register implementing the QMUX protocol for knowing where
services are located.

The common encoding of messages passed between endpoints of the message
routing is QMI, which is made an affair totally that of each client.

> If Qualcomm is supportive of this effort and is willing to actually
> assist and/or open some of the specs or interface descriptions, then
> this is a good thing. Service registration and cleanup is really done
> best in the kernel. Same applies to multiplexing. Trying to do
> multiplexing in userspace is always cumbersome and leads to overhead
> that is of no gain. For example within oFono, we had to force
> everything to go via oFono since it was the only sane way of handling
> it. Other approaches were error prone and full of race conditions. You
> need a central entity that can clean up.
> 

The current upstream solution depends on a collaboration between
net/qrtr and the user space service register for figuring out whom to
send messages to. After that muxing et al is handled by the socket
interface and service registry does not need to be involved.

Qualcomm is very supporting of this solution and we're collaborating on
transitioning "downstream" to use this implementation.

> For the definition of an UAPI to share some code, I am actually not
> sure that is such a good idea. For example the QMI code in oFono
> follows a way simpler approach. And I am not convinced that all the
> macros are actually beneficial. For example, the whole netlink macros
> are pretty cumbersome. Adding some Documentation/qmi.txt on how the
> wire format looks like and what is expected seems to be a way better
> approach.
> 

The socket interface provided by the kernel expects some knowledge of
the QMUX protocol, for service management. The majority of this
knowledge is already public, but I agree that it would be good to gather
this in a document. The common data structure for the control message is
what I've put in the uapi, as this is used by anyone dealing with
control messages.

When it comes to the QMI-encoded messages these are application
specific, just like e.g. protobuf definitions are application specific.

As the core infrastructure is becoming available upstream and boards
like the DB410c and DB820c aim to be supported by open solutions we will
have a natural place to discuss publication of at least some of the
application level protocols.

Regards,
Bjorn


[PATCH] spi: xlp: fix error return code in xlp_spi_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the spi-xlp driver ignores
it and always returns -EINVAL. This is not correct and, prevents
-EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/spi/spi-xlp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-xlp.c b/drivers/spi/spi-xlp.c
index 80cb4d6..74a01b0 100644
--- a/drivers/spi/spi-xlp.c
+++ b/drivers/spi/spi-xlp.c
@@ -393,8 +393,8 @@ static int xlp_spi_probe(struct platform_device *pdev)
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   dev_err(&pdev->dev, "no IRQ resource found\n");
-   return -EINVAL;
+   dev_err(&pdev->dev, "no IRQ resource found: %d\n", irq);
+   return irq;
}
err = devm_request_irq(&pdev->dev, irq, xlp_spi_interrupt, 0,
pdev->name, xspi);
-- 
2.5.0



[PATCH 1/2] PCI: iproc: Implement PCI hotplug support

2017-08-07 Thread Oza Pawandeep
This patch implements PCI hotplug support for iproc family chipsets.

Iproc based SOC (e.g. Stingray) does not have hotplug controller
integrated.
Hence, standard PCI hotplug framework hooks can-not be used.
e.g. controlled power up/down of slot.

The mechanism, for e.g. Stingray has adopted for PCI hotplug is as follows:
PCI present lines are input to GPIOs depending on the type of
connector (x2, x4, x8).

GPIO array needs to be present if hotplug is supported.
HW implementation is SOC/Board specific, and also it depends on how
add-in card is designed
(e.g. how many present pins are implemented).

If x8 card is connected, then it might be possible that all the
3 present pins could go low, or at least one pin goes low.
If x4 card is connected, then it might be possible that 2 present
pins go low, or at least one pin goes low.

The implementation essentially takes care of following:
> Initializing hotplug irq thread.
> Detecting the endpoint device based on link state.
> Handling PERST and detecting the plugged devices.
> Ordered hot plug-out, where User is expected
  to write 1 to /sys/bus/pci/devices//remove
> Handling spurious interrupt
> Handling multiple interrupts and makes sure that card is
  enumerated only once.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Ray Jui 

diff --git a/drivers/pci/host/pcie-iproc-platform.c 
b/drivers/pci/host/pcie-iproc-platform.c
index 9512960..e51bbd2 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -89,6 +89,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device 
*pdev)
pcie->need_ob_cfg = true;
}
 
+   if (of_property_read_bool(np, "brcm,pci-hotplug"))
+   pcie->enable_hotplug = true;
+
/* PHY use is optional */
pcie->phy = devm_phy_get(dev, "pcie-phy");
if (IS_ERR(pcie->phy)) {
diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c
index ee40651..c6d1add 100644
--- a/drivers/pci/host/pcie-iproc.c
+++ b/drivers/pci/host/pcie-iproc.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pcie-iproc.h"
 
@@ -65,6 +66,17 @@
 #define PCIE_DL_ACTIVE_SHIFT 2
 #define PCIE_DL_ACTIVE   BIT(PCIE_DL_ACTIVE_SHIFT)
 
+#define CFG_RC_LTSSM 0x1cf8
+#define CFG_RC_PHY_CTL   0x1804
+#define CFG_RC_LTSSM_TIMEOUT 1000
+#define CFG_RC_LTSSM_STATE_MASK  0xff
+#define CFG_RC_LTSSM_STATE_L10x1
+
+#define CFG_RC_CLR_LTSSM_HIST_SHIFT  29
+#define CFG_RC_CLR_LTSSM_HIST_MASK   BIT(CFG_RC_CLR_LTSSM_HIST_SHIFT)
+#define CFG_RC_CLR_RECOV_HIST_SHIFT  31
+#define CFG_RC_CLR_RECOV_HIST_MASK   BIT(CFG_RC_CLR_RECOV_HIST_SHIFT)
+
 #define APB_ERR_EN_SHIFT 0
 #define APB_ERR_EN   BIT(APB_ERR_EN_SHIFT)
 
@@ -1306,12 +1318,106 @@ static int iproc_pcie_rev_init(struct iproc_pcie *pcie)
return 0;
 }
 
+static bool iproc_pci_hp_check_ltssm(struct iproc_pcie *pcie)
+{
+   struct pci_bus *bus = pcie->root_bus;
+   u32 val, timeout = CFG_RC_LTSSM_TIMEOUT;
+
+   /* Clear LTSSM history. */
+   pci_bus_read_config_dword(pcie->root_bus, 0,
+ CFG_RC_PHY_CTL, &val);
+   pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL,
+  val | CFG_RC_CLR_RECOV_HIST_MASK |
+  CFG_RC_CLR_LTSSM_HIST_MASK);
+   /* write back the origional value. */
+   pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL, val);
+
+   do {
+   pci_bus_read_config_dword(pcie->root_bus, 0,
+ CFG_RC_LTSSM, &val);
+   /* check link state to see if link moved to L1 state. */
+   if ((val & CFG_RC_LTSSM_STATE_MASK) ==
+CFG_RC_LTSSM_STATE_L1)
+   return true;
+   timeout--;
+   usleep_range(500, 1000);
+   } while (timeout);
+
+   return false;
+}
+
+static irqreturn_t iproc_pci_hotplug_thread(int irq, void *data)
+{
+   struct iproc_pcie *pcie = data;
+   struct pci_bus *bus = pcie->root_bus, *child;
+   bool link_status;
+
+   iproc_pcie_perst_ctrl(pcie, true);
+   iproc_pcie_perst_ctrl(pcie, false);
+
+   link_status = iproc_pci_hp_check_ltssm(pcie);
+
+   if (link_status &&
+   !iproc_pcie_check_link(pcie, bus) &&
+   !pcie->ep_is_present) {
+   pci_rescan_bus(bus);
+   list_for_each_entry(child, &bus->children, node)
+   pcie_bus_configure_settings(child);
+   pcie->ep_is_present = true;
+   dev_info(pcie->dev,
+"PCI Hotplug: \n");
+   } else if (link_status && pcie->ep_is_present)
+   /*
+* ep_is_present makes sure, enumuration done only once.
+* So it can handle spurious intrrupts, and also if we
+* get multiple interrupts for a

[PATCH 0/2] PCI hotplug feature

2017-08-07 Thread Oza Pawandeep
These patches bring in PCI hotplug support for iproc family chipsets.

It includes DT binding documentation update and, implementation in
iproc pcie RC driver.

These patch set is made on top of following patches.
[PATCH v6 2/2] PCI: iproc: add device shutdown for PCI RC
[PATCH v6 1/2] PCI: iproc: Retry request when CRS returned from EP

Oza Pawandeep (2):
  PCI: iproc: Implement PCI hotplug support
  PCI: iproc: Add optional brcm,pci-hotplug

 .../devicetree/bindings/pci/brcm,iproc-pcie.txt|  23 
 drivers/pci/host/pcie-iproc-platform.c |   3 +
 drivers/pci/host/pcie-iproc.c  | 130 -
 drivers/pci/host/pcie-iproc.h  |   7 ++
 4 files changed, 157 insertions(+), 6 deletions(-)

-- 
1.9.1



[PATCH 2/2] PCI: iproc: Add optional brcm,pci-hotplug

2017-08-07 Thread Oza Pawandeep
Add description for optional device tree property
'brcm,pci-hotplug' for PCI hotplug feature.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Ray Jui 

diff --git a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt 
b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt
index b8e48b4..a3bad24 100644
--- a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt
@@ -72,6 +72,29 @@ Optional properties:
 - brcm,pcie-msi-inten: Needs to be present for some older iProc platforms that
 require the interrupt enable registers to be set explicitly to enable MSI
 
+Optional properties:
+- brcm,pci-hotplug: PCI hotplug feature is supported.
+
+If the brcm,pcie-hotplug property is present, the following properties become
+effective:
+
+- brcm,prsnt-gpio: Array of gpios, needs to be present if Hotplug is supported.
+
+PCI hotplug implementation is SOC/Board specific, and also it depends on
+how add-in card is designed (e.g. how many present pins are implemented).
+
+If x8 card is connected, then it might be possible that all the
+3 present pins could go low, or at least one pin goes low.
+
+If x4 card is connected, then it might be possible that 2 present
+pins go low, or at least one pin goes low.
+
+Example:
+brcm,prsnt-gpio: <&pca9505 32 1>, <&pca9505 33 1>;
+This is x4 connector: monitoring max 2 present lines.
+brcm,prsnt-gpio: <&pca9505 32 1>, <&pca9505 33 1>, <&pca9505 34 1>;
+This is x8 connector: monitoring max 3 present lines.
+
 Example:
pcie0: pcie@18012000 {
compatible = "brcm,iproc-pcie";
-- 
1.9.1



Re: [PATCH] arm64: correct modules range of kernel virtual memory layout

2017-08-07 Thread Miles Chen
On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote:
> On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote:
> > On 7 August 2017 at 14:16, Will Deacon  wrote:
> > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote:
> > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR")
> > >> moved module virtual address to
> > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE).
> > >>
> > >> Display module information of the virtual kernel
> > >> memory layout by using module_alloc_base.
> > >>
> > >> testing output:
> > >> 1) Current implementation:
> > >> Virtual kernel memory layout:
> > >>   modules : 0xff80 - 0xff800800   (   128 MB)
> > >> 2) this patch + KASLR:
> > >> Virtual kernel memory layout:
> > >>   modules : 0xff800056 - 0xff800856   (   128 MB)
> > >> 3) this patch + KASLR and a dummy seed:
> > >> Virtual kernel memory layout:
> > >>   modules : 0xffa7df637000 - 0xffa7e7637000   (   128 MB)
> > >>
> > >> Signed-off-by: Miles Chen 
> > >> ---
> > >>  arch/arm64/mm/init.c | 5 +++--
> > >>  1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > Does this mean the modules code in our pt dumper is busted
> > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these addresses
> > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END
> > > altogether?
> > >
> > 
> > I don't think we need this patch. The 'module' line simply prints the
> > VA region that is reserved for modules. The fact that we end up
> > putting them elsewhere when running randomized does not necessarily
> > mean this line should reflect that.
> 
> I was more concerned by other users of MODULES_VADDR tbh, although I see
> now that we don't randomize the module region if kasan is enabled. Still,
> the kcore code adds the modules region as a separate area (distinct from
> vmalloc) if MODULES_VADDR is defined, the page table dumping code uses
> MODULES_VADDR to identify the module region and I think we'll get false
> positives from is_vmalloc_or_module_addr, which again uses the static
> region.
> 
> So, given that MODULES_VADDR never points at the module area, can't we get
> rid of it?

Agreed.MODULES_VADDR should be phased out. Considering the kernel
modules live somewhere between [VMALLOC_START, VMALLOC_END) now:
(arch/arm64/kernel/module.c:module_alloc). I suggest the following
changes:

1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly
2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END.
3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to
get the shadow memory? (the kernel modules still live in this range when
kasan is enabled)
4. remove modules line in kernel memory layout
(optional, thanks for Ard's feedback)
5. remove MODULE_VADDR, MODULES_END definition


Miles
> 
> Will




[PATCH v2] perf/core: Avoid context switch overheads

2017-08-07 Thread 石祤
From: "leilei.lin" 

A performance issue caused by less strickly check in task
sched when these tasks were once attached by per-task perf_event.

A task will alloc task->perf_event_ctxp[ctxn] when it was called
by perf_event_open, and task->perf_event_ctxp[ctxn] would not
ever be freed to NULL.

__perf_event_task_sched_in()
if (task->perf_event_ctxp[ctxn]) //  here is always true
perf_event_context_sched_in() // operate pmu

50% at most performance overhead was observed under some extreme
test case. Therefor, add a more strick check as to ctx->nr_events,
when ctx->nr_events == 0, it's no need to continue.

Signed-off-by: leilei.lin 
---
 kernel/events/core.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..3d86695 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3180,6 +3180,13 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
return;
 
perf_ctx_lock(cpuctx, ctx);
+   /*
+* We must check ctx->nr_events while holding ctx->lock, such
+* that we serialize against perf_install_in_context().
+*/
+   if (!cpuctx->task_ctx && !ctx->nr_events)
+   goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
 * We want to keep the following priority order:
@@ -3193,6 +3200,8 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
 }
 
-- 
2.8.4.31.g9ed660f



[PATCH 2/3] autofs - make disc device user accessible

2017-08-07 Thread Ian Kent
The autofs miscellanous device ioctls that shouldn't require
CAP_SYS_ADMIN need to be accessible to user space applications in
order to be able to get information about autofs mounts.

The module checks capabilities so the miscelaneous device should
be fine with broad permissions.

Signed-off-by: Ian Kent 
Cc: Colin Walters 
Cc: Ondrej Holy 
---
 fs/autofs4/dev-ioctl.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index dd9f1bebb5a3..218a4ecc75cc 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -733,7 +733,8 @@ static const struct file_operations _dev_ioctl_fops = {
 static struct miscdevice _autofs_dev_ioctl_misc = {
.minor  = AUTOFS_MINOR,
.name   = AUTOFS_DEVICE_NAME,
-   .fops   = &_dev_ioctl_fops
+   .fops   = &_dev_ioctl_fops,
+   .mode   = 0644,
 };
 
 MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);



[PATCH 1/3] autofs - fix AT_NO_AUTOMOUNT not being honored

2017-08-07 Thread Ian Kent
The fstatat(2) and statx() calls can pass the flag AT_NO_AUTOMOUNT
which is meant to clear the LOOKUP_AUTOMOUNT flag and prevent triggering
of an automount by the call. But this flag is unconditionally cleared
for all stat family system calls except statx().

stat family system calls have always triggered mount requests for the
negative dentry case in follow_automount() which is intended but prevents
the fstatat(2) and statx() AT_NO_AUTOMOUNT case from being handled.

In order to handle the AT_NO_AUTOMOUNT for both system calls the
negative dentry case in follow_automount() needs to be changed to
return ENOENT when the LOOKUP_AUTOMOUNT flag is clear (and the other
required flags are clear).

AFAICT this change doesn't have any noticable side effects and may,
in some use cases (although I didn't see it in testing) prevent
unnecessary callbacks to the automount daemon.

It's also possible that a stat family call has been made with a
path that is in the process of being mounted by some other process.
But stat family calls should return the automount state of the path
as it is "now" so it shouldn't wait for mount completion.

This is the same semantic as the positive dentry case already
handled.

Signed-off-by: Ian Kent 
Cc: David Howells 
Cc: Colin Walters 
Cc: Ondrej Holy 
---
 fs/namei.c |   15 ---
 include/linux/fs.h |3 +--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index ddb6a7c2b3d4..1180f9c58093 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct 
nameidata *nd,
 * of the daemon to instantiate them before they can be used.
 */
if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-  LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
-   path->dentry->d_inode)
-   return -EISDIR;
+  LOOKUP_OPEN | LOOKUP_CREATE |
+  LOOKUP_AUTOMOUNT))) {
+   /* Positive dentry that isn't meant to trigger an
+* automount, EISDIR will allow it to be used,
+* otherwise there's no mount here "now" so return
+* ENOENT.
+*/
+   if (path->dentry->d_inode)
+   return -EISDIR;
+   else
+   return -ENOENT;
+   }
 
if (path->dentry->d_sb->s_user_ns != &init_user_ns)
return -EACCES;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..37c96f52e48e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3022,8 +3022,7 @@ static inline int vfs_lstat(const char __user *name, 
struct kstat *stat)
 static inline int vfs_fstatat(int dfd, const char __user *filename,
  struct kstat *stat, int flags)
 {
-   return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
-stat, STATX_BASIC_STATS);
+   return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {



[PATCH 3/3] autofs - make dev ioctl version and ismountpoint user accessible

2017-08-07 Thread Ian Kent
Some of the autofs miscellaneous device ioctls need to be accessable to
user space applications without CAP_SYS_ADMIN to get information about
autofs mounts.

Signed-off-by: Ian Kent 
Cc: Colin Walters 
Cc: Ondrej Holy 
---
 fs/autofs4/dev-ioctl.c  |   12 
 include/uapi/linux/auto_dev-ioctl.h |2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 218a4ecc75cc..ea8b3a1cddd2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -628,10 +628,6 @@ static int _autofs_dev_ioctl(unsigned int command,
ioctl_fn fn = NULL;
int err = 0;
 
-   /* only root can play with this */
-   if (!capable(CAP_SYS_ADMIN))
-   return -EPERM;
-
cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
cmd = _IOC_NR(command);
 
@@ -640,6 +636,14 @@ static int _autofs_dev_ioctl(unsigned int command,
return -ENOTTY;
}
 
+   /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD
+* and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+*/
+   if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+   cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD &&
+   !capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
/* Copy the parameters into kernel space. */
param = copy_dev_ioctl(user);
if (IS_ERR(param))
diff --git a/include/uapi/linux/auto_dev-ioctl.h 
b/include/uapi/linux/auto_dev-ioctl.h
index 744b3d060968..5558db8e6646 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -16,7 +16,7 @@
 #define AUTOFS_DEVICE_NAME "autofs"
 
 #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR 1
 
 #define AUTOFS_DEV_IOCTL_SIZE  sizeof(struct autofs_dev_ioctl)
 



[PATCH] thermal: rockchip: fix error return code in rockchip_thermal_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the rockchip_thermal driver
ignores it and always returns -EINVAL. This is not correct and, prevents
-EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print and propagate the return value of platform_get_irq on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/thermal/rockchip_thermal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/rockchip_thermal.c 
b/drivers/thermal/rockchip_thermal.c
index 4c77965..6ca9747 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -1068,8 +1068,8 @@ static int rockchip_thermal_probe(struct platform_device 
*pdev)
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   dev_err(&pdev->dev, "no irq resource?\n");
-   return -EINVAL;
+   dev_err(&pdev->dev, "no irq resource: %d\n", irq);
+   return irq;
}
 
thermal = devm_kzalloc(&pdev->dev, sizeof(struct rockchip_thermal_data),
-- 
2.5.0



Re: [PATCH 3/3] IPI: Avoid to use 2 cache lines for one call_single_data

2017-08-07 Thread Huang, Ying
Peter Zijlstra  writes:

> On Sat, Aug 05, 2017 at 08:47:02AM +0800, Huang, Ying wrote:
>> Yes.  That looks good.  So you will prepare the final patch?  Or you
>> hope me to do that?
>
> I was hoping you'd do it ;-)

Thanks!  Here is the updated patch

Best Regards,
Huang, Ying

-->8--
>From 957735e9ff3922368286540dab852986fc7b23b5 Mon Sep 17 00:00:00 2001
From: Huang Ying 
Date: Mon, 7 Aug 2017 16:55:33 +0800
Subject: [PATCH -v3] IPI: Avoid to use 2 cache lines for one
 call_single_data

struct call_single_data is used in IPI to transfer information between
CPUs.  Its size is bigger than sizeof(unsigned long) and less than
cache line size.  Now, it is allocated with no explicit alignment
requirement.  This makes it possible for allocated call_single_data to
cross 2 cache lines.  So that double the number of the cache lines
that need to be transferred among CPUs.

This is resolved by requiring call_single_data to be aligned with the
size of call_single_data.  Now the size of call_single_data is the
power of 2.  If we add new fields to call_single_data, we may need to
add pads to make sure the size of new definition is the power of 2.
Fortunately, this is enforced by gcc, which will report error for not
power of 2 alignment requirement.

To set alignment requirement of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.

To test the effect of the patch, we use the vm-scalability multiple
thread swap test case (swap-w-seq-mt).  The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPI will be triggered when unmapping
memory.  In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data because of faster IPI.

[Add call_single_data_t and align with size of call_single_data]
Suggested-by: Peter Zijlstra 
Signed-off-by: "Huang, Ying" 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Michael Ellerman 
Cc: Borislav Petkov 
Cc: Thomas Gleixner 
Cc: Juergen Gross 
Cc: Aaron Lu 
---
 arch/mips/kernel/smp.c |  6 ++--
 block/blk-softirq.c|  2 +-
 drivers/block/null_blk.c   |  2 +-
 drivers/cpuidle/coupled.c  | 10 +++
 drivers/net/ethernet/cavium/liquidio/lio_main.c|  2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h |  2 +-
 include/linux/blkdev.h |  2 +-
 include/linux/netdevice.h  |  2 +-
 include/linux/smp.h|  8 --
 kernel/sched/sched.h   |  2 +-
 kernel/smp.c   | 32 --
 kernel/up.c|  2 +-
 12 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 770d4d1516cb..bd8ba5472bca 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
 static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
 
 void tick_broadcast(const struct cpumask *mask)
 {
atomic_t *count;
-   struct call_single_data *csd;
+   call_single_data_t *csd;
int cpu;
 
for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
 
 static int __init tick_broadcast_init(void)
 {
-   struct call_single_data *csd;
+   call_single_data_t *csd;
int cpu;
 
for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 87b7df4851bf..07125e7941f4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
 static int raise_blk_irq(int cpu, struct request *rq)
 {
if (cpu_online(cpu)) {
-   struct call_single_data *data = &rq->csd;
+   call_single_data_t *data = &rq->csd;
 
data->func = trigger_softirq;
data->info = rq;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 85c24cace973..81142ce781da 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
 struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
-   struct call_single_data csd;
+   call_single_data_t csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 71e586d7df71..147f38ea0fcd 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,13 +119,13 @@ struct cpuidle_coupled {
 
 #define CPUIDLE_COUPLED_NOT_IDLE   (-1)
 
-static DEFINE_PER_CPU(struct call_single_data

[PATCH v5 1/1] usb:host:xhci support option to disable the xHCI USB2 HW LPM

2017-08-07 Thread Thang Q. Nguyen
XHCI specification 1.1 does not require xHCI-compliant controllers
to always enable hardware USB2 LPM. However, the current xHCI
driver always enable it when seeing HLC=1.
This patch supports an option for users to control disabling
USB2 Hardware LPM via DT/ACPI attribute.
This option is needed in case user would like to disable this
feature. For example, their xHCI controller has its USB2 HW LPM
broken.

Signed-off-by: Tung Nguyen 
Signed-off-by: Thang Q. Nguyen 
Acked-by: Rob Herring 
---
Changes since v4:
 - When HW LPM is optionally disabled, explicitly disable HLE, RWE, ...
 - Update codes to work with kernel 4.13-rc4
 - Add Acked-By from Rob Herring 
Changes since v3:
 - Bypass updating LPM parameters when HW LPM is optionally disabled.
Changes since v2:
 - Change code to disable HW LPM as an option for user which
   is set via ACPI/DT.
Changes since v1:
 - Update DT/ACPI attribute and corresponding codes from HLE to LPM to
   be consistent with other attribute names.
---
 Documentation/devicetree/bindings/usb/usb-xhci.txt |1 +
 drivers/usb/host/xhci-plat.c   |3 +++
 drivers/usb/host/xhci.c|2 +-
 drivers/usb/host/xhci.h|1 +
 4 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/Documentation/devicetree/bindings/usb/usb-xhci.txt 
b/Documentation/devicetree/bindings/usb/usb-xhci.txt
index 2d80b60..ae6e484 100644
--- a/Documentation/devicetree/bindings/usb/usb-xhci.txt
+++ b/Documentation/devicetree/bindings/usb/usb-xhci.txt
@@ -26,6 +26,7 @@ Required properties:
 
 Optional properties:
   - clocks: reference to a clock
+  - usb2-lpm-disable: indicate if we don't want to enable USB2 HW LPM
   - usb3-lpm-capable: determines if platform is USB3 LPM capable
   - quirk-broken-port-ped: set if the controller has broken port disable 
mechanism
 
diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c
index c04144b..9028fb5 100644
--- a/drivers/usb/host/xhci-plat.c
+++ b/drivers/usb/host/xhci-plat.c
@@ -267,6 +267,9 @@ static int xhci_plat_probe(struct platform_device *pdev)
goto disable_clk;
}
 
+   if (device_property_read_bool(sysdev, "usb2-lpm-disable"))
+   xhci->quirks |= XHCI_HW_LPM_DISABLE;
+
if (device_property_read_bool(sysdev, "usb3-lpm-capable"))
xhci->quirks |= XHCI_LPM_SUPPORT;
 
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index b2ff1ff..3a8e75f 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -4087,7 +4087,7 @@ static int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd,
xhci_dbg(xhci, "%s port %d USB2 hardware LPM\n",
enable ? "enable" : "disable", port_num + 1);
 
-   if (enable) {
+   if (enable && !(xhci->quirks & XHCI_HW_LPM_DISABLE)) {
/* Host supports BESL timeout instead of HIRD */
if (udev->usb2_hw_lpm_besl_capable) {
/* if device doesn't have a preferred BESL value use a
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index e3e9352..5d89c51 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1821,6 +1821,7 @@ struct xhci_hcd {
 #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26)
 #define XHCI_U2_DISABLE_WAKE   (1 << 27)
 #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL(1 << 28)
+#define XHCI_HW_LPM_DISABLE(1 << 29)
 
unsigned intnum_active_eps;
unsigned intlimit_active_eps;
-- 
1.7.1



Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page

2017-08-07 Thread Huang, Ying
Mike Kravetz  writes:

> On 08/07/2017 12:21 AM, Huang, Ying wrote:
>> From: Huang Ying 
>> 
>> Huge page helps to reduce TLB miss rate, but it has higher cache
>> footprint, sometimes this may cause some issue.  For example, when
>> clearing huge page on x86_64 platform, the cache footprint is 2M.  But
>> on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M
>> LLC (last level cache).  That is, in average, there are 2.5M LLC for
>> each core and 1.25M LLC for each thread.  If the cache pressure is
>> heavy when clearing the huge page, and we clear the huge page from the
>> begin to the end, it is possible that the begin of huge page is
>> evicted from the cache after we finishing clearing the end of the huge
>> page.  And it is possible for the application to access the begin of
>> the huge page after clearing the huge page.
>> 
>> To help the above situation, in this patch, when we clear a huge page,
>> the order to clear sub-pages is changed.  In quite some situation, we
>> can get the address that the application will access after we clear
>> the huge page, for example, in a page fault handler.  Instead of
>> clearing the huge page from begin to end, we will clear the sub-pages
>> farthest from the the sub-page to access firstly, and clear the
>> sub-page to access last.  This will make the sub-page to access most
>> cache-hot and sub-pages around it more cache-hot too.  If we cannot
>> know the address the application will access, the begin of the huge
>> page is assumed to be the the address the application will access.
>> 
>> With this patch, the throughput increases ~28.3% in vm-scalability
>> anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699
>> system (36 cores, 72 threads).  The test case creates 72 processes,
>> each process mmap a big anonymous memory area and writes to it from
>> the begin to the end.  For each process, other processes could be seen
>> as other workload which generates heavy cache pressure.  At the same
>> time, the cache miss rate reduced from ~33.4% to ~31.7%, the
>> IPC (instruction per cycle) increased from 0.56 to 0.74, and the time
>> spent in user space is reduced ~7.9%
>> 
>> Thanks Andi Kleen to propose to use address to access to determine the
>> order of sub-pages to clear.
>> 
>> The hugetlbfs access address could be improved, will do that in
>> another patch.
>
> hugetlb_fault masks off the actual faulting address with,
> address &= huge_page_mask(h);
> before calling hugetlb_no_page.
>
> But, we could pass down the actual (unmasked) address to take advantage
> of this optimization for hugetlb faults as well.  hugetlb_fault is the
> only caller of hugetlb_no_page, so this should be pretty straight forward.
>
> Were you thinking of additional improvements?

No.  I am thinking of something like this.  If the basic idea is
accepted, I plan to add better support like this for hugetlbfs in
another patch.

Best Regards,
Huang, Ying


Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression

2017-08-07 Thread Nadav Amit
Minchan Kim  wrote:

> Hi,
> 
> On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote:
>> Greeting,
>> 
>> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to 
>> commit:
>> 
>> 
>> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix 
>> MADV_[FREE|DONTNEED] TLB flush miss problem")
>> url: 
>> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715
>> 
>> 
>> in testcase: will-it-scale
>> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with 
>> 64G memory
>> with following parameters:
>> 
>>  nr_task: 16
>>  mode: process
>>  test: brk1
>>  cpufreq_governor: performance
>> 
>> test-description: Will It Scale takes a testcase and runs it from 1 through 
>> to n parallel copies to see if the testcase will scale. It builds both a 
>> process and threads based test in order to see any differences between the 
>> two.
>> test-url: https://github.com/antonblanchard/will-it-scale
> 
> Thanks for the report.
> Could you explain what kinds of workload you are testing?
> 
> Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple
> threads?

According to the description it is "testcase:brk increase/decrease of one
page”. According to the mode it spawns multiple processes, not threads.

Since a single page is unmapped each time, and the iTLB-loads increase
dramatically, I would suspect that for some reason a full TLB flush is
caused during do_munmap().

If I find some free time, I’ll try to profile the workload - but feel free
to beat me to it.

Nadav 



Re: [v5] wlcore: add missing nvs file name info for wilink8

2017-08-07 Thread Tony Lindgren
* Reizer, Eyal  [170807 00:47]:
> Hi Tony,
> > 
> > * Reizer, Eyal  [170807 00:32]:
> > > The following commits:
> > > c815fde wlcore: spi: Populate config firmware data
> > > d776fc8 wlcore: sdio: Populate config firmware data
> > >
> > > Populated the nvs entry for wilink6 and wilink7 only while it is
> > > still needed for wilink8 as well.
> > > This broke user space backward compatibility when upgrading from older
> > > kernels, as the alternate mac address would not be read from the nvs that
> > is
> > > present in the file system (lib/firmware/ti-connectivity/wl1271-nvs.bin)
> > > causing mac address change of the wlan interface.
> > >
> > > This patch fix this and update the structure field with the same default
> > > nvs file name that has been used before.
> > >
> > > In addition, some distros hold a default wl1271-nvs.bin in the file
> > > system with a bogus mac address (deadbeef...) that for a wl18xx device
> > > also overrides the mac address that is stored inside the device.
> > > Warn users about this bogus mac address and use a random mac instead
> > 
> > Hmm looks pretty good to me except for one more thing I just noticed.
> > 
> > Why don't you just use the hardware mac address instead of a random
> > mac address on wl18xx device when you see a bogus nvs file?
> > 
> 
> I agree that this would have been better but the problem is that hardware 
> mac address is available for wilink8 onlyWilink6/7 don't have one stored.
> The wlcore code responsible for handling mac address is common code 
> and there is method for detecting between them in this module.

Care to clarify a bit.. Are you saying wilink8 will use the hardware
mac address in case of bogus nvs file?

Regards,

Tony


Re: [PATCH] gpio: uniphier: add UniPhier GPIO controller driver

2017-08-07 Thread Keerthy


On Tuesday 08 August 2017 06:36 AM, Masahiro Yamada wrote:
> Hi Linus,
> 
> 2017-08-08 0:37 GMT+09:00 Linus Walleij :
>> On Mon, Aug 7, 2017 at 3:50 PM, Masahiro Yamada
>>  wrote:
>>
>>> Adding "interrupts" property in DT causes
>>> of_pupulate_default_populate() to assign virtual IRQ numbers
>>> before driver probing.  So it does not work well with IRQ domain hierarchy.
>>
>> I think I heard some noise about this the week before.
>>
>>> For pinctrl/stm32/pinctrl-stm32.c,
>>> I do not see "interrupts", so it just straight maps the irq numbers.
>>
>> I think OMAP and DaVinci does someting similar too. This is from a recent
>> DaVinci patch from Keerthy:
>>
>> +Example for 66AK2G:
>> +
>> +gpio0: gpio@2603000 {
>> +   compatible = "ti,k2g-gpio", "ti,keystone-gpio";
>> +   reg = <0x02603000 0x100>;
>> +   gpio-controller;
>> +   #gpio-cells = <2>;
>> +   interrupts = ,
>> +   ,
>> +   ,
>> +   ,
>> +   ,
>> +   ,
>> +   ,
>> +   ,
>> +   ;
>> +   interrupt-controller;
>> +   #interrupt-cells = <2>;
>> +   ti,ngpio = <144>;
>> +   ti,davinci-gpio-unbanked = <0>;
>> +   clocks = <&k2g_clks 0x001b 0x0>;
>> +   clock-names = "gpio";
>> +};
>>
>>
>> That looks fairly similar.
>>
> 
> I do not think so.
> 
> 
> I do not see .alloc hook in drivers/gpio/gpio-davinci.c
> so this driver is unrelated to IRQ domain hierarchy.

Hi Masahiro,

Yes CONFIG_IRQ_DOMAIN_HIERARCHY is not enabled in keystone_defconfig or
davinci_all_defconfig.

Regards,
Keerthy

> 
> 
> 
> 
> 
> 


Re: [PATCH 13/18] power: supply: bq24190_charger: Export 5V boost converter as regulator

2017-08-07 Thread Tony Lindgren
* Hans de Goede  [170806 05:37]:
> Register the 5V boost converter as a regulator named
> "regulator-bq24190-usb-vbus". Note the name includes "bq24190" because
> the bq24190 family is also used on ACPI devices where there are no
> device-tree phandles, so regulator_get will fallback to the name and thus
> it must be unique on the system.

Nice, this makes VBUS easy to use for USB PHY drivers :)

Tony


[PATCH] usb: imx21-hcd: fix error return code in imx21_probe()

2017-08-07 Thread Gustavo A. R. Silva
platform_get_irq() returns an error code, but the imx21-hcd driver
ignores it and always returns -ENXIO. This is not correct, and
prevents -EPROBE_DEFER from being propagated properly.

Notice that platform_get_irq() no longer returns 0 on error:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af

Print error message and propagate the return value of platform_get_irq
on failure.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/usb/host/imx21-hcd.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c
index f542045..e25d72e 100644
--- a/drivers/usb/host/imx21-hcd.c
+++ b/drivers/usb/host/imx21-hcd.c
@@ -1849,8 +1849,10 @@ static int imx21_probe(struct platform_device *pdev)
if (!res)
return -ENODEV;
irq = platform_get_irq(pdev, 0);
-   if (irq < 0)
-   return -ENXIO;
+   if (irq < 0) {
+   dev_err(&pdev->dev, "Failed to get IRQ: %d\n", irq);
+   return irq;
+   }
 
hcd = usb_create_hcd(&imx21_hc_driver,
&pdev->dev, dev_name(&pdev->dev));
-- 
2.5.0



Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API

2017-08-07 Thread Byungchul Park
On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote:
> On 2017/8/7 下午4:38, Byungchul Park wrote:
> > Although llist provides proper APIs, they are not used. Make them used.
> > 
> > Signed-off-by: Byungchul Park  Only have a question about why not using llist_for_each_entry(), it's

Hello,

The reason is to keep the original logic unchanged. The logic already
does as if it's the safe version against removal.

> still OK with llist_for_each_entry_safe(). The rested part is good to me.
> 
> Acked-by: Coly Li 
> 
> > ---
> >  drivers/md/bcache/closure.c | 17 +++--
> >  1 file changed, 3 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
> > index 864e673..1841d03 100644
> > --- a/drivers/md/bcache/closure.c
> > +++ b/drivers/md/bcache/closure.c
> > @@ -64,27 +64,16 @@ void closure_put(struct closure *cl)
> >  void __closure_wake_up(struct closure_waitlist *wait_list)
> >  {
> > struct llist_node *list;
> > -   struct closure *cl;
> > +   struct closure *cl, *t;
> > struct llist_node *reverse = NULL;
> >  
> > list = llist_del_all(&wait_list->list);
> >  
> > /* We first reverse the list to preserve FIFO ordering and fairness */
> > -
> > -   while (list) {
> > -   struct llist_node *t = list;
> > -   list = llist_next(list);
> > -
> > -   t->next = reverse;
> > -   reverse = t;
> > -   }
> > +   reverse = llist_reverse_order(list);
> >  
> > /* Then do the wakeups */
> > -
> > -   while (reverse) {
> > -   cl = container_of(reverse, struct closure, list);
> > -   reverse = llist_next(reverse);
> > -
> > +   llist_for_each_entry_safe(cl, t, reverse, list) {
> 
> Just wondering why not using llist_for_each_entry(), or you use the
> _safe version on purpose ?

If I use llist_for_each_entry(), then it would change the original
behavior. Is it ok?

Thank you,
Byungchul



Re: [PATCH 3.18 00/50] 3.18.64-stable review

2017-08-07 Thread Guenter Roeck

On 08/07/2017 12:34 PM, Greg Kroah-Hartman wrote:

On Sat, Aug 05, 2017 at 12:11:19PM -0700, Guenter Roeck wrote:

On 08/05/2017 08:43 AM, Greg Kroah-Hartman wrote:

On Sat, Aug 05, 2017 at 08:02:17AM +0200, Willy Tarreau wrote:

On Sat, Aug 05, 2017 at 07:55:11AM +0200, Willy Tarreau wrote:

On Fri, Aug 04, 2017 at 07:51:07PM -0700, Greg Kroah-Hartman wrote:

On Fri, Aug 04, 2017 at 07:46:57PM -0700, Greg Kroah-Hartman wrote:

On Fri, Aug 04, 2017 at 06:43:50PM -0700, Guenter Roeck wrote:

On 08/04/2017 04:15 PM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 3.18.64 release.
There are 50 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Aug  6 23:15:34 UTC 2017.
Anything received after that time might be too late.



Preliminary:

Lots of

lib/string.c:31:32: fatal error: asm/word-at-a-time.h

affecting several architectures.

alpha:

lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask'


Hm, I think I need to add c753bf34c94e ("word-at-a-time.h: support
zero_bytemask() on alpha and tile"), right?  Any other arches failing?


Hm, that doesn't work, do we care about tile? :)

Let me see how deep this hole is, I just wanted to get strscpy into 3.18
to fix a bug...


I suspect you'll need this one which came as part of the strscpy() series
between 4.2 and 4.3 (though I have not tested) :

commit a6e2f029ae34f41adb6ae3812c32c5d326e1abd2
Author: Chris Metcalf 
Date:   Wed Apr 29 12:48:40 2015 -0400

  Make asm/word-at-a-time.h available on all architectures
  Added the x86 implementation of word-at-a-time to the
  generic version, which previously only supported big-endian.
  (...)


OK I just applied it on top of 3.18.64-rc1 and it allowed me to build mips
which previously broke. It will not apply as-is, you'll need to drop the
change for arch/nios2/include/asm/Kbuild, and after that it's OK.


Thanks for that, I've now queued that patch up.



Better, but there are still some errors.

powerpc:
lib/string.c: In function 'strscpy':
lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask'

tile:
arch/tile/gxio/mpipe.c:46:15: error: conflicting types for 'strscpy'
include/linux/string.h:29:22: note: previous declaration of 'strscpy' was here

Missing patches:

7a5692e6e533 ("arch/powerpc: provide zero_bytemask() for big-endian")
30059d494a72 ("tile: use global strscpy() rather than private copy")


Thanks for these, I'll queue them up.  And do a -rc2 in a few days as
this was a mess...



Getting there. With v3.18.63-62-gc7d9ae0:

Build results:
total: 136 pass: 136 fail: 0
Qemu test results:
total: 111 pass: 111 fail: 0

Details are available at http://kerneltests.org/builders.

Guenter


[PATCH] f2fs: fix some cases with reserved_blocks

2017-08-07 Thread Yunlong Song
Signed-off-by: Yunlong Song 
---
 fs/f2fs/recovery.c | 3 ++-
 fs/f2fs/super.c| 9 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a3d0261..e288319 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -51,7 +51,8 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
 {
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
 
-   if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
+   if (sbi->last_valid_block_count + nalloc +
+   sbi->reserved_blocks > sbi->user_block_count)
return false;
return true;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4c1bdcb..c644bf5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -946,6 +946,7 @@ static int f2fs_statfs(struct dentry *dentry, struct 
kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
block_t total_count, user_block_count, start_count, ovp_count;
u64 avail_node_count;
+   block_t avail_user_block_count;
 
total_count = le64_to_cpu(sbi->raw_super->block_count);
user_block_count = sbi->user_block_count;
@@ -953,16 +954,16 @@ static int f2fs_statfs(struct dentry *dentry, struct 
kstatfs *buf)
ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
buf->f_type = F2FS_SUPER_MAGIC;
buf->f_bsize = sbi->blocksize;
+   avail_user_block_count = user_block_count - sbi->reserved_blocks;
 
buf->f_blocks = total_count - start_count;
buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
-   buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-   sbi->reserved_blocks;
+   buf->f_bavail = avail_user_block_count - valid_user_blocks(sbi);
 
avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
 
-   if (avail_node_count > user_block_count) {
-   buf->f_files = user_block_count;
+   if (avail_node_count > avail_user_block_count) {
+   buf->f_files = avail_user_block_count;
buf->f_ffree = buf->f_bavail;
} else {
buf->f_files = avail_node_count;
-- 
1.8.5.2



Re: Possible race in pc87413_wdt.ko

2017-08-07 Thread Guenter Roeck

On 08/07/2017 06:22 AM, Anton Volkov wrote:

Hello.

While searching for races in the Linux kernel I've come across 
"drivers/watchdog/pc87413_wdt.ko" module. Here is a question that I came up 
with while analyzing results. Lines are given using the info from Linux v4.12.

Consider the following case:

Thread 1:  Thread 2:
pc87413_init
misc_register(&pc87413_miscdev)
-> pc87413_get_swc_base_addr   pc87413_open
-> pc87413_refresh
   -> pc87413_swc_bank3
  swc_base_addr = ...  
  (pc87413_wdt.c: line 133)(pc87413_wdt.c: line 146)

So in this case preemptive registration of the device leads to a possibility of 
race between the initialization process and a callback to the registered device.

Is this race feasible from your point of view? And if it is, is it possible to 
move the device registration a bit further down in the pc87413_init function?



Yes, the race is feasible, and it is possible to move the device registration 
function
(though the preferred solution would be to convert the driver to use the 
watchdog
subsystem). The code looks pretty bad as written.

Just not sure if it is worth bothering about it. I suspect no on is using that 
driver
anymore (the datasheet is from 2001). Might as well just declare it obsolete and
wait for someone to scream.

Guenter


Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page

2017-08-07 Thread Mike Kravetz
On 08/07/2017 12:21 AM, Huang, Ying wrote:
> From: Huang Ying 
> 
> Huge page helps to reduce TLB miss rate, but it has higher cache
> footprint, sometimes this may cause some issue.  For example, when
> clearing huge page on x86_64 platform, the cache footprint is 2M.  But
> on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M
> LLC (last level cache).  That is, in average, there are 2.5M LLC for
> each core and 1.25M LLC for each thread.  If the cache pressure is
> heavy when clearing the huge page, and we clear the huge page from the
> begin to the end, it is possible that the begin of huge page is
> evicted from the cache after we finishing clearing the end of the huge
> page.  And it is possible for the application to access the begin of
> the huge page after clearing the huge page.
> 
> To help the above situation, in this patch, when we clear a huge page,
> the order to clear sub-pages is changed.  In quite some situation, we
> can get the address that the application will access after we clear
> the huge page, for example, in a page fault handler.  Instead of
> clearing the huge page from begin to end, we will clear the sub-pages
> farthest from the the sub-page to access firstly, and clear the
> sub-page to access last.  This will make the sub-page to access most
> cache-hot and sub-pages around it more cache-hot too.  If we cannot
> know the address the application will access, the begin of the huge
> page is assumed to be the the address the application will access.
> 
> With this patch, the throughput increases ~28.3% in vm-scalability
> anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699
> system (36 cores, 72 threads).  The test case creates 72 processes,
> each process mmap a big anonymous memory area and writes to it from
> the begin to the end.  For each process, other processes could be seen
> as other workload which generates heavy cache pressure.  At the same
> time, the cache miss rate reduced from ~33.4% to ~31.7%, the
> IPC (instruction per cycle) increased from 0.56 to 0.74, and the time
> spent in user space is reduced ~7.9%
> 
> Thanks Andi Kleen to propose to use address to access to determine the
> order of sub-pages to clear.
> 
> The hugetlbfs access address could be improved, will do that in
> another patch.

hugetlb_fault masks off the actual faulting address with,
address &= huge_page_mask(h);
before calling hugetlb_no_page.

But, we could pass down the actual (unmasked) address to take advantage
of this optimization for hugetlb faults as well.  hugetlb_fault is the
only caller of hugetlb_no_page, so this should be pretty straight forward.

Were you thinking of additional improvements?
-- 
Mike Kravetz


[PATCH v2 3/4] KVM: s390: implements the kvm_arch_vcpu_in_kernel()

2017-08-07 Thread Longpeng(Mike)
This implements the kvm_arch_vcpu_in_kernel() for s390.

Signed-off-by: Longpeng(Mike) 
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0b0c689..e46177b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2449,7 +2449,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
-   return false;
+   return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel);
 
-- 
1.8.3.1




[PATCH v2 2/4] KVM: X86: implement the logic for spinlock optimization

2017-08-07 Thread Longpeng(Mike)
1. Implements the kvm_arch_vcpu_in_kernel(), because get_cpl requires
vcpu_load, so we must cache the result(whether the vcpu was preempted
when its cpl=0) in kvm_vcpu_arch.

2. Add ->spin_in_kernel hook, because we can get benefit from VMX.

Signed-off-by: Longpeng(Mike) 
---
 arch/x86/include/asm/kvm_host.h |  5 +
 arch/x86/kvm/hyperv.c   |  2 +-
 arch/x86/kvm/svm.c  |  8 +++-
 arch/x86/kvm/vmx.c  | 16 +++-
 arch/x86/kvm/x86.c  |  7 ++-
 5 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fb..d2b2d57 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -688,6 +688,9 @@ struct kvm_vcpu_arch {
 
/* GPA available (AMD only) */
bool gpa_available;
+
+   /* be preempted when it's in kernel-mode(cpl=0) */
+   bool preempted_in_kernel;
 };
 
 struct kvm_lpage_info {
@@ -1057,6 +1060,8 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+   bool (*spin_in_kernel)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index cd0e6e6..dec5e8a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1268,7 +1268,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
switch (code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
-   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
+   kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu));
break;
case HVCALL_POST_MESSAGE:
case HVCALL_SIGNAL_EVENT:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e6ed24e..ccb6df7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3751,7 +3751,7 @@ static int pause_interception(struct vcpu_svm *svm)
 {
struct kvm_vcpu *vcpu = &(svm->vcpu);
 
-   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
+   kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu));
return 1;
 }
 
@@ -5364,6 +5364,11 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
 }
 
+static bool svm_spin_in_kernel(struct kvm_vcpu *vcpu)
+{
+   return svm_get_cpl(vcpu) == 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5476,6 +5481,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
+   .spin_in_kernel = svm_spin_in_kernel,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9d6223a..297a158 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6761,7 +6761,8 @@ static int handle_pause(struct kvm_vcpu *vcpu)
if (ple_gap)
grow_ple_window(vcpu);
 
-   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
+   /* See comments in vmx_spin_in_kernel() */
+   kvm_vcpu_on_spin(vcpu, true);
return kvm_skip_emulated_instruction(vcpu);
 }
 
@@ -11636,6 +11637,17 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
 }
 
+static bool vmx_spin_in_kernel(struct kvm_vcpu *vcpu)
+{
+   /*
+* Intel sdm vol3 ch-25.1.3 says: The “PAUSE-loop exiting”
+* VM-execution control is ignored if CPL > 0. OTOH, KVM
+* never set PAUSE_EXITING and just set PLE if supported,
+* so the vcpu must be CPL=0 if it gets a PAUSE exit.
+*/
+   return true;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -11763,6 +11775,8 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 #endif
 
.setup_mce = vmx_setup_mce,
+
+   .spin_in_kernel = vmx_spin_in_kernel,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4430be6..28299b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2881,6 +2881,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu 
*vcpu)
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
int idx;
+
+   if (vcpu->preempted)
+   vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+
/*
 * Disable page faults because we're in atomic context here.
 * kvm_write_guest_offset_cached() would call might_fault()
@@ -7992,6 +7996,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_pmu_init(vcpu);
 
vcpu->arch.pending_external_vector = -1;
+   vcpu->arch.preempted_in_kernel = false;
 
kvm_hv_vcpu_init(vcpu);
 
@@ -8441,7 +8446,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 bo

[PATCH v2 1/4] KVM: add spinlock optimization framework

2017-08-07 Thread Longpeng(Mike)
If the vcpu(me) exit due to request a usermode spinlock, then
the spinlock-holder may be preempted in usermode or kernmode.

But if the vcpu(me) is in kernmode, then the holder must be
preempted in kernmode, so we should choose a vcpu in kernmode
as the most eligible candidate.

This introduces kvm_arch_vcpu_in_kernel() to decide whether the
vcpu is in kernel-mode when it's preempted or spinlock exit.

Signed-off-by: Longpeng(Mike) 
---
 arch/arm/kvm/handle_exit.c   | 2 +-
 arch/arm64/kvm/handle_exit.c | 2 +-
 arch/mips/kvm/mips.c | 6 ++
 arch/powerpc/kvm/powerpc.c   | 6 ++
 arch/s390/kvm/diag.c | 2 +-
 arch/s390/kvm/kvm-s390.c | 6 ++
 arch/x86/kvm/hyperv.c| 2 +-
 arch/x86/kvm/svm.c   | 4 +++-
 arch/x86/kvm/vmx.c   | 2 +-
 arch/x86/kvm/x86.c   | 6 ++
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/arm/arm.c   | 5 +
 virt/kvm/kvm_main.c  | 4 +++-
 13 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 54442e3..a7ea5db 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -67,7 +67,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) {
trace_kvm_wfx(*vcpu_pc(vcpu), true);
vcpu->stat.wfe_exit_stat++;
-   kvm_vcpu_on_spin(vcpu);
+   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
} else {
trace_kvm_wfx(*vcpu_pc(vcpu), false);
vcpu->stat.wfi_exit_stat++;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 17d8a16..d6c8cb6 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -84,7 +84,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
vcpu->stat.wfe_exit_stat++;
-   kvm_vcpu_on_spin(vcpu);
+   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
} else {
trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
vcpu->stat.wfi_exit_stat++;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index d4b2ad1..70208be 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -98,6 +98,12 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return !!(vcpu->arch.pending_exceptions);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel);
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
return 1;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1a75c0b..6184c45 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -58,6 +58,12 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel);
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
return 1;
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index ce865bd..4ea8c38 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -150,7 +150,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 {
VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
vcpu->stat.diagnose_44++;
-   kvm_vcpu_on_spin(vcpu);
+   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
return 0;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index af09d34..0b0c689 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2447,6 +2447,12 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_s390_vcpu_has_irq(vcpu, 0);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel);
+
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
 {
atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 337b6d2..cd0e6e6 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1268,7 +1268,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
switch (code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
-   kvm_vcpu_on_spin(vcpu);
+   kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu));
break;
case HVCALL_POST_MESSAGE:
case HVCALL_SIGNAL_EVENT:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1107626..e6ed24e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3749,7 +3749,9 @@ static int interrupt_window_interception(struct vcpu_svm 
*svm)
 
 static int pause_interception(struct vcpu_svm *svm)
 {
-   kvm_vc

[PATCH v2 4/4] KVM: arm: implements the kvm_arch_vcpu_in_kernel()

2017-08-07 Thread Longpeng(Mike)
This implements the kvm_arch_vcpu_in_kernel() for ARM.

Signed-off-by: Longpeng(Mike) 
---
 virt/kvm/arm/arm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 862f820..b9f68e4 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -418,7 +418,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
-   return false;
+   return vcpu_mode_priv(vcpu);
 }
 
 /* Just ensure a guest exit from a particular CPU */
-- 
1.8.3.1




Re: [PATCH v2 0/9] mfd: axp20x: Add basic support for AXP813

2017-08-07 Thread Chen-Yu Tsai
On Wed, Jul 26, 2017 at 4:32 PM, Maxime Ripard
 wrote:
> On Wed, Jul 26, 2017 at 04:28:23PM +0800, Chen-Yu Tsai wrote:
>> Hi everyone,
>>
>> This is v2 of my AXP813 support series. The device tree patches are
>> based on my A83T MMC support series. These will go through the sunxi
>> tree. The dt-binding and mfd patches are based on v4.13-rc1. These
>> will go through Lee's mfd tree.
>>
>> Changes since v1:
>>
>>   - Provided relative path for ac100.txt in dt-bindings/mfd/axp20x.txt
>>
>>   - Added Rob's acks to dt-binding patches
>>
>>   - Added Quentin's "mfd: axp20x: use correct platform device id for
>> many PEK" patch to this series. This patch depends on mfd changes
>> in this series. It is included so Lee can take them together in
>> one go.
>>
>>   - Added Lee's mfd-acks to mfd patches
>>
>>   - Added axp818 compatible with axp813 fallback. The two chips are
>> identical except for the markings. The added compatible matches
>> what is actually on the board, to avoid confusing readers.
>>
>>   - Fixed up device tree patches to mention which board is changed
>>
>>   - Added device tree patches for the H8 homlet
>
> For the whole serie,
> Acked-by: Maxime Ripard 

Applied the dts patches for 4.14.

ChenYu


[PATCH v2 0/4] KVM: optimize the kvm_vcpu_on_spin

2017-08-07 Thread Longpeng(Mike)
This is a simple optimization for kvm_vcpu_on_spin, the
main idea is described in patch-1's commit msg.

I did some tests base on the RFC version, the result shows
that it can improves the performance slightly.

== Geekbench-3.4.1 ==
VM1:8U,4G, vcpu(0...7) is 1:1 pinned to pcpu(6...11,18,19)
running Geekbench-3.4.1 *10 truns*
VM2/VM3/VM4: configure is the same as VM1
stress each vcpu usage(seed by top in guest) to 40%

The comparison of each testcase's score:
(higher is better)
before  after   improve
Inter
 single 1176.7  1179.0  0.2%
 multi  3459.5  3426.5  -0.9%
Float
 single 1150.5  1150.9  0.0%
 multi  3364.5  3391.9  0.8%
Memory(stream)
 single 1768.7  1773.1  0.2%
 multi  2511.6  2557.2  1.8%
Overall
 single 1284.2  1286.2  0.2%
 multi  3231.4  3238.4  0.2%


== kernbench-0.42 ==
VM1:8U,12G, vcpu(0...7) is 1:1 pinned to pcpu(6...11,18,19)
running "kernbench -n 10"
VM2/VM3/VM4: configure is the same as VM1
stress each vcpu usage(seed by top in guest) to 40%

The comparison of 'Elapsed Time':
(sooner is better)
before  after   improve
load -j412.762  12.751  0.1%
load -j32   9.743   8.955   8.1%
load -j 9.688   9.229   4.7%


Physical Machine:
  Architecture:  x86_64
  CPU op-mode(s):32-bit, 64-bit
  Byte Order:Little Endian
  CPU(s):24
  On-line CPU(s) list:   0-23
  Thread(s) per core:2
  Core(s) per socket:6
  Socket(s): 2
  NUMA node(s):  2
  Vendor ID: GenuineIntel
  CPU family:6
  Model: 45
  Model name:Intel(R) Xeon(R) CPU E5-2640 0 @ 2.50GHz
  Stepping:  7
  CPU MHz:   2799.902
  BogoMIPS:  5004.67
  Virtualization:VT-x
  L1d cache: 32K
  L1i cache: 32K
  L2 cache:  256K
  L3 cache:  15360K
  NUMA node0 CPU(s): 0-5,12-17
  NUMA node1 CPU(s): 6-11,18-23

---
Changes since V1:
 - split the implementation of s390 & arm. [David]
 - refactor the impls according to the suggestion. [Paolo]

Changes since RFC:
 - only cache result for X86. [David & Cornlia & Paolo]
 - add performance numbers. [David]
 - impls arm/s390. [Christoffer & David]
 - refactor the impls. [me]

---
Longpeng(Mike) (4):
  KVM: add spinlock optimization framework
  KVM: X86: implement the logic for spinlock optimization
  KVM: s390: implements the kvm_arch_vcpu_in_kernel()
  KVM: arm: implements the kvm_arch_vcpu_in_kernel()

 arch/arm/kvm/handle_exit.c  |  2 +-
 arch/arm64/kvm/handle_exit.c|  2 +-
 arch/mips/kvm/mips.c|  6 ++
 arch/powerpc/kvm/powerpc.c  |  6 ++
 arch/s390/kvm/diag.c|  2 +-
 arch/s390/kvm/kvm-s390.c|  6 ++
 arch/x86/include/asm/kvm_host.h |  5 +
 arch/x86/kvm/hyperv.c   |  2 +-
 arch/x86/kvm/svm.c  | 10 +-
 arch/x86/kvm/vmx.c  | 16 +++-
 arch/x86/kvm/x86.c  | 11 +++
 include/linux/kvm_host.h|  3 ++-
 virt/kvm/arm/arm.c  |  5 +
 virt/kvm/kvm_main.c |  4 +++-
 14 files changed, 72 insertions(+), 8 deletions(-)

-- 
1.8.3.1




[PATCH v2] x86/xen/64: Rearrange the SYSCALL entries

2017-08-07 Thread Andy Lutomirski
Xen's raw SYSCALL entries are much less weird than native.  Rather
than fudging them to look like native entries, use the Xen-provided
stack frame directly.

This lets us eliminate entry_SYSCALL_64_after_swapgs and two uses of
the SWAPGS_UNSAFE_STACK paravirt hook.  The SYSENTER code would
benefit from similar treatment.

This makes one change to the native code path: the compat
instruction that clears the high 32 bits of %rax is moved slightly
later.  I'd be surprised if this affects performance at all.

Signed-off-by: Andy Lutomirski 
---

Changes from v1 (which I never actually emailed):
 - Fix zero-extension in the compat case.

 arch/x86/entry/entry_64.S|  9 ++---
 arch/x86/entry/entry_64_compat.S |  7 +++
 arch/x86/xen/xen-asm_64.S| 23 +--
 3 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index aa58155187c5..7cee92cf807f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64)
 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 * it is too small to ever cause noticeable irq latency.
 */
-   SWAPGS_UNSAFE_STACK
-   /*
-* A hypervisor implementation might want to use a label
-* after the swapgs, so that it can do the swapgs
-* for the guest and jump here on syscall.
-*/
-GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+   swapgs
movq%rsp, PER_CPU_VAR(rsp_scratch)
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq   %r11/* pt_regs->flags */
pushq   $__USER_CS  /* pt_regs->cs */
pushq   %rcx/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..5314d7b8e5ad 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
  */
 ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
-   SWAPGS_UNSAFE_STACK
+   swapgs
 
/* Stash user ESP and switch to the kernel stack. */
movl%esp, %r8d
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-   /* Zero-extending 32-bit regs, do not remove */
-   movl%eax, %eax
-
/* Construct struct pt_regs on stack */
pushq   $__USER32_DS/* pt_regs->ss */
pushq   %r8 /* pt_regs->sp */
pushq   %r11/* pt_regs->flags */
pushq   $__USER32_CS/* pt_regs->cs */
pushq   %rcx/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+   movl%eax, %eax  /* discard orig_ax high bits */
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..a8a4f4c460a6 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -82,34 +82,29 @@ RELOC(xen_sysret64, 1b+1)
  * rip
  * r11
  * rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
  */
 
-.macro undo_xen_syscall
-   mov 0*8(%rsp), %rcx
-   mov 1*8(%rsp), %r11
-   mov 5*8(%rsp), %rsp
-.endm
-
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
-   undo_xen_syscall
-   jmp entry_SYSCALL_64_after_swapgs
+   popq %rcx
+   popq %r11
+   jmp entry_SYSCALL_64_after_hwframe
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-   undo_xen_syscall
-   jmp entry_SYSCALL_compat
+   popq %rcx
+   popq %r11
+   jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
-   undo_xen_syscall
+   mov 0*8(%rsp), %rcx
+   mov 1*8(%rsp), %r11
+   mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
-- 
2.13.3



Re: [PATCH v2 00/13] mpt3sas driver NVMe support:

2017-08-07 Thread Keith Busch
On Mon, Aug 07, 2017 at 08:45:25AM -0700, James Bottomley wrote:
> On Mon, 2017-08-07 at 20:01 +0530, Kashyap Desai wrote:
> > 
> > We have to attempt this use case and see how it behaves. I have not
> > tried this, so not sure if things are really bad or just some tuning
> > may be helpful. I will revert back to you on this.
> > 
> > I understood request as -  We need some udev rules to be working well
> > for *same* NVME drives if it is behind  or native .
> > Example - If user has OS installed on NVME drive which is behind
> >  driver as SCSI disk should be able to boot if he/she hooked
> > same NVME drive which is detected by native  driver (and vice
> > versa.)
> 
> It's not just the udev rules, it's the tools as well; possibly things
> like that nvme-cli toolkit Intel is doing.

It looks like they can make existing nvme tooling work with little
effort if they have the driver implement NVME_IOCTL_ADMIN_COMMAND, and
then have their driver build the MPI NVMe Encapsulated Request from that.


[RFC v1 0/4] ipmi_bmc: framework for IPMI on BMCs

2017-08-07 Thread Brendan Higgins
This introduces a framework for implementing the BMC side of the IPMI protocol,
roughly mirroring the host side OpenIPMI framework; it attempts to abstract away
hardware interfaces, such as Block Transfer interface hardware implementations
from IPMI command handlers.

It does this by implementing the traditional driver model of a bus with devices;
however, in this case a struct ipmi_bmc_bus represents a hardware interface,
where a struct ipmi_bmc_device represents a handler. A handler filters messages
by registering a function which returns whether a given message matches the
handler; it also has the concept of a default handler which is forwarded all
messages which are not matched by some other interface.

In this patchset, we introduce an example of a default handler: a misc device
file interface which implements the same interface as the the device file
interface used by the Aspeed BT driver.

Currently, OpenBMC handles all IPMI message routing and handling in userland;
the existing drivers simply provide a file interface for the hardware on the
device. In this patchset, we propose a common file interface to be shared by all
IPMI hardware interfaces, but also a framework for implementing handlers at the
kernel level, similar to how the existing OpenIPMI framework supports both
kernel users, as well as misc device file interface.

This patchset depends on the "ipmi: bt-i2c: added IPMI Block Transfer over I2C"
patchset, which can be found here:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1461960.html
However, I can fix this if desired.

Tested on the AST2500 EVB.


[RFC v1 2/4] ipmi_bmc: device interface to IPMI BMC framework

2017-08-07 Thread Brendan Higgins
From: Benjamin Fair 

This creates a char device which allows userspace programs to send and
receive IPMI messages. Messages are only routed to userspace if no other
kernel driver can handle them.

Signed-off-by: Benjamin Fair 
Signed-off-by: Brendan Higgins 
---
 drivers/char/ipmi_bmc/Kconfig|   6 +
 drivers/char/ipmi_bmc/Makefile   |   1 +
 drivers/char/ipmi_bmc/ipmi_bmc_devintf.c | 241 +++
 3 files changed, 248 insertions(+)
 create mode 100644 drivers/char/ipmi_bmc/ipmi_bmc_devintf.c

diff --git a/drivers/char/ipmi_bmc/Kconfig b/drivers/char/ipmi_bmc/Kconfig
index b6af38455702..262a17866aa2 100644
--- a/drivers/char/ipmi_bmc/Kconfig
+++ b/drivers/char/ipmi_bmc/Kconfig
@@ -11,6 +11,12 @@ menuconfig IPMI_BMC
 
 if IPMI_BMC
 
+config IPMI_BMC_DEVICE_INTERFACE
+   tristate 'Device interface for BMC-side IPMI'
+   help
+ This provides a file interface to the IPMI BMC core so userland
+ processes may use IPMI.
+
 config IPMI_BMC_BT_I2C
depends on I2C
select I2C_SLAVE
diff --git a/drivers/char/ipmi_bmc/Makefile b/drivers/char/ipmi_bmc/Makefile
index 9c7cd48d899f..ead8abffbd11 100644
--- a/drivers/char/ipmi_bmc/Makefile
+++ b/drivers/char/ipmi_bmc/Makefile
@@ -3,5 +3,6 @@
 #
 
 obj-$(CONFIG_IPMI_BMC) += ipmi_bmc.o
+obj-$(CONFIG_IPMI_BMC_DEVICE_INTERFACE) += ipmi_bmc_devintf.o
 obj-$(CONFIG_IPMI_BMC_BT_I2C) += ipmi_bmc_bt_i2c.o
 obj-$(CONFIG_ASPEED_BT_IPMI_BMC) += ipmi_bmc_bt_aspeed.o
diff --git a/drivers/char/ipmi_bmc/ipmi_bmc_devintf.c 
b/drivers/char/ipmi_bmc/ipmi_bmc_devintf.c
new file mode 100644
index ..2421237ed575
--- /dev/null
+++ b/drivers/char/ipmi_bmc/ipmi_bmc_devintf.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define PFX "IPMI BMC devintf: "
+
+#define DEVICE_NAME "ipmi-bt-host"
+
+/* Must be a power of two */
+#define REQUEST_FIFO_SIZE roundup_pow_of_two(BT_MSG_SEQ_MAX)
+
+struct bmc_devintf_data {
+   struct miscdevice   miscdev;
+   struct ipmi_bmc_device  bmc_device;
+   struct ipmi_bmc_ctx *bmc_ctx;
+   wait_queue_head_t   wait_queue;
+   /* FIFO of waiting messages */
+   DECLARE_KFIFO(requests, struct bt_msg, REQUEST_FIFO_SIZE);
+};
+
+static inline struct bmc_devintf_data *file_to_bmc_devintf_data(
+   struct file *file)
+{
+   return container_of(file->private_data, struct bmc_devintf_data,
+   miscdev);
+}
+
+static ssize_t ipmi_bmc_devintf_read(struct file *file, char __user *buf,
+size_t count, loff_t *ppos)
+{
+   struct bmc_devintf_data *devintf_data = file_to_bmc_devintf_data(file);
+   bool non_blocking = file->f_flags & O_NONBLOCK;
+   struct bt_msg msg;
+
+   if (non_blocking && kfifo_is_empty(&devintf_data->requests)) {
+   return -EAGAIN;
+   } else if (!non_blocking) {
+   if (wait_event_interruptible(devintf_data->wait_queue,
+   !kfifo_is_empty(&devintf_data->requests)))
+   return -ERESTARTSYS;
+   }
+
+   /* TODO(benjaminfair): eliminate this extra copy */
+   if (unlikely(!kfifo_get(&devintf_data->requests, &msg))) {
+   pr_err(PFX "Unable to read request from fifo\n");
+   return -EIO;
+   }
+
+   /* TODO(benjaminfair): handle partial reads of a message */
+   if (count > bt_msg_len(&msg))
+   count = bt_msg_len(&msg);
+
+   if (copy_to_user(buf, &msg, count))
+   return -EFAULT;
+
+   return count;
+}
+
+static ssize_t ipmi_bmc_devintf_write(struct file *file, const char __user 
*buf,
+ size_t count, loff_t *ppos)
+{
+   struct bmc_devintf_data *devintf_data = file_to_bmc_devintf_data(file);
+   bool non_blocking = file->f_flags & O_NONBLOCK;
+   struct bt_msg msg;
+   ssize_t ret = 0;
+
+   if (count > sizeof(struct bt_msg))
+   return -EINVAL;
+
+   if (copy_from_user(&msg, buf, count))
+   return -EFAULT;
+
+   if (count != bt_msg_len(&msg))
+   return -EINVAL;
+
+   ret = ipmi_bmc_send_response(devintf_data->bmc_ctx, &msg);
+
+   /* Try again if blocking is allowed */
+   while (!non_blocking && ret == -EAGAIN) {
+   if (wait_event_inte

[RFC v1 4/4] ipmi_bmc: bt-aspeed: port driver to IPMI BMC framework

2017-08-07 Thread Brendan Higgins
From: Benjamin Fair 

The driver was handling interaction with userspace on its own. This
patch changes it to use the functionality of the ipmi_bmc framework
instead.

Note that this removes the ability for the BMC to set SMS_ATN by making
an ioctl. If this functionality is required, it can be added back in
with a later patch.

Signed-off-by: Benjamin Fair 
Signed-off-by: Brendan Higgins 
---
 drivers/char/ipmi_bmc/ipmi_bmc_bt_aspeed.c | 258 +
 include/uapi/linux/bt-bmc.h|  18 --
 2 files changed, 74 insertions(+), 202 deletions(-)
 delete mode 100644 include/uapi/linux/bt-bmc.h

diff --git a/drivers/char/ipmi_bmc/ipmi_bmc_bt_aspeed.c 
b/drivers/char/ipmi_bmc/ipmi_bmc_bt_aspeed.c
index 70d434bc1cbf..7c8082c511ee 100644
--- a/drivers/char/ipmi_bmc/ipmi_bmc_bt_aspeed.c
+++ b/drivers/char/ipmi_bmc/ipmi_bmc_bt_aspeed.c
@@ -7,25 +7,19 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include 
-#include 
 #include 
 #include 
 #include 
+#include 
 #include 
-#include 
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 
-/*
- * This is a BMC device used to communicate to the host
- */
-#define DEVICE_NAME"ipmi-bt-host"
+#define DEVICE_NAME "ipmi-bmc-bt-aspeed"
 
 #define BT_IO_BASE 0xe4
 #define BT_IRQ 10
@@ -61,18 +55,17 @@
 #define BT_BMC_BUFFER_SIZE 256
 
 struct bt_bmc {
+   struct ipmi_bmc_bus bus;
struct device   dev;
-   struct miscdevice   miscdev;
+   struct ipmi_bmc_ctx *bmc_ctx;
+   struct bt_msg   request;
struct regmap   *map;
int offset;
int irq;
-   wait_queue_head_t   queue;
struct timer_list   poll_timer;
-   struct mutexmutex;
+   spinlock_t  lock;
 };
 
-static atomic_t open_count = ATOMIC_INIT(0);
-
 static const struct regmap_config bt_regmap_cfg = {
.reg_bits = 32,
.val_bits = 32,
@@ -158,27 +151,28 @@ static ssize_t bt_writen(struct bt_bmc *bt_bmc, u8 *buf, 
size_t n)
return n;
 }
 
+/* TODO(benjaminfair): support ioctl BT_BMC_IOCTL_SMS_ATN */
 static void set_sms_atn(struct bt_bmc *bt_bmc)
 {
bt_outb(bt_bmc, BT_CTRL_SMS_ATN, BT_CTRL);
 }
 
-static struct bt_bmc *file_bt_bmc(struct file *file)
+/* Called with bt_bmc->lock held */
+static bool __is_request_avail(struct bt_bmc *bt_bmc)
 {
-   return container_of(file->private_data, struct bt_bmc, miscdev);
+   return bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_H2B_ATN;
 }
 
-static int bt_bmc_open(struct inode *inode, struct file *file)
+static bool is_request_avail(struct bt_bmc *bt_bmc)
 {
-   struct bt_bmc *bt_bmc = file_bt_bmc(file);
+   unsigned long flags;
+   bool result;
 
-   if (atomic_inc_return(&open_count) == 1) {
-   clr_b_busy(bt_bmc);
-   return 0;
-   }
+   spin_lock_irqsave(&bt_bmc->lock, flags);
+   result = __is_request_avail(bt_bmc);
+   spin_unlock_irqrestore(&bt_bmc->lock, flags);
 
-   atomic_dec(&open_count);
-   return -EBUSY;
+   return result;
 }
 
 /*
@@ -194,67 +188,43 @@ static int bt_bmc_open(struct inode *inode, struct file 
*file)
  *Length  NetFn/LUN  Seq Cmd Data
  *
  */
-static ssize_t bt_bmc_read(struct file *file, char __user *buf,
-  size_t count, loff_t *ppos)
+static void get_request(struct bt_bmc *bt_bmc)
 {
-   struct bt_bmc *bt_bmc = file_bt_bmc(file);
-   u8 len;
-   int len_byte = 1;
-   u8 kbuffer[BT_BMC_BUFFER_SIZE];
-   ssize_t ret = 0;
-   ssize_t nread;
+   u8 *request_buf = (u8 *) &bt_bmc->request;
+   unsigned long flags;
 
-   if (!access_ok(VERIFY_WRITE, buf, count))
-   return -EFAULT;
+   spin_lock_irqsave(&bt_bmc->lock, flags);
 
-   WARN_ON(*ppos);
-
-   if (wait_event_interruptible(bt_bmc->queue,
-bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_H2B_ATN))
-   return -ERESTARTSYS;
-
-   mutex_lock(&bt_bmc->mutex);
-
-   if (unlikely(!(bt_inb(bt_bmc, BT_CTRL) & BT_CTRL_H2B_ATN))) {
-   ret = -EIO;
-   goto out_unlock;
+   if (!__is_request_avail(bt_bmc)) {
+   spin_unlock_irqrestore(&bt_bmc->lock, flags);
+   return;
}
 
set_b_busy(bt_bmc);
clr_h2b_atn(bt_bmc);
clr_rd_ptr(bt_bmc);
 
-   /*
-* The BT frames start with the message length, which does not
-* include the length byte.
-*/
-   kbuffer[0] = bt_read(bt_bmc);
-   len = kbuffer[0];
-
-   /* We pass the length back to userspace as well */
-   if (len + 1 > count)
-   len = count - 1;
-
-   while (len) {
-   nread = min_t(ssize_t, len, sizeof(kbuffer) - len_byte);
-
-   bt_readn(bt_bmc, kbuffer + len_byte, nread);
-
-   if (copy_to_user(buf,

[RFC v1 3/4] ipmi_bmc: bt-i2c: port driver to IPMI BMC framework

2017-08-07 Thread Brendan Higgins
From: Benjamin Fair 

Instead of handling interaction with userspace and providing a file
interface, rely on the IPMI BMC framework to do this. This simplifies
the logic and eliminates duplicate code.

Signed-off-by: Benjamin Fair 
Signed-off-by: Brendan Higgins 
---
 drivers/char/ipmi_bmc/ipmi_bmc_bt_i2c.c | 202 +---
 1 file changed, 28 insertions(+), 174 deletions(-)

diff --git a/drivers/char/ipmi_bmc/ipmi_bmc_bt_i2c.c 
b/drivers/char/ipmi_bmc/ipmi_bmc_bt_i2c.c
index 686b83fa42a4..6665aa9d4300 100644
--- a/drivers/char/ipmi_bmc/ipmi_bmc_bt_i2c.c
+++ b/drivers/char/ipmi_bmc/ipmi_bmc_bt_i2c.c
@@ -14,102 +14,51 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
 
 #define PFX "IPMI BMC BT-I2C: "
 
-/*
- * TODO: This is "bt-host" to match the bt-host driver; however, I think this 
is
- * unclear in the context of a CPU side driver. Should probably name this
- * and the DEVICE_NAME in bt-host to something like "bt-bmc" or "bt-slave".
- */
-#define DEVICE_NAME"ipmi-bt-host"
-
-static const unsigned long request_queue_max_len = 256;
-
-struct bt_request_elem {
-   struct list_headlist;
-   struct bt_msg   request;
-};
-
 struct bt_i2c_slave {
+   struct ipmi_bmc_bus bus;
struct i2c_client   *client;
-   struct miscdevice   miscdev;
+   struct ipmi_bmc_ctx *bmc_ctx;
struct bt_msg   request;
-   struct list_headrequest_queue;
-   atomic_trequest_queue_len;
struct bt_msg   response;
boolresponse_in_progress;
size_t  msg_idx;
spinlock_t  lock;
-   wait_queue_head_t   wait_queue;
-   struct mutexfile_mutex;
 };
 
-static int receive_bt_request(struct bt_i2c_slave *bt_slave, bool non_blocking,
- struct bt_msg *bt_request)
+static bool bt_i2c_is_response_open(struct ipmi_bmc_bus *bus)
 {
-   int res;
+   struct bt_i2c_slave *bt_slave;
+   bool response_in_progress;
unsigned long flags;
-   struct bt_request_elem *queue_elem;
-
-   if (!non_blocking) {
-try_again:
-   res = wait_event_interruptible(
-   bt_slave->wait_queue,
-   atomic_read(&bt_slave->request_queue_len));
-   if (res)
-   return res;
-   }
 
-   spin_lock_irqsave(&bt_slave->lock, flags);
-   if (!atomic_read(&bt_slave->request_queue_len)) {
-   spin_unlock_irqrestore(&bt_slave->lock, flags);
-   if (non_blocking)
-   return -EAGAIN;
-   goto try_again;
-   }
+   bt_slave = container_of(bus, struct bt_i2c_slave, bus);
 
-   if (list_empty(&bt_slave->request_queue)) {
-   pr_err(PFX "request_queue was empty despite nonzero 
request_queue_len\n");
-   return -EIO;
-   }
-   queue_elem = list_first_entry(&bt_slave->request_queue,
- struct bt_request_elem, list);
-   memcpy(bt_request, &queue_elem->request, sizeof(*bt_request));
-   list_del(&queue_elem->list);
-   kfree(queue_elem);
-   atomic_dec(&bt_slave->request_queue_len);
+   spin_lock_irqsave(&bt_slave->lock, flags);
+   response_in_progress = bt_slave->response_in_progress;
spin_unlock_irqrestore(&bt_slave->lock, flags);
-   return 0;
+
+   return !response_in_progress;
 }
 
-static int send_bt_response(struct bt_i2c_slave *bt_slave, bool non_blocking,
-   struct bt_msg *bt_response)
+static int bt_i2c_send_response(struct ipmi_bmc_bus *bus,
+   struct bt_msg *bt_response)
 {
-   int res;
+   struct bt_i2c_slave *bt_slave;
unsigned long flags;
 
-   if (!non_blocking) {
-try_again:
-   res = wait_event_interruptible(bt_slave->wait_queue,
-  !bt_slave->response_in_progress);
-   if (res)
-   return res;
-   }
+   bt_slave = container_of(bus, struct bt_i2c_slave, bus);
 
spin_lock_irqsave(&bt_slave->lock, flags);
if (bt_slave->response_in_progress) {
spin_unlock_irqrestore(&bt_slave->lock, flags);
-   if (non_blocking)
-   return -EAGAIN;
-   goto try_again;
+   return -EAGAIN;
}
 
memcpy(&bt_slave->response, bt_response, sizeof(*bt_response));
@@ -118,106 +67,13 @@ static int send_bt_response(struct bt_i2c_slave 
*bt_slave, bool non_blocking,
return 0;
 }
 
-static inline struct bt_i2c_slave *to_bt_i2c_slave(struct file *file)
-{
-   return container_of(file->private_data, struct bt_i2c_slave, miscdev);
-}
-
-static ssize_t bt_read(struct file *file, char __user *buf, size_t

[RFC v1 1/4] ipmi_bmc: framework for BT IPMI on BMCs

2017-08-07 Thread Brendan Higgins
From: Benjamin Fair 

This patch introduces a framework for writing IPMI drivers which run on
a Board Management Controller. It is similar in function to OpenIPMI.
The framework handles registering devices and routing messages.

Signed-off-by: Benjamin Fair 
Signed-off-by: Brendan Higgins 
---
 drivers/char/ipmi_bmc/Makefile   |   1 +
 drivers/char/ipmi_bmc/ipmi_bmc.c | 294 +++
 include/linux/ipmi_bmc.h | 184 
 3 files changed, 479 insertions(+)
 create mode 100644 drivers/char/ipmi_bmc/ipmi_bmc.c

diff --git a/drivers/char/ipmi_bmc/Makefile b/drivers/char/ipmi_bmc/Makefile
index 8bff32b55c24..9c7cd48d899f 100644
--- a/drivers/char/ipmi_bmc/Makefile
+++ b/drivers/char/ipmi_bmc/Makefile
@@ -2,5 +2,6 @@
 # Makefile for the ipmi bmc drivers.
 #
 
+obj-$(CONFIG_IPMI_BMC) += ipmi_bmc.o
 obj-$(CONFIG_IPMI_BMC_BT_I2C) += ipmi_bmc_bt_i2c.o
 obj-$(CONFIG_ASPEED_BT_IPMI_BMC) += ipmi_bmc_bt_aspeed.o
diff --git a/drivers/char/ipmi_bmc/ipmi_bmc.c b/drivers/char/ipmi_bmc/ipmi_bmc.c
new file mode 100644
index ..c1324ac9a83c
--- /dev/null
+++ b/drivers/char/ipmi_bmc/ipmi_bmc.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define PFX "IPMI BMC core: "
+
+struct ipmi_bmc_ctx *ipmi_bmc_get_global_ctx()
+{
+   static struct ipmi_bmc_ctx global_ctx;
+
+   return &global_ctx;
+}
+
+int ipmi_bmc_send_response(struct ipmi_bmc_ctx *ctx,
+  struct bt_msg *bt_response)
+{
+   struct ipmi_bmc_bus *bus;
+   int ret = -ENODEV;
+
+   rcu_read_lock();
+   bus = rcu_dereference(ctx->bus);
+
+   if (bus)
+   ret = bus->send_response(bus, bt_response);
+
+   rcu_read_unlock();
+   return ret;
+}
+EXPORT_SYMBOL(ipmi_bmc_send_response);
+
+bool ipmi_bmc_is_response_open(struct ipmi_bmc_ctx *ctx)
+{
+   struct ipmi_bmc_bus *bus;
+   bool ret = false;
+
+   rcu_read_lock();
+   bus = rcu_dereference(ctx->bus);
+
+   if (bus)
+   ret = bus->is_response_open(bus);
+
+   rcu_read_unlock();
+   return ret;
+}
+EXPORT_SYMBOL(ipmi_bmc_is_response_open);
+
+int ipmi_bmc_register_device(struct ipmi_bmc_ctx *ctx,
+struct ipmi_bmc_device *device_in)
+{
+   struct ipmi_bmc_device *device;
+
+   mutex_lock(&ctx->drivers_mutex);
+   /* Make sure it hasn't already been registered. */
+   list_for_each_entry(device, &ctx->devices, link) {
+   if (device == device_in) {
+   mutex_unlock(&ctx->drivers_mutex);
+   return -EINVAL;
+   }
+   }
+
+   list_add_rcu(&device_in->link, &ctx->devices);
+   mutex_unlock(&ctx->drivers_mutex);
+
+   return 0;
+}
+EXPORT_SYMBOL(ipmi_bmc_register_device);
+
+int ipmi_bmc_unregister_device(struct ipmi_bmc_ctx *ctx,
+  struct ipmi_bmc_device *device_in)
+{
+   struct ipmi_bmc_device *device;
+   bool found = false;
+
+   mutex_lock(&ctx->drivers_mutex);
+   /* Make sure it is currently registered. */
+   list_for_each_entry(device, &ctx->devices, link) {
+   if (device == device_in) {
+   found = true;
+   break;
+   }
+   }
+   if (!found) {
+   mutex_unlock(&ctx->drivers_mutex);
+   return -ENXIO;
+   }
+
+   list_del_rcu(&device_in->link);
+   mutex_unlock(&ctx->drivers_mutex);
+   synchronize_rcu();
+
+   return 0;
+}
+EXPORT_SYMBOL(ipmi_bmc_unregister_device);
+
+int ipmi_bmc_register_default_device(struct ipmi_bmc_ctx *ctx,
+struct ipmi_bmc_device *device)
+{
+   int ret;
+
+   mutex_lock(&ctx->drivers_mutex);
+   if (!ctx->default_device) {
+   ctx->default_device = device;
+   ret = 0;
+   } else {
+   ret = -EBUSY;
+   }
+   mutex_unlock(&ctx->drivers_mutex);
+
+   return ret;
+}
+EXPORT_SYMBOL(ipmi_bmc_register_default_device);
+
+int ipmi_bmc_unregister_default_device(struct ipmi_bmc_ctx *ctx,
+  struct ipmi_bmc_device *device)
+{
+   int ret;
+
+   mutex_lock(&ctx->drivers_mutex);
+   if (ctx->default_device == device) {
+   ctx->default_device = NULL;
+   ret = 0;
+   } else {
+   ret = -ENXIO;
+   }
+ 

Re: [PATCH 2/2] f2fs: introduce gc_urgent mode for background GC

2017-08-07 Thread Jaegeuk Kim
Hi Chao,

On 08/08, Chao Yu wrote:
> Hi Jaegeuk,
> 
> On 2017/8/8 9:42, Jaegeuk Kim wrote:
> > This patch adds a sysfs entry to control urgent mode for background GC.
> > If this is set, background GC thread conducts GC with gc_urgent_sleep_time
> > all the time.
> 
> Good idea.
> 
> If we want to add more gc policy, current approach is not friendly to be
> extended, and sysfs nodes are also become more and more, it's not friendly to
> user. So I'd like to suggest adding /sys/fs/f2fs//gc_policy only, and
> exposing original policy as normal_mode, and then introduce urgent_mode and
> reuse gc_min_sleep_time as gc_urgent_sleep_time in this patch.
> 
> e.g.
> 
> enum gc_policy {
>   GC_NORMAL,
>   GC_URGENT,
> };
> 
> If we want to turn on urgent_mode, we could:
> echo 1 > /sys/fs/f2fs//gc_policy
> echo 1000 > /sys/fs/f2fs//gc_min_sleep_time

I want to keep previous gc_min_sleep_time, so that user can go back to normal
state seamlessly.

Thanks,

> 
> How do you think?
> 
> Thanks,
> 
> > 
> > Signed-off-by: Jaegeuk Kim 
> > ---
> >  Documentation/ABI/testing/sysfs-fs-f2fs | 12 
> >  fs/f2fs/gc.c| 17 +++--
> >  fs/f2fs/gc.h|  4 
> >  fs/f2fs/sysfs.c |  9 +
> >  4 files changed, 40 insertions(+), 2 deletions(-)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs 
> > b/Documentation/ABI/testing/sysfs-fs-f2fs
> > index c579ce5e0ef5..11b7f4ebea7c 100644
> > --- a/Documentation/ABI/testing/sysfs-fs-f2fs
> > +++ b/Documentation/ABI/testing/sysfs-fs-f2fs
> > @@ -139,3 +139,15 @@ Date:  June 2017
> >  Contact:   "Chao Yu" 
> >  Description:
> >  Controls current reserved blocks in system.
> > +
> > +What:  /sys/fs/f2fs//gc_urgent
> > +Date:  August 2017
> > +Contact:   "Jaegeuk Kim" 
> > +Description:
> > +Do background GC agressively
> > +
> > +What:  /sys/fs/f2fs//gc_urgent_sleep_time
> > +Date:  August 2017
> > +Contact:   "Jaegeuk Kim" 
> > +Description:
> > +Controls sleep time of GC urgent mode
> > diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> > index 620dca443b29..8da7c14a9d29 100644
> > --- a/fs/f2fs/gc.c
> > +++ b/fs/f2fs/gc.c
> > @@ -35,9 +35,14 @@ static int gc_thread_func(void *data)
> > set_freezable();
> > do {
> > wait_event_interruptible_timeout(*wq,
> > -   kthread_should_stop() || freezing(current),
> > +   kthread_should_stop() || freezing(current) ||
> > +   gc_th->gc_wake,
> > msecs_to_jiffies(wait_ms));
> >  
> > +   /* give it a try one time */
> > +   if (gc_th->gc_wake)
> > +   gc_th->gc_wake = 0;
> > +
> > if (try_to_freeze())
> > continue;
> > if (kthread_should_stop())
> > @@ -74,6 +79,11 @@ static int gc_thread_func(void *data)
> > if (!mutex_trylock(&sbi->gc_mutex))
> > goto next;
> >  
> > +   if (gc_th->gc_urgent) {
> > +   wait_ms = gc_th->urgent_sleep_time;
> > +   goto do_gc;
> > +   }
> > +
> > if (!is_idle(sbi)) {
> > increase_sleep_time(gc_th, &wait_ms);
> > mutex_unlock(&sbi->gc_mutex);
> > @@ -84,7 +94,7 @@ static int gc_thread_func(void *data)
> > decrease_sleep_time(gc_th, &wait_ms);
> > else
> > increase_sleep_time(gc_th, &wait_ms);
> > -
> > +do_gc:
> > stat_inc_bggc_count(sbi);
> >  
> > /* if return value is not zero, no victim was selected */
> > @@ -115,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
> > goto out;
> > }
> >  
> > +   gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
> > gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
> > gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
> > gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
> >  
> > gc_th->gc_idle = 0;
> > +   gc_th->gc_urgent = 0;
> > +   gc_th->gc_wake= 0;
> >  
> > sbi->gc_thread = gc_th;
> > init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
> > diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
> > index a993967dcdb9..57a9000ce3af 100644
> > --- a/fs/f2fs/gc.h
> > +++ b/fs/f2fs/gc.h
> > @@ -13,6 +13,7 @@
> >  * whether IO subsystem is idle
> >  * or not
> >  */
> > +#define DEF_GC_THREAD_URGENT_SLEEP_TIME500 /* 500 ms */
> >  #define DEF_GC_THREAD_MIN_SLEEP_TIME   3   /* milliseconds */
> >  #define DEF_GC_THREAD_MAX_SLEEP_TIME   6
> >  #define DEF_GC_THREAD_NOGC_SLEEP_TIME  30  /* wait 5 min */
> > @@ -27,12 +28,15 @@ struct f2fs_gc_kthread {
> >

linux-next: Signed-off-by missing for commit in the scsi-mkp tree

2017-08-07 Thread Stephen Rothwell
Hi Martin,

Commit

  facfc963ae92 ("scsi: g_NCR5380: Two DTC436 PDMA workarounds")

is missing a Signed-off-by from its author.

-- 
Cheers,
Stephen Rothwell


  1   2   3   4   5   6   7   8   9   10   >