date:20210209

While building Linux next tag 20210209 mips uImage.gz failed with below configs

  - mips (cavium_octeon_defconfig) with gcc-8, gcc-9 and gcc-10 - FAILED
  - mips (malta_defconfig) with gcc-8, gcc-9 and gcc-10 - FAILED
  - mips (nlm_xlp_defconfig) with gcc-8, gcc-9 and gcc-10 - FAILED
  - mips (defconfig) with gcc-8, gcc-9 and gcc-10 - FAILED

make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=mips
CROSS_COMPILE=mips-linux-gnu- 'CC=sccache mips-linux-gnu-gcc'
'HOSTCC=sccache gcc' uImage.gz
In file included from /include/linux/spinlock.h:90,
 from /include/linux/ipc.h:5,
 from /include/uapi/linux/sem.h:5,
 from /include/linux/sem.h:5,
 from /include/linux/compat.h:14,
 from /arch/mips/kernel/asm-offsets.c:12:
/arch/mips/include/asm/spinlock.h:17:28: error: redefinition of
'queued_spin_unlock'
   17 | #define queued_spin_unlock queued_spin_unlock
  |^~

Reported-by: Naresh Kamboju 

build link,
https://builds.tuxbuild.com/1oF9lkBAeWM2WvR11O2Yun8ERNT/

-- 
Linaro LKFT
https://lkft.linaro.org

[next] [s390 ] net: mlx5: tc_tun.h:24:29: error: field 'match_level' has incomplete type

While building Linux next tag 20210209 s390 (defconfig) with gcc-9
make modules failed.
  - s390 (defconfig) with gcc-8 - FAILED
  - s390 (defconfig) with gcc-9 - FAILED
  - s390 (defconfig) with gcc-10 - FAILED

make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=s390
CROSS_COMPILE=s390x-linux-gnu- 'CC=sccache s390x-linux-gnu-gcc'
'HOSTCC=sccache gcc'
In file included from drivers/net/ethernet/mellanox/mlx5/core/en_tc.h:40,
 from drivers/net/ethernet/mellanox/mlx5/core/en_main.c:45:
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h:24:29: error:
field 'match_level' has incomplete type
   24 |  enum mlx5_flow_match_level match_level;
  | ^~~
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h:27:26: warning:
'struct mlx5e_encap_entry' declared inside parameter list will not be
visible outside of this definition or declaration
   27 |  int (*calc_hlen)(struct mlx5e_encap_entry *e);
  |  ^

Reported-by: Naresh Kamboju 

build link,
https://builds.tuxbuild.com/1oF9mT3pKaPfVIptyzGbiNjKW0m/

-- 
Linaro LKFT
https://lkft.linaro.org

Re: [PATCH 1/3] lib/test_printf: use KSTM_MODULE_GLOBALS macro


On 2/9/21 11:18 PM, Timur Tabi wrote:

Instead of defining the total/failed test counters manually,
test_printf should use the kselftest macro created for this
purpose.

Signed-off-by: Timur Tabi


Ugh, I really need to stop sending patches late at night.  This is again 
the wrong email address.


I'm sure I'll need to post another version of these patches, so I'll 
just fix it then.

[PATCH 3/3] lib/vsprintf: make-printk-non-secret printks all addresses as unhashed

If the make-printk-non-secret command line parameter is set, then
printk("%p") will print pointers as unhashed.  This is useful for
debugging purposes.

A large warning message is displayed if this option is enabled.
Unhashed pointers, while useful for debugging, expose kernel
addresses which can be a security risk.

Also update test_printf to skip the hashed pointer tests if the
command-line option is set.

Signed-off-by: Timur Tabi 
Acked-by: Petr Mladek 
Acked-by: Randy Dunlap 
Acked-by: Sergey Senozhatsky 
---
 .../admin-guide/kernel-parameters.txt | 15 
 lib/test_printf.c |  8 
 lib/vsprintf.c| 38 ++-
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a10b545c2070..6962379469e4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2613,6 +2613,21 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
 
+make-printk-non-secret
+   Force pointers printed to the console to be unhashed.
+   By default, when a pointer is printed to the kernel
+   console (via %p format string), that pointer is
+   "hashed", i.e. obscured by hashing the pointer value.
+   This is a security feature that hides actual kernel
+   addresses from unprivileged users, but it also makes
+   debugging the kernel more difficult since unequal
+   pointers can no longer be compared.  If this option is
+   specified, then all normal pointers will have their
+   true value printed.  Pointers printed via %pK may
+   still be hashed.  This option should only be specified
+   when debugging the kernel.  Please do not use on
+   production kernels.
+
max_addr=nn[KMG][KNL,BOOT,ia64] All physical memory greater
than or equal to this physical address is ignored.
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index ad2bcfa8caa1..b0b62d76e598 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -35,6 +35,8 @@ KSTM_MODULE_GLOBALS();
 static char *test_buffer __initdata;
 static char *alloced_buffer __initdata;
 
+extern bool debug_never_hash_pointers;
+
 static int __printf(4, 0) __init
 do_test(int bufsize, const char *expect, int elen,
const char *fmt, va_list ap)
@@ -301,6 +303,12 @@ plain(void)
 {
int err;
 
+   if (debug_never_hash_pointers) {
+   pr_warn("skipping plain 'p' tests");
+   skipped_tests += 2;
+   return;
+   }
+
err = plain_hash();
if (err) {
pr_warn("plain 'p' does not appear to be hashed\n");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3b53c73580c5..1296d9b0b328 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2090,6 +2090,34 @@ char *fwnode_string(char *buf, char *end, struct 
fwnode_handle *fwnode,
return widen_string(buf, buf - buf_start, end, spec);
 }
 
+/* Disable pointer hashing if requested */
+bool debug_never_hash_pointers __ro_after_init;
+EXPORT_SYMBOL_GPL(debug_never_hash_pointers);
+
+static int __init debug_never_hash_pointers_enable(char *str)
+{
+   debug_never_hash_pointers = true;
+
+   pr_warn("**\n");
+   pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+   pr_warn("**  **\n");
+   pr_warn("** All pointers that are printed to the console will**\n");
+   pr_warn("** be printed as unhashed.  **\n");
+   pr_warn("**  **\n");
+   pr_warn("** Kernel memory addresses are exposed, which may   **\n");
+   pr_warn("** reduce the security of your system.  **\n");
+   pr_warn("**  **\n");
+   pr_warn("** If you see this message and you are not debugging**\n");
+   pr_warn("** the kernel, report this immediately to your system   **\n");
+   pr_warn("** administrator!   **\n");
+   pr_warn("**  **\n");
+   pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+   pr_warn("**\n");
+
+   return 0;
+}
+early_param("make-printk-non-secret", debug_never_hash_pointers_enable);
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is foll

[PATCH 2/3] kselftest: add support for skipped tests

Update the kselftest framework to all testing clients to
specify that some tests were skipped.

Signed-off-by: Timur Tabi 
---
 tools/testing/selftests/kselftest_module.h | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kselftest_module.h 
b/tools/testing/selftests/kselftest_module.h
index e8eafaf0941a..e2ea41de3f35 100644
--- a/tools/testing/selftests/kselftest_module.h
+++ b/tools/testing/selftests/kselftest_module.h
@@ -11,7 +11,8 @@
 
 #define KSTM_MODULE_GLOBALS()  \
 static unsigned int total_tests __initdata;\
-static unsigned int failed_tests __initdata
+static unsigned int failed_tests __initdata;   \
+static unsigned int skipped_tests __initdata
 
 #define KSTM_CHECK_ZERO(x) do {
\
total_tests++;  \
@@ -21,11 +22,16 @@ static unsigned int failed_tests __initdata
}   \
 } while (0)
 
-static inline int kstm_report(unsigned int total_tests, unsigned int 
failed_tests)
+static inline int kstm_report(unsigned int total_tests, unsigned int 
failed_tests,
+ unsigned int skipped_tests)
 {
-   if (failed_tests == 0)
-   pr_info("all %u tests passed\n", total_tests);
-   else
+   if (failed_tests == 0) {
+   if (skipped_tests) {
+   pr_info("skipped %u tests\n", skipped_tests);
+   pr_info("remaining %u tests passed\n", total_tests);
+   } else
+   pr_info("all %u tests passed\n", total_tests);
+   } else
pr_warn("failed %u out of %u tests\n", failed_tests, 
total_tests);
 
return failed_tests ? -EINVAL : 0;
@@ -36,7 +42,7 @@ static int __init __module##_init(void)   
\
 {  \
pr_info("loaded.\n");   \
selftest(); \
-   return kstm_report(total_tests, failed_tests);  \
+   return kstm_report(total_tests, failed_tests, skipped_tests);   \
 }  \
 static void __exit __module##_exit(void)   \
 {  \
-- 
2.25.1

Re: WARNING: suspicious RCU usage (5.11.0-rc7+ #1812 Tainted: G)

2021-02-09 Thread Kalle Valo

"Rafael J. Wysocki"  writes:

>> > AFAICT that's a simple 'use RCU without holding rcu_read_lock' warning.
>> > I've not dug through ath10k to see who should be doing rcu_read_lock,
>> > but the few places I did look at don't seem to have changed recently.
>>
>> Just this morning I applied a patch which should fix this:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git/commit/?h=ath-next&id=2615e3cdbd9c0e864f5906279c952a309871d225
>>
>> Please let me know if it fixes the issue.
>
> The traces are gone after applying this patch, so it does help:
>
> Tested-by: Rafael J. Wysocki 

Good, thanks for testing.

-- 
https://patchwork.kernel.org/project/linux-wireless/list/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

[PATCH 1/3] lib/test_printf: use KSTM_MODULE_GLOBALS macro

Instead of defining the total/failed test counters manually,
test_printf should use the kselftest macro created for this
purpose.

Signed-off-by: Timur Tabi 
---
 lib/test_printf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..ad2bcfa8caa1 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -30,8 +30,8 @@
 #define PAD_SIZE 16
 #define FILL_CHAR '$'
 
-static unsigned total_tests __initdata;
-static unsigned failed_tests __initdata;
+KSTM_MODULE_GLOBALS();
+
 static char *test_buffer __initdata;
 static char *alloced_buffer __initdata;
 
-- 
2.25.1

[PATCH 0/3][RESEND] add support for never printing hashed addresses

[accidentally sent from the wrong email address, so resending]

[The list of email addresses on CC: is getting quite lengthy,
so I hope I've included everyone.]

Although hashing addresses printed via printk does make the
kernel more secure, it interferes with debugging, especially
with some functions like print_hex_dump() which always uses
hashed addresses.

To avoid having to choose between %p and %px, it's easier to
add a kernel command line that treats all %p as %px.  This
encourages developers to use %p more without making debugging
more difficult.

Patches #1 and #2 upgrade the kselftest framework so that
it can report on tests that were skipped outright.  This
is needed for the test_printf module which will now skip
%p hashing tests if hashing is disabled.

Patch #2 upgrades the printf library to check the command
line.  It also updates test_printf().

Timur Tabi (3):
  lib/test_printf: use KSTM_MODULE_GLOBALS macro
  kselftest: add support for skipped tests
  [v2] lib/vsprintf: make-printk-non-secret printks all addresses as
unhashed

 .../admin-guide/kernel-parameters.txt | 15 +++
 lib/test_printf.c | 12 +-
 lib/vsprintf.c| 40 ++-
 tools/testing/selftests/kselftest_module.h| 18 ++---
 4 files changed, 75 insertions(+), 10 deletions(-)

-- 
2.25.1

Re: [PATCH] mm: page-writeback: simplify memcg handling in test_clear_page_writeback()

2021-02-09 Thread Hugh Dickins

On Tue, 9 Feb 2021, Johannes Weiner wrote:

> Page writeback doesn't hold a page reference, which allows truncate to
> free a page the second PageWriteback is cleared. This used to require
> special attention in test_clear_page_writeback(), where we had to be
> careful not to rely on the unstable page->memcg binding and look up
> all the necessary information before clearing the writeback flag.
> 
> Since commit 073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and
> BUG_ON(PageWriteback)") test_clear_page_writeback() is called with an
> explicit reference on the page, and this dance is no longer needed.
> 
> Use unlock_page_memcg() and dec_lruvec_page_stat() directly.

s/stat()/state()/

This is a nice cleanup: I hadn't seen that connection at all.

But I think you should take it further:
__unlock_page_memcg() can then be static in mm/memcontrol.c,
and its declarations deleted from include/linux/memcontrol.h?

And further: delete __dec_lruvec_state() and dec_lruvec_state()
from include/linux/vmstat.h - unless you feel that every "inc"
ought to be matched by a "dec", even when unused.

> 
> Signed-off-by: Johannes Weiner 

Acked-by: Hugh Dickins 

> ---
>  mm/page-writeback.c | 9 +++--
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index eb34d204d4ee..f6c2c3165d4d 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
>  int test_clear_page_writeback(struct page *page)
>  {
>   struct address_space *mapping = page_mapping(page);
> - struct mem_cgroup *memcg;
> - struct lruvec *lruvec;
>   int ret;
>  
> - memcg = lock_page_memcg(page);
> - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + lock_page_memcg(page);
>   if (mapping && mapping_use_writeback_tags(mapping)) {
>   struct inode *inode = mapping->host;
>   struct backing_dev_info *bdi = inode_to_bdi(inode);
> @@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page)
>   ret = TestClearPageWriteback(page);
>   }
>   if (ret) {
> - dec_lruvec_state(lruvec, NR_WRITEBACK);
> + dec_lruvec_page_state(page, NR_WRITEBACK);
>   dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
>   inc_node_page_state(page, NR_WRITTEN);
>   }
> - __unlock_page_memcg(memcg);
> + unlock_page_memcg(page);
>   return ret;
>  }
>  
> -- 
> 2.30.0

RE: [Linuxarm] Re: [PATCH for-next 00/32] spin lock usage optimization for SCSI drivers

2021-02-09 Thread Song Bao Hua (Barry Song)



> -Original Message-
> From: Finn Thain [mailto:fth...@telegraphics.com.au]
> Sent: Wednesday, February 10, 2021 5:16 PM
> To: Song Bao Hua (Barry Song) 
> Cc: tanxiaofei ; j...@linux.ibm.com;
> martin.peter...@oracle.com; linux-s...@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux...@openeuler.org;
> linux-m...@vger.kernel.org
> Subject: [Linuxarm] Re: [PATCH for-next 00/32] spin lock usage optimization
> for SCSI drivers
> 
> On Tue, 9 Feb 2021, Song Bao Hua (Barry Song) wrote:
> 
> > > > sonic_interrupt() uses an irq lock within an interrupt handler to
> > > > avoid issues relating to this. This kind of locking may be needed in
> > > > the drivers you are trying to patch. Or it might not. Apparently,
> > > > no-one has looked.
> >
> > Is the comment in sonic_interrupt() outdated according to this:
> > m68k: irq: Remove IRQF_DISABLED
> >
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/
> ?id=77a4279
> > http://lkml.iu.edu/hypermail/linux/kernel/1109.2/01687.html
> >
> 
> The removal of IRQF_DISABLED isn't relevant to this driver. Commit
> 77a42796786c ("m68k: Remove deprecated IRQF_DISABLED") did not disable
> interrupts, it just removed some code to enable them.
> 
> The code and comments in sonic_interrupt() are correct. You can confirm
> this for yourself quite easily using QEMU and a cross-compiler.
> 
> > and this:
> > genirq: Warn when handler enables interrupts
> >
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/
> ?id=b738a50a
> >
> > wouldn't genirq report a warning on m68k?
> >
> 
> There is no warning from m68k builds. That's because arch_irqs_disabled()
> returns true when the IPL is non-zero.


So for m68k, the case is
arch_irqs_disabled() is true, but interrupts can still come?

Then it seems it is very confusing. If prioritized interrupts can still come
while arch_irqs_disabled() is true, how could spin_lock_irqsave() block the
prioritized interrupts? Isn't arch_irqs_disabled() a status reflection of
irq disable API?

Thanks
Barry

Re: [PATCH v4 5/7] cpufreq: qcom-hw: Implement CPRh aware OSM programming

2021-02-09 Thread Viresh Kumar

On 20-01-21, 13:05, Bjorn Andersson wrote:
> On Wed 20 Jan 12:25 CST 2021, Taniya Das wrote:
> 
> > The CPUFREQ-HW driver is intended to be used only for CPUFREQ HW designs
> > where the firmware programs the look up tables.
> > 
> 
> It's obvious that this is the intended target for the current version of
> the driver, but what are your technical arguments for keeping it that
> way?
> 
> > Suggestion is to separate out the driver where the programming is managed by
> > high level OS.
> > 
> 
> Can you please elaborate on the benefits of this approach?
> 
> PS. Please don't top-post on LKML.

Taniya, Can you please respond back to this ? We are waiting for
merging this patchset..

Bjorn, can you or someone else please review this patch ?

-- 
viresh

Re: [RFC PATCH 2/2] KVM: selftests: Add a test for kvm page table code

2021-02-09 Thread wangyanan (Y)




On 2021/2/10 1:38, Ben Gardon wrote:

On Mon, Feb 8, 2021 at 11:22 PM wangyanan (Y)  wrote:

Hi Ben,

On 2021/2/9 4:29, Ben Gardon wrote:

On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() is designed to cover conditions where a single vcpu
or multiple vcpus access guest pages within the same memory range, in three
VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings or
block mappings can be chosen by users to be created in the test.

If use of ANONYMOUS memory is specified, kvm will create page mappings for the
tested memory region before dirty-logging, and update attributes of the page
mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory is
specified, kvm will create block mappings for the tested memory region before
dirty-logging, and split the blcok mappings into page mappings during
dirty-logging, and coalesce the page mappings back into block mappings after
dirty-logging is stopped.

So in summary, as a performance tester, this test can present the performance
of kvm creating/updating normal page mappings, or the performance of kvm
creating/splitting/recovering block mappings, through execution time.

When we need to coalesce the page mappings back to block mappings after dirty
logging is stopped, we have to firstly invalidate *all* the TLB entries for the
page mappings right before installation of the block entry, because a TLB 
conflict
abort error could occur if we can't invalidate the TLB entries fully. We have
hit this TLB conflict twice on aarch64 software implementation and fixed it.
As this test can imulate process from dirty-logging enabled to dirty-logging
stopped of a VM with block mappings, so it can also reproduce this TLB conflict
abort due to inadequate TLB invalidation when coalescing tables.

Signed-off-by: Yanan Wang 

Thanks for sending this! Happy to see more tests for weird TLB
flushing edge cases and races.

Just out of curiosity, were you unable to replicate the bug with the
dirty_log_perf_test and setting the wr_fract option?
With "KVM: selftests: Disable dirty logging with vCPUs running"
(https://lkml.org/lkml/2021/2/2/1431), the dirty_log_perf_test has
most of the same features as this one.
Please correct me if I'm wrong, but it seems like the major difference
here is a more careful pattern of which pages are dirtied when.

Actually the procedures in KVM_UPDATE_MAPPINGS stage are specially
designed for
reproduce of the TLB conflict bug. The following explains why.
In x86 implementation, the related page mappings will be all destroyed
in advance when
stopping dirty logging while vcpus are still running. So after dirty
logging is successfully
stopped, there will certainly be page faults when accessing memory, and
KVM will handle
the faults and create block mappings once again. (Is this right?)
So in this case, dirty_log_perf_test can replicate the bug theoretically.

But there is difference in ARM implementation. The related page mappings
will not be
destroyed immediately when stopping dirty logging and will  be kept
instead. And after
dirty logging, KVM will destroy these mappings together with creation of
block mappings
when handling a guest fault (page fault or permission fault).  So based
on guest_code() in
dirty_log_perf_test, there will not be any page faults after dirty
logging because all the
page mappings have been created and KVM has no chance to recover block
mappings
at all. So this is why I left half of the pages clean and another half
dirtied.

Ah okay, I'm sorry. I shouldn't have assumed that ARM does the same
thing as x86 when disabling dirty logging. It makes sense then why
your guest code is so carefully structured. Does that mean that if a
VM dirties all its memory during dirty logging, that it will never be
able to reconstitute the broken down mappings into large page / block
mappings?


Indeed, but it's really a rare case to happen. I think both the x86 way 
and ARM way have


it's own benefits and are based on different considerations. Anyway, the 
more carefully


structured code is compatible for the TLB bug of different architectures.


Within Google we have a system for pre-specifying sets of arguments to
e.g. the dirty_log_perf_test. I wonder if something similar, even as
simple as a script that just runs dirty_log_perf_test several times
would be helpful for cases where different arguments are needed for
the test to cover different specific cases. Even with this test, for

I not sure I have got your point :), but it depends on what exactly the
specific cases are,
and sometimes we have to use different arguments. Is this right?

Exactly, it might be ki

[PATCH] mmc: core: add a power cycle when CMD11 fails

2021-02-09 Thread DooHyun Hwang

A power cycle is required if CMD11 fails.
CMD11 failure should be handled as no response.

If there is a timeout error that means no response to the CMD11,
do not send the CMD11 again and the power cycle is required.
Any other errors for CMD11 are the same because CMD11 failed.

On some bad SD Card, CMD11 may fail but the card may have already
invoked the voltage switch sequence.
In this case, it is necessary to retry without voltage switching
after power cycle.

Signed-off-by: DooHyun Hwang 
---
 drivers/mmc/core/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 1136b859ddd8..a6674df2a7bb 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1207,7 +1207,7 @@ int mmc_set_uhs_voltage(struct mmc_host *host, u32 ocr)
 
err = mmc_wait_for_cmd(host, &cmd, 0);
if (err)
-   return err;
+   goto power_cycle;
 
if (!mmc_host_is_spi(host) && (cmd.resp[0] & R1_ERROR))
return -EIO;
-- 
2.29.0

[PATCH 3/3] [v2] lib/vsprintf: make-printk-non-secret printks all addresses as unhashed

From: Timur Tabi 

If the make-printk-non-secret command line parameter is set, then
printk("%p") will print pointers as unhashed.  This is useful for
debugging purposes.

A large warning message is displayed if this option is enabled.
Unhashed pointers, while useful for debugging, expose kernel
addresses which can be a security risk.

Also update test_printf to skip the hashed pointer tests if the
command-line option is set.

Signed-off-by: Timur Tabi 
Acked-by: Petr Mladek 
Acked-by: Randy Dunlap 
Acked-by: Sergey Senozhatsky 
---
 .../admin-guide/kernel-parameters.txt | 15 
 lib/test_printf.c |  8 
 lib/vsprintf.c| 38 ++-
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 9e3cdb271d06..e639b0f32a6c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2613,6 +2613,21 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
 
+make-printk-non-secret
+   Force pointers printed to the console to be unhashed.
+   By default, when a pointer is printed to the kernel
+   console (via %p format string), that pointer is
+   "hashed", i.e. obscured by hashing the pointer value.
+   This is a security feature that hides actual kernel
+   addresses from unprivileged users, but it also makes
+   debugging the kernel more difficult since unequal
+   pointers can no longer be compared.  If this option is
+   specified, then all normal pointers will have their
+   true value printed.  Pointers printed via %pK may
+   still be hashed.  This option should only be specified
+   when debugging the kernel.  Please do not use on
+   production kernels.
+
max_addr=nn[KMG][KNL,BOOT,ia64] All physical memory greater
than or equal to this physical address is ignored.
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index ad2bcfa8caa1..b0b62d76e598 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -35,6 +35,8 @@ KSTM_MODULE_GLOBALS();
 static char *test_buffer __initdata;
 static char *alloced_buffer __initdata;
 
+extern bool debug_never_hash_pointers;
+
 static int __printf(4, 0) __init
 do_test(int bufsize, const char *expect, int elen,
const char *fmt, va_list ap)
@@ -301,6 +303,12 @@ plain(void)
 {
int err;
 
+   if (debug_never_hash_pointers) {
+   pr_warn("skipping plain 'p' tests");
+   skipped_tests += 2;
+   return;
+   }
+
err = plain_hash();
if (err) {
pr_warn("plain 'p' does not appear to be hashed\n");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3b53c73580c5..1296d9b0b328 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2090,6 +2090,34 @@ char *fwnode_string(char *buf, char *end, struct 
fwnode_handle *fwnode,
return widen_string(buf, buf - buf_start, end, spec);
 }
 
+/* Disable pointer hashing if requested */
+bool debug_never_hash_pointers __ro_after_init;
+EXPORT_SYMBOL_GPL(debug_never_hash_pointers);
+
+static int __init debug_never_hash_pointers_enable(char *str)
+{
+   debug_never_hash_pointers = true;
+
+   pr_warn("**\n");
+   pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+   pr_warn("**  **\n");
+   pr_warn("** All pointers that are printed to the console will**\n");
+   pr_warn("** be printed as unhashed.  **\n");
+   pr_warn("**  **\n");
+   pr_warn("** Kernel memory addresses are exposed, which may   **\n");
+   pr_warn("** reduce the security of your system.  **\n");
+   pr_warn("**  **\n");
+   pr_warn("** If you see this message and you are not debugging**\n");
+   pr_warn("** the kernel, report this immediately to your system   **\n");
+   pr_warn("** administrator!   **\n");
+   pr_warn("**  **\n");
+   pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+   pr_warn("**\n");
+
+   return 0;
+}
+early_param("make-printk-non-secret", debug_never_hash_pointers_enable);
+
 /*
  * Show a '%p' thing.  A kernel extension is th

[PATCH 2/3] kselftest: add support for skipped tests

Update the kselftest framework to all testing clients to
specify that some tests were skipped.

Signed-off-by: Timur Tabi 
---
 tools/testing/selftests/kselftest_module.h | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kselftest_module.h 
b/tools/testing/selftests/kselftest_module.h
index e8eafaf0941a..e2ea41de3f35 100644
--- a/tools/testing/selftests/kselftest_module.h
+++ b/tools/testing/selftests/kselftest_module.h
@@ -11,7 +11,8 @@
 
 #define KSTM_MODULE_GLOBALS()  \
 static unsigned int total_tests __initdata;\
-static unsigned int failed_tests __initdata
+static unsigned int failed_tests __initdata;   \
+static unsigned int skipped_tests __initdata
 
 #define KSTM_CHECK_ZERO(x) do {
\
total_tests++;  \
@@ -21,11 +22,16 @@ static unsigned int failed_tests __initdata
}   \
 } while (0)
 
-static inline int kstm_report(unsigned int total_tests, unsigned int 
failed_tests)
+static inline int kstm_report(unsigned int total_tests, unsigned int 
failed_tests,
+ unsigned int skipped_tests)
 {
-   if (failed_tests == 0)
-   pr_info("all %u tests passed\n", total_tests);
-   else
+   if (failed_tests == 0) {
+   if (skipped_tests) {
+   pr_info("skipped %u tests\n", skipped_tests);
+   pr_info("remaining %u tests passed\n", total_tests);
+   } else
+   pr_info("all %u tests passed\n", total_tests);
+   } else
pr_warn("failed %u out of %u tests\n", failed_tests, 
total_tests);
 
return failed_tests ? -EINVAL : 0;
@@ -36,7 +42,7 @@ static int __init __module##_init(void)   
\
 {  \
pr_info("loaded.\n");   \
selftest(); \
-   return kstm_report(total_tests, failed_tests);  \
+   return kstm_report(total_tests, failed_tests, skipped_tests);   \
 }  \
 static void __exit __module##_exit(void)   \
 {  \
-- 
2.25.1

[PATCH 1/3] lib/test_printf: use KSTM_MODULE_GLOBALS macro

Instead of defining the total/failed test counters manually,
test_printf should use the kselftest macro created for this
purpose.

Signed-off-by: Timur Tabi 
---
 lib/test_printf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..ad2bcfa8caa1 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -30,8 +30,8 @@
 #define PAD_SIZE 16
 #define FILL_CHAR '$'
 
-static unsigned total_tests __initdata;
-static unsigned failed_tests __initdata;
+KSTM_MODULE_GLOBALS();
+
 static char *test_buffer __initdata;
 static char *alloced_buffer __initdata;
 
-- 
2.25.1

[PATCH 0/3] add support for never printing hashed addresses

[The list of email addresses on CC: is getting quite lengthy,
so I hope I've included everyone.]

Although hashing addresses printed via printk does make the
kernel more secure, it interferes with debugging, especially
with some functions like print_hex_dump() which always uses
hashed addresses.

To avoid having to choose between %p and %px, it's easier to
add a kernel command line that treats all %p as %px.  This
encourages developers to use %p more without making debugging
more difficult.

Patches #1 and #2 upgrade the kselftest framework so that
it can report on tests that were skipped outright.  This
is needed for the test_printf module which will now skip
%p hashing tests if hashing is disabled.

Patch #2 upgrades the printf library to check the command
line.  It also updates test_printf().

Timur Tabi (3):
  lib/test_printf: use KSTM_MODULE_GLOBALS macro
  kselftest: add support for skipped tests
  [v2] lib/vsprintf: make-printk-non-secret printks all addresses as
unhashed

 .../admin-guide/kernel-parameters.txt | 15 +++
 lib/test_printf.c | 12 +-
 lib/vsprintf.c| 40 ++-
 tools/testing/selftests/kselftest_module.h| 18 ++---
 4 files changed, 75 insertions(+), 10 deletions(-)

-- 
2.25.1

Re: [PATCH 4.4 00/38] 4.4.257-rc1 review

On Mon, 8 Feb 2021 at 20:33, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.4.257 release.
> There are 38 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Wed, 10 Feb 2021 14:57:55 +.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.257-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.4.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Tested-by: Linux Kernel Functional Testing 

Summary


kernel: 4.4.257-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.4.y
git commit: 1a954f75c0ee3245a025a60f2a4cccd6722a1bb6
git describe: v4.4.256-39-g1a954f75c0ee
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-linux-4.4.y/build/v4.4.256-39-g1a954f75c0ee


No regressions (compared to build v4.4.256)

No fixes (compared to build v4.4.256)

Ran 31608 total tests in the following environments and test suites.

Environments
--
- arm
- arm64
- i386
- juno-64k_page_size
- juno-r2 - arm64
- juno-r2-compat
- juno-r2-kasan
- mips
- qemu-arm64-kasan
- qemu-x86_64-kasan
- qemu_arm
- qemu_arm64
- qemu_arm64-compat
- qemu_i386
- qemu_x86_64
- qemu_x86_64-compat
- sparc
- x15 - arm
- x86_64
- x86-kasan
- x86_64

Test Suites
---
* build
* linux-log-parser
* kselftest-android
* kselftest-bpf
* kselftest-capabilities
* kselftest-cgroup
* kselftest-clone3
* kselftest-core
* kselftest-cpu-hotplug
* kselftest-cpufreq
* kselftest-efivarfs
* kselftest-filesystems
* kselftest-firmware
* kselftest-fpu
* kselftest-futex
* kselftest-gpio
* kselftest-intel_pstate
* kselftest-ipc
* kselftest-ir
* kselftest-kcmp
* kselftest-kexec
* kselftest-kvm
* kselftest-lib
* kselftest-livepatch
* kselftest-lkdtm
* kselftest-membarrier
* kselftest-ptrace
* kselftest-rseq
* kselftest-rtc
* kselftest-seccomp
* kselftest-sigaltstack
* kselftest-size
* kselftest-splice
* kselftest-static_keys
* kselftest-sync
* kselftest-sysctl
* kselftest-timens
* kselftest-timers
* kselftest-tmpfs
* kselftest-tpm2
* kselftest-user
* kselftest-vm
* kselftest-x86
* kselftest-zram
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-controllers-tests
* ltp-cpuhotplug-tests
* ltp-crypto-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-open-posix-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-tracing-tests
* network-basic-tests
* perf
* v4l2-compliance
* kvm-unit-tests
* fwts
* ssuite

Summary


kernel: 4.4.257-rc1
git repo: https://git.linaro.org/lkft/arm64-stable-rc.git
git branch: 4.4.257-rc1-hikey-20210208-927
git commit: 288b6b317ee80392b29cd493327d429385373359
git describe: 4.4.257-rc1-hikey-20210208-927
Test details: 
https://qa-reports.linaro.org/lkft/linaro-hikey-stable-rc-4.4-oe/build/4.4.257-rc1-hikey-20210208-927/


No regressions (compared to build 4.4.256-rc1-hikey-20210205-921)


No fixes (compared to build 4.4.256-rc1-hikey-20210205-921)

Ran 1953 total tests in the following environments and test suites.

Environments
--
- hi6220-hikey - arm64

Test Suites
---
* build
* install-android-platform-tools-r2600
* libhugetlbfs
* linux-log-parser
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* perf
* spectre-meltdown-checker-test
* v4l2-compliance
* kselftest-android
* kselftest-bpf
* kselftest-capabilities
* kselftest-cgroup
* kselftest-clone3
* kselftest-core
* kselftest-cpu-hotplug
* kselftest-cpufreq
* kselftest-efivarfs
* kselftest-filesystems
* kselftest-firmware
* kselftest-fpu
* kselftest-futex
* kselftest-gpio
* kselftest-intel_pstate
* kselftest-ipc
* kselftest-ir
* kselftest-kcmp
* kselftest-kexec
* kselftest-kvm
* kselftest-lib
* kselftest-livepatch
* kselftest-lkdtm
* kselftest-membarrier
* kselftest-ptrace
* kselftest-rseq
* kselftest-rtc
* kselftest-se

Re: [PATCH v8 7/9] crypto: hisilicon/hpre - add 'ECDH' algorithm

2021-02-09 Thread Herbert Xu

On Mon, Feb 08, 2021 at 05:38:55PM +0800, Meng Yu wrote:
> 1. Enable 'ECDH' algorithm in Kunpeng 930;
> 2. HPRE ECDH Support: ecdh-nist-p192, ecdh-nist-p224,
>ecdh-nist-p256, ecdh-nist-p384, ecdh-nist-p521.

Where is the patch that adds the generic ecdh-nist-p384?

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH v8 3/9] crypto: atmel-ecc - move curve_id of ECDH from the key to algorithm name

2021-02-09 Thread Herbert Xu

On Mon, Feb 08, 2021 at 05:38:51PM +0800, Meng Yu wrote:
> As curve id of ECDH will be moved from its key into algorithm name,
> we cannot use 'curve_id' in 'struct ecdh', so we should modify ECDH
> driver in atmel, and make ECDH algorithm name be the same as crypto
> (like 'ecdh-nist-pxxx');
> 
> Signed-off-by: Meng Yu 
> Reviewed-by: Zaibo Xu 
> ---
>  drivers/crypto/atmel-ecc.c | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)

Patches 3-5 need to be squashed into one in order to avoid future
bisection failures.

The alternative is to let the new/old names coexist but it's probably
not worth it for this case as the number of drivers impacted is small.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: linux-next: qemu boot failure after merge of the tip tree

2021-02-09 Thread Stephen Rothwell

Hi Peter,

On Mon, 1 Feb 2021 15:40:12 +0100 Peter Zijlstra  wrote:
>
> On Mon, Feb 01, 2021 at 01:04:30PM +, Valentin Schneider wrote:
> > On 01/02/21 20:09, Stephen Rothwell wrote:  
> > > Hi all,
> > >  
> > 
> > Hi Stephen,
> >   
> > > After merging the tip tree, today's linux-next qemu boot test (powerpc
> > > pseries_le_defconfig) failed like this:  
> > 
> > In case you haven't seen it, Dietmar did the dirty work and fixed my fail
> > at
> > 
> >   http://lore.kernel.org/r/6000e39e-7d28-c360-9cd6-8798fd22a...@arm.com  
> 
> Right, picked that up, I'll try and push it before the next next ;-)

This fix has not reached the tip auto-latest branch yet and so is not
in linux-next.

-- 
Cheers,
Stephen Rothwell


pgpe5gk1w2NRX.pgp
Description: OpenPGP digital signature

[next] [arm] ERROR: modpost: "udp_sock_create6" [net/rxrpc/rxrpc.ko] undefined!

Linux next tag 20210209 arm omap2plus_defconfig make modules failed.
   - arm (omap2plus_defconfig) with gcc-10 - FAILED
   - arm (omap2plus_defconfig) with gcc-9 - FAILED
   - arm (omap2plus_defconfig) with gcc-8 - FAILED

make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=arm
CROSS_COMPILE=arm-linux-gnueabihf- 'CC=sccache
arm-linux-gnueabihf-gcc' 'HOSTCC=sccache gcc'

ERROR: modpost: "udp_sock_create6" [net/rxrpc/rxrpc.ko] undefined!
ERROR: modpost: "setup_udp_tunnel_sock" [net/rxrpc/rxrpc.ko] undefined!
ERROR: modpost: "udp_sock_create4" [net/rxrpc/rxrpc.ko] undefined!
make[2]: *** [scripts/Makefile.modpost:132: Module.symvers] Error 1
make[2]: *** Deleting file 'Module.symvers'
make[2]: Target '__modpost' not remade because of errors.

Reported-by: Naresh Kamboju 

build log link,
https://builds.tuxbuild.com/1oF9lZzseBXx1Dl1IkVLgB4nvhM/

-- 
Linaro LKFT
https://lkft.linaro.org

RE: [ANNOUNCE] exfatprogs-1.1.0 version released

2021-02-09 Thread Namjae Jeon

> On Wed, Feb 10, 2021 at 12:50 AM Namjae Jeon  wrote:
> >
> > Hi folk,
> >
> > We have released exfatprogs 1.1.0 version. In this release, exfatlabel
> > has been added to print or re-write volume label and volume serial value.
> > Also, A new dump.exfat util has been added to display statistics from
> > a given device(Requested by Mike Fleetwood(GParted Developer)).
> >
> > Any feedback is welcome!:)
> >
> 
Hi Sedat,
> Congrats to the new release and thanks to all involved people.
Thanks!

> 
> Hope Sven will do a new release for Debian.
> ( Note that Debian/bullseye release  plans "Milestone 2" this Friday, 
> February 12th (see [1] > "Key
> release dates" > "[2021-Feb-12] Soft Freeze"). Dunno which impact this might 
> have on this. )
I hope he will do it, too!

Thanks Sedat:)
> 
> - Sedat -
> 
> [1] https://release.debian.org/
> 
> 
> > CHANGES :
> >  * fsck.exfat: Recover corrupted boot region.
> >
> > NEW FEATURES :
> >  * exfatlabel: Print or set volume label and serial.
> >  * dump.exfat: Show the on-disk metadata information and the statistics.
> >
> > BUG FIXES :
> >  * Set _FILE_OFFSET_BITS=64 for Android build.
> >
> > The git tree is at:
> >
> > https://protect2.fireeye.com/v1/url?k=f588edef-aa13d460-f58966a0-0cc47
> > a31307c-ebe7fdcb9cce33c0&q=1&e=88dc7065-283e-4803-b82d-ffcf0f9d681e&u=
> > https%3A%2F%2Fgithub.com%2Fexfatprogs%2Fexfatprogs
> >
> > The tarballs can be found at:
> >
> > https://protect2.fireeye.com/v1/url?k=98eca6ac-c7779f23-98ed2de3-0cc47a31307c-
> c97058e3d3889dd3&q=1&e=88dc7065-283e-4803-b82d-
> ffcf0f9d681e&u=https%3A%2F%2Fgithub.com%2Fexfatprogs%2Fexfatprogs%2Freleases%2Fdownload%2F1.1.0%2Fexfa
> tprogs-1.1.0.tar.gz
> >

Re: [PATCH V3 16/19] virtio-pci: introduce modern device module




On 2021/2/9 下午10:20, Michael S. Tsirkin wrote:

On Mon, Jan 04, 2021 at 02:55:00PM +0800, Jason Wang wrote:

Signed-off-by: Jason Wang 
---
  drivers/virtio/Kconfig |  10 +-
  drivers/virtio/Makefile|   1 +
  drivers/virtio/virtio_pci_common.h |  27 +-
  drivers/virtio/virtio_pci_modern.c | 617 -
  drivers/virtio/virtio_pci_modern_dev.c | 599 
  include/linux/virtio_pci_modern.h  | 111 +
  6 files changed, 721 insertions(+), 644 deletions(-)
  create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
  create mode 100644 include/linux/virtio_pci_modern.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 7b41130d3f35..6b9b81f4b8c2 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -12,6 +12,14 @@ config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
  This option is selected if the architecture may need to enforce
  VIRTIO_F_ACCESS_PLATFORM
  
+config VIRTIO_PCI_MODERN

+   tristate "Modern Virtio PCI Device"
+   depends on PCI
+   help
+ Modern PCI device implementation. This module implements the
+ basic probe and control for devices which are based on modern
+ PCI device with possible vendor specific extensions.
+
  menuconfig VIRTIO_MENU
bool "Virtio drivers"
default y
@@ -20,7 +28,7 @@ if VIRTIO_MENU
  
  config VIRTIO_PCI

tristate "PCI driver for virtio devices"
-   depends on PCI
+   depends on VIRTIO_PCI_MODERN
select VIRTIO
help
  This driver provides support for virtio based paravirtual device

Looks like VIRTIO_PCI_MODERN is actually just a library that
virtio pci uses. Is that right?



Right.



In that case just select it
automatically, let's not make users enable it manually.



I've considered to do this but the problem is that the module depends on 
PCI so it can't be selected I think.


Thanks

Re: [PATCH V3 16/19] virtio-pci: introduce modern device module




On 2021/2/9 下午6:15, Naresh Kamboju wrote:

Hi Jason,

On Mon, 4 Jan 2021 at 12:28, Jason Wang  wrote:

Signed-off-by: Jason Wang 
---
  drivers/virtio/Kconfig |  10 +-
  drivers/virtio/Makefile|   1 +
  drivers/virtio/virtio_pci_common.h |  27 +-
  drivers/virtio/virtio_pci_modern.c | 617 -
  drivers/virtio/virtio_pci_modern_dev.c | 599 
  include/linux/virtio_pci_modern.h  | 111 +
  6 files changed, 721 insertions(+), 644 deletions(-)
  create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
  create mode 100644 include/linux/virtio_pci_modern.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 7b41130d3f35..6b9b81f4b8c2 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -12,6 +12,14 @@ config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
   This option is selected if the architecture may need to enforce
   VIRTIO_F_ACCESS_PLATFORM

+config VIRTIO_PCI_MODERN
+   tristate "Modern Virtio PCI Device"
+   depends on PCI
+   help
+ Modern PCI device implementation. This module implements the
+ basic probe and control for devices which are based on modern
+ PCI device with possible vendor specific extensions.
+
  menuconfig VIRTIO_MENU
 bool "Virtio drivers"
 default y
@@ -20,7 +28,7 @@ if VIRTIO_MENU

  config VIRTIO_PCI
 tristate "PCI driver for virtio devices"
-   depends on PCI
+   depends on VIRTIO_PCI_MODERN

While booting Linux next tag 20210208 kernel on qemu_arm64 and qemu_arm
mount rootfs failed.  The root cause seems to be due to missing configs
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_PCI_LEGACY=y

Reported-by: Naresh Kamboju 

Then I have to force to enable this MODERN config
CONFIG_VIRTIO_PCI_MODERN=y
and which enabled
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_PCI_LEGACY=y

and the qemu_arm64 and qemu_arm boot pass.


New build link,
https://builds.tuxbuild.com/1oEse4EFsoQr1FkKBfiLmhMCe7j/



Thanks for the reporting.

I will post a patch to fix the def config to enable VIRTIO_PCI_MODERN.

Thanks

Re: [PATCH 1/1] PCI/RCEC: Fix failure to inject errors to some RCiEP devices

2021-02-09 Thread Kelley, Sean V




> On Feb 9, 2021, at 6:05 PM, Qiuxu Zhuo  wrote:
> 
> On a Sapphire Rapids server, it failed to inject correctable errors
> to the RCiEP device e8:02.0 which was associated with the RCEC device
> e8:00.4. See the following error log before applying the patch:
> 
> aer-inject -s e8:02.0 examples/correctable
> Error: Failed to write, No such device
> 
> This was because rcec_assoc_rciep() mistakenly used "rciep->devfn" as
> device number to check whether the corresponding bit was set in
> the RCiEPBitmap of the RCEC. So that the RCiEP device e8:02.0 wasn't
> linked to the RCEC and resulted in the above error.
> 
> Fix it by using PCI_SLOT() to convert rciep->devfn to device number.
> Ensure that the RCiEP devices associated with the RCEC are linked to
> the RCEC as the RCEC is enumerated. After applying the patch, correctable
> errors can be injected to the RCiEP successfully.
> 
> Reported-and-tested-by: Wen Jin 
> Signed-off-by: Qiuxu Zhuo 

Reviewed-by: Sean V Kelley 

Thanks,

Sean

> ---
> drivers/pci/pcie/rcec.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
> index 2c5c552994e4..d0bcd141ac9c 100644
> --- a/drivers/pci/pcie/rcec.c
> +++ b/drivers/pci/pcie/rcec.c
> @@ -32,7 +32,7 @@ static bool rcec_assoc_rciep(struct pci_dev *rcec, struct 
> pci_dev *rciep)
> 
>   /* Same bus, so check bitmap */
>   for_each_set_bit(devn, &bitmap, 32)
> - if (devn == rciep->devfn)
> + if (devn == PCI_SLOT(rciep->devfn))
>   return true;
> 
>   return false;
> -- 
> 2.17.1
>

Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation

2021-02-09 Thread zhou xianrong




On 2021/2/9 下午5:23, Michal Hocko wrote:

On Tue 09-02-21 16:23:13, zhou wrote:

From: zhou xianrong 

For purpose of better migration cma pages are allocated after
failure movalbe allocations and are used normally for file pages
or anonymous pages.

In reclaim path so many cma pages if configurated are reclaimed
from lru lists in kswapd mainly or direct reclaim triggered by
unmovable or reclaimable allocations. But these cma pages can not
be used by original unmovable or reclaimable allocations. So the
reclaim are unnecessary.

In a same system if the cma pages were configurated to large then
more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
allocations are arised and then more kswapd rounds are triggered
and then more cma pages are reclaimed.

Could you be more specific? Do you have any numbers and an example
configuration when this is visible?

It should be implicit.

So this maybe cause vicious cycle. It causes that when we are under
low memory and still there are many cma pages that can not be
allocated due to unnecessary cma reclaim and cma fallback allocations
. So cma pages are not used sufficiently.

The modification is straightforward that skips reclaiming cma pages
in reclaim procedure which is triggered only by unmovable or
reclaimable allocations. This optimization can avoid ~3% unnecessary
cma isolations (cma isolated / total isolated).

Joonsoo used to have a patch series to drop many of the hacks we have
for CMA and made it part of a movable zone. That would solve many
problems, including this one. I am not sure where the work stands now
but it would be probably better to revive that rather than adding more
special casing on top of what we have right now.

Yes. This modification is simple and retain existing cma logic.

Signed-off-by: zhou xianrong 
---
  include/linux/mmzone.h|  6 ++--
  include/trace/events/vmscan.h | 20 +++
  mm/page_alloc.c   |  5 +--
  mm/vmscan.c   | 63 +--
  4 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..7dd38d7372b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -301,6 +301,8 @@ struct lruvec {
  #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
  /* Isolate unevictable pages */
  #define ISOLATE_UNEVICTABLE   ((__force isolate_mode_t)0x8)
+/* Isolate none cma pages */
+#define ISOLATE_NONCMA ((__force isolate_mode_t)0x10)
  
  /* LRU Isolation modes. */

  typedef unsigned __bitwise isolate_mode_t;
@@ -756,7 +758,7 @@ typedef struct pglist_data {
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
   mem_hotplug_begin/end() */
-   int kswapd_order;
+   int kswapd_order, kswapd_migratetype;
enum zone_type kswapd_highest_zoneidx;
  
  	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */

@@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
  
  void build_all_zonelists(pg_data_t *pgdat);

  void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-  enum zone_type highest_zoneidx);
+  int migratetype, enum zone_type highest_zoneidx);
  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long 
mark,
 int highest_zoneidx, unsigned int alloc_flags,
 long free_pages);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 2070df64958e..41bbafdfde84 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
  
  TRACE_EVENT(mm_vmscan_kswapd_wake,
  
-	TP_PROTO(int nid, int zid, int order),

+   TP_PROTO(int nid, int zid, int order, int mt),
  
-	TP_ARGS(nid, zid, order),

+   TP_ARGS(nid, zid, order, mt),
  
  	TP_STRUCT__entry(

__field(int,nid )
__field(int,zid )
__field(int,order   )
+   __field(int,mt  )
),
  
  	TP_fast_assign(

__entry->nid = nid;
__entry->zid= zid;
__entry->order   = order;
+   __entry->mt  = mt;
),
  
-	TP_printk("nid=%d order=%d",

+   TP_printk("nid=%d order=%d migratetype=%d",
__entry->nid,
-   __entry->order)
+   __entry->order,
+   __entry->mt)
  );
  
  TRACE_EVENT(mm_vmscan_wakeup_kswapd,
  
-	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),

+   TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
  
-	TP_ARGS(nid, zid, order, gfp_flags),

+   TP_ARGS(nid, zid, order, mt, gfp_flags),
  
  	TP_STRUCT__entry(

__field(int,nid )
__field(int,

Re: [PATCH RFC 6/6] btrfs: Add roundrobin raid1 read policy

2021-02-09 Thread Michał Mirosław

On Tue, Feb 09, 2021 at 09:30:40PM +0100, Michal Rostecki wrote:
[...]
> For the array with 3 HDDs, not adding any penalty resulted in 409MiB/s
> (429MB/s) performance. Adding the penalty value 1 resulted in a
> performance drop to 404MiB/s (424MB/s). Increasing the value towards 10
> was making the performance even worse.
> 
> For the array with 2 HDDs and 1 SSD, adding penalty value 1 to
> rotational disks resulted in the best performance - 541MiB/s (567MB/s).
> Not adding any value and increasing the value was making the performance
> worse.
> 
> Adding penalty value to non-rotational disks was always decreasing the
> performance, which motivated setting it as 0 by default. For the purpose
> of testing, it's still configurable.
[...]
> + bdev = map->stripes[mirror_index].dev->bdev;
> + inflight = mirror_load(fs_info, map, mirror_index, stripe_offset,
> +stripe_nr);
> + queue_depth = blk_queue_depth(bdev->bd_disk->queue);
> +
> + return inflight < queue_depth;
[...]
> + last_mirror = this_cpu_read(*fs_info->last_mirror);
[...]
> + for (i = last_mirror; i < first + num_stripes; i++) {
> + if (mirror_queue_not_filled(fs_info, map, i, stripe_offset,
> + stripe_nr)) {
> + preferred_mirror = i;
> + goto out;
> + }
> + }
> +
> + for (i = first; i < last_mirror; i++) {
> + if (mirror_queue_not_filled(fs_info, map, i, stripe_offset,
> + stripe_nr)) {
> + preferred_mirror = i;
> + goto out;
> + }
> + }
> +
> + preferred_mirror = last_mirror;
> +
> +out:
> + this_cpu_write(*fs_info->last_mirror, preferred_mirror);

This looks like it effectively decreases queue depth for non-last
device. After all devices are filled to queue_depth-penalty, only
a single mirror will be selected for next reads (until a read on
some other one completes).

Have you tried testing with much more jobs / non-sequential accesses?

Best Reagrds,
Michał Mirosław

[PATCH 2/2] Docs: add fs/eventpoll to docbooks

2021-02-09 Thread Randy Dunlap

Add fs/eventpoll.c to the filesystem api-summary book.

Signed-off-by: Randy Dunlap 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
---
 Documentation/filesystems/api-summary.rst |6 ++
 1 file changed, 6 insertions(+)

--- linux-next-20210205.orig/Documentation/filesystems/api-summary.rst
+++ linux-next-20210205/Documentation/filesystems/api-summary.rst
@@ -122,6 +122,12 @@ Events based on file descriptors
 .. kernel-doc:: fs/eventfd.c
:export:
 
+eventpoll (epoll) interfaces
+
+
+.. kernel-doc:: fs/eventpoll.c
+   :internal:
+
 The Filesystem for Exporting Kernel Objects
 ===

[PATCH 1/2] fs: eventpoll: fix comments & kernel-doc notation

2021-02-09 Thread Randy Dunlap

Use the documented kernel-doc format for function Return: descriptions.
Begin constant values in kernel-doc comments with '%'.

Remove kernel-doc "/**" from 2 functions that are not documented with
kernel-doc notation.

Fix typos, punctuation, & grammar.

Also fix a few kernel-doc warnings:

../fs/eventpoll.c:1883: warning: Function parameter or member 'ep' not 
described in 'ep_loop_check_proc'
../fs/eventpoll.c:1883: warning: Excess function parameter 'priv' description 
in 'ep_loop_check_proc'
../fs/eventpoll.c:1932: warning: Function parameter or member 'ep' not 
described in 'ep_loop_check'
../fs/eventpoll.c:1932: warning: Excess function parameter 'from' description 
in 'ep_loop_check'

Signed-off-by: Randy Dunlap 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
---
Jon: Al says that he is OK with one of you merging this fs/
 (only comments) patch.

 fs/eventpoll.c |   52 +++
 1 file changed, 26 insertions(+), 26 deletions(-)

--- linux-next-20210205.orig/fs/eventpoll.c
+++ linux-next-20210205/fs/eventpoll.c
@@ -366,8 +366,8 @@ static inline struct epitem *ep_item_fro
  *
  * @ep: Pointer to the eventpoll context.
  *
- * Returns: Returns a value different than zero if ready events are available,
- *  or zero otherwise.
+ * Return: a value different than %zero if ready events are available,
+ *  or %zero otherwise.
  */
 static inline int ep_events_available(struct eventpoll *ep)
 {
@@ -1023,7 +1023,7 @@ struct file *get_epoll_tfile_raw_ptr(str
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
-/**
+/*
  * Adds a new entry to the tail of the list in a lockless way, i.e.
  * multiple CPUs are allowed to call this function concurrently.
  *
@@ -1035,10 +1035,10 @@ struct file *get_epoll_tfile_raw_ptr(str
  * completed.
  *
  *Also an element can be locklessly added to the list only in one
- *direction i.e. either to the tail either to the head, otherwise
+ *direction i.e. either to the tail or to the head, otherwise
  *concurrent access will corrupt the list.
  *
- * Returns %false if element has been already added to the list, %true
+ * Return: %false if element has been already added to the list, %true
  * otherwise.
  */
 static inline bool list_add_tail_lockless(struct list_head *new,
@@ -1076,11 +1076,11 @@ static inline bool list_add_tail_lockles
return true;
 }
 
-/**
+/*
  * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
  * i.e. multiple CPUs are allowed to call this function concurrently.
  *
- * Returns %false if epi element has been already chained, %true otherwise.
+ * Return: %false if epi element has been already chained, %true otherwise.
  */
 static inline bool chain_epi_lockless(struct epitem *epi)
 {
@@ -1105,8 +1105,8 @@ static inline bool chain_epi_lockless(st
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
  *
- * This callback takes a read lock in order not to content with concurrent
- * events from another file descriptors, thus all modifications to ->rdllist
+ * This callback takes a read lock in order not to contend with concurrent
+ * events from another file descriptor, thus all modifications to ->rdllist
  * or ->ovflist are lockless.  Read lock is paired with the write lock from
  * ep_scan_ready_list(), which stops all list modifications and guarantees
  * that lists state is seen correctly.
@@ -1335,8 +1335,8 @@ static int reverse_path_check_proc(struc
  *  paths such that we will spend all our time waking up
  *  eventpoll objects.
  *
- * Returns: Returns zero if the proposed links don't create too many paths,
- * -1 otherwise.
+ * Return: %zero if the proposed links don't create too many paths,
+ * %-1 otherwise.
  */
 static int reverse_path_check(void)
 {
@@ -1734,7 +1734,7 @@ static struct timespec64 *ep_timeout_to_
 }
 
 /**
- * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
  *   event buffer.
  *
  * @ep: Pointer to the eventpoll context.
@@ -1747,7 +1747,7 @@ static struct timespec64 *ep_timeout_to_
  *   until at least one event has been retrieved (or an error
  *   occurred).
  *
- * Returns: Returns the number of ready events which have been fetched, or an
+ * Return: the number of ready events which have been fetched, or an
  *  error code, in case of error.
  */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1774,9 +1774,9 @@ static int ep_poll(struct eventpoll *ep,
 
/*
 * This call is racy: We may or may not see events that are being added
-* to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+* to the ready list under the lock (e.g., in IRQ callbacks). For cas

Re: [PATCH] vdpa/mlx5: fix param validation in mlx5_vdpa_get_config()




On 2021/2/9 下午5:00, Stefano Garzarella wrote:

On Tue, Feb 09, 2021 at 07:43:02AM +0200, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:17:41PM +0100, Stefano Garzarella wrote:

It's legal to have 'offset + len' equal to
sizeof(struct virtio_net_config), since 'ndev->config' is a
'struct virtio_net_config', so we can safely copy its content under
this condition.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 
devices")

Cc: sta...@vger.kernel.org
Signed-off-by: Stefano Garzarella 


Acked-by: Eli Cohen 

BTW, same error in vdpa_sim you may want to fix.



Commit 65b709586e22 ("vdpa_sim: add get_config callback in 
vdpasim_dev_attr") unintentionally solved it.


Since it's a simulator, maybe we can avoid solving it in the stable 
branches. Or does it matter?



I think not, since the module depends on RUNTIME_TESTING_MENU.

Thanks






Thanks,
Stefano

RE: [Linuxarm] Re: [PATCH for-next 00/32] spin lock usage optimization for SCSI drivers

2021-02-09 Thread Finn Thain

On Tue, 9 Feb 2021, Song Bao Hua (Barry Song) wrote:

> > > sonic_interrupt() uses an irq lock within an interrupt handler to 
> > > avoid issues relating to this. This kind of locking may be needed in 
> > > the drivers you are trying to patch. Or it might not. Apparently, 
> > > no-one has looked.
> 
> Is the comment in sonic_interrupt() outdated according to this:
> m68k: irq: Remove IRQF_DISABLED
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=77a4279
> http://lkml.iu.edu/hypermail/linux/kernel/1109.2/01687.html
> 

The removal of IRQF_DISABLED isn't relevant to this driver. Commit 
77a42796786c ("m68k: Remove deprecated IRQF_DISABLED") did not disable 
interrupts, it just removed some code to enable them.

The code and comments in sonic_interrupt() are correct. You can confirm 
this for yourself quite easily using QEMU and a cross-compiler.

> and this:
> genirq: Warn when handler enables interrupts
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b738a50a
> 
> wouldn't genirq report a warning on m68k?
> 

There is no warning from m68k builds. That's because arch_irqs_disabled() 
returns true when the IPL is non-zero.

> > 
> > Thanks
> > Barry
> 
> Thanks
> Barry
> 
>

RE: [Linuxarm] Re: [PATCH for-next 00/32] spin lock usage optimization for SCSI drivers

2021-02-09 Thread Finn Thain

On Wed, 10 Feb 2021, Song Bao Hua (Barry Song) wrote:

> > On Tue, 9 Feb 2021, Song Bao Hua (Barry Song) wrote:
> > 
> > > > On Tue, 9 Feb 2021, Song Bao Hua (Barry Song) wrote:
> > > >
> > > > > > On Sun, 7 Feb 2021, Xiaofei Tan wrote:
> > > > > >
> > > > > > > Replace spin_lock_irqsave with spin_lock in hard IRQ of SCSI 
> > > > > > > drivers. There are no function changes, but may speed up if 
> > > > > > > interrupt happen too often.
> > > > > >
> > > > > > This change doesn't necessarily work on platforms that support 
> > > > > > nested interrupts.
> > > > > >
> > > > > > Were you able to measure any benefit from this change on some 
> > > > > > other platform?
> > > > >
> > > > > I think the code disabling irq in hardIRQ is simply wrong. Since 
> > > > > this commit
> > > > >
> > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e58aa3d2d0cc
> > > > >  
> > > > > genirq: Run irq handlers with interrupts disabled
> > > > >
> > > > > interrupt handlers are definitely running in a irq-disabled 
> > > > > context unless irq handlers enable them explicitly in the 
> > > > > handler to permit other interrupts.
> > > > >
> > > >
> > > > Repeating the same claim does not somehow make it true.
> > >
> > > Sorry for I didn't realize xiaofei had replied.
> > >
> > 
> > I was referring to the claim in patch 00/32, i.e. that interrupt 
> > handlers only run when irqs are disabled.
> > 
> > > > If you put your claim to the test, you'll see that that interrupts 
> > > > are not disabled on m68k when interrupt handlers execute.
> > >
> > > Sounds like an implementation issue of m68k since IRQF_DISABLED has 
> > > been totally removed.
> > >
> > 
> > It's true that IRQF_DISABLED could be used to avoid the need for irq 
> > locks in interrupt handlers. So, if you want to remove irq locks from 
> > interrupt handlers, today you can't use IRQF_DISABLED to help you. So 
> > what?
> > 
> > > >
> > > > The Interrupt Priority Level (IPL) can prevent any given irq 
> > > > handler from being re-entered, but an irq with a higher priority 
> > > > level may be handled during execution of a lower priority irq 
> > > > handler.
> > > >
> > >
> > > We used to have IRQF_DISABLED to support so-called "fast interrupt" 
> > > to avoid this.
> > >
> > > But the concept has been totally removed. That is interesting if 
> > > m68k still has this issue.
> > >
> > 
> > Prioritized interrupts are beneficial. Why would you want to avoid 
> > them?
> > 
> 
> I doubt this is true as it has been already thought as unnecessary
> in Linux:
> https://lwn.net/Articles/380931/
>

The article you cited does not refute what I said about prioritized 
interrupts.

The article is about eliminating the distinction between fast and slow 
interrupt handlers.

The article says that some developers convinced Linus that, although 
minimal interrupt latency is desirable, is isn't strictly necessary.

The article also warns of stack overflow from arbitrarily deep slow 
interrupt nesting, but that's not what m68k does.

> > Moreover, there's no reason to believe that m68k is the only platform 
> > that supports nested interrupts.
> 
> I doubt that is true as genirq is running understand the consumption
> that hardIRQ is running in irq-disabled context:

I'm not going to guess whether other platforms might be affected -- you're 
supporting this patch so you will have to show that it is correct.

> "We run all handlers with interrupts disabled and expect them not to
> enable them. Warn when we catch one who does."
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b738a50a
> 
> If it is, m68k is against the assumption of genirq.
> 

Interrupt handlers on m68k do not enable interrupts. If they did, you 
would see that warning fire. It doesn't fire. Try it.

> > 
> > > > sonic_interrupt() uses an irq lock within an interrupt handler to
> > > > avoid issues relating to this. This kind of locking may be needed in
> > > > the drivers you are trying to patch. Or it might not. Apparently,
> > > > no-one has looked.
> > >
> 
> Thanks
> Barry
>

Re: [PATCH v4] checkpatch: do not apply "initialise globals to 0" check to BPF progs

2021-02-09 Thread Joe Perches

On Wed, 2021-02-10 at 04:07 +, Song Liu wrote:
> > On Feb 9, 2021, at 6:10 PM, Joe Perches  wrote:
> > On Tue, 2021-02-09 at 13:19 -0800, Song Liu wrote:
> > > BPF programs explicitly initialise global variables to 0 to make sure
> > > clang (v10 or older) do not put the variables in the common section.
> > 
> > Acked-by: Joe Perches 
> > 
> > So the patch is OK now, but I have a question about the concept:
> > 
> > Do you mean that these initialized to 0 global variables
> > should go into bss or another section?
> 
> We want these variables to go to bss. 

OK, then the patch is fine.

> > Perhaps it'd be useful to somehow mark variables into specific
> > sections rather than bss when initialized to 0 and data when not
> > initialized to 0.
> 
> Currently, libbpf expects zero initialized global data in bss. This 
> convention works well so far. Is there any reason that we specify 
> section for global data? 

There's no need I know of.

cheers, Joe

Re: [PATCH V3 11/14] coresight: sink: Add TRBE driver

2021-02-09 Thread Anshuman Khandual




On 2/9/21 11:09 PM, Mathieu Poirier wrote:
> On Fri, Feb 05, 2021 at 10:53:30AM -0700, Mathieu Poirier wrote:
>> On Wed, Jan 27, 2021 at 02:25:35PM +0530, Anshuman Khandual wrote:
>>> Trace Buffer Extension (TRBE) implements a trace buffer per CPU which is
>>> accessible via the system registers. The TRBE supports different addressing
>>> modes including CPU virtual address and buffer modes including the circular
>>> buffer mode. The TRBE buffer is addressed by a base pointer (TRBBASER_EL1),
>>> an write pointer (TRBPTR_EL1) and a limit pointer (TRBLIMITR_EL1). But the
>>> access to the trace buffer could be prohibited by a higher exception level
>>> (EL3 or EL2), indicated by TRBIDR_EL1.P. The TRBE can also generate a CPU
>>> private interrupt (PPI) on address translation errors and when the buffer
>>> is full. Overall implementation here is inspired from the Arm SPE driver.
>>>
>>
>> I got this message when applying the patch: 
>>
>> Applying: coresight: sink: Add TRBE driver
>> .git/rebase-apply/patch:76: new blank line at EOF.
>> +
>> warning: 1 line adds whitespace errors.
>>  
>>> Cc: Mathieu Poirier 
>>> Cc: Mike Leach 
>>> Cc: Suzuki K Poulose 
>>> Signed-off-by: Anshuman Khandual 
>>> ---
>>> Changes in V3:
>>>
>>> - Added new DT bindings document TRBE.yaml
>>> - Changed TRBLIMITR_TRIG_MODE_SHIFT from 2 to 3
>>> - Dropped isb() from trbe_reset_local()
>>> - Dropped gap between (void *) and buf->trbe_base
>>> - Changed 'int' to 'unsigned int' in is_trbe_available()
>>> - Dropped unused function set_trbe_running(), set_trbe_virtual_mode(),
>>>   set_trbe_enabled() and set_trbe_limit_pointer()
>>> - Changed get_trbe_flag_update(), is_trbe_programmable() and
>>>   get_trbe_address_align() to accept TRBIDR value
>>> - Changed is_trbe_running(), is_trbe_abort(), is_trbe_wrap(), is_trbe_trg(),
>>>   is_trbe_irq(), get_trbe_bsc() and get_trbe_ec() to accept TRBSR value
>>> - Dropped snapshot mode condition in arm_trbe_alloc_buffer()
>>> - Exit arm_trbe_init() when arm64_kernel_unmapped_at_el0() is enabled
>>> - Compute trbe_limit before trbe_write to get the updated handle
>>> - Added trbe_stop_and_truncate_event()
>>> - Dropped trbe_handle_fatal()
>>>
>>>  Documentation/trace/coresight/coresight-trbe.rst |   39 +
>>>  arch/arm64/include/asm/sysreg.h  |1 +
>>>  drivers/hwtracing/coresight/Kconfig  |   11 +
>>>  drivers/hwtracing/coresight/Makefile |1 +
>>>  drivers/hwtracing/coresight/coresight-trbe.c | 1023 
>>> ++
>>>  drivers/hwtracing/coresight/coresight-trbe.h |  160 
>>>  6 files changed, 1235 insertions(+)
>>>  create mode 100644 Documentation/trace/coresight/coresight-trbe.rst
>>>  create mode 100644 drivers/hwtracing/coresight/coresight-trbe.c
>>>  create mode 100644 drivers/hwtracing/coresight/coresight-trbe.h
>>>
>>> diff --git a/Documentation/trace/coresight/coresight-trbe.rst 
>>> b/Documentation/trace/coresight/coresight-trbe.rst
>>> new file mode 100644
>>> index 000..1cbb819
>>> --- /dev/null
>>> +++ b/Documentation/trace/coresight/coresight-trbe.rst
>>> @@ -0,0 +1,39 @@
>>> +.. SPDX-License-Identifier: GPL-2.0
>>> +
>>> +==
>>> +Trace Buffer Extension (TRBE).
>>> +==
>>> +
>>> +:Author:   Anshuman Khandual 
>>> +:Date: November 2020
>>> +
>>> +Hardware Description
>>> +
>>> +
>>> +Trace Buffer Extension (TRBE) is a percpu hardware which captures in system
>>> +memory, CPU traces generated from a corresponding percpu tracing unit. This
>>> +gets plugged in as a coresight sink device because the corresponding trace
>>> +genarators (ETE), are plugged in as source device.
>>> +
>>> +The TRBE is not compliant to CoreSight architecture specifications, but is
>>> +driven via the CoreSight driver framework to support the ETE (which is
>>> +CoreSight compliant) integration.
>>> +
>>> +Sysfs files and directories
>>> +---
>>> +
>>> +The TRBE devices appear on the existing coresight bus alongside the other
>>> +coresight devices::
>>> +
>>> +   >$ ls /sys/bus/coresight/devices
>>> +   trbe0  trbe1  trbe2 trbe3
>>> +
>>> +The ``trbe`` named TRBEs are associated with a CPU.::
>>> +
>>> +   >$ ls /sys/bus/coresight/devices/trbe0/
>>> +align dbm
>>> +
>>> +*Key file items are:-*
>>> +   * ``align``: TRBE write pointer alignment
>>> +   * ``dbm``: TRBE updates memory with access and dirty flags
>>> +
>>
>> Please add documentation for these, the same way it was done for all the 
>> other CS
>> components [1].
>>
>> [1]. https://elixir.bootlin.com/linux/latest/source/Documentation/ABI/testing
>> (sysfs-bus-coresight-device-xyz)
>>
>>> diff --git a/arch/arm64/include/asm/sysreg.h 
>>> b/arch/arm64/include/asm/sysreg.h
>>> index 85ae4db..9e2e9b7 100644
>>> --- a/arch/arm64/include/asm/sysreg.h
>>> +++ b/arch/arm64/include/asm/sysreg.h
>>> @@ -97,6 +97,7 @@
>>>  #define SET_PSTATE_UAO(x)  __emit_i

Re: [RFC PATCH 1/2] KVM: selftests: Add a macro to get string of vm_mem_backing_src_type

2021-02-09 Thread wangyanan (Y)




On 2021/2/10 1:35, Sean Christopherson wrote:

On Tue, Feb 09, 2021, Ben Gardon wrote:

On Tue, Feb 9, 2021 at 3:21 AM wangyanan (Y)  wrote:


On 2021/2/9 2:13, Ben Gardon wrote:

On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:

Add a macro to get string of the backing source memory type, so that
application can add choices for source types in the help() function,
and users can specify which type to use for testing.

Coincidentally, I sent out a change last week to do the same thing:
"KVM: selftests: Add backing src parameter to dirty_log_perf_test"
(https://lkml.org/lkml/2021/2/2/1430)
Whichever way this ends up being implemented, I'm happy to see others
interested in testing different backing source types too.

Thanks Ben! I have a little question here.

Can we just present three IDs (0/1/2) but not strings for users to
choose which backing_src_type to use like the way of guest modes,

That would be fine with me. The string names are easier for me to read
than an ID number (especially if you were to add additional options
e.g. 1G hugetlb or file backed  / shared memory) but it's mostly an
aesthetic preference, so I don't have strong feelings either way.

I vote to expose/consume strings, being able to do ".dirty_log_perf_test --help"
and understand the backing options without having to dig into source was super
nice.
.
Fine then:), I will make some change based on 
(https://lkml.org/lkml/2021/2/2/1430), thanks!

[PATCH v1] media: ti-vpe: cal: fix ce

2021-02-09 Thread Tong Zhang

FIELD_GET require mask field to be constant, however it is wrapped by a
function which will cause error

././include/linux/compiler_types.h:320:38: error: call to 
‘__compiletime_assert_270’ declared with attribute error: FIELD_GET: mask is
not constant
  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)

redefine cal_read_field as a macro

Signed-off-by: Tong Zhang 
---
 drivers/media/platform/ti-vpe/cal.h | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/media/platform/ti-vpe/cal.h 
b/drivers/media/platform/ti-vpe/cal.h
index 4123405ee0cf..d73a4a3b99cb 100644
--- a/drivers/media/platform/ti-vpe/cal.h
+++ b/drivers/media/platform/ti-vpe/cal.h
@@ -215,10 +215,7 @@ static inline void cal_write(struct cal_dev *cal, u32 
offset, u32 val)
iowrite32(val, cal->base + offset);
 }
 
-static inline u32 cal_read_field(struct cal_dev *cal, u32 offset, u32 mask)
-{
-   return FIELD_GET(mask, cal_read(cal, offset));
-}
+#define cal_read_field(cal, offset, mask) FIELD_GET(mask, cal_read(cal, 
offset))
 
 static inline void cal_write_field(struct cal_dev *cal, u32 offset, u32 value,
   u32 mask)
-- 
2.25.1

Re: [PATCH RFC 4/6] btrfs: Check if the filesystem is has mixed type of devices

2021-02-09 Thread Michał Mirosław

On Tue, Feb 09, 2021 at 09:30:38PM +0100, Michal Rostecki wrote:
> From: Michal Rostecki 
> 
> Add the btrfs_check_mixed() function which checks if the filesystem has
> the mixed type of devices (non-rotational and rotational). This
> information is going to be used in roundrobin raid1 read policy.a
[...]
> @@ -669,8 +699,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices 
> *fs_devices,
>   }
>  
>   q = bdev_get_queue(bdev);
> - if (!blk_queue_nonrot(q))
> + rotating = !blk_queue_nonrot(q);
> + device->rotating = rotating;
> + if (rotating)
>   fs_devices->rotating = true;
> + if (!fs_devices->mixed)
> + fs_devices->mixed = btrfs_check_mixed(fs_devices, rotating);
[...]

Since this is adding to a set, a faster way is:

if (fs_devices->rotating != rotating)
fs_devices->mixed = true;

The scan might be necessary on device removal, though.

> - if (!blk_queue_nonrot(q))
> + rotating = !blk_queue_nonrot(q);
> + device->rotating = rotating;
> + if (rotating)
>   fs_devices->rotating = true;
> + if (!fs_devices->mixed)
> + fs_devices->mixed = btrfs_check_mixed(fs_devices, rotating);

Duplication. Maybe pull all this into a function?

Best Regards,
Michał Mirosław

Re: [PATCH v4] checkpatch: do not apply "initialise globals to 0" check to BPF progs

2021-02-09 Thread Song Liu




> On Feb 9, 2021, at 6:10 PM, Joe Perches  wrote:
> 
> On Tue, 2021-02-09 at 13:19 -0800, Song Liu wrote:
>> BPF programs explicitly initialise global variables to 0 to make sure
>> clang (v10 or older) do not put the variables in the common section.
> 
> Acked-by: Joe Perches 
> 
> So the patch is OK now, but I have a question about the concept:
> 
> Do you mean that these initialized to 0 global variables
> should go into bss or another section?

We want these variables to go to bss. 

> Perhaps it'd be useful to somehow mark variables into specific
> sections rather than bss when initialized to 0 and data when not
> initialized to 0.

Currently, libbpf expects zero initialized global data in bss. This 
convention works well so far. Is there any reason that we specify 
section for global data? 

Thanks,
Song

> 
> $ clang --version
> clang version 10.0.0 (git://github.com/llvm/llvm-project.git 
> 305b961f64b75e73110e309341535f6d5a48ed72)
> Target: x86_64-unknown-linux-gnu
> Thread model: posix
> 
> $ cat t_common.c
> int a = 0;
> int b = 1;
> 
> int foo_a(void)
> {
>   return a;
> }
> 
> int foo_b(void)
> {
>   return b;
> }
> 
> $ clang -c -O3 t_common.c
> 
> $ objdump -x t_common.o 
> 
> t_common.o: file format elf64-x86-64
> t_common.o
> architecture: i386:x86-64, flags 0x0011:
> HAS_RELOC, HAS_SYMS
> start address 0x
> 
> Sections:
> Idx Name  Size  VMA   LMA   File off  Algn
>  0 .text 0017      0040  2**4
>  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
>  1 .bss  0004      0058  2**2
>  ALLOC
>  2 .data 0004      0058  2**2
>  CONTENTS, ALLOC, LOAD, DATA
>  3 .comment  0068      005c  2**0
>  CONTENTS, READONLY
>  4 .note.GNU-stack       00c4  
> 2**0
>  CONTENTS, READONLY
>  5 .eh_frame 0040      00c8  2**3
>  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
>  6 .llvm_addrsig       0210  2**0
>  CONTENTS, READONLY, EXCLUDE
> SYMBOL TABLE:
>  ldf *ABS* t_common.c
>  ld  .text .text
>  g O .bss 0004 a
>  g O .data0004 b
>  g F .text0007 foo_a
> 0010 g F .text0007 foo_b
> 
> 
> RELOCATION RECORDS FOR [.text]:
> OFFSET   TYPE  VALUE 
> 0002 R_X86_64_PC32 a-0x0004
> 0012 R_X86_64_PC32 b-0x0004
> 
> 
> RELOCATION RECORDS FOR [.eh_frame]:
> OFFSET   TYPE  VALUE 
> 0020 R_X86_64_PC32 .text
> 0034 R_X86_64_PC32 .text+0x0010
> 
> 
> Perhaps instead something like:
> 
> $ cat t_common_bpf.c
> __attribute__((__section__("bpf"))) int a = 0;
> __attribute__((__section__("bpf"))) int b = 1;
> 
> int foo_a(void)
> {
>   return a;
> }
> 
> int foo_b(void)
> {
>   return b;
> }
> 
> $ clang -c -O3 t_common_bpf.c
> 
> $ objdump -x t_common_bpf.o 
> 
> t_common_bpf.o: file format elf64-x86-64
> t_common_bpf.o
> architecture: i386:x86-64, flags 0x0011:
> HAS_RELOC, HAS_SYMS
> start address 0x
> 
> Sections:
> Idx Name  Size  VMA   LMA   File off  Algn
>  0 .text 0017      0040  2**4
>  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
>  1 bpf   0008      0058  2**2
>  CONTENTS, ALLOC, LOAD, DATA
>  2 .comment  0068      0060  2**0
>  CONTENTS, READONLY
>  3 .note.GNU-stack       00c8  
> 2**0
>  CONTENTS, READONLY
>  4 .eh_frame 0040      00c8  2**3
>  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
>  5 .llvm_addrsig       0210  2**0
>  CONTENTS, READONLY, EXCLUDE
> SYMBOL TABLE:
>  ldf *ABS* t_common_bpf.c
>  ld  .text .text
>  g O bpf  0004 a
> 0004 g O bpf  0004 b
>  g F .text0007 foo_a
> 0010 g F .text0007 foo_b
> 
> 
> RELOCATION RECORDS FOR [.text]:
> OFFSET   TYPE  VALUE 
> 0002

[PATCH] psi: Use ONCPU state tracking machinery to detect reclaim

2021-02-09 Thread Chengming Zhou

Move the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state. And we
also add memstall state changes checking in the psi_task_switch()
optimization to update the parents properly.

Thanks to Johannes Weiner for pointing out the psi_task_switch()
optimization things and the clearer changelog.

Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
---
 include/linux/psi.h  |  1 -
 kernel/sched/core.c  |  1 -
 kernel/sched/psi.c   | 52 
 kernel/sched/stats.h |  9 -
 4 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023f3fdd..65eb1476ac70 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int 
set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15d2562118d1..31788a9b335b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4533,7 +4533,6 @@ void scheduler_tick(void)
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
-   psi_task_tick(rq);
 
rq_unlock(rq, &rf);
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2293c45d289d..11449fb8141e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
u32 delta;
u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
-   else if (memstall_tick) {
-   u32 sample;
-   /*
-* Since we care about lost potential, a
-* memstall is FULL when there are no other
-* working tasks, but also when the CPU is
-* actively reclaiming and nothing productive
-* could run even if it were runnable.
-*
-* When the timer tick sees a reclaiming CPU,
-* regardless of runnable tasks, sample a FULL
-* tick (or less if it hasn't been a full tick
-* since the last state change).
-*/
-   sample = min(delta, (u32)jiffies_to_nsecs(1));
-   groupc->times[PSI_MEM_FULL] += sample;
-   }
}
 
if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
 */
write_seqcount_begin(&groupc->seq);
 
-   record_times(groupc, cpu, false);
+   record_times(groupc, cpu);
 
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
if (test_state(groupc->tasks, s))
state_mask |= (1 << s);
}
+
+   /*
+* Since we care about lost potential, a memstall is FULL
+* when there are no other working tasks, but also when
+* the CPU is actively reclaiming and nothing productive
+* could run even if it were runnable. So when the current
+* task in a cgroup is in_memstall, the corresponding groupc
+* on that cpu is in PSI_MEM_FULL state.
+*/
+   if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
+   state_mask |= (1 << PSI_MEM_FULL);
+
groupc->state_mask = state_mask;
 
write_seqcount_end(&groupc->seq);
@@ -833,7 +827,8 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
 */
iter = NULL;
while ((group = iterate_groups(next, &iter))) {
-   if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+   if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU] &&
+   next->in_memstall == prev->in_memstall) {
common = group;
break;
}
@@ -859,21 +854,6 @@ void psi_task_switch(str

Re: [PATCH 1/2] ext4: Handle casefolding with encryption

2021-02-09 Thread Theodore Ts'o

On Tue, Feb 09, 2021 at 08:03:10PM -0700, Andreas Dilger wrote:
> Depending on the size of the "escape", it probably makes sense to move
> toward having e2fsck migrate from the current mechanism to using dirdata
> for all deployments.  In the current implementation, tools don't really
> know for sure if there is data beyond the filename in the dirent or not.

It's actually quite well defined.  If dirdata is enabled, then we
follow the dirdata rules.  If dirdata is *not* enabled, then if a
directory inode has the case folding and encryption flags set, then
there will be cryptographic data immediately following the filename.
Otherwise, there is no valid data after the filename.

> For example, what if casefold is enabled on an existing filesystem that
> already has an encrypted directory?  Does the code _assume_ that there is
> a hash beyond the name if the rec_len is long enough for this?

No, we will only expect there to be a hash beyond the name if
EXT4_CASEFOLD_FL and EXT4_ENCRYPT_FL flags are set on the inode.  (And
if the rec_len is not large enough, then that's a corrupted directory
entry.)

> I guess it is implicit with the casefold+encryption case for dirents in
> directories that have the encryption flag set in a filesystem that also
> has casefold enabled, but it's definitely not friendly to these features
> being enabled on an existing filesystem.

No, it's fine.  That's because the EXT4_CASEFOLD_FL inode flag can
only be set if the EXT4_FEATURE_INCOMPAT_CASEFOLD is set in the
superblock, and EXT4_ENCRYPT_FL inode flag can only be set if
EXT4_FEATURE_INCOMPAT_ENCRYPT is set in the superblock, this is why it
will be safe to enable of these features, since merely enabling the
file system features only allows new directories to be created with
both CASEFOLD_FL and ENCRYPT_FL set.

The only restriction we would have is a file system has both the case
folding and encryption features, it will *not* be safe to set the
dirdata feature flag without first scanning all of the directories to
see if there are any directories that have both the casefold and
encrypt flags set on that inode, and if so, to convert all of the
directory entries to use dirdata.  I don't think this is going to be a
significant restriction in practice, though.

- Ted

Re: [PATCH 3/3] mlx5_vdpa: defer clear_virtqueues to until DRIVER_OK




On 2021/2/10 上午8:26, Si-Wei Liu wrote:



On 2/8/2021 7:37 PM, Jason Wang wrote:


On 2021/2/6 下午8:29, Si-Wei Liu wrote:

While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().

Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index 
after change map")

Signed-off-by: Si-Wei Liu 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c

index aa6f8cd..444ab58 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1785,7 +1785,6 @@ static void mlx5_vdpa_set_status(struct 
vdpa_device *vdev, u8 status)

  if (!status) {
  mlx5_vdpa_info(mvdev, "performing device reset\n");
  teardown_driver(ndev);
-    clear_virtqueues(ndev);
  mlx5_vdpa_destroy_mr(&ndev->mvdev);
  ndev->mvdev.status = 0;
  ++mvdev->generation;
@@ -1794,6 +1793,7 @@ static void mlx5_vdpa_set_status(struct 
vdpa_device *vdev, u8 status)
    if ((status ^ ndev->mvdev.status) & 
VIRTIO_CONFIG_S_DRIVER_OK) {

  if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+    clear_virtqueues(ndev);



Rethink about this. As mentioned in another thread, this in fact 
breaks set_vq_state().  (See vhost_virtqueue_start() -> 
vhost_vdpa_set_vring_base() in qemu codes).

I assume that the clearing for vhost-vdpa would be done via (qemu code),

vhost_dev_start()->vhost_vdpa_dev_start()->vhost_vdpa_call(status | 
VIRTIO_CONFIG_S_DRIVER_OK)


which is _after_ vhost_virtqueue_start() gets called to restore the 
avail_idx to h/w in vhost_dev_start(). What am I missing here?


-Siwei



I think not. I thought clear_virtqueues() will clear hardware index but 
looks not. (I guess we need a better name other than clear_virtqueues(), 
e.g from the name it looks like the it will clear the hardware states)


Thanks







The issue is that the avail idx is forgot, we need keep it.

Thanks



  err = setup_driver(ndev);
  if (err) {
  mlx5_vdpa_warn(mvdev, "failed to setup driver\n");

Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map




On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources 
are destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available and 
used
indices upon device reset, fix this here to avoid regression 
caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, 
in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,
mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,
virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net *ndev, 
struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);

     attr->available_index = MLX5_GET(virtio_net_q_object,
obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct
mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device reset\n");
     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns with 
the other
state cleanups floating around at the same place. However, the 
thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in the driver software at vq suspension, but 
before the
virtq object is destroyed. We shouldn't clear the avail_index 
too early.

Good point.

There's a limitation on the virtio spec

[PATCH v3] printk: fix deadlock when kernel panic

2021-02-09 Thread Muchun Song

printk_safe_flush_on_panic() caused the following deadlock on our
server:

CPU0: CPU1:
panic rcu_dump_cpu_stacks
  kdump_nmi_shootdown_cpus  nmi_trigger_cpumask_backtrace
register_nmi_handler(crash_nmi_callback)  printk_safe_flush
__printk_safe_flush
  
raw_spin_lock_irqsave(&read_lock)
// send NMI to other processors
apic_send_IPI_allbutself(NMI_VECTOR)
// NMI interrupt, dead 
loop
crash_nmi_callback
  printk_safe_flush_on_panic
printk_safe_flush
  __printk_safe_flush
// deadlock
raw_spin_lock_irqsave(&read_lock)

DEADLOCK: read_lock is taken on CPU1 and will never get released.

It happens when panic() stops a CPU by NMI while it has been in
the middle of printk_safe_flush().

Handle the lock the same way as logbuf_lock. The printk_safe buffers
are flushed only when both locks can be safely taken. It can avoid
the deadlock _in this particular case_ at expense of losing contents
of printk_safe buffers.

Note: It would actually be safe to re-init the locks when all CPUs were
  stopped by NMI. But it would require passing this information
  from arch-specific code. It is not worth the complexity.
  Especially because logbuf_lock and printk_safe buffers have been
  obsoleted by the lockless ring buffer.

Fixes: cf9b1106c81c ("printk/nmi: flush NMI messages on the system panic")
Signed-off-by: Muchun Song 
Reviewed-by: Petr Mladek 
Cc: 
---
 kernel/printk/printk_safe.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index a0e6f746de6c..2e9e3ed7d63e 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -45,6 +45,8 @@ struct printk_safe_seq_buf {
 static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
 static DEFINE_PER_CPU(int, printk_context);
 
+static DEFINE_RAW_SPINLOCK(safe_read_lock);
+
 #ifdef CONFIG_PRINTK_NMI
 static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
 #endif
@@ -180,8 +182,6 @@ static void report_message_lost(struct printk_safe_seq_buf 
*s)
  */
 static void __printk_safe_flush(struct irq_work *work)
 {
-   static raw_spinlock_t read_lock =
-   __RAW_SPIN_LOCK_INITIALIZER(read_lock);
struct printk_safe_seq_buf *s =
container_of(work, struct printk_safe_seq_buf, work);
unsigned long flags;
@@ -195,7 +195,7 @@ static void __printk_safe_flush(struct irq_work *work)
 * different CPUs. This is especially important when printing
 * a backtrace.
 */
-   raw_spin_lock_irqsave(&read_lock, flags);
+   raw_spin_lock_irqsave(&safe_read_lock, flags);
 
i = 0;
 more:
@@ -232,7 +232,7 @@ static void __printk_safe_flush(struct irq_work *work)
 
 out:
report_message_lost(s);
-   raw_spin_unlock_irqrestore(&read_lock, flags);
+   raw_spin_unlock_irqrestore(&safe_read_lock, flags);
 }
 
 /**
@@ -278,6 +278,14 @@ void printk_safe_flush_on_panic(void)
raw_spin_lock_init(&logbuf_lock);
}
 
+   if (raw_spin_is_locked(&safe_read_lock)) {
+   if (num_online_cpus() > 1)
+   return;
+
+   debug_locks_off();
+   raw_spin_lock_init(&safe_read_lock);
+   }
+
printk_safe_flush();
 }
 
-- 
2.11.0

[PATCH] sched/autogroup: Use true and false for bool variable

2021-02-09 Thread Jiapeng Chong

Fix the following coccicheck warning:

kernel/sched/autogroup.h:46:8-9: WARNING: return of 0/1 in function
'task_group_is_autogroup' with return type bool.

Reported-by: Abaci Robot
Signed-off-by: Jiapeng Chong 
---
 kernel/sched/autogroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b964199..238ac9e 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -43,7 +43,7 @@ static inline void autogroup_init(struct task_struct 
*init_task) {  }
 static inline void autogroup_free(struct task_group *tg) { }
 static inline bool task_group_is_autogroup(struct task_group *tg)
 {
-   return 0;
+   return false;
 }
 
 static inline struct task_group *
-- 
1.8.3.1

[PATCH v1] media: atomisp: fix compiler warning

2021-02-09 Thread Tong Zhang

should use %zu for size_t type, otherwise compiler will complain
drivers/staging/media/atomisp/pci/hmm/hmm.c:272:3: warning: format ‘%ld’ 
expects argument of type ‘long int’, but argument 6 has type ‘size_t’ {aka 
‘unsigned int’} [-Wformat=]
  272 |   "%s: pages: 0x%08x (%ld bytes), type: %d from highmem %d, user ptr 
%p, cached %d\n",
  |   
^~~

Signed-off-by: Tong Zhang 
---
 drivers/staging/media/atomisp/pci/hmm/hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm.c 
b/drivers/staging/media/atomisp/pci/hmm/hmm.c
index e0eaff0f8a22..6a5ee4607089 100644
--- a/drivers/staging/media/atomisp/pci/hmm/hmm.c
+++ b/drivers/staging/media/atomisp/pci/hmm/hmm.c
@@ -269,7 +269,7 @@ ia_css_ptr hmm_alloc(size_t bytes, enum hmm_bo_type type,
hmm_set(bo->start, 0, bytes);
 
dev_dbg(atomisp_dev,
-   "%s: pages: 0x%08x (%ld bytes), type: %d from highmem %d, user 
ptr %p, cached %d\n",
+   "%s: pages: 0x%08x (%zu bytes), type: %d from highmem %d, user 
ptr %p, cached %d\n",
__func__, bo->start, bytes, type, from_highmem, userptr, 
cached);
 
return bo->start;
-- 
2.25.1

Re: [Linuxarm] [PATCH for next v1 0/2] gpio: few clean up patches to replace spin_lock_irqsave with spin_lock

2021-02-09 Thread luojiaxing

On 2021/2/9 17:42, Andy Shevchenko wrote:

On Tue, Feb 9, 2021 at 11:24 AM luojiaxing wrote:

On 2021/2/8 21:28, Andy Shevchenko wrote:

On Mon, Feb 8, 2021 at 11:11 AM luojiaxing wrote:

Sorry, my operation error causes a patch missing from this patch set. I
re-send the patch set. Please check the new one.

What is the new one?! You have to give proper versioning and change
log for your series.

sure, I will send a new one later, but let me answer your question first.

On 2021/2/8 16:56, Luo Jiaxing wrote:

There is no need to use API with _irqsave in hard IRQ handler, So replace
those with spin_lock.

How do you know that another CPU in the system can't serve the

The keyword here is: *another*.

ooh, sorry, now I got your point.

As to me, I don't think another CPU can serve the IRQ when one CPU
runing hard IRQ handler,

except it's a per CPU interrupts.

The following is a simple call logic when IRQ come.

elx_irq -> handle_arch_irq -> __handle_domain_irq -> desc->handle_irq ->
handle_irq_event

Assume that two CPUs receive the same IRQ and enter the preceding
process. Both of them will go to desc->handle_irq().

In handle_irq(), raw_spin_lock(&desc->lock) always be called first.
Therefore, even if two CPUs are running handle_irq(),

only one can get the spin lock. Assume that CPU A obtains the spin lock.
Then CPU A will sets the status of irq_data to

IRQD_IRQ_INPROGRESS in handle_irq_event() and releases the spin lock.
Even though CPU B gets the spin lock later and

continue to run handle_irq(), but the check of irq_may_run(desc) causes
it to exit.

so, I think we don't own the situation that two CPU server the hard IRQ
handler at the same time.

following interrupt from the hardware at the same time?

Yes, I have some question before.

There are some similar discussion here, please take a look, Song baohua
explained it more professionally.

https://lore.kernel.org/lkml/e949a474a9284ac6951813bfc8b34...@hisilicon.com/

Here are some excerpts from the discussion:

I think the code disabling irq in hardIRQ is simply wrong.

Why?

I mention the following call before.

elx_irq -> handle_arch_irq -> __handle_domain_irq -> desc->handle_irq ->
handle_irq_event

__handle_domain_irq() will call irq_enter(), it ensures that the IRQ
processing of the current CPU can not be preempted.

So I think this is the reason why Song baohua said it's not need to
disable IRQ in hardIRQ handler.

Since this commit
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e58aa3d2d0cc
genirq: Run irq handlers with interrupts disabled

interrupt handlers are definitely running in a irq-disabled context
unless irq handlers enable them explicitly in the handler to permit
other interrupts.

This doesn't explain any changes in the behaviour on SMP.
IRQ line can be disabled on a few stages:
a) on the source (IP that generates an event)
b) on IRQ router / controller
c) on CPU side

yes, you are right.

The commit above is discussing (rightfully!) the problem when all
interrupts are being served by a *single* core. Nobody prevents them
from being served by *different* cores simultaneously. Also, see [1].

[1]: https://www.kernel.org/doc/htmldocs/kernel-locking/cheatsheet.html

I check [1], quite useful description about locking, thanks. But you can

see Table of locking Requirements

Between IRQ handler A and IRQ handle A, it's no need for a SLIS.

Thanks

Jiaxing

Re: [PATCH 1/9] mm/migrate.c: Always allow device private pages to migrate

2021-02-09 Thread Alistair Popple

On Wednesday, 10 February 2021 12:39:32 AM AEDT Jason Gunthorpe wrote:
> On Tue, Feb 09, 2021 at 12:07:14PM +1100, Alistair Popple wrote:
> > Device private pages are used to represent device memory that is not
> > directly accessible from the CPU. Extra references to a device private
> > page are only used to ensure the struct page itself remains valid whilst
> > waiting for migration entries. Therefore extra references should not
> > prevent device private page migration as this can lead to failures to
> > migrate pages back to the CPU which are fatal to the user process.
> 
> This should identify the extra references in expected_count, just
> disabling this protection seems unsafe, ZONE_DEVICE is not so special
> that the refcount means nothing

This is similar to what migarte_vma_check_page() does now. The issue is that a 
migration wait takes a reference on the device private page so you can end up 
with one thread stuck waiting for migration whilst the other can't migrate due 
to the extra refcount.

Given device private pages can't undergo GUP and that it's not possible to 
differentiate the migration wait refcount from any other refcount we assume 
any possible extra reference must be from migration wait.

> Is this a side effect of the extra refcounts that Ralph was trying to
> get rid of? I'd rather see that work finished :)

I'd like to see that finished too but I don't think it would help here as this 
is not a side effect of that.

 - Alistair

> Jason

[PATCH v1] media: atomisp add auto selection to prevent ce

2021-02-09 Thread Tong Zhang

VIDEO_ATOMISP depends on VIDEO_V4L2_SUBDEV_API, if VIDEO_V4L2_SUBDEV_API
 is not selected, it will cause compilation error

drivers/staging/media/atomisp/pci/atomisp_cmd.c:6079:42: error:
 ‘struct v4l2_subdev_fh’ has no member named ‘pad’ atomisp_subdev_set_ffmt
(&asd->subdev, fh.pad, V4L2_SUBDEV_FORMAT_ACTIVE,

add auto select VIDEO_V4L2_SUBDEV_API if VIDEO_ATOMISP is selected

Signed-off-by: Tong Zhang 
---
 drivers/staging/media/atomisp/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/media/atomisp/Kconfig 
b/drivers/staging/media/atomisp/Kconfig
index 37577bb72998..c34ef28d1ebc 100644
--- a/drivers/staging/media/atomisp/Kconfig
+++ b/drivers/staging/media/atomisp/Kconfig
@@ -15,6 +15,7 @@ config VIDEO_ATOMISP
depends on PMIC_OPREGION
select IOSF_MBI
select VIDEOBUF_VMALLOC
+   select VIDEO_V4L2_SUBDEV_API
help
  Say Y here if your platform supports Intel Atom SoC
  camera imaging subsystem.
-- 
2.25.1

Re: [PATCH] mmc: cb710: Use new tasklet API

2021-02-09 Thread Michał Mirosław

On Mon, Feb 08, 2021 at 02:45:51PM +0100, Emil Renner Berthing wrote:
> This converts the driver to use the new tasklet API introduced in
> commit 12cc923f1ccc ("tasklet: Introduce new initialization API")
> 
> Signed-off-by: Emil Renner Berthing 

Acked-by: Michał Mirosław 

> ---
>  drivers/mmc/host/cb710-mmc.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c
> index e84ed84ea4cc..6d623b2681c3 100644
> --- a/drivers/mmc/host/cb710-mmc.c
> +++ b/drivers/mmc/host/cb710-mmc.c
> @@ -646,14 +646,14 @@ static int cb710_mmc_irq_handler(struct cb710_slot 
> *slot)
>   return 1;
>  }
>  
> -static void cb710_mmc_finish_request_tasklet(unsigned long data)
> +static void cb710_mmc_finish_request_tasklet(struct tasklet_struct *t)
>  {
> - struct mmc_host *mmc = (void *)data;
> - struct cb710_mmc_reader *reader = mmc_priv(mmc);
> + struct cb710_mmc_reader *reader = from_tasklet(reader, t,
> +finish_req_tasklet);
>   struct mmc_request *mrq = reader->mrq;
>  
>   reader->mrq = NULL;
> - mmc_request_done(mmc, mrq);
> + mmc_request_done(mmc_from_priv(reader), mrq);
>  }
>  
>  static const struct mmc_host_ops cb710_mmc_host = {
> @@ -718,8 +718,8 @@ static int cb710_mmc_init(struct platform_device *pdev)
>  
>   reader = mmc_priv(mmc);
>  
> - tasklet_init(&reader->finish_req_tasklet,
> - cb710_mmc_finish_request_tasklet, (unsigned long)mmc);
> + tasklet_setup(&reader->finish_req_tasklet,
> +   cb710_mmc_finish_request_tasklet);
>   spin_lock_init(&reader->irq_lock);
>   cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
>

Re: [ANNOUNCE] exfatprogs-1.1.0 version released

2021-02-09 Thread Sedat Dilek

On Wed, Feb 10, 2021 at 12:50 AM Namjae Jeon  wrote:
>
> Hi folk,
>
> We have released exfatprogs 1.1.0 version. In this release, exfatlabel
> has been added to print or re-write volume label and volume serial value.
> Also, A new dump.exfat util has been added to display statistics from
> a given device(Requested by Mike Fleetwood(GParted Developer)).
>
> Any feedback is welcome!:)
>

Congrats to the new release and thanks to all involved people.

Hope Sven will do a new release for Debian.
( Note that Debian/bullseye release  plans "Milestone 2" this Friday,
February 12th (see [1] > "Key release dates" > "[2021-Feb-12] Soft
Freeze"). Dunno which impact this might have on this. )

- Sedat -

[1] https://release.debian.org/


> CHANGES :
>  * fsck.exfat: Recover corrupted boot region.
>
> NEW FEATURES :
>  * exfatlabel: Print or set volume label and serial.
>  * dump.exfat: Show the on-disk metadata information and the statistics.
>
> BUG FIXES :
>  * Set _FILE_OFFSET_BITS=64 for Android build.
>
> The git tree is at:
>   https://github.com/exfatprogs/exfatprogs
>
> The tarballs can be found at:
>   
> https://github.com/exfatprogs/exfatprogs/releases/download/1.1.0/exfatprogs-1.1.0.tar.gz
>

[GIT] Networking

2021-02-09 Thread David Miller



Another pile of networing fixes:

1) ath9k build error fix from Arnd Bergmann

2) dma memory leak fix in mediatec driver from Lorenzo Bianconi.

3) bpf int3 kprobe fix from Alexei Starovoitov.

4) bpf stackmap integer overflow fix from Bui Quang Minh.

5) Add usb device ids for Cinterion MV31 to qmi_qwwan driver, from
   Christoph Schemmel.

6) Don't update deleted entry in xt_recent netfilter module, from Jazsef 
Kadlecsik.

7) Use after free in nftables, fix from Pablo Neira Ayuso.

8) Header checksum fix in flowtable from Sven Auhagen.

9) Validate user controlled length in qrtr code, from Sabyrzhan Tasbolatov.

10) Fix race in xen/netback, from Juergen Gross,

11) New device ID in cxgb4, from Raju Rangoju.

12) Fix ring locking in rxrpc release call, from David Howells.

13) Don't return LAPB error codes from x25_open(), from Xie He.

14) Missing error returns in gsi_channel_setup() from Alex Elder.

15) Get skb_copy_and_csum_datagram working properly with odd segment sizes,
from Willem de Bruijn.

16) Missing RFS/RSS table init in enetc driver, from Vladimir Oltean.

17) Do teardown on probe failure in DSA, from Vladimir Oltean.

18) Fix compilation failures of txtimestamp selftest, from Vadim Fedorenko.

19) Limit rx per-napi gro queue size to fix latency regression,  from Eric 
Dumazet.

20) dpaa_eth xdp fixes from Camelia Groza.

21) Missing txq mode update when switching CBS off, in stmmac driver,
from Mohammad Athari Bin Ismail.

22) Failover pending logic fix in ibmvnic driver, from Sukadev Bhattiprolu.

23) Null deref fix in vmw_vsock, from Norbert Slusarek.

24) Missing verdict update in xdp paths of ena driver, from Shay Agroskin.

25) seq_file iteration fix in sctp from Neil Brown.

26) bpf 32-bit src register truncation fix on div/mod, from Daniel Borkmann.

27) Fix jmp32 pruning in bpf verifier, from  Daniel Borkmann.

28) Fix locking in vsock_shutdown(),  from Stefano Garzarella.

29) Various missing index bound checks in hns3 driver, from Yufeng Mo.

30) Flush ports on .phylink_mac_link_down() in dsa felix driver, from Vladimir 
Oltean.

31) Don't mix up stp and mrp port states in bridge layer, from Horatiu Vultur.

32) Fix locking during netif_tx_disable(), from Edwin Peer.

Please pull, thanks a lot!

The following changes since commit 3aaf0a27ffc29b19a62314edd684b9bc6346f9a8:

  Merge tag 'clang-format-for-linux-v5.11-rc7' of git://github.com/ojeda/linux 
(2021-02-02 10:46:59 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git 

for you to fetch changes up to b8776f14a47046796fe078c4a2e691f58e00ae06:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf (2021-02-09 
18:55:17 -0800)


Alex Elder (1):
  net: ipa: set error code in gsi_channel_setup()

Alexei Starovoitov (1):
  bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called via do_int3

Andrea Parri (Microsoft) (1):
  hv_netvsc: Reset the RSC count if NVSP_STAT_FAIL in netvsc_receive()

Arnd Bergmann (1):
  ath9k: fix build error with LEDS_CLASS=m

Bui Quang Minh (1):
  bpf: Check for integer overflow when using roundup_pow_of_two()

Camelia Groza (3):
  dpaa_eth: reserve space for the xdp_frame under the A050385 erratum
  dpaa_eth: reduce data alignment requirements for the A050385 erratum
  dpaa_eth: try to move the data in place for the A050385 erratum

Christoph Schemmel (1):
  NET: usb: qmi_wwan: Adding support for Cinterion MV31

Daniel Borkmann (3):
  bpf: Fix verifier jsgt branch analysis on max bound
  bpf: Fix verifier jmp32 pruning decision logic
  bpf: Fix 32 bit src register truncation on div/mod

David Howells (1):
  rxrpc: Fix clearance of Tx/Rx ring when releasing a call

David S. Miller (4):
  Merge branch 'bridge-mrp'
  Merge branch 'hns3-fixes'
  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
  Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Edwin Peer (1):
  net: watchdog: hold device global xmit lock during tx disable

Eric Dumazet (1):
  net: gro: do not keep too many GRO packets in napi->rx_list

Fabian Frederick (1):
  selftests: netfilter: fix current year

Florian Westphal (1):
  netfilter: conntrack: skip identical origin tuple in same zone only

Horatiu Vultur (2):
  bridge: mrp: Fix the usage of br_mrp_port_switchdev_set_state
  switchdev: mrp: Remove SWITCHDEV_ATTR_ID_MRP_PORT_STAT

Jakub Kicinski (3):
  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
  Merge branch 'dpaa_eth-a050385-erratum-workaround-fixes-under-xdp'
  Merge tag 'wireless-drivers-2021-02-05' of 
git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers

Jozsef Kadlecsik (1):
  netfilter: xt_recent: Fix attempt to update deleted entry

Juergen Gross (1):
  xen/netback: avoid race in xenvif_rx_ring_slots_available()

Lorenzo Biancon

Re: [PATCH v2 06/28] locking/rwlocks: Add contention detection for rwlocks

2021-02-09 Thread Waiman Long


On 2/2/21 1:57 PM, Ben Gardon wrote:

rwlocks do not currently have any facility to detect contention
like spinlocks do. In order to allow users of rwlocks to better manage
latency, add contention detection for queued rwlocks.

CC: Ingo Molnar 
CC: Will Deacon 
Acked-by: Peter Zijlstra 
Acked-by: Davidlohr Bueso 
Acked-by: Waiman Long 
Acked-by: Paolo Bonzini 
Signed-off-by: Ben Gardon 
---
  include/asm-generic/qrwlock.h | 24 ++--
  include/linux/rwlock.h|  7 +++
  2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 84ce841ce735..0020d3b820a7 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -14,6 +14,7 @@
  #include 
  
  #include 

+#include 


As said in another thread, qspinlock and qrwlock can be independently 
enabled for an architecture. So we shouldn't include qspinlock.h here. 
Instead, just include the regular linux/spinlock.h file to make sure 
that arch_spin_is_locked() is available.



  
  /*

   * Writer states & reader shift and bias.
@@ -116,15 +117,26 @@ static inline void queued_write_unlock(struct qrwlock 
*lock)
smp_store_release(&lock->wlocked, 0);
  }
  
+/**

+ * queued_rwlock_is_contended - check if the lock is contended
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+   return arch_spin_is_locked(&lock->wait_lock);
+}
+
  /*
   * Remapping rwlock architecture specific functions to the corresponding
   * queue rwlock functions.
   */
-#define arch_read_lock(l)  queued_read_lock(l)
-#define arch_write_lock(l) queued_write_lock(l)
-#define arch_read_trylock(l)   queued_read_trylock(l)
-#define arch_write_trylock(l)  queued_write_trylock(l)
-#define arch_read_unlock(l)queued_read_unlock(l)
-#define arch_write_unlock(l)   queued_write_unlock(l)
+#define arch_read_lock(l)  queued_read_lock(l)
+#define arch_write_lock(l) queued_write_lock(l)
+#define arch_read_trylock(l)   queued_read_trylock(l)
+#define arch_write_trylock(l)  queued_write_trylock(l)
+#define arch_read_unlock(l)queued_read_unlock(l)
+#define arch_write_unlock(l)   queued_write_unlock(l)
+#define arch_rwlock_is_contended(l)queued_rwlock_is_contended(l)
  
  #endif /* __ASM_GENERIC_QRWLOCK_H */

diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 3dcd617e65ae..7ce9a51ae5c0 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -128,4 +128,11 @@ do {   
\
1 : ({ local_irq_restore(flags); 0; }); \
  })
  
+#ifdef arch_rwlock_is_contended

+#define rwlock_is_contended(lock) \
+arch_rwlock_is_contended(&(lock)->raw_lock)
+#else
+#define rwlock_is_contended(lock)  ((void)(lock), 0)
+#endif /* arch_rwlock_is_contended */
+
  #endif /* __LINUX_RWLOCK_H */


Cheers,
Longman

[PATCH v2] perf tools: Fix arm64 build error with gcc-11

2021-02-09 Thread Jianlin Lv

gcc version: 11.0.0 20210208 (experimental) (GCC)

Following build error on arm64:

...
In function ‘printf’,
inlined from ‘regs_dump__printf’ at util/session.c:1141:3,
inlined from ‘regs__printf’ at util/session.c:1169:2:
/usr/include/aarch64-linux-gnu/bits/stdio2.h:107:10: \
  error: ‘%-5s’ directive argument is null [-Werror=format-overflow=]

107 |   return __printf_chk (__USE_FORTIFY_LEVEL - 1, __fmt, \
__va_arg_pack ());

..
In function ‘fprintf’,
  inlined from ‘perf_sample__fprintf_regs.isra’ at \
builtin-script.c:622:14:
/usr/include/aarch64-linux-gnu/bits/stdio2.h:100:10: \
error: ‘%5s’ directive argument is null [-Werror=format-overflow=]
  100 |   return __fprintf_chk (__stream, __USE_FORTIFY_LEVEL - 1, __fmt,
  101 | __va_arg_pack ());

cc1: all warnings being treated as errors
...

This patch fixes Wformat-overflow warnings. Add ternary operator,
The statement evaluates to "Unknown" if reg_name==NULL is met.

Signed-off-by: Jianlin Lv 
---
v2: Add ternary operator to avoid similar errors in other arch.
---
 tools/perf/builtin-script.c| 4 +++-
 tools/perf/util/scripting-engines/trace-event-python.c | 4 +++-
 tools/perf/util/session.c  | 5 +++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 42dad4a0f8cf..d59da3a063d0 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -635,6 +635,7 @@ static int perf_sample__fprintf_regs(struct regs_dump 
*regs, uint64_t mask,
 {
unsigned i = 0, r;
int printed = 0;
+   const char *reg_name;
 
if (!regs || !regs->regs)
return 0;
@@ -643,7 +644,8 @@ static int perf_sample__fprintf_regs(struct regs_dump 
*regs, uint64_t mask,
 
for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
u64 val = regs->regs[i++];
-   printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r), 
val);
+   reg_name = perf_reg_name(r);
+   printed += fprintf(fp, "%5s:0x%"PRIx64" ", reg_name ?: 
"Unknown", val);
}
 
return printed;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index c83c2c6564e0..e1222cc6a699 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -691,6 +691,7 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
 {
unsigned int i = 0, r;
int printed = 0;
+   const char *reg_name;
 
bf[0] = 0;
 
@@ -700,9 +701,10 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
u64 val = regs->regs[i++];
 
+   reg_name = perf_reg_name(r);
printed += scnprintf(bf + printed, size - printed,
 "%5s:0x%" PRIx64 " ",
-perf_reg_name(r), val);
+reg_name ?: "Unknown", val);
}
 
return printed;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 25adbcce0281..1058d8487e98 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1135,12 +1135,13 @@ static void branch_stack__printf(struct perf_sample 
*sample, bool callstack)
 static void regs_dump__printf(u64 mask, u64 *regs)
 {
unsigned rid, i = 0;
+   const char *reg_name;
 
for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
u64 val = regs[i++];
-
+   reg_name = perf_reg_name(rid);
printf(" %-5s 0x%016" PRIx64 "\n",
-  perf_reg_name(rid), val);
+  reg_name ?: "Unknown", val);
}
 }
 
-- 
2.25.1

[PATCH] crypto: amlogic - Fix unnecessary check in meson_crypto_probe()

2021-02-09 Thread Tang Bin

The function meson_crypto_probe() is only called with an openfirmware
platform device. Therefore there is no need to check that the passed
in device is NULL.

Signed-off-by: Tang Bin 
---
 drivers/crypto/amlogic/amlogic-gxl-core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/crypto/amlogic/amlogic-gxl-core.c 
b/drivers/crypto/amlogic/amlogic-gxl-core.c
index 466552acb..468a16f62 100644
--- a/drivers/crypto/amlogic/amlogic-gxl-core.c
+++ b/drivers/crypto/amlogic/amlogic-gxl-core.c
@@ -229,9 +229,6 @@ static int meson_crypto_probe(struct platform_device *pdev)
struct meson_dev *mc;
int err, i;
 
-   if (!pdev->dev.of_node)
-   return -ENODEV;
-
mc = devm_kzalloc(&pdev->dev, sizeof(*mc), GFP_KERNEL);
if (!mc)
return -ENOMEM;
-- 
2.20.1.windows.1

linux-next: build failure after merge of the drm-misc tree

2021-02-09 Thread Stephen Rothwell

Hi all,

After merging the drm-misc tree, today's linux-next build (x86_64
allmodconfig) failed like this:

drivers/gpu/drm/v3d/v3d_sched.c:263:1: error: return type is an incomplete type
  263 | v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job 
*sched_job)
  | ^
drivers/gpu/drm/v3d/v3d_sched.c: In function 'v3d_gpu_reset_for_timeout':
drivers/gpu/drm/v3d/v3d_sched.c:289:9: error: 'return' with a value, in 
function returning void [-Werror=return-type]
  289 |  return DRM_GPU_SCHED_STAT_NOMINAL;
  | ^~
drivers/gpu/drm/v3d/v3d_sched.c:263:1: note: declared here
  263 | v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job 
*sched_job)
  | ^
drivers/gpu/drm/v3d/v3d_sched.c: At top level:
drivers/gpu/drm/v3d/v3d_sched.c:298:1: error: return type is an incomplete type
  298 | v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
  | ^~~
drivers/gpu/drm/v3d/v3d_sched.c: In function 'v3d_cl_job_timedout':
drivers/gpu/drm/v3d/v3d_sched.c:309:10: error: 'return' with a value, in 
function returning void [-Werror=return-type]
  309 |   return DRM_GPU_SCHED_STAT_NOMINAL;
  |  ^~
drivers/gpu/drm/v3d/v3d_sched.c:298:1: note: declared here
  298 | v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
  | ^~~
drivers/gpu/drm/v3d/v3d_sched.c: At top level:
drivers/gpu/drm/v3d/v3d_sched.c:316:1: error: return type is an incomplete type
  316 | v3d_bin_job_timedout(struct drm_sched_job *sched_job)
  | ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:325:1: error: return type is an incomplete type
  325 | v3d_render_job_timedout(struct drm_sched_job *sched_job)
  | ^~~
drivers/gpu/drm/v3d/v3d_sched.c:334:1: error: return type is an incomplete type
  334 | v3d_generic_job_timedout(struct drm_sched_job *sched_job)
  | ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:342:1: error: return type is an incomplete type
  342 | v3d_csd_job_timedout(struct drm_sched_job *sched_job)
  | ^~~~
drivers/gpu/drm/v3d/v3d_sched.c: In function 'v3d_csd_job_timedout':
drivers/gpu/drm/v3d/v3d_sched.c:353:10: error: 'return' with a value, in 
function returning void [-Werror=return-type]
  353 |   return DRM_GPU_SCHED_STAT_NOMINAL;
  |  ^~
drivers/gpu/drm/v3d/v3d_sched.c:342:1: note: declared here
  342 | v3d_csd_job_timedout(struct drm_sched_job *sched_job)
  | ^~~~
drivers/gpu/drm/v3d/v3d_sched.c: At top level:
drivers/gpu/drm/v3d/v3d_sched.c:362:18: error: initialization of 'enum 
drm_gpu_sched_stat (*)(struct drm_sched_job *)' from incompatible pointer type 
'void (*)(struct drm_sched_job *)' [-Werror=incompatible-pointer-types]
  362 |  .timedout_job = v3d_bin_job_timedout,
  |  ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:362:18: note: (near initialization for 
'v3d_bin_sched_ops.timedout_job')
drivers/gpu/drm/v3d/v3d_sched.c:369:18: error: initialization of 'enum 
drm_gpu_sched_stat (*)(struct drm_sched_job *)' from incompatible pointer type 
'void (*)(struct drm_sched_job *)' [-Werror=incompatible-pointer-types]
  369 |  .timedout_job = v3d_render_job_timedout,
  |  ^~~
drivers/gpu/drm/v3d/v3d_sched.c:369:18: note: (near initialization for 
'v3d_render_sched_ops.timedout_job')
drivers/gpu/drm/v3d/v3d_sched.c:376:18: error: initialization of 'enum 
drm_gpu_sched_stat (*)(struct drm_sched_job *)' from incompatible pointer type 
'void (*)(struct drm_sched_job *)' [-Werror=incompatible-pointer-types]
  376 |  .timedout_job = v3d_generic_job_timedout,
  |  ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:376:18: note: (near initialization for 
'v3d_tfu_sched_ops.timedout_job')
drivers/gpu/drm/v3d/v3d_sched.c:383:18: error: initialization of 'enum 
drm_gpu_sched_stat (*)(struct drm_sched_job *)' from incompatible pointer type 
'void (*)(struct drm_sched_job *)' [-Werror=incompatible-pointer-types]
  383 |  .timedout_job = v3d_csd_job_timedout,
  |  ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:383:18: note: (near initialization for 
'v3d_csd_sched_ops.timedout_job')
drivers/gpu/drm/v3d/v3d_sched.c:390:18: error: initialization of 'enum 
drm_gpu_sched_stat (*)(struct drm_sched_job *)' from incompatible pointer type 
'void (*)(struct drm_sched_job *)' [-Werror=incompatible-pointer-types]
  390 |  .timedout_job = v3d_generic_job_timedout,
  |  ^~~~
drivers/gpu/drm/v3d/v3d_sched.c:390:18: note: (near initialization for 
'v3d_c

Re: [PATCH v3 1/2] dt-bindings: input/touchscreen: add bindings for msg26xx

2021-02-09 Thread Jeff LaBundy

Hi Vincent,

On Tue, Feb 09, 2021 at 07:58:33PM +0100, Vincent Knecht wrote:
> Le mardi 09 février 2021 à 10:13 -0600, Rob Herring a écrit :
> > On Thu, Jan 21, 2021 at 06:43:47PM +0100, Vincent Knecht wrote:
> > > This adds dts bindings for the mstar msg26xx touchscreen.
> > > 
> > > Signed-off-by: Vincent Knecht 
> > > ---
> > > Changed in v3:
> > > - added `touchscreen-size-x: true` and `touchscreen-size-y: true` 
> > > properties
> > > Changed in v2:
> > > - changed M-Star to MStar in title line
> > > - changed reset gpio to active-low in example section
> > > ---
> > >  .../input/touchscreen/mstar,msg26xx.yaml  | 69 +++
> > >  1 file changed, 69 insertions(+)
> > >  create mode 100644 
> > > Documentation/devicetree/bindings/input/touchscreen/mstar,msg26xx.yaml
> > > 
> > > diff --git 
> > > a/Documentation/devicetree/bindings/input/touchscreen/mstar,msg26xx.yaml
> > > b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg26xx.yaml
> > > new file mode 100644
> > > index ..5d26a1008bf1
> > > --- /dev/null
> > > +++ 
> > > b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg26xx.yaml
> > > @@ -0,0 +1,69 @@
> > > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > > +%YAML 1.2
> > > +---
> > > +$id: http://devicetree.org/schemas/input/touchscreen/mstar,msg26xx.yaml#
> > > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > > +
> > > +title: MStar msg26xx touchscreen controller Bindings
> > > +
> > > +maintainers:
> > > +  - Vincent Knecht 
> > > +
> > > +allOf:
> > > +  - $ref: touchscreen.yaml#
> > > +
> > > +properties:
> > > +  compatible:
> > > +    const: mstar,msg26xx
> > 
> > Don't use wildcards in compatible strings.
> 
> Thank you for the input...
> 
> Let's say I set it to "mstar,msg2638", is it better to rename the driver file 
> and functions too ?
> According to downstream source file naming, msg2638 is the model I have and 
> test this driver with.

This is ultimately Dmitry's call, but it's fairly common to use wildcards
for driver names and function calls if the driver is known to work across
all devices that fit in the wildcard (see iqs5xx and many others).

The risk with wildcards, however, is that vendors can introduce different
devices later with similar part numbers. Therefore, some subsystems (e.g.
iio) tend to frown upon wildcards for that reason.

You should try and make the driver cover as many devices as possible. But
if the driver is only known to work for one device then I don't think you
can use a wildcard in the name unless you support all other devices (just
my opinion).

In either case, however, compatible strings must be unique just as with a
part number in a schematic or bill of materials. As such, it is perfectly
fine to have multiple compatible strings in a single driver.

> 
> 
> There's a possibility this driver works as-is or with minor mods for msg2633 
> too,
> and a more remote one for msg21xx and msg22xx...
> 

Kind regards,
Jeff LaBundy

Re: [PATCH v2 2/2] drivers/clocksource: Fixup csky,mptimer compile error with CPU_CK610

2021-02-09 Thread Guo Ren

Hi Daniel,

On Wed, Feb 10, 2021 at 4:26 AM Daniel Lezcano
 wrote:
>
> On 09/02/2021 17:02, Guo Ren wrote:
> > Hi Daniel,
> >
> > On Sun, Feb 7, 2021 at 5:29 PM Daniel Lezcano  
> > wrote:
> >>
> >> On 07/02/2021 04:31, Guo Ren wrote:
> >>> Hi Daniel,
> >>>
> >>> On Thu, Feb 4, 2021 at 4:48 PM Daniel Lezcano  
> >>> wrote:
> 
>  On 04/02/2021 08:46, guo...@kernel.org wrote:
> > From: Guo Ren 
> >
> > The timer-mp-csky.c only could support CPU_CK860 and it will
> > compile error with CPU_CK610.
> >
> > It has been selected in arch/csky/Kconfig.
> 
>  It would be better if you fix the root cause of the compilation error.
> >>> The timer-mp-csky.c has used specific instructions which only
> >>> supported by CK860 and timer-mp-csky.c is only design for CK860.
> >>
> >> I guess you are referring to mfcr() ?
> >>
> >>> In arch/csky/Konfig we only select it with CK860.
> >>> select CSKY_MPINTC if CPU_CK860
> >>> select CSKY_MP_TIMER if CPU_CK860
> >>>
> >>> So here let's select timer-mp-csky.c in arch/csky/Kconfig, not in
> >>> drivers/clocksource/Kconfig.
> >>
> >> The COMPILE_TEST option is there to let other architecture to compile
> >> drivers and increase the compilation test coverage.
> >>
> >> The proposed change just removes the driver from this coverage.
> > When we compile the csky arch with C860, it will be selected.
> >
> >>
> >> Ideally, it would be better to keep it with the COMPILE_TEST option, so
> >> changes impacting all the drivers can be caught before submitting the
> >> patches.
> >>
> >> By just adding
> >>
> >> #ifndef mfcr
> >> #define mfcr(a) 0
> >> #endif
> >
> > 610 couldn't support CSKY_MP_TIMER and it's only for 860. So it's not
> > a coding skill issue.
>
> I think there is a misunderstanding.
>
> When I want to compile on x64 all the timer drivers, I do enable
> COMPILE_TEST, then the strings appear and the drivers can be selected.
>
> If the COMPILE_TEST is not enabled, the string does not appear, it is
> not possible to enable/disable it and the platform must enable it from
> the aforementioned arch/csky/Konfig.
>
> Actually, the timer drivers policy is : drivers can not be enabled from
> the drivers/clocksource/Kconfig, it is up to the platform Kconfig to
> select them. The exception is when the COMPILE_TEST option is set for
> testing purpose.
>
> The timer must compile on any other archs and the stubs for the platform
> specific calls must be provided.
>
> Did I miss something with your changes ?
I think our biggest difference is:
 - You think that CSKY_MPTIMER should not be related to the
architecture, but can be compiled with any architecture.
 - But I think CSKY_MPTIMER only could to be compiled with CSKY C860.

But from the perspective of easy maintenance, I agree with your
suggestion. I will adopt in next patch:
> >> #ifndef mfcr
> >> #define mfcr(a) 0
> >> #endif

Thx




--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

[PATCH] virtio-mmio: Use to_virtio_mmio_device() to simply code

2021-02-09 Thread Tang Bin

The file virtio_mmio.c has defined the function to_virtio_mmio_device,
so use it instead of container_of() to simply code. And remove
superfluous blank lines in this file.

Signed-off-by: Tang Bin 
---
 drivers/virtio/virtio_mmio.c | 16 +---
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 627ac0487..449d0f209 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -69,14 +69,10 @@
 #include 
 #include 
 
-
-
 /* The alignment to use between consumer and producer parts of vring.
  * Currently hardcoded to the page size. */
 #define VIRTIO_MMIO_VRING_ALIGNPAGE_SIZE
 
-
-
 #define to_virtio_mmio_device(_plat_dev) \
container_of(_plat_dev, struct virtio_mmio_device, vdev)
 
@@ -100,8 +96,6 @@ struct virtio_mmio_vq_info {
struct list_head node;
 };
 
-
-
 /* Configuration interface */
 
 static u64 vm_get_features(struct virtio_device *vdev)
@@ -264,8 +258,6 @@ static void vm_reset(struct virtio_device *vdev)
writel(0, vm_dev->base + VIRTIO_MMIO_STATUS);
 }
 
-
-
 /* Transport interface */
 
 /* the notify function used when creating a virt queue */
@@ -307,8 +299,6 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
return ret;
 }
 
-
-
 static void vm_del_vq(struct virtqueue *vq)
 {
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
@@ -512,13 +502,11 @@ static const struct virtio_config_ops 
virtio_mmio_config_ops = {
.bus_name   = vm_bus_name,
 };
 
-
 static void virtio_mmio_release_dev(struct device *_d)
 {
struct virtio_device *vdev =
container_of(_d, struct virtio_device, dev);
-   struct virtio_mmio_device *vm_dev =
-   container_of(vdev, struct virtio_mmio_device, vdev);
+   struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
struct platform_device *pdev = vm_dev->pdev;
 
devm_kfree(&pdev->dev, vm_dev);
@@ -608,8 +596,6 @@ static int virtio_mmio_remove(struct platform_device *pdev)
return 0;
 }
 
-
-
 /* Devices list parameter */
 
 #if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
-- 
2.20.1.windows.1

Re: [PATCH 1/2] ext4: Handle casefolding with encryption

2021-02-09 Thread Andreas Dilger

On Feb 9, 2021, at 4:22 PM, Theodore Ts'o  wrote:
> 
> On Wed, Feb 03, 2021 at 11:31:28AM -0500, Theodore Ts'o wrote:
>> On Wed, Feb 03, 2021 at 03:55:06AM -0700, Andreas Dilger wrote:
>>> 
>>> It looks like this change will break the dirdata feature, which is similarly
>>> storing a data field beyond the end of the dirent. However, that feature 
>>> also
>>> provides for flags stored in the high bits of the type field to indicate
>>> which of the fields are in use there.
>>> The first byte of each field stores
>>> the length, so it can be skipped even if the content is not understood.
>> 
>> Daniel, for context, the dirdata field is an out-of-tree feature which
>> is used by Lustre, and so has fairly large deployed base.  So if there
>> is a way that we can accomodate not breaking dirdata, that would be
>> good.
>> 
>> Did the ext4 casefold+encryption implementation escape out to any
>> Android handsets?
> 
> So from an OOB chat with Daniel, it appears that the ext4
> casefold+encryption implementation did in fact escape out to Android
> handsets.  So I think what we will need to do, ultiumately, is support
> one way of supporting the casefold IV in the case where "encryption &&
> casefold", and another way when "encryption && casefold && dirdata".
> 
> That's going to be a bit sucky, but I don't think it should be that
> complex.  Daniel, Andreas, does that make sense to you?

I was just going to ping you about this, whether it made sense to remove
this feature addition from the "maint" branch (i.e. make a 1.45.8 without
it), and keep it only in 1.46 or "next" to reduce its spread?

Depending on the size of the "escape", it probably makes sense to move
toward having e2fsck migrate from the current mechanism to using dirdata
for all deployments.  In the current implementation, tools don't really
know for sure if there is data beyond the filename in the dirent or not.

I guess it is implicit with the casefold+encryption case for dirents in
directories that have the encryption flag set in a filesystem that also
has casefold enabled, but it's definitely not friendly to these features
being enabled on an existing filesystem.

For example, what if casefold is enabled on an existing filesystem that
already has an encrypted directory?  Does the code _assume_ that there is
a hash beyond the name if the rec_len is long enough for this?  There will
definitely be some pre-existing dirents that will have a large rec_len
(e.g. those at the end of the block, or with deleted entries immediately
following), that do *not* have the proper hash stored in them.  There may
be random garbage at the end of the dirent, and since every value in the
hash is valid, there is no way to know whether it is good or bad.

With the dirdata mechanism, there would be a bit set in the "file_type"
field that will indicate if the hash was present, as well as a length
field (0x08) that is a second confirmation that this field is valid.

Cheers, Andreas

signature.asc
Description: Message signed with OpenPGP

Re: INFO: task hung in io_uring_cancel_task_requests

2021-02-09 Thread Jens Axboe

On 2/9/21 7:57 PM, Pavel Begunkov wrote:
> On 10/02/2021 00:54, syzbot wrote:
>> Hello,
>>
>> syzbot found the following issue on:
>>
>> HEAD commit:dd86e7fa Merge tag 'pci-v5.11-fixes-2' of git://git.kernel..
>> git tree:   upstream
>> console output: https://syzkaller.appspot.com/x/log.txt?x=13e43f90d0
>> kernel config:  https://syzkaller.appspot.com/x/.config?x=e83e68d0a6aba5f6
>> dashboard link: https://syzkaller.appspot.com/bug?extid=695b03d82fa8e4901b06
>> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1490f0d4d0
>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17aedf1cd0
>>
>> IMPORTANT: if you fix the issue, please add the following tag to the commit:
>> Reported-by: syzbot+695b03d82fa8e4901...@syzkaller.appspotmail.com
> 
> It looks like SQPOLL. I wonder if that's due to parked SQPOLL task that
> won't be able to do task_work run, and so reap poll-cancelled requests
> killed by io_put_deferred().
> 
> I'll test it out tomorrow.

It is indeed SQPOLL. From a quick look, it's doing a POLL_ADD on the ring
fd itself.

-- 
Jens Axboe

Re: INFO: task hung in io_uring_cancel_task_requests

2021-02-09 Thread Pavel Begunkov

On 10/02/2021 00:54, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:dd86e7fa Merge tag 'pci-v5.11-fixes-2' of git://git.kernel..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=13e43f90d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=e83e68d0a6aba5f6
> dashboard link: https://syzkaller.appspot.com/bug?extid=695b03d82fa8e4901b06
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1490f0d4d0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17aedf1cd0
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+695b03d82fa8e4901...@syzkaller.appspotmail.com

It looks like SQPOLL. I wonder if that's due to parked SQPOLL task that
won't be able to do task_work run, and so reap poll-cancelled requests
killed by io_put_deferred().

I'll test it out tomorrow.

> 
> INFO: task syz-executor893:8493 blocked for more than 143 seconds.
>   Not tainted 5.11.0-rc6-syzkaller #0
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:syz-executor893 state:D stack:28144 pid: 8493 ppid:  8480 
> flags:0x0004
> Call Trace:
>  context_switch kernel/sched/core.c:4327 [inline]
>  __schedule+0x90c/0x21a0 kernel/sched/core.c:5078
>  schedule+0xcf/0x270 kernel/sched/core.c:5157
>  io_uring_cancel_files fs/io_uring.c:8912 [inline]
>  io_uring_cancel_task_requests+0xe70/0x11a0 fs/io_uring.c:8979
>  __io_uring_files_cancel+0x110/0x1b0 fs/io_uring.c:9067
>  io_uring_files_cancel include/linux/io_uring.h:51 [inline]
>  do_exit+0x2fe/0x2ae0 kernel/exit.c:780
>  do_group_exit+0x125/0x310 kernel/exit.c:922
>  __do_sys_exit_group kernel/exit.c:933 [inline]
>  __se_sys_exit_group kernel/exit.c:931 [inline]
>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x43eb19
> RSP: 002b:7ffda99d64d8 EFLAGS: 0246 ORIG_RAX: 00e7
> RAX: ffda RBX: 004b02f0 RCX: 0043eb19
> RDX: 003c RSI: 00e7 RDI: 
> RBP:  R08: ffc0 R09: 1000
> R10: 8011 R11: 0246 R12: 004b02f0
> R13: 0001 R14:  R15: 0001
> INFO: task syz-executor893:8571 blocked for more than 143 seconds.
>   Not tainted 5.11.0-rc6-syzkaller #0
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:syz-executor893 state:D stack:28144 pid: 8571 ppid:  8479 
> flags:0x4004
> Call Trace:
>  context_switch kernel/sched/core.c:4327 [inline]
>  __schedule+0x90c/0x21a0 kernel/sched/core.c:5078
>  schedule+0xcf/0x270 kernel/sched/core.c:5157
>  io_uring_cancel_files fs/io_uring.c:8912 [inline]
>  io_uring_cancel_task_requests+0xe70/0x11a0 fs/io_uring.c:8979
>  __io_uring_files_cancel+0x110/0x1b0 fs/io_uring.c:9067
>  io_uring_files_cancel include/linux/io_uring.h:51 [inline]
>  do_exit+0x2fe/0x2ae0 kernel/exit.c:780
>  do_group_exit+0x125/0x310 kernel/exit.c:922
>  __do_sys_exit_group kernel/exit.c:933 [inline]
>  __se_sys_exit_group kernel/exit.c:931 [inline]
>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x43eb19
> RSP: 002b:7ffda99d64d8 EFLAGS: 0246 ORIG_RAX: 00e7
> RAX: ffda RBX: 004b02f0 RCX: 0043eb19
> RDX: 003c RSI: 00e7 RDI: 
> RBP:  R08: ffc0 R09: 1000
> R10: 8011 R11: 0246 R12: 004b02f0
> R13: 0001 R14:  R15: 0001
> INFO: task syz-executor893:8579 blocked for more than 143 seconds.
>   Not tainted 5.11.0-rc6-syzkaller #0
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:syz-executor893 state:D stack:28144 pid: 8579 ppid:  8482 
> flags:0x4004
> Call Trace:
>  context_switch kernel/sched/core.c:4327 [inline]
>  __schedule+0x90c/0x21a0 kernel/sched/core.c:5078
>  schedule+0xcf/0x270 kernel/sched/core.c:5157
>  io_uring_cancel_files fs/io_uring.c:8912 [inline]
>  io_uring_cancel_task_requests+0xe70/0x11a0 fs/io_uring.c:8979
>  __io_uring_files_cancel+0x110/0x1b0 fs/io_uring.c:9067
>  io_uring_files_cancel include/linux/io_uring.h:51 [inline]
>  do_exit+0x2fe/0x2ae0 kernel/exit.c:780
>  do_group_exit+0x125/0x310 kernel/exit.c:922
>  __do_sys_exit_group kernel/exit.c:933 [inline]
>  __se_sys_exit_group kernel/exit.c:931 [inline]
>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x43eb19
> RSP: 002b:7ffda99d64d8 EFLAGS: 0246 ORIG_RAX: 00e7
> RAX: ffda RBX: 004b02f0 RCX: 0

Re: [PATCH 2/2] usb: misc: usb5744: Add support for USB hub controller

2021-02-09 Thread Andrew Lunn

On Tue, Feb 09, 2021 at 10:53:20AM +0100, Michal Simek wrote:
> +static int usb5744_i2c_probe(struct i2c_client *client,
> +  const struct i2c_device_id *id)
> +{
> + struct device *dev = &client->dev;
> + int ret;
> +
> + /* Trigger gpio reset to the hub. */
> + ret = usb5744_init_hw(dev);
> + if (ret)
> + return ret;
> +
> + /* Send SMBus command to boot hub. */
> + ret = i2c_smbus_write_word_data(client, 0xAA, swab16(0x5600));

Hi Michal

This is not my area of the kernel. But that swab16() stood out, and
made me wonder about endianness. Will this work correctly on big and
little endian hosts?

   Andrew

[PATCH v5 1/3] platform/chrome: cros_ec: Add SW_FRONT_PROXIMITY MKBP define

Some cros ECs support a front proximity MKBP event via
'EC_MKBP_FRONT_PROXIMITY'. Add this define so it can be used in a
future patch.

Cc: Dmitry Torokhov 
Cc: Benson Leung 
Cc: Guenter Roeck 
Cc: Douglas Anderson 
Cc: Gwendal Grignou 
Acked-by: Enric Balletbo i Serra 
Signed-off-by: Stephen Boyd 
---

No changes from last time.

 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/platform_data/cros_ec_commands.h 
b/include/linux/platform_data/cros_ec_commands.h
index 86376779ab31..776e0b2be0e9 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -3457,6 +3457,7 @@ struct ec_response_get_next_event_v1 {
 #define EC_MKBP_LID_OPEN   0
 #define EC_MKBP_TABLET_MODE1
 #define EC_MKBP_BASE_ATTACHED  2
+#define EC_MKBP_FRONT_PROXIMITY3
 
 /* Run keyboard factory test scanning */
 #define EC_CMD_KEYBOARD_FACTORY_TEST 0x0068
-- 
https://chromeos.dev

[PATCH v5 3/3] iio: proximity: Add a ChromeOS EC MKBP proximity driver

Add support for a ChromeOS EC proximity driver that exposes a "front"
proximity sensor via the IIO subsystem. The EC decides when front
proximity is near and sets an MKBP switch 'EC_MKBP_FRONT_PROXIMITY' to
notify the kernel of proximity. Similarly, when proximity detects
something far away it sets the switch bit to 0. For now this driver
exposes a single sensor, but it could be expanded in the future via more
MKBP bits if desired.

Cc: Dmitry Torokhov 
Cc: Benson Leung 
Cc: Guenter Roeck 
Cc: Douglas Anderson 
Cc: Gwendal Grignou 
Reviewed-by: Enric Balletbo i Serra 
Signed-off-by: Stephen Boyd 
---

Changes from v4:
 * Dropped of_match_ptr()

 drivers/iio/proximity/Kconfig |  11 +
 drivers/iio/proximity/Makefile|   1 +
 .../iio/proximity/cros_ec_mkbp_proximity.c| 242 ++
 3 files changed, 254 insertions(+)
 create mode 100644 drivers/iio/proximity/cros_ec_mkbp_proximity.c

diff --git a/drivers/iio/proximity/Kconfig b/drivers/iio/proximity/Kconfig
index 12672a0e89ed..7c7203ca3ac6 100644
--- a/drivers/iio/proximity/Kconfig
+++ b/drivers/iio/proximity/Kconfig
@@ -21,6 +21,17 @@ endmenu
 
 menu "Proximity and distance sensors"
 
+config CROS_EC_MKBP_PROXIMITY
+   tristate "ChromeOS EC MKBP Proximity sensor"
+   depends on CROS_EC
+   help
+ Say Y here to enable the proximity sensor implemented via the 
ChromeOS EC MKBP
+ switches protocol. You must enable one bus option (CROS_EC_I2C or 
CROS_EC_SPI)
+ to use this.
+
+ To compile this driver as a module, choose M here: the
+ module will be called cros_ec_mkbp_proximity.
+
 config ISL29501
tristate "Intersil ISL29501 Time Of Flight sensor"
depends on I2C
diff --git a/drivers/iio/proximity/Makefile b/drivers/iio/proximity/Makefile
index 9c1aca1a8b79..cbdac09433eb 100644
--- a/drivers/iio/proximity/Makefile
+++ b/drivers/iio/proximity/Makefile
@@ -5,6 +5,7 @@
 
 # When adding new entries keep the list in alphabetical order
 obj-$(CONFIG_AS3935)   += as3935.o
+obj-$(CONFIG_CROS_EC_MKBP_PROXIMITY) += cros_ec_mkbp_proximity.o
 obj-$(CONFIG_ISL29501) += isl29501.o
 obj-$(CONFIG_LIDAR_LITE_V2)+= pulsedlight-lidar-lite-v2.o
 obj-$(CONFIG_MB1232)   += mb1232.o
diff --git a/drivers/iio/proximity/cros_ec_mkbp_proximity.c 
b/drivers/iio/proximity/cros_ec_mkbp_proximity.c
new file mode 100644
index ..2cdaf05c0ec2
--- /dev/null
+++ b/drivers/iio/proximity/cros_ec_mkbp_proximity.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for cros-ec proximity sensor exposed through MKBP switch
+ *
+ * Copyright 2021 Google LLC.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+
+struct cros_ec_mkbp_proximity_data {
+   struct cros_ec_device *ec;
+   struct iio_dev *indio_dev;
+   struct mutex lock;
+   struct notifier_block notifier;
+   bool enabled;
+};
+
+static const struct iio_event_spec cros_ec_mkbp_proximity_events[] = {
+   {
+   .type = IIO_EV_TYPE_THRESH,
+   .dir = IIO_EV_DIR_EITHER,
+   .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+   },
+};
+
+static const struct iio_chan_spec cros_ec_mkbp_proximity_chan_spec[] = {
+   {
+   .type = IIO_PROXIMITY,
+   .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+   .event_spec = cros_ec_mkbp_proximity_events,
+   .num_event_specs = ARRAY_SIZE(cros_ec_mkbp_proximity_events),
+   },
+};
+
+static int cros_ec_mkbp_proximity_parse_state(const void *data)
+{
+   u32 switches = get_unaligned_le32(data);
+
+   return !!(switches & BIT(EC_MKBP_FRONT_PROXIMITY));
+}
+
+static int cros_ec_mkbp_proximity_query(struct cros_ec_device *ec_dev,
+   int *state)
+{
+   struct {
+   struct cros_ec_command msg;
+   union {
+   struct ec_params_mkbp_info params;
+   u32 switches;
+   };
+   } __packed buf = { };
+   struct ec_params_mkbp_info *params = &buf.params;
+   struct cros_ec_command *msg = &buf.msg;
+   u32 *switches = &buf.switches;
+   size_t insize = sizeof(*switches);
+   int ret;
+
+   msg->command = EC_CMD_MKBP_INFO;
+   msg->version = 1;
+   msg->outsize = sizeof(*params);
+   msg->insize = insize;
+
+   params->info_type = EC_MKBP_INFO_CURRENT;
+   params->event_type = EC_MKBP_EVENT_SWITCH;
+
+   ret = cros_ec_cmd_xfer_status(ec_dev, msg);
+   if (ret < 0)
+   return ret;
+
+   if (ret != insize) {
+   dev_warn(ec_dev->dev, "wrong result size: %d != %zu\n", ret,
+insize);
+   return -EPROTO;
+   }
+
+   *state = cros_ec_mkbp_proximity_parse_state(switches);
+   return IIO_VAL_INT;

[PATCH v5 2/3] dt-bindings: iio: Add cros ec proximity yaml doc

Some cros ECs support a front proximity MKBP event via
'EC_MKBP_FRONT_PROXIMITY'. Add a DT binding to document this feature via
a node that is a child of the main cros_ec device node. Devices that
have this ability will describe this in firmware.

Cc: Dmitry Torokhov 
Cc: Benson Leung 
Cc: Guenter Roeck 
Cc: Douglas Anderson 
Cc: Gwendal Grignou 
Cc: 
Cc: Rob Herring 
Cc: Enric Balletbo i Serra 
Signed-off-by: Stephen Boyd 
---

Changes from v4:
 * Reduced example in iio binding and moved to mfd
 * Dropped unevaluatedProperties

 .../google,cros-ec-mkbp-proximity.yaml| 37 +++
 .../bindings/mfd/google,cros-ec.yaml  |  7 
 2 files changed, 44 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml

diff --git 
a/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml
 
b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml
new file mode 100644
index ..099b4be927d4
--- /dev/null
+++ 
b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: 
http://devicetree.org/schemas/iio/proximity/google,cros-ec-mkbp-proximity.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ChromeOS EC MKBP Proximity Sensor
+
+maintainers:
+  - Stephen Boyd 
+  - Benson Leung 
+  - Enric Balletbo i Serra 
+
+description: |
+  Google's ChromeOS EC sometimes has the ability to detect user proximity.
+  This is implemented on the EC as near/far logic and exposed to the OS
+  via an MKBP switch bit.
+
+properties:
+  compatible:
+const: google,cros-ec-mkbp-proximity
+
+  label:
+description: Name for proximity sensor
+
+required:
+  - compatible
+
+additionalProperties: false
+
+examples:
+  - |
+proximity {
+  compatible = "google,cros-ec-mkbp-proximity";
+  label = "proximity-wifi-lte";
+};
diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml 
b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
index 76bf16ee27ec..4dfa70a013ae 100644
--- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
+++ b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
@@ -94,6 +94,9 @@ properties:
   keyboard-controller:
 $ref: "/schemas/input/google,cros-ec-keyb.yaml#"
 
+  proximity:
+$ref: "/schemas/iio/proximity/google,cros-ec-mkbp-proximity.yaml#"
+
   codecs:
 type: object
 additionalProperties: false
@@ -180,6 +183,10 @@ examples:
 interrupts = <99 0>;
 interrupt-parent = <&gpio7>;
 spi-max-frequency = <500>;
+
+proximity {
+compatible = "google,cros-ec-mkbp-proximity";
+};
 };
 };
 
-- 
https://chromeos.dev

[PATCHv5 0/3] iio: Add a ChromeOS EC MKBP proximity driver

This is a different approach to [1] where I tried to add this proximity
sensor logic to the input subsystem. Instead, we'll take the approach of
making a small IIO proximity driver that parses the EC switch bitmap to
find out if the front proximity sensor is detecting something or not.
This allows us to treat proximity sensors as IIO devices all the time in
userspace instead of handling this switch on the EC via the input
subsystem and then other proximity sensors via IIO.

I propose this is all merged through IIO subsystem. Please ack
the first patch so it can be merged that way.

Changes from v4:
 * Reduced binding and moved proximity node to mfd spi example
 * Dropped of_match_ptr()

Changes from v3:
 * Added SPI and cros-ec wrapper nodes to yaml example
 * Ignore notifier registration return code that is always zero

Changes from v2:
 * Check iio clock and use IIO time if not boottime

Changes from v1:
 * Driver moved location
 * Put mkbp everywhere
 * Fixed up DT binding to not fail and make sure is a child of cros-ec
 * Simplified logic for sending a message
 * Dropped CONFIG_OF usage
 * Sorted includes

[1] https://lore.kernel.org/r/20201205004709.3126266-1-swb...@chromium.org

Cc: Dmitry Torokhov 
Cc: Benson Leung 
Cc: Guenter Roeck 
Cc: Douglas Anderson 
Cc: Gwendal Grignou 
Cc: 
Cc: Rob Herring 
Cc: Enric Balletbo i Serra 

Stephen Boyd (3):
  platform/chrome: cros_ec: Add SW_FRONT_PROXIMITY MKBP define
  dt-bindings: iio: Add cros ec proximity yaml doc
  iio: proximity: Add a ChromeOS EC MKBP proximity driver

 .../google,cros-ec-mkbp-proximity.yaml|  37 +++
 .../bindings/mfd/google,cros-ec.yaml  |   7 +
 drivers/iio/proximity/Kconfig |  11 +
 drivers/iio/proximity/Makefile|   1 +
 .../iio/proximity/cros_ec_mkbp_proximity.c| 242 ++
 .../linux/platform_data/cros_ec_commands.h|   1 +
 6 files changed, 299 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml
 create mode 100644 drivers/iio/proximity/cros_ec_mkbp_proximity.c


base-commit: 19c329f6808995b142b3966301f217c831e7cf31
-- 
https://chromeos.dev

Re: [External] Re: [PATCH v2] psi: Remove the redundant psi_task_tick

2021-02-09 Thread Chengming Zhou

Hello Johannes,

在 2021/2/9 下午11:48, Johannes Weiner 写道:
> Hello Chengming,
>
> On Tue, Feb 09, 2021 at 03:10:33PM +0800, Chengming Zhou wrote:
>> When the current task in a cgroup is in_memstall, the corresponding groupc
>> on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
>> redundant psi_task_tick from scheduler_tick to save this periodic cost.
> Can you please update the patch name and the changelog to the new
> version of the patch? It's not removing the redundant tick, it's
> moving the reclaim detection from the timer tick to the task state
> tracking machinery using the recently added ONCPU state.

Yes, I will change the name and changelog, it will be clearer for this patch : )

>> Signed-off-by: Muchun Song 
>> Signed-off-by: Chengming Zhou 
>> ---
>>  include/linux/psi.h  |  1 -
>>  kernel/sched/core.c  |  1 -
>>  kernel/sched/psi.c   | 49 ++---
>>  kernel/sched/stats.h |  9 -
>>  4 files changed, 14 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/psi.h b/include/linux/psi.h
>> index 7361023f3fdd..65eb1476ac70 100644
>> --- a/include/linux/psi.h
>> +++ b/include/linux/psi.h
>> @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, 
>> int set);
>>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>>   bool sleep);
>>  
>> -void psi_memstall_tick(struct task_struct *task, int cpu);
>>  void psi_memstall_enter(unsigned long *flags);
>>  void psi_memstall_leave(unsigned long *flags);
>>  
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 15d2562118d1..31788a9b335b 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -4533,7 +4533,6 @@ void scheduler_tick(void)
>>  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
>>  curr->sched_class->task_tick(rq, curr, 0);
>>  calc_global_load_tick(rq);
>> -psi_task_tick(rq);
>>  
>>  rq_unlock(rq, &rf);
>>  
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 2293c45d289d..6e46d9eb279b 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
>>  wake_up_interruptible(&group->poll_wait);
>>  }
>>  
>> -static void record_times(struct psi_group_cpu *groupc, int cpu,
>> - bool memstall_tick)
>> +static void record_times(struct psi_group_cpu *groupc, int cpu)
>>  {
>>  u32 delta;
>>  u64 now;
>> @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, 
>> int cpu,
>>  groupc->times[PSI_MEM_SOME] += delta;
>>  if (groupc->state_mask & (1 << PSI_MEM_FULL))
>>  groupc->times[PSI_MEM_FULL] += delta;
>> -else if (memstall_tick) {
>> -u32 sample;
>> -/*
>> - * Since we care about lost potential, a
>> - * memstall is FULL when there are no other
>> - * working tasks, but also when the CPU is
>> - * actively reclaiming and nothing productive
>> - * could run even if it were runnable.
>> - *
>> - * When the timer tick sees a reclaiming CPU,
>> - * regardless of runnable tasks, sample a FULL
>> - * tick (or less if it hasn't been a full tick
>> - * since the last state change).
>> - */
>> -sample = min(delta, (u32)jiffies_to_nsecs(1));
>> -groupc->times[PSI_MEM_FULL] += sample;
>> -}
>>  }
>>  
>>  if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
>> @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, 
>> int cpu,
>>   */
>>  write_seqcount_begin(&groupc->seq);
>>  
>> -record_times(groupc, cpu, false);
>> +record_times(groupc, cpu);
>>  
>>  for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
>>  if (!(m & (1 << t)))
>> @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, 
>> int cpu,
>>  if (test_state(groupc->tasks, s))
>>  state_mask |= (1 << s);
>>  }
>> +
>> +/*
>> + * Since we care about lost potential, a memstall is FULL
>> + * when there are no other working tasks, but also when
>> + * the CPU is actively reclaiming and nothing productive
>> + * could run even if it were runnable. So when the current
>> + * task in a cgroup is in_memstall, the corresponding groupc
>> + * on that cpu is in PSI_MEM_FULL state.
>> + */
>> +if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
>> +state_mask |= (1 << PSI_MEM_FULL);
> This doesn't really work with the psi_task_switch() optimization. If
> we switch between two tasks inside a leaf group, where one is memstall
> and the other is not, we

Re: [PATCH v4 3/3] iio: proximity: Add a ChromeOS EC MKBP proximity driver

Quoting Stephen Boyd (2021-02-06 19:21:39)
> Quoting Jonathan Cameron (2021-02-06 08:17:11)
> > On Tue,  2 Feb 2021 10:44:34 -0800
> > Stephen Boyd  wrote:
> > 
> > > +static struct platform_driver cros_ec_mkbp_proximity_driver = {
> > > + .driver = {
> > > + .name = "cros-ec-mkbp-proximity",
> > > + .of_match_table = 
> > > of_match_ptr(cros_ec_mkbp_proximity_of_match),
> > I'm going to assume we know no one is going to use this with
> > ACPI via PRP0001 given presumably the firmware on these devices
> > is tightly controlled.
> 
> Correct.
> 
> > 
> > However, we should should still drop the of_match_ptr
> > as it will lead to an unused warning for cros_ec_mkbp_proximity_of_match
> > if anyone builds this without CONFIG_OF + it sets a general bad
> > precedence that I'd rather wasn't around for people to copy.
> > Note that in general we are slowly ripping these out of IIO but
> > probably lots still there.
> > 
> > If this is all that is needed in this version I'll just do it
> > whilst applying unless anyone shouts.
> > 
> 
> Agreed. Thanks for fixing that last little bit.

Seems Rob wanted a small tweak to the binding so I'll resend this now
and drop the of_match_ptr() usage.

Re: [PATCH v4 2/3] dt-bindings: iio: Add cros ec proximity yaml doc

Quoting Rob Herring (2021-02-09 13:13:47)
> On Tue, Feb 02, 2021 at 10:44:33AM -0800, Stephen Boyd wrote:
> > +description: Name for proximity sensor
> > +
> > +required:
> > +  - compatible
> > +
> > +unevaluatedProperties: false
> > +additionalProperties: false
> 
> Only need one. In this case 'additionalProperties'.

Got it.

> 
> > +
> > +examples:
> > +  - |
> > +spi {
> > +  #address-cells = <1>;
> > +  #size-cells = <0>;
> > +  ec@0 {
> > +compatible = "google,cros-ec-spi";
> > +reg = <0>;
> > +proximity {
> > +  compatible = "google,cros-ec-mkbp-proximity";
> > +  label = "proximity-wifi-lte";
> > +};
> 
> The complete examples I prefer is 1 example for the whole MFD in the MFD 
> schema and no example here.

Alright. I can add it to the mfd binding instead.

> 
> > +  };
> > +};
> > diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml 
> > b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
> > index 76bf16ee27ec..479a9f15de32 100644
> > --- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
> > +++ b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
> > @@ -94,6 +94,9 @@ properties:
> >keyboard-controller:
> >  $ref: "/schemas/input/google,cros-ec-keyb.yaml#"

Re: [PATCH 5/5] ath10k: reduce invalid ht params rate message noise

2021-02-09 Thread Wen Gong


On 2021-02-10 08:42, Shuah Khan wrote:
ath10k_mac_get_rate_flags_ht() floods dmesg with the following 
messages,

when it fails to find a match for mcs=7 and rate=1440.

supported_ht_mcs_rate_nss2:
{7,  {1300, 2700, 1444, 3000} }

ath10k_pci :02:00.0: invalid ht params rate 1440 100kbps nss 2 mcs 
7


dev_warn_ratelimited() isn't helping the noise. Use dev_warn_once()
instead.

Signed-off-by: Shuah Khan 
---
 drivers/net/wireless/ath/ath10k/mac.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/mac.c
b/drivers/net/wireless/ath/ath10k/mac.c
index 3545ce7dce0a..276321f0cfdd 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -8970,8 +8970,9 @@ static void ath10k_mac_get_rate_flags_ht(struct
ath10k *ar, u32 rate, u8 nss, u8
*bw |= RATE_INFO_BW_40;
*flags |= RATE_INFO_FLAGS_SHORT_GI;
} else {
-   ath10k_warn(ar, "invalid ht params rate %d 100kbps nss %d mcs 
%d",
-   rate, nss, mcs);
+   dev_warn_once(ar->dev,
+ "invalid ht params rate %d 100kbps nss %d mcs %d",
+ rate, nss, mcs);
}
 }

The {7,  {1300, 2700, 1444, 3000} } is a correct value.
The 1440 is report from firmware, its a wrong value, it has fixed in 
firmware.
If change it to dev_warn_once, then it will have no chance to find the 
other wrong values which report by firmware, and it indicate
a wrong value to mac80211/cfg80211 and lead "iw wlan0 station dump" get 
a wrong bitrate.

[PATCH v2 10/14] x86/fault: Bypass no_context() for implicit kernel faults from usermode

We can drop an indentation level and remove the last
user_mode(regs) == true caller of no_context() by directly OOPSing for
implicit kernel faults from usermode.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 59 -
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8b8bd0a4f4b2..f735639455a5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -825,44 +825,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned 
long error_code,
 {
struct task_struct *tsk = current;
 
-   /* User mode accesses just cause a SIGSEGV */
-   if (user_mode(regs) && (error_code & X86_PF_USER)) {
-   /*
-* It's possible to have interrupts off here:
-*/
-   local_irq_enable();
+   if (!user_mode(regs)) {
+   no_context(regs, error_code, address, pkey, si_code);
+   return;
+   }
 
-   /*
-* Valid to do another page fault here because this one came
-* from user space:
-*/
-   if (is_prefetch(regs, error_code, address))
-   return;
+   if (!(error_code & X86_PF_USER)) {
+   /* Implicit user access to kernel memory -- just oops */
+   page_fault_oops(regs, error_code, address);
+   return;
+   }
 
-   if (is_errata100(regs, address))
-   return;
+   /*
+* User mode accesses just cause a SIGSEGV.
+* It's possible to have interrupts off here:
+*/
+   local_irq_enable();
 
-   sanitize_error_code(address, &error_code);
+   /*
+* Valid to do another page fault here because this one came
+* from user space:
+*/
+   if (is_prefetch(regs, error_code, address))
+   return;
 
-   if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, 
address))
-   return;
+   if (is_errata100(regs, address))
+   return;
 
-   if (likely(show_unhandled_signals))
-   show_signal_msg(regs, error_code, address, tsk);
+   sanitize_error_code(address, &error_code);
 
-   set_signal_archinfo(address, error_code);
+   if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+   return;
 
-   if (si_code == SEGV_PKUERR)
-   force_sig_pkuerr((void __user *)address, pkey);
+   if (likely(show_unhandled_signals))
+   show_signal_msg(regs, error_code, address, tsk);
 
-   force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+   set_signal_archinfo(address, error_code);
 
-   local_irq_disable();
+   if (si_code == SEGV_PKUERR)
+   force_sig_pkuerr((void __user *)address, pkey);
 
-   return;
-   }
+   force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 
-   no_context(regs, error_code, address, SIGSEGV, si_code);
+   local_irq_disable();
 }
 
 static noinline void
-- 
2.29.2

[PATCH v2 11/14] x86/fault: Rename no_context() to kernelmode_fixup_or_oops()

The name no_context() has never been very clear.  It's only called for
faults from kernel mode, so rename it and change the no-longer-useful
user_mode(regs) check to a WARN_ON_ONCE.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 28 ++--
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f735639455a5..9fb636b2a3da 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -729,17 +729,10 @@ page_fault_oops(struct pt_regs *regs, unsigned long 
error_code,
 }
 
 static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+unsigned long address, int signal, int si_code)
 {
-   if (user_mode(regs)) {
-   /*
-* This is an implicit supervisor-mode access from user
-* mode.  Bypass all the kernel-mode recovery code and just
-* OOPS.
-*/
-   goto oops;
-   }
+   WARN_ON_ONCE(user_mode(regs));
 
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
@@ -779,7 +772,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_prefetch(regs, error_code, address))
return;
 
-oops:
page_fault_oops(regs, error_code, address);
 }
 
@@ -826,7 +818,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
struct task_struct *tsk = current;
 
if (!user_mode(regs)) {
-   no_context(regs, error_code, address, pkey, si_code);
+   kernelmode_fixup_or_oops(regs, error_code, address, pkey, 
si_code);
return;
}
 
@@ -958,7 +950,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
 {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
-   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+   kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, 
BUS_ADRERR);
return;
}
 
@@ -1420,8 +1412,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 * has unlocked the mm for us if we get here.
 */
if (!user_mode(regs))
-   no_context(regs, error_code, address, SIGBUS,
-  BUS_ADRERR);
+   kernelmode_fixup_or_oops(regs, error_code, address,
+SIGBUS, BUS_ADRERR);
return;
}
 
@@ -1441,15 +1433,15 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
 
if (fatal_signal_pending(current) && !user_mode(regs)) {
-   no_context(regs, error_code, address, 0, 0);
+   kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
 
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
-   no_context(regs, error_code, address,
-  SIGSEGV, SEGV_MAPERR);
+   kernelmode_fixup_or_oops(regs, error_code, address,
+SIGSEGV, SEGV_MAPERR);
return;
}
 
-- 
2.29.2

[PATCH v2 13/14] x86/fault: Don't run fixups for SMAP violations

A SMAP-violating kernel access is not a recoverable condition.  Imagine
kernel code that, outside of a uaccess region, dereferences a pointer to
the user range by accident.  If SMAP is on, this will reliably generate
as an intentional user access.  This makes it easy for bugs to be
overlooked if code is inadequately tested both with and without SMAP.

We discovered this because BPF can generate invalid accesses to user
memory, but those warnings only got printed if SMAP was off.  With this
patch, this type of error will be discovered with SMAP on as well.

Cc: Yonghong Song 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 466415bdf58c..eed217d4a877 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1278,9 +1278,12 @@ void do_user_addr_fault(struct pt_regs *regs,
 */
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
 !(error_code & X86_PF_USER) &&
-!(regs->flags & X86_EFLAGS_AC)))
-   {
-   bad_area_nosemaphore(regs, error_code, address);
+!(regs->flags & X86_EFLAGS_AC))) {
+   /*
+* No extable entry here.  This was a kernel access to an
+* invalid pointer.  get_kernel_nofault() will not get here.
+*/
+   page_fault_oops(regs, error_code, address);
return;
}
 
-- 
2.29.2

[PATCH v2 08/14] x86/fault: Skip erratum #93 workaround on new CPUs

Erratum #93 applies to the first generation of AMD K8 CPUs.  Skip the
workaround on newer CPUs.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index cbb1a9754473..3fe2f4800b69 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -442,9 +442,8 @@ static void dump_pagetable(unsigned long address)
  */
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
-#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
-   || boot_cpu_data.x86 != 0xf)
+#if defined(CONFIG_X86_64)
+   if (!is_amd_k8_pre_npt())
return 0;
 
if (user_mode(regs))
-- 
2.29.2

[PATCH v2 14/14] x86/fault, x86/efi: Fix and rename efi_recover_from_page_fault()

efi_recover_from_page_fault() doesn't recover -- it does a special EFI
mini-oops.  Rename it to make it clear that it crashes.

While renaming it, I noticed a blatant bug: a page fault oops in a
different thread happening concurrently with an EFI runtime service call
would be misinterpreted as an EFI page fault.  Fix that.

This isn't quite exact.  We could do better by using a special CS for
calls into EFI.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Ard Biesheuvel 
Cc: linux-...@vger.kernel.org
Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/efi.h |  2 +-
 arch/x86/mm/fault.c| 11 ++-
 arch/x86/platform/efi/quirks.c | 16 
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c98f78330b09..4b7706ddd8b6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -150,7 +150,7 @@ extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
 extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
 extern void efi_free_boot_services(void);
 
 /* kexec external ABI */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eed217d4a877..dfdd56d9c020 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,7 +16,7 @@
 #include /* prefetchw*/
 #include /* exception_enter(), ...   */
 #include  /* faulthandler_disabled()  */
-#include  /* efi_recover_from_page_fault()*/
+#include  /* 
efi_crash_gracefully_on_page_fault()*/
 #include 
 
 #include /* boot_cpu_has, ...*/
@@ -25,7 +25,7 @@
 #include   /* emulate_vsyscall */
 #include   /* struct vm86  */
 #include/* vma_pkey()   */
-#include/* efi_recover_from_page_fault()*/
+#include/* 
efi_crash_gracefully_on_page_fault()*/
 #include   /* store_idt(), ... */
 #include /* exception stack  
*/
 #include  /* VMALLOC_START, ...   */
@@ -700,11 +700,12 @@ page_fault_oops(struct pt_regs *regs, unsigned long 
error_code,
 #endif
 
/*
-* Buggy firmware could access regions which might page fault, try to
-* recover from such faults.
+* Buggy firmware could access regions which might page fault.  If
+* this happens, EFI has a special OOPS path that will try to
+* avoid hanging the system.
 */
if (IS_ENABLED(CONFIG_EFI))
-   efi_recover_from_page_fault(address);
+   efi_crash_gracefully_on_page_fault(address);
 
 oops:
/*
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe411ebd..0463ef9cddd6 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, 
void *kbuff,
  * @return: Returns, if the page fault is not handled. This function
  * will never return if the page fault is handled successfully.
  */
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
 {
if (!IS_ENABLED(CONFIG_X86_64))
return;
 
+   /*
+* If we are in an interrupt nested inside an EFI runtime service,
+* then this is a regular OOPS, not an EFI failure.
+*/
+   if (in_interrupt() || in_nmi() || in_softirq())
+   return;
+
/*
 * Make sure that an efi runtime service caused the page fault.
+* READ_ONCE() because we might be OOPSing in a different thread,
+* and we don't want to trip KTSAN while trying to OOPS.
 */
-   if (efi_rts_work.efi_rts_id == EFI_NONE)
+   if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+   current_work() != &efi_rts_work.work)
return;
 
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
-   return;
 }
-- 
2.29.2

[PATCH v2 12/14] x86/fault: Don't look for extable entries for SMEP violations

If we get a SMEP violation or a fault that would have been a SMEP
violation if we had SMEP, we shouldn't run fixups.  Just OOPS.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9fb636b2a3da..466415bdf58c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1249,12 +1249,12 @@ void do_user_addr_fault(struct pt_regs *regs,
 * user memory.  Unless this is AMD erratum #93, which
 * corrupts RIP such that it looks like a user address,
 * this is unrecoverable.  Don't even try to look up the
-* VMA.
+* VMA or look for extable entries.
 */
if (is_errata93(regs, address))
return;
 
-   bad_area_nosemaphore(regs, error_code, address);
+   page_fault_oops(regs, error_code, address);
return;
}
 
-- 
2.29.2

[PATCH v2 09/14] x86/fault: Split the OOPS code out from no_context()

Not all callers of no_context() want to run exception fixups.
Separate the OOPS code out from the fixup code in no_context().

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 116 +++-
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3fe2f4800b69..8b8bd0a4f4b2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -654,53 +654,20 @@ static void set_signal_archinfo(unsigned long address,
 }
 
 static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+   unsigned long address)
 {
-   struct task_struct *tsk = current;
unsigned long flags;
int sig;
 
if (user_mode(regs)) {
/*
-* This is an implicit supervisor-mode access from user
-* mode.  Bypass all the kernel-mode recovery code and just
-* OOPS.
+* Implicit kernel access from user mode?  Skip the stack
+* overflow and EFI special cases.
 */
goto oops;
}
 
-   /* Are we prepared to handle this kernel fault? */
-   if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
-   /*
-* Any interrupt that takes a fault gets the fixup. This makes
-* the below recursive fault logic only apply to a faults from
-* task context.
-*/
-   if (in_interrupt())
-   return;
-
-   /*
-* Per the above we're !in_interrupt(), aka. task context.
-*
-* In this case we need to make sure we're not recursively
-* faulting through the emulate_vsyscall() logic.
-*/
-   if (current->thread.sig_on_uaccess_err && signal) {
-   sanitize_error_code(address, &error_code);
-
-   set_signal_archinfo(address, error_code);
-
-   /* XXX: hwpoison faults will set the wrong code. */
-   force_sig_fault(signal, si_code, (void __user 
*)address);
-   }
-
-   /*
-* Barring that, we can do the fixup and be happy.
-*/
-   return;
-   }
-
 #ifdef CONFIG_VMAP_STACK
/*
 * Stack overflow?  During boot, we can fault near the initial
@@ -708,8 +675,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 * that we're in vmalloc space to avoid this.
 */
if (is_vmalloc_addr((void *)address) &&
-   (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
-address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+   (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+address - ((unsigned long)current->stack + THREAD_SIZE) < 
PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void 
*);
/*
 * We're likely to be running with very little stack space
@@ -732,20 +699,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
}
 #endif
 
-   /*
-* 32-bit:
-*
-*   Valid to do another page fault here, because if this fault
-*   had been triggered by is_prefetch fixup_exception would have
-*   handled it.
-*
-* 64-bit:
-*
-*   Hall of shame of CPU/BIOS bugs.
-*/
-   if (is_prefetch(regs, error_code, address))
-   return;
-
/*
 * Buggy firmware could access regions which might page fault, try to
 * recover from such faults.
@@ -762,7 +715,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
show_fault_oops(regs, error_code, address);
 
-   if (task_stack_end_corrupted(tsk))
+   if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
sig = SIGKILL;
@@ -775,6 +728,61 @@ no_context(struct pt_regs *regs, unsigned long error_code,
oops_end(flags, regs, sig);
 }
 
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+  unsigned long address, int signal, int si_code)
+{
+   if (user_mode(regs)) {
+   /*
+* This is an implicit supervisor-mode access from user
+* mode.  Bypass all the kernel-mode recovery code and just
+* OOPS.
+*/
+   goto oops;
+   }
+
+   /* Are we prepared to handle this kernel fault? */
+   if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+   /*
+* Any interrupt that ta

[PATCH v2 06/14] x86/fault: Correct a few user vs kernel checks wrt WRUSS

In general, page fault errors for WRUSS should be just like get_user(),
etc.  Fix three bugs in this area:

There is a comment that says that, if the kernel can't handle a page fault
on a user address due to OOM, the OOM-kill-and-retry logic would be
skipped.  The code checked kernel *privilege*, not kernel mode, so it
missed WRUSS.  This means that the kernel would malfunction if it got OOM
on a WRUSS fault -- this would be a kernel-mode, user-privilege fault, and
the OOM killer would be invoked and the handler would retry the faulting
instruction.

A failed user access from kernel while a fatal signal is pending should
fail even if the instruction in question was WRUSS.

do_sigbus() should not send SIGBUS for WRUSS -- it should handle it like
any other kernel mode failure.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 013910b7b93f..b1104844260d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -945,7 +945,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
  vm_fault_t fault)
 {
/* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
+   if (!user_mode(regs)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -1217,7 +1217,14 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
 }
 NOKPROBE_SYMBOL(do_kern_addr_fault);
 
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space.  Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
 static inline
 void do_user_addr_fault(struct pt_regs *regs,
unsigned long error_code,
@@ -1406,14 +1413,14 @@ void do_user_addr_fault(struct pt_regs *regs,
if (likely(!(fault & VM_FAULT_ERROR)))
return;
 
-   if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
+   if (fatal_signal_pending(current) && !user_mode(regs)) {
no_context(regs, error_code, address, 0, 0);
return;
}
 
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
+   if (!user_mode(regs)) {
no_context(regs, error_code, address,
   SIGSEGV, SEGV_MAPERR);
return;
-- 
2.29.2

[PATCH v2 07/14] x86/fault: Improve kernel-executing-user-memory handling

Right now we treat the case of the kernel trying to execute from user
memory more or less just like the kernel getting a page fault on a user
access.  In the failure path, we check for erratum #93, try to otherwise
fix up the error, and then oops.

If we manage to jump to the user address space, with or without SMEP, we
should not try to resolve the page fault.  This is an error, pure and
simple.  Rearrange the code so that we catch this case early, check for
erratum #93, and bail out.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b1104844260d..cbb1a9754473 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -447,6 +447,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long 
address)
|| boot_cpu_data.x86 != 0xf)
return 0;
 
+   if (user_mode(regs))
+   return 0;
+
if (address != regs->ip)
return 0;
 
@@ -744,9 +747,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_prefetch(regs, error_code, address))
return;
 
-   if (is_errata93(regs, address))
-   return;
-
/*
 * Buggy firmware could access regions which might page fault, try to
 * recover from such faults.
@@ -1239,6 +1239,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
 
+   if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == 
X86_PF_INSTR)) {
+   /*
+* Whoops, this is kernel mode code trying to execute from
+* user memory.  Unless this is AMD erratum #93, which
+* corrupts RIP such that it looks like a user address,
+* this is unrecoverable.  Don't even try to look up the
+* VMA.
+*/
+   if (is_errata93(regs, address))
+   return;
+
+   bad_area_nosemaphore(regs, error_code, address);
+   return;
+   }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
-- 
2.29.2

[PATCH v2 03/14] x86/fault: Fold mm_fault_error() into do_user_addr_fault()

mm_fault_error() is logically just the end of do_user_addr_fault().
Combine the functions.  This makes the code easier to read.

Most of the churn here is from renaming hw_error_code to error_code in
do_user_addr_fault().

This makes no difference at all to the generated code (objdump -dr) as
compared to changing noinline to __always_inline in the definition of
mm_fault_error().

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 97 +
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 818902b08c52..91cf7a672c04 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -981,40 +981,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
 
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, vm_fault_t fault)
-{
-   if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address, 0, 0);
-   return;
-   }
-
-   if (fault & VM_FAULT_OOM) {
-   /* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address,
-  SIGSEGV, SEGV_MAPERR);
-   return;
-   }
-
-   /*
-* We ran out of memory, call the OOM killer, and return the
-* userspace (which will retry the fault, or kill us if we got
-* oom-killed):
-*/
-   pagefault_out_of_memory();
-   } else {
-   if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
-VM_FAULT_HWPOISON_LARGE))
-   do_sigbus(regs, error_code, address, fault);
-   else if (fault & VM_FAULT_SIGSEGV)
-   bad_area_nosemaphore(regs, error_code, address);
-   else
-   BUG();
-   }
-}
-
 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
 {
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1252,7 +1218,7 @@ NOKPROBE_SYMBOL(do_kern_addr_fault);
 /* Handle faults in the user portion of the address space */
 static inline
 void do_user_addr_fault(struct pt_regs *regs,
-   unsigned long hw_error_code,
+   unsigned long error_code,
unsigned long address)
 {
struct vm_area_struct *vma;
@@ -1272,8 +1238,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 * Reserved bits are never expected to be set on
 * entries in the user portion of the page tables.
 */
-   if (unlikely(hw_error_code & X86_PF_RSVD))
-   pgtable_bad(regs, hw_error_code, address);
+   if (unlikely(error_code & X86_PF_RSVD))
+   pgtable_bad(regs, error_code, address);
 
/*
 * If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1283,10 +1249,10 @@ void do_user_addr_fault(struct pt_regs *regs,
 * enforcement appears to be consistent with the USER bit.
 */
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
-!(hw_error_code & X86_PF_USER) &&
+!(error_code & X86_PF_USER) &&
 !(regs->flags & X86_EFLAGS_AC)))
{
-   bad_area_nosemaphore(regs, hw_error_code, address);
+   bad_area_nosemaphore(regs, error_code, address);
return;
}
 
@@ -1295,7 +1261,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 * in a region with pagefaults disabled then we must not take the fault
 */
if (unlikely(faulthandler_disabled() || !mm)) {
-   bad_area_nosemaphore(regs, hw_error_code, address);
+   bad_area_nosemaphore(regs, error_code, address);
return;
}
 
@@ -1316,9 +1282,9 @@ void do_user_addr_fault(struct pt_regs *regs,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-   if (hw_error_code & X86_PF_WRITE)
+   if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
-   if (hw_error_code & X86_PF_INSTR)
+   if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
 #ifdef CONFIG_X86_64
@@ -1334,7 +1300,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 * to consider the PF_PK bit.
 */
if (is_vsyscall_vaddr(address)) {
-   if (emulate_vsyscall(hw_error_code, regs, address))
+   if (emulate_vsyscall(error_code, regs, address))
return;
}
 #endif
@@ -1357,7 +1323,7 @@ void do_user_addr_fault(struct pt_regs *regs,

[PATCH v2 04/14] x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault()

bad_area() and its relatives are called from many places in fault.c, and
exactly one of them wants the F00F workaround.

__bad_area_nosemaphore() no longer contains any kernel fault code, which
prepares for further cleanups.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 91cf7a672c04..3ffed003f281 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -482,10 +482,12 @@ static int is_errata100(struct pt_regs *regs, unsigned 
long address)
 }
 
 /* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+  unsigned long address)
 {
 #ifdef CONFIG_X86_F00F_BUG
-   if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+   if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+   idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -853,9 +855,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
-   if (is_f00f_bug(regs, address))
-   return;
-
no_context(regs, error_code, address, SIGSEGV, si_code);
 }
 
@@ -1195,6 +1194,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
}
 #endif
 
+   if (is_f00f_bug(regs, hw_error_code, address))
+   return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
-- 
2.29.2

[PATCH v2 02/14] x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs

According to the Revision Guide for AMD Athlon™ 64 and AMD Opteron™
Processors, only early revisions of family 0xF are affected.  This will
avoid unnecessarily fetching instruction bytes before sending SIGSEGV to
user programs.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 441c3e9b8971..818902b08c52 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -106,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char 
*instr,
}
 }
 
+static bool is_amd_k8_pre_npt(void)
+{
+   struct cpuinfo_x86 *c = &boot_cpu_data;
+
+   return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+   c->x86_vendor == X86_VENDOR_AMD &&
+   c->x86 == 0xf && c->x86_model < 0x40);
+}
+
 static int
 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 {
@@ -113,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long 
error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
 
+   /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+   if (!is_amd_k8_pre_npt())
+   return 0;
+
/*
 * If it was a exec (instruction fetch) fault on NX page, then
 * do not ignore the fault:
-- 
2.29.2

[PATCH v2 05/14] x86/fault: Document the locking in the fault_signal_pending() path

If fault_signal_pending() returns true, then the core mm has unlocked the
mm for us.  Add a comment to help future readers of this code.

Cc: Dave Hansen 
Cc: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3ffed003f281..013910b7b93f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1380,8 +1380,11 @@ void do_user_addr_fault(struct pt_regs *regs,
 */
fault = handle_mm_fault(vma, address, flags, regs);
 
-   /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+   /*
+* Quick path to respond to signals.  The core mm code
+* has unlocked the mm for us if we get here.
+*/
if (!user_mode(regs))
no_context(regs, error_code, address, SIGBUS,
   BUS_ADRERR);
-- 
2.29.2

[PATCH v2 01/14] x86/fault: Fix AMD erratum #91 errata fixup for user code

The recent rework of probe_kernel_address() and its conversion to
get_kernel_nofault() inadvertently broke is_prefetch(). Before this change,
probe_kernel_address() was used as a sloppy "read user or kernel memory"
helper, but it doesn't do that any more.  The new get_kernel_nofault()
reads *kernel* memory only, which completely broke is_prefetch() for user
access.

Adjust the code to the the correct accessor based on access mode.  The
manual address bounds check is no longer necessary, since the accessor
helpers (get_user() / get_kernel_nofault()) do the right thing all by
themselves.  As a bonus, by using the correct accessor, we don't need the
open-coded address bounds check.

Fixes: eab0c6089b68 ("maccess: unify the probe kernel arch hooks")
Cc: sta...@vger.kernel.org
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Christoph Hellwig 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Masami Hiramatsu 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/fault.c | 27 +--
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a..441c3e9b8971 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
  * 32-bit mode:
  *
  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- *   Check that here and ignore it.
+ *   Check that here and ignore it.  This is AMD erratum #91.
  *
  * 64-bit mode:
  *
@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char 
*instr,
 #ifdef CONFIG_X86_64
case 0x40:
/*
-* In AMD64 long mode 0x40..0x4F are valid REX prefixes
-* Need to figure out under what instruction mode the
-* instruction was issued. Could check the LDT for lm,
-* but for now it's good enough to assume that long
-* mode only uses well known segments or kernel.
+* In 64-bit mode 0x40..0x4F are valid REX prefixes
 */
return (!user_mode(regs) || user_64bit_mode(regs));
 #endif
@@ -127,20 +123,31 @@ is_prefetch(struct pt_regs *regs, unsigned long 
error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
 
-   if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
-   return 0;
+   /*
+* This code has historically always bailed out if IP points to a
+* not-present page (e.g. due to a race).  No one has ever
+* complained about this.
+*/
+   pagefault_disable();
 
while (instr < max_instr) {
unsigned char opcode;
 
-   if (get_kernel_nofault(opcode, instr))
-   break;
+   if (user_mode(regs)) {
+   if (get_user(opcode, instr))
+   break;
+   } else {
+   if (get_kernel_nofault(opcode, instr))
+   break;
+   }
 
instr++;
 
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+   pagefault_enable();
return prefetch;
 }
 
-- 
2.29.2

[PATCH v2 00/14] x86/fault: #PF improvements, mostly related to USER bit

This series is a whole bunch of page fault cleanups, plus a couple
of OOPS diagnostic improvements.  The overall goals are to clean up
handling of the faulting CPL, the USER bit in the error_code, and
the log messages generated by #PF OOPSes.

This series can also be seen as CET preparation.  CET introduces the
WRUSS instruction, which is the very first way for CPL 0 code to
cause a #PF fault with the USER bit set.  Let's get the page fault
code into shape before we start using WRUSS :)

Changes from v1:
 - Various changelog improvements.
 - Reorder patches (SMAP moved after SMEP)
 - Add the efi_recover_from_page_fault() patch
 - Tidy up and improve the AMD erratum detection code

Andy Lutomirski (14):
  x86/fault: Fix AMD erratum #91 errata fixup for user code
  x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs
  x86/fault: Fold mm_fault_error() into do_user_addr_fault()
  x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault()
  x86/fault: Document the locking in the fault_signal_pending() path
  x86/fault: Correct a few user vs kernel checks wrt WRUSS
  x86/fault: Improve kernel-executing-user-memory handling
  x86/fault: Skip erratum #93 workaround on new CPUs
  x86/fault: Split the OOPS code out from no_context()
  x86/fault: Bypass no_context() for implicit kernel faults from
usermode
  x86/fault: Rename no_context() to kernelmode_fixup_or_oops()
  x86/fault: Don't look for extable entries for SMEP violations
  x86/fault: Don't run fixups for SMAP violations
  x86/fault, x86/efi: Fix and rename efi_recover_from_page_fault()

 arch/x86/include/asm/efi.h |   2 +-
 arch/x86/mm/fault.c| 380 +++--
 arch/x86/platform/efi/quirks.c |  16 +-
 3 files changed, 227 insertions(+), 171 deletions(-)

-- 
2.29.2

RE: [PATCH] perf tools: Fix arm64 build error with gcc-11

2021-02-09 Thread Jianlin Lv



> -Original Message-
> From: Leo Yan 
> Sent: Tuesday, February 9, 2021 8:17 PM
> To: Jianlin Lv 
> Cc: john.ga...@huawei.com; w...@kernel.org; mathieu.poir...@linaro.org;
> pet...@infradead.org; mi...@redhat.com; a...@kernel.org; Mark Rutland
> ; alexander.shish...@linux.intel.com;
> jo...@redhat.com; namhy...@kernel.org; linux-arm-
> ker...@lists.infradead.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] perf tools: Fix arm64 build error with gcc-11
> 
> Hi Jianlin,
> 
> On Tue, Feb 09, 2021 at 07:33:57PM +0800, Jianlin Lv wrote:
> > gcc version: 11.0.0 20210208 (experimental) (GCC)
> >
> > Following build error on arm64:
> >
> > ...
> > In function ‘printf’,
> > inlined from ‘regs_dump__printf’ at util/session.c:1141:3,
> > inlined from ‘regs__printf’ at util/session.c:1169:2:
> > /usr/include/aarch64-linux-gnu/bits/stdio2.h:107:10: \
> >   error: ‘%-5s’ directive argument is null [-Werror=format-overflow=]
> >
> > 107 |   return __printf_chk (__USE_FORTIFY_LEVEL - 1, __fmt, \
> > __va_arg_pack ());
> >
> > ..
> > In function ‘fprintf’,
> >   inlined from ‘perf_sample__fprintf_regs.isra’ at \
> > builtin-script.c:622:14:
> > /usr/include/aarch64-linux-gnu/bits/stdio2.h:100:10: \
> > error: ‘%5s’ directive argument is null [-Werror=format-overflow=]
> >   100 |   return __fprintf_chk (__stream, __USE_FORTIFY_LEVEL - 1, __fmt,
> >   101 | __va_arg_pack ());
> >
> > cc1: all warnings being treated as errors ...
> >
> > This patch fixes Wformat-overflow warnings by replacing the return
> > value NULL of perf_reg_name with "unknown".
> >
> > Signed-off-by: Jianlin Lv 
> > ---
> >  tools/perf/arch/arm64/include/perf_regs.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/tools/perf/arch/arm64/include/perf_regs.h
> > b/tools/perf/arch/arm64/include/perf_regs.h
> > index baaa5e64a3fb..901419f907c0 100644
> > --- a/tools/perf/arch/arm64/include/perf_regs.h
> > +++ b/tools/perf/arch/arm64/include/perf_regs.h
> > @@ -85,10 +85,10 @@ static inline const char *perf_reg_name(int id)
> > case PERF_REG_ARM64_PC:
> > return "pc";
> > default:
> > -   return NULL;
> > +   return "unknown";
> > }
> >
> > -   return NULL;
> > +   return "unknown";
> 
> This issue is a common issue crossing all archs.  So it's better to change the
> code in the places where calls perf_reg_name(), e.g. in
> util/session.c:
> 
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1135,12 +1135,14 @@ static void branch_stack__printf(struct
> perf_sample *sample, bool callstack)  static void regs_dump__printf(u64
> mask, u64 *regs)  {
> unsigned rid, i = 0;
> +   char *reg_name;
> 
> for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
> u64 val = regs[i++];
> 
> +   reg_name = perf_reg_name(rid);
> printf(" %-5s 0x%016" PRIx64 "\n",
> -  perf_reg_name(rid), val);
> +  reg_name ?: "Unknown", val);
> }
>  }
> 

Thanks for your comments, I will send a v2 of the patch today.

Jianlin


> And another potential issue is the format specifier "%-5s", it prints out
> maximum to 5 chars, but actually string "Unknown" has 7 chars.
> Actually the format specifier breaks other archs register names, e.g.
> [1][2], seems to me, it's better to change as "%-8s", you might need to use a
> dedicated patch for format specifier changes.
> 
> Thanks,
> Leo
> 
> 
> [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/
> perf/arch/powerpc/include/perf_regs.h#n57
> [2]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/
> perf/arch/csky/include/perf_regs.h#n83

Re: [PATCH] ext4: add .kunitconfig fragment to enable ext4-specific tests

2021-02-09 Thread Theodore Ts'o

On Tue, Feb 09, 2021 at 05:32:06PM -0800, Daniel Latypov wrote:
> 
> After [2]:
>   $ ./tools/testing/kunit.py run --kunitconfig=fs/ext4/.kunitconfig

Any chance that in the future this might become:

$ ./tools/testing/kunit.py run --kunitconfig=fs/ext4

Or better yet, syntactic sugar like:

$ ./tools/testing/kunit.py test fs/ext4

would be really nice.

- Ted

Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-09 Thread Si-Wei Liu





On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources are 
destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The driver 
failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available and used
indices upon device reset, fix this here to avoid regression 
caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported 
mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,
obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,
mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,
virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net
*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net *ndev, 
struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);

     attr->available_index = MLX5_GET(virtio_net_q_object,
obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct
mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device reset\n");
     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns with 
the other
state cleanups floating around at the same place. However, the 
thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in the driver software at vq suspension, but 
before the
virtq object is destroyed. We shouldn't clear the avail_index 
too early.

Good point.

There's a limitation on the virtio spec and vDPA framework that 
we can not

simply dif

Re: [PATCH 2/6] dt-bindings: clk: mstar msc313 mpll binding description