dw_mmc: IDMAC Invalidate cache after read
CPU may not see most up-to-date and correct copy of DMA buffer, when internal DMA controller is in use. Problem appears on The Altera SoC FPGA (uses integrated DMA controller), during higher CPU and system memory load Signed-off-by: Jan Jablonsky --- drivers/mmc/host/dw_mmc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 80dc2fd..63873d9 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -499,8 +499,7 @@ static void dw_mci_dmac_complete_dma(void *arg) dev_vdbg(host->dev, "DMA complete\n"); - if ((host->use_dma == TRANS_MODE_EDMAC) && - data && (data->flags & MMC_DATA_READ)) + if (data && (data->flags & MMC_DATA_READ)) /* Invalidate cache after read */ dma_sync_sg_for_cpu(mmc_dev(host->slot->mmc), data->sg,
dw_mmc: IDMAC Invalidate cache after read
CPU may not see most up-to-date and correct copy of DMA buffer, when internal DMA controller is in use. Problem appears on The Altera SoC FPGA (uses integrated DMA controller), during higher CPU and system memory load Signed-off-by: Jan Jablonsky --- drivers/mmc/host/dw_mmc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 80dc2fd..63873d9 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -499,8 +499,7 @@ static void dw_mci_dmac_complete_dma(void *arg) dev_vdbg(host->dev, "DMA complete\n"); - if ((host->use_dma == TRANS_MODE_EDMAC) && - data && (data->flags & MMC_DATA_READ)) + if (data && (data->flags & MMC_DATA_READ)) /* Invalidate cache after read */ dma_sync_sg_for_cpu(mmc_dev(host->slot->mmc), data->sg,
perf tools: remove option --tail-synthesize ?
Hi, I found perf-record --tail-synthesize without --overwrite breaks symbols for perf-script, perf-report, etc. For example: [root@]# ~/perf record -ag --tail-synthesize -- sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.129 MB perf.data (3531 samples) ] [root@]# ~/perf script | head swapper 0 [000] 1250675.051971: 1 cycles:ppp: 81009e15 [unknown] ([unknown]) 81196b19 [unknown] ([unknown]) 81196579 [unknown] ([unknown]) 81110ca7 [unknown] ([unknown]) 81a01f4a [unknown] ([unknown]) 81a017bf [unknown] ([unknown]) 8180e17a [unknown] ([unknown]) perf-record with --overwrite does NOT have this issue. After digging into this, I found this issue is introduced by commit a73e24d240bc136619d382b1268f34d75c9d25ce. Reverting this commit does fix this issue. However, on a second thought, I feel it is probably better just drop --tail-synthesize, as it doesn't make much sense without --overwrite. All we need is to do tail_synthesize when --overwrite is set. Thoughts? Thanks, Song
perf tools: remove option --tail-synthesize ?
Hi, I found perf-record --tail-synthesize without --overwrite breaks symbols for perf-script, perf-report, etc. For example: [root@]# ~/perf record -ag --tail-synthesize -- sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.129 MB perf.data (3531 samples) ] [root@]# ~/perf script | head swapper 0 [000] 1250675.051971: 1 cycles:ppp: 81009e15 [unknown] ([unknown]) 81196b19 [unknown] ([unknown]) 81196579 [unknown] ([unknown]) 81110ca7 [unknown] ([unknown]) 81a01f4a [unknown] ([unknown]) 81a017bf [unknown] ([unknown]) 8180e17a [unknown] ([unknown]) perf-record with --overwrite does NOT have this issue. After digging into this, I found this issue is introduced by commit a73e24d240bc136619d382b1268f34d75c9d25ce. Reverting this commit does fix this issue. However, on a second thought, I feel it is probably better just drop --tail-synthesize, as it doesn't make much sense without --overwrite. All we need is to do tail_synthesize when --overwrite is set. Thoughts? Thanks, Song
[PATCH 4/4] perf intel-pt/bts: fix potential NULL pointer dereference in intel_bts_process_auxtrace_info
This patch fixes a possible null pointer dereference in intel_bts_process_auxtrace_info, detected by the semantic patch deref_null.cocci, with the following warning: ./tools/perf/util/intel-bts.c:921:32-49: ERROR: session -> itrace_synth_opts is NULL but dereferenced. Signed-off-by: Wen Yang Reviewed-by: Tan Hu CC: Julia Lawall --- tools/perf/util/intel-bts.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 7b27d77..b0258f4 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -917,7 +917,8 @@ int intel_bts_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { bts->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(>synth_opts, + if (session->itrace_synth_opts) + itrace_synth_opts__set_default(>synth_opts, session->itrace_synth_opts->default_no_sample); if (session->itrace_synth_opts) bts->synth_opts.thread_stack = -- 2.9.5
[PATCH 4/4] perf intel-pt/bts: fix potential NULL pointer dereference in intel_bts_process_auxtrace_info
This patch fixes a possible null pointer dereference in intel_bts_process_auxtrace_info, detected by the semantic patch deref_null.cocci, with the following warning: ./tools/perf/util/intel-bts.c:921:32-49: ERROR: session -> itrace_synth_opts is NULL but dereferenced. Signed-off-by: Wen Yang Reviewed-by: Tan Hu CC: Julia Lawall --- tools/perf/util/intel-bts.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 7b27d77..b0258f4 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -917,7 +917,8 @@ int intel_bts_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { bts->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(>synth_opts, + if (session->itrace_synth_opts) + itrace_synth_opts__set_default(>synth_opts, session->itrace_synth_opts->default_no_sample); if (session->itrace_synth_opts) bts->synth_opts.thread_stack = -- 2.9.5
[PATCH 1/4] perf intel-pt: fix potential NULL pointer dereference in intel_pt_process_auxtrace_info
This patch fixes a possible null pointer dereference in intel_pt_process_auxtrace_info, detected by the semantic patch deref_null.cocci, with the following warning: ./tools/perf/util/intel-pt.c:2579:32-49: ERROR: session -> itrace_synth_opts is NULL but dereferenced. Signed-off-by: Wen Yang Reviewed-by: Tan Hu CC: Julia Lawall --- tools/perf/util/intel-pt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 149ff36..bac20e8 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2575,7 +2575,8 @@ int intel_pt_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { pt->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(>synth_opts, + if (session->itrace_synth_opts) + itrace_synth_opts__set_default(>synth_opts, session->itrace_synth_opts->default_no_sample); if (use_browser != -1) { pt->synth_opts.branches = false; -- 2.9.5
[PATCH 1/4] perf intel-pt: fix potential NULL pointer dereference in intel_pt_process_auxtrace_info
This patch fixes a possible null pointer dereference in intel_pt_process_auxtrace_info, detected by the semantic patch deref_null.cocci, with the following warning: ./tools/perf/util/intel-pt.c:2579:32-49: ERROR: session -> itrace_synth_opts is NULL but dereferenced. Signed-off-by: Wen Yang Reviewed-by: Tan Hu CC: Julia Lawall --- tools/perf/util/intel-pt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 149ff36..bac20e8 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2575,7 +2575,8 @@ int intel_pt_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { pt->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(>synth_opts, + if (session->itrace_synth_opts) + itrace_synth_opts__set_default(>synth_opts, session->itrace_synth_opts->default_no_sample); if (use_browser != -1) { pt->synth_opts.branches = false; -- 2.9.5
Re: [PATCH v2] tee: optee: log message if dynamic shm is enabled
Hi Victor, On Tue, Sep 11, 2018 at 06:09:02PM +0900, Victor Chong wrote: > When dynamic shared memory support is enabled in the OP-TEE Trusted > OS, it doesn't mean that the driver supports it, which can confuse > users during debugging. Log a message when dynamic shared memory is > enabled in the driver, to let users know for sure. > > Suggested-by: Jerome Forissier > Signed-off-by: Victor Chong > Reviewed-by: Jerome Forissier > --- > > v2: > * Added patch description > > [v1] lore.kernel.org/patchwork/patch/983949/ > > drivers/tee/optee/core.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c > index e1aafe842d66..cca5c091e55a 100644 > --- a/drivers/tee/optee/core.c > +++ b/drivers/tee/optee/core.c > @@ -631,6 +631,9 @@ static struct optee *optee_probe(struct device_node *np) > > optee_enable_shm_cache(optee); > > + if (optee->sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) > + pr_info("dynamic shared memory is enabled\n"); > + > pr_info("initialized driver\n"); > return optee; > err: > -- > 2.17.1 Looks good. I'm picking this up. Thanks, Jens
Re: [PATCH v2] tee: optee: log message if dynamic shm is enabled
Hi Victor, On Tue, Sep 11, 2018 at 06:09:02PM +0900, Victor Chong wrote: > When dynamic shared memory support is enabled in the OP-TEE Trusted > OS, it doesn't mean that the driver supports it, which can confuse > users during debugging. Log a message when dynamic shared memory is > enabled in the driver, to let users know for sure. > > Suggested-by: Jerome Forissier > Signed-off-by: Victor Chong > Reviewed-by: Jerome Forissier > --- > > v2: > * Added patch description > > [v1] lore.kernel.org/patchwork/patch/983949/ > > drivers/tee/optee/core.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c > index e1aafe842d66..cca5c091e55a 100644 > --- a/drivers/tee/optee/core.c > +++ b/drivers/tee/optee/core.c > @@ -631,6 +631,9 @@ static struct optee *optee_probe(struct device_node *np) > > optee_enable_shm_cache(optee); > > + if (optee->sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) > + pr_info("dynamic shared memory is enabled\n"); > + > pr_info("initialized driver\n"); > return optee; > err: > -- > 2.17.1 Looks good. I'm picking this up. Thanks, Jens
Re: linux-next: manual merge of the mlx5-next tree with the rdma tree
On Wed, Nov 21, 2018 at 11:04:32AM +1100, Stephen Rothwell wrote: > Hi Leon, > > Today's linux-next merge of the mlx5-next tree got a conflict in: > > drivers/infiniband/hw/mlx5/main.c > > between commit: > > 9afc97c29b03 ("mlx5: remove support for ib_get_vector_affinity") > > from the rdma tree and commit: > > f2f3df550139 ("net/mlx5: EQ, Privatize eq_table and friends") > > from the mlx5-next tree. > > I fixed it up (the former removed some of the code modified by the latter) > and can carry the fix as necessary. This is now fixed as far as linux-next > is concerned, but any non trivial conflicts should be mentioned to your > upstream maintainer when your tree is submitted for merging. You may > also want to consider cooperating with the maintainer of the conflicting > tree to minimise any particularly complex conflicts. > Thanks Stephen, You are absolutely right, the removal is correct. > > > -- > Cheers, > Stephen Rothwell signature.asc Description: PGP signature
Re: linux-next: manual merge of the mlx5-next tree with the rdma tree
On Wed, Nov 21, 2018 at 11:04:32AM +1100, Stephen Rothwell wrote: > Hi Leon, > > Today's linux-next merge of the mlx5-next tree got a conflict in: > > drivers/infiniband/hw/mlx5/main.c > > between commit: > > 9afc97c29b03 ("mlx5: remove support for ib_get_vector_affinity") > > from the rdma tree and commit: > > f2f3df550139 ("net/mlx5: EQ, Privatize eq_table and friends") > > from the mlx5-next tree. > > I fixed it up (the former removed some of the code modified by the latter) > and can carry the fix as necessary. This is now fixed as far as linux-next > is concerned, but any non trivial conflicts should be mentioned to your > upstream maintainer when your tree is submitted for merging. You may > also want to consider cooperating with the maintainer of the conflicting > tree to minimise any particularly complex conflicts. > Thanks Stephen, You are absolutely right, the removal is correct. > > > -- > Cheers, > Stephen Rothwell signature.asc Description: PGP signature
Re: [RFC PATCH 3/3] mm, fault_around: do not take a reference to a locked page
On Tue 20-11-18 17:47:21, Hugh Dickins wrote: > On Tue, 20 Nov 2018, Michal Hocko wrote: > > > From: Michal Hocko > > > > filemap_map_pages takes a speculative reference to each page in the > > range before it tries to lock that page. While this is correct it > > also can influence page migration which will bail out when seeing > > an elevated reference count. The faultaround code would bail on > > seeing a locked page so we can pro-actively check the PageLocked > > bit before page_cache_get_speculative and prevent from pointless > > reference count churn. > > > > Cc: "Kirill A. Shutemov" > > Suggested-by: Jan Kara > > Signed-off-by: Michal Hocko > > Acked-by: Hugh Dickins Thanks! > though I think this patch is more useful to the avoid atomic ops, > and unnecessary dirtying of the cacheline, than to avoid the very > transient elevation of refcount, which will not affect page migration > very much. Are you sure it would really be transient? In other words is it possible that the fault around can block migration repeatedly under refault heavy workload? I just couldn't convince myself, to be honest. -- Michal Hocko SUSE Labs
Re: [RFC PATCH 3/3] mm, fault_around: do not take a reference to a locked page
On Tue 20-11-18 17:47:21, Hugh Dickins wrote: > On Tue, 20 Nov 2018, Michal Hocko wrote: > > > From: Michal Hocko > > > > filemap_map_pages takes a speculative reference to each page in the > > range before it tries to lock that page. While this is correct it > > also can influence page migration which will bail out when seeing > > an elevated reference count. The faultaround code would bail on > > seeing a locked page so we can pro-actively check the PageLocked > > bit before page_cache_get_speculative and prevent from pointless > > reference count churn. > > > > Cc: "Kirill A. Shutemov" > > Suggested-by: Jan Kara > > Signed-off-by: Michal Hocko > > Acked-by: Hugh Dickins Thanks! > though I think this patch is more useful to the avoid atomic ops, > and unnecessary dirtying of the cacheline, than to avoid the very > transient elevation of refcount, which will not affect page migration > very much. Are you sure it would really be transient? In other words is it possible that the fault around can block migration repeatedly under refault heavy workload? I just couldn't convince myself, to be honest. -- Michal Hocko SUSE Labs
Re: [RFC PATCH 3/3] mm, fault_around: do not take a reference to a locked page
On Tue 20-11-18 21:51:39, William Kucharski wrote: > > > > On Nov 20, 2018, at 7:12 AM, Michal Hocko wrote: > > > > + /* > > +* Check the locked pages before taking a reference to not > > +* go in the way of migration. > > +*/ > > Could you make this a tiny bit more explanative, something like: > > + /* > + * Check for a locked page first, as a speculative > + * reference may adversely influence page migration. > + */ sure > > Reviewed-by: William Kucharski Thanks! -- Michal Hocko SUSE Labs
Re: [RFC PATCH 3/3] mm, fault_around: do not take a reference to a locked page
On Tue 20-11-18 21:51:39, William Kucharski wrote: > > > > On Nov 20, 2018, at 7:12 AM, Michal Hocko wrote: > > > > + /* > > +* Check the locked pages before taking a reference to not > > +* go in the way of migration. > > +*/ > > Could you make this a tiny bit more explanative, something like: > > + /* > + * Check for a locked page first, as a speculative > + * reference may adversely influence page migration. > + */ sure > > Reviewed-by: William Kucharski Thanks! -- Michal Hocko SUSE Labs
Re: [RFC PATCH 1/3] mm, proc: be more verbose about unstable VMA flags in /proc//smaps
On Tue 20-11-18 10:32:07, Dan Williams wrote: > On Tue, Nov 20, 2018 at 2:35 AM Michal Hocko wrote: > > > > From: Michal Hocko > > > > Even though vma flags exported via /proc//smaps are explicitly > > documented to be not guaranteed for future compatibility the warning > > doesn't go far enough because it doesn't mention semantic changes to > > those flags. And they are important as well because these flags are > > a deep implementation internal to the MM code and the semantic might > > change at any time. > > > > Let's consider two recent examples: > > http://lkml.kernel.org/r/20181002100531.gc4...@quack2.suse.cz > > : commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" has > > : removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in the > > : mean time certain customer of ours started poking into /proc//smaps > > : and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA > > : flags, the application just fails to start complaining that DAX support is > > : missing in the kernel. > > > > http://lkml.kernel.org/r/alpine.deb.2.21.1809241054050.224...@chino.kir.corp.google.com > > : Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") > > : introduced a regression in that userspace cannot always determine the set > > : of vmas where thp is ineligible. > > : Userspace relies on the "nh" flag being emitted as part of /proc/pid/smaps > > : to determine if a vma is eligible to be backed by hugepages. > > : Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to > > : be disabled and emit "nh" as a flag for the corresponding vmas as part of > > : /proc/pid/smaps. After the commit, thp is disabled by means of an mm > > : flag and "nh" is not emitted. > > : This causes smaps parsing libraries to assume a vma is eligible for thp > > : and ends up puzzling the user on why its memory is not backed by thp. > > > > In both cases userspace was relying on a semantic of a specific VMA > > flag. The primary reason why that happened is a lack of a proper > > internface. While this has been worked on and it will be fixed properly, > > it seems that our wording could see some refinement and be more vocal > > about semantic aspect of these flags as well. > > > > Cc: Jan Kara > > Cc: Dan Williams > > Cc: David Rientjes > > Signed-off-by: Michal Hocko > > --- > > Documentation/filesystems/proc.txt | 4 +++- > > 1 file changed, 3 insertions(+), 1 deletion(-) > > > > diff --git a/Documentation/filesystems/proc.txt > > b/Documentation/filesystems/proc.txt > > index 12a5e6e693b6..b1fda309f067 100644 > > --- a/Documentation/filesystems/proc.txt > > +++ b/Documentation/filesystems/proc.txt > > @@ -496,7 +496,9 @@ flags associated with the particular virtual memory > > area in two letter encoded > > > > Note that there is no guarantee that every flag and associated mnemonic > > will > > be present in all further kernel releases. Things get changed, the flags > > may > > -be vanished or the reverse -- new added. > > +be vanished or the reverse -- new added. Interpretatation of their meaning > > +might change in future as well. So each consumnent of these flags have to > > +follow each specific kernel version for the exact semantic. > > Can we start to claw some of this back? Perhaps with a config option > to hide the flags to put applications on notice? I would love to. My knowledge of CRIU is very minimal, but my understanding is that this is the primary consumer of those flags. And checkpointing is so close to the specific kernel version that I assume that this abuse is somehow justified. We can hide it behind CONFIG_CHECKPOINT_RESTORE but does it going to help? I presume that many distro kernels will have the config enabled. > I recall that when I > introduced CONFIG_IO_STRICT_DEVMEM it caused enough regressions that > distros did not enable it, but now a few years out I'm finding that it > is enabled in more places. > > In any event, > > Acked-by: Dan Williams Thanks! -- Michal Hocko SUSE Labs
Re: [RFC PATCH 1/3] mm, proc: be more verbose about unstable VMA flags in /proc//smaps
On Tue 20-11-18 10:32:07, Dan Williams wrote: > On Tue, Nov 20, 2018 at 2:35 AM Michal Hocko wrote: > > > > From: Michal Hocko > > > > Even though vma flags exported via /proc//smaps are explicitly > > documented to be not guaranteed for future compatibility the warning > > doesn't go far enough because it doesn't mention semantic changes to > > those flags. And they are important as well because these flags are > > a deep implementation internal to the MM code and the semantic might > > change at any time. > > > > Let's consider two recent examples: > > http://lkml.kernel.org/r/20181002100531.gc4...@quack2.suse.cz > > : commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" has > > : removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in the > > : mean time certain customer of ours started poking into /proc//smaps > > : and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA > > : flags, the application just fails to start complaining that DAX support is > > : missing in the kernel. > > > > http://lkml.kernel.org/r/alpine.deb.2.21.1809241054050.224...@chino.kir.corp.google.com > > : Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") > > : introduced a regression in that userspace cannot always determine the set > > : of vmas where thp is ineligible. > > : Userspace relies on the "nh" flag being emitted as part of /proc/pid/smaps > > : to determine if a vma is eligible to be backed by hugepages. > > : Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to > > : be disabled and emit "nh" as a flag for the corresponding vmas as part of > > : /proc/pid/smaps. After the commit, thp is disabled by means of an mm > > : flag and "nh" is not emitted. > > : This causes smaps parsing libraries to assume a vma is eligible for thp > > : and ends up puzzling the user on why its memory is not backed by thp. > > > > In both cases userspace was relying on a semantic of a specific VMA > > flag. The primary reason why that happened is a lack of a proper > > internface. While this has been worked on and it will be fixed properly, > > it seems that our wording could see some refinement and be more vocal > > about semantic aspect of these flags as well. > > > > Cc: Jan Kara > > Cc: Dan Williams > > Cc: David Rientjes > > Signed-off-by: Michal Hocko > > --- > > Documentation/filesystems/proc.txt | 4 +++- > > 1 file changed, 3 insertions(+), 1 deletion(-) > > > > diff --git a/Documentation/filesystems/proc.txt > > b/Documentation/filesystems/proc.txt > > index 12a5e6e693b6..b1fda309f067 100644 > > --- a/Documentation/filesystems/proc.txt > > +++ b/Documentation/filesystems/proc.txt > > @@ -496,7 +496,9 @@ flags associated with the particular virtual memory > > area in two letter encoded > > > > Note that there is no guarantee that every flag and associated mnemonic > > will > > be present in all further kernel releases. Things get changed, the flags > > may > > -be vanished or the reverse -- new added. > > +be vanished or the reverse -- new added. Interpretatation of their meaning > > +might change in future as well. So each consumnent of these flags have to > > +follow each specific kernel version for the exact semantic. > > Can we start to claw some of this back? Perhaps with a config option > to hide the flags to put applications on notice? I would love to. My knowledge of CRIU is very minimal, but my understanding is that this is the primary consumer of those flags. And checkpointing is so close to the specific kernel version that I assume that this abuse is somehow justified. We can hide it behind CONFIG_CHECKPOINT_RESTORE but does it going to help? I presume that many distro kernels will have the config enabled. > I recall that when I > introduced CONFIG_IO_STRICT_DEVMEM it caused enough regressions that > distros did not enable it, but now a few years out I'm finding that it > is enabled in more places. > > In any event, > > Acked-by: Dan Williams Thanks! -- Michal Hocko SUSE Labs
[PATCH] ARM: dts: imx6sll-evk: add debug LED support
On i.MX6SLL EVK board, there is a debug LED controlled by MX6SLL_PAD_EPDC_VCOM1__GPIO2_IO04 pin, add support for it. Signed-off-by: Anson Huang --- arch/arm/boot/dts/imx6sll-evk.dts | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/arm/boot/dts/imx6sll-evk.dts b/arch/arm/boot/dts/imx6sll-evk.dts index c8e1155..9c90cbd 100644 --- a/arch/arm/boot/dts/imx6sll-evk.dts +++ b/arch/arm/boot/dts/imx6sll-evk.dts @@ -31,6 +31,18 @@ status = "okay"; }; + leds { + compatible = "gpio-leds"; + pinctrl-names = "default"; + pinctrl-0 = <_led>; + + user { + label = "debug"; + gpios = < 4 GPIO_ACTIVE_HIGH>; + linux,default-trigger = "heartbeat"; + }; + }; + reg_usb_otg1_vbus: regulator-otg1-vbus { compatible = "regulator-fixed"; pinctrl-names = "default"; @@ -455,6 +467,12 @@ >; }; + pinctrl_led: ledgrp { + fsl,pins = < + MX6SLL_PAD_EPDC_VCOM1__GPIO2_IO04 0x17059 + >; + }; + pinctrl_pwm1: pmw1grp { fsl,pins = < MX6SLL_PAD_PWM1__PWM1_OUT 0x110b0 -- 2.7.4
[PATCH] ARM: dts: imx6sll-evk: add debug LED support
On i.MX6SLL EVK board, there is a debug LED controlled by MX6SLL_PAD_EPDC_VCOM1__GPIO2_IO04 pin, add support for it. Signed-off-by: Anson Huang --- arch/arm/boot/dts/imx6sll-evk.dts | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/arm/boot/dts/imx6sll-evk.dts b/arch/arm/boot/dts/imx6sll-evk.dts index c8e1155..9c90cbd 100644 --- a/arch/arm/boot/dts/imx6sll-evk.dts +++ b/arch/arm/boot/dts/imx6sll-evk.dts @@ -31,6 +31,18 @@ status = "okay"; }; + leds { + compatible = "gpio-leds"; + pinctrl-names = "default"; + pinctrl-0 = <_led>; + + user { + label = "debug"; + gpios = < 4 GPIO_ACTIVE_HIGH>; + linux,default-trigger = "heartbeat"; + }; + }; + reg_usb_otg1_vbus: regulator-otg1-vbus { compatible = "regulator-fixed"; pinctrl-names = "default"; @@ -455,6 +467,12 @@ >; }; + pinctrl_led: ledgrp { + fsl,pins = < + MX6SLL_PAD_EPDC_VCOM1__GPIO2_IO04 0x17059 + >; + }; + pinctrl_pwm1: pmw1grp { fsl,pins = < MX6SLL_PAD_PWM1__PWM1_OUT 0x110b0 -- 2.7.4
Re: Backed up kernels
Hi Jean, On Tue, Nov 20, 2018 at 10:40 PM Jean Delvare wrote: > > Hi Masahiro, Michal, > > When I run "make install", if a kernel by the same version number + > flavor string already exists, a backup is created with ".old" appended. > Over time, this adds many entries to my boot menu, makes some package > updates take much longer (e.g. when all initrds must be regenerated), > and ultimately confuses grub2, which fails to find the matching modules > directory under /lib/modules. > > You could argue that grub2 could be fixed to find the right modules > directory, but in fact there is no guarantee that the modules built for > the new kernel are fully compatible with the old kernel. Keeping a > backup copy of the old modules is also not possible, because both > kernels have the same $(uname -r) and therefore the modules of both > kernels must live under the same /lib/modules/$(uname -r), which > collides. > > Given that, is there really any practical value in saving a backup of > old kernels? I'm doing kernel development for 15 years and I can't > remember ever booting one of these ".old" kernels. If my latest > development kernel doesn't work for any reason, I will just boot back > to the distribution kernel. > > Therefore I am asking, can we change "make install" so that it does NOT > create a backup copy of an existing kernel? I think your suggestion makes sense, but "make install" is basically implemented by arch-specific shell script. (For example, arch/x86/boot/install.sh) Will you talk to the maintainers of architecture you are interested in? (or send it to linux-a...@vger.kernel.org) > Thanks, > -- > Jean Delvare > SUSE L3 Support -- Best Regards Masahiro Yamada
Re: Backed up kernels
Hi Jean, On Tue, Nov 20, 2018 at 10:40 PM Jean Delvare wrote: > > Hi Masahiro, Michal, > > When I run "make install", if a kernel by the same version number + > flavor string already exists, a backup is created with ".old" appended. > Over time, this adds many entries to my boot menu, makes some package > updates take much longer (e.g. when all initrds must be regenerated), > and ultimately confuses grub2, which fails to find the matching modules > directory under /lib/modules. > > You could argue that grub2 could be fixed to find the right modules > directory, but in fact there is no guarantee that the modules built for > the new kernel are fully compatible with the old kernel. Keeping a > backup copy of the old modules is also not possible, because both > kernels have the same $(uname -r) and therefore the modules of both > kernels must live under the same /lib/modules/$(uname -r), which > collides. > > Given that, is there really any practical value in saving a backup of > old kernels? I'm doing kernel development for 15 years and I can't > remember ever booting one of these ".old" kernels. If my latest > development kernel doesn't work for any reason, I will just boot back > to the distribution kernel. > > Therefore I am asking, can we change "make install" so that it does NOT > create a backup copy of an existing kernel? I think your suggestion makes sense, but "make install" is basically implemented by arch-specific shell script. (For example, arch/x86/boot/install.sh) Will you talk to the maintainers of architecture you are interested in? (or send it to linux-a...@vger.kernel.org) > Thanks, > -- > Jean Delvare > SUSE L3 Support -- Best Regards Masahiro Yamada
Re: [RFC PATCH 1/3] mm, proc: be more verbose about unstable VMA flags in /proc//smaps
On Tue 20-11-18 16:01:47, David Rientjes wrote: > On Tue, 20 Nov 2018, Jan Kara wrote: > > > > Even though vma flags exported via /proc//smaps are explicitly > > > documented to be not guaranteed for future compatibility the warning > > > doesn't go far enough because it doesn't mention semantic changes to > > > those flags. And they are important as well because these flags are > > > a deep implementation internal to the MM code and the semantic might > > > change at any time. > > > > > > Let's consider two recent examples: > > > http://lkml.kernel.org/r/20181002100531.gc4...@quack2.suse.cz > > > : commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" > > > has > > > : removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in > > > the > > > : mean time certain customer of ours started poking into /proc//smaps > > > : and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA > > > : flags, the application just fails to start complaining that DAX support > > > is > > > : missing in the kernel. > > > > > > http://lkml.kernel.org/r/alpine.deb.2.21.1809241054050.224...@chino.kir.corp.google.com > > > : Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") > > > : introduced a regression in that userspace cannot always determine the > > > set > > > : of vmas where thp is ineligible. > > > : Userspace relies on the "nh" flag being emitted as part of > > > /proc/pid/smaps > > > : to determine if a vma is eligible to be backed by hugepages. > > > : Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to > > > : be disabled and emit "nh" as a flag for the corresponding vmas as part > > > of > > > : /proc/pid/smaps. After the commit, thp is disabled by means of an mm > > > : flag and "nh" is not emitted. > > > : This causes smaps parsing libraries to assume a vma is eligible for thp > > > : and ends up puzzling the user on why its memory is not backed by thp. > > > > > > In both cases userspace was relying on a semantic of a specific VMA > > > flag. The primary reason why that happened is a lack of a proper > > > internface. While this has been worked on and it will be fixed properly, > > > it seems that our wording could see some refinement and be more vocal > > > about semantic aspect of these flags as well. > > > > > > Cc: Jan Kara > > > Cc: Dan Williams > > > Cc: David Rientjes > > > Signed-off-by: Michal Hocko > > > > Honestly, it just shows that no amount of documentation is going to stop > > userspace from abusing API that's exposing too much if there's no better > > alternative. But this is a good clarification regardless. So feel free to > > add: > > > > Acked-by: Jan Kara > > > > I'm not sure what is expected of a userspace developer who finds they have > a single way to determine if something is enabled/disabled. Should they > refer to the documentation and see that the flag may be unstable so they > write a kernel patch and have it merged upstream before using it? What to > do when they don't control the kernel version they are running on? Well, I would treat it as any standard feature request. Ask for the feature upstream and work with the comunity to come up with a reasonable and a stable API. > Anyway, mentioning that the vm flags here only have meaning depending on > the kernel version seems like a worthwhile addition: > > Acked-by: David Rientjes Thanks! -- Michal Hocko SUSE Labs
Re: [RFC PATCH 1/3] mm, proc: be more verbose about unstable VMA flags in /proc//smaps
On Tue 20-11-18 16:01:47, David Rientjes wrote: > On Tue, 20 Nov 2018, Jan Kara wrote: > > > > Even though vma flags exported via /proc//smaps are explicitly > > > documented to be not guaranteed for future compatibility the warning > > > doesn't go far enough because it doesn't mention semantic changes to > > > those flags. And they are important as well because these flags are > > > a deep implementation internal to the MM code and the semantic might > > > change at any time. > > > > > > Let's consider two recent examples: > > > http://lkml.kernel.org/r/20181002100531.gc4...@quack2.suse.cz > > > : commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" > > > has > > > : removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in > > > the > > > : mean time certain customer of ours started poking into /proc//smaps > > > : and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA > > > : flags, the application just fails to start complaining that DAX support > > > is > > > : missing in the kernel. > > > > > > http://lkml.kernel.org/r/alpine.deb.2.21.1809241054050.224...@chino.kir.corp.google.com > > > : Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") > > > : introduced a regression in that userspace cannot always determine the > > > set > > > : of vmas where thp is ineligible. > > > : Userspace relies on the "nh" flag being emitted as part of > > > /proc/pid/smaps > > > : to determine if a vma is eligible to be backed by hugepages. > > > : Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to > > > : be disabled and emit "nh" as a flag for the corresponding vmas as part > > > of > > > : /proc/pid/smaps. After the commit, thp is disabled by means of an mm > > > : flag and "nh" is not emitted. > > > : This causes smaps parsing libraries to assume a vma is eligible for thp > > > : and ends up puzzling the user on why its memory is not backed by thp. > > > > > > In both cases userspace was relying on a semantic of a specific VMA > > > flag. The primary reason why that happened is a lack of a proper > > > internface. While this has been worked on and it will be fixed properly, > > > it seems that our wording could see some refinement and be more vocal > > > about semantic aspect of these flags as well. > > > > > > Cc: Jan Kara > > > Cc: Dan Williams > > > Cc: David Rientjes > > > Signed-off-by: Michal Hocko > > > > Honestly, it just shows that no amount of documentation is going to stop > > userspace from abusing API that's exposing too much if there's no better > > alternative. But this is a good clarification regardless. So feel free to > > add: > > > > Acked-by: Jan Kara > > > > I'm not sure what is expected of a userspace developer who finds they have > a single way to determine if something is enabled/disabled. Should they > refer to the documentation and see that the flag may be unstable so they > write a kernel patch and have it merged upstream before using it? What to > do when they don't control the kernel version they are running on? Well, I would treat it as any standard feature request. Ask for the feature upstream and work with the comunity to come up with a reasonable and a stable API. > Anyway, mentioning that the vm flags here only have meaning depending on > the kernel version seems like a worthwhile addition: > > Acked-by: David Rientjes Thanks! -- Michal Hocko SUSE Labs
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 12:06 PM, Viresh Kumar wrote: On 21-11-18, 11:12, Rajendra Nayak wrote: And the reason for that seems to be that we update the genpd status to GPD_STATE_ACTIVE *after* we try to set the performance state, so we always hit this check which bails out thinking the genpd is not ON. Thanks for looking at it. Here is the (untested) fix, please try it out. Thanks, yes, this does seem to work. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 84c13695af65..92be4a224b45 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -250,9 +250,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int mstate; int ret; - if (!genpd_status_on(genpd)) - goto out; - /* Propagate to masters of genpd */ list_for_each_entry(link, >slave_links, slave_node) { master = link->master; @@ -286,7 +283,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, if (ret) goto err; -out: genpd->performance_state = state; return 0; @@ -361,6 +357,11 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, return 0; update_state: + if (!genpd_status_on(genpd)) { + genpd->performance_state = state; + return 0; + } + return _genpd_set_performance_state(genpd, state, depth); }
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 12:06 PM, Viresh Kumar wrote: On 21-11-18, 11:12, Rajendra Nayak wrote: And the reason for that seems to be that we update the genpd status to GPD_STATE_ACTIVE *after* we try to set the performance state, so we always hit this check which bails out thinking the genpd is not ON. Thanks for looking at it. Here is the (untested) fix, please try it out. Thanks, yes, this does seem to work. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 84c13695af65..92be4a224b45 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -250,9 +250,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int mstate; int ret; - if (!genpd_status_on(genpd)) - goto out; - /* Propagate to masters of genpd */ list_for_each_entry(link, >slave_links, slave_node) { master = link->master; @@ -286,7 +283,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, if (ret) goto err; -out: genpd->performance_state = state; return 0; @@ -361,6 +357,11 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, return 0; update_state: + if (!genpd_status_on(genpd)) { + genpd->performance_state = state; + return 0; + } + return _genpd_set_performance_state(genpd, state, depth); }
Re: [PATCH v5] tpm: add support for partial reads
On Tue, Nov 20, 2018 at 03:13:59PM -0800, Tadeusz Struk wrote: > On 11/20/18 3:07 PM, Jarkko Sakkinen wrote: > +/* Holds the resul of the last successful call to > tpm_transmit() */ > >>> This comment is cruft. > >> Do you want me to remove it? This is the comment you proposed. > > As I explained before it made sense before you made the remark that > > it can only get positive values i.e. the length. > > > So what do you want me to do with this one? I don't think it needs now any comment because the variable describes itself so well. /Jarkko
Re: [PATCH v5] tpm: add support for partial reads
On Tue, Nov 20, 2018 at 03:13:59PM -0800, Tadeusz Struk wrote: > On 11/20/18 3:07 PM, Jarkko Sakkinen wrote: > +/* Holds the resul of the last successful call to > tpm_transmit() */ > >>> This comment is cruft. > >> Do you want me to remove it? This is the comment you proposed. > > As I explained before it made sense before you made the remark that > > it can only get positive values i.e. the length. > > > So what do you want me to do with this one? I don't think it needs now any comment because the variable describes itself so well. /Jarkko
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 11:12, Rajendra Nayak wrote: > And the reason for that seems to be that we update the genpd status to > GPD_STATE_ACTIVE > *after* we try to set the performance state, so we always hit this check > which bails out > thinking the genpd is not ON. Thanks for looking at it. Here is the (untested) fix, please try it out. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 84c13695af65..92be4a224b45 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -250,9 +250,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int mstate; int ret; - if (!genpd_status_on(genpd)) - goto out; - /* Propagate to masters of genpd */ list_for_each_entry(link, >slave_links, slave_node) { master = link->master; @@ -286,7 +283,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, if (ret) goto err; -out: genpd->performance_state = state; return 0; @@ -361,6 +357,11 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, return 0; update_state: + if (!genpd_status_on(genpd)) { + genpd->performance_state = state; + return 0; + } + return _genpd_set_performance_state(genpd, state, depth); } -- viresh
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 11:12, Rajendra Nayak wrote: > And the reason for that seems to be that we update the genpd status to > GPD_STATE_ACTIVE > *after* we try to set the performance state, so we always hit this check > which bails out > thinking the genpd is not ON. Thanks for looking at it. Here is the (untested) fix, please try it out. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 84c13695af65..92be4a224b45 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -250,9 +250,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int mstate; int ret; - if (!genpd_status_on(genpd)) - goto out; - /* Propagate to masters of genpd */ list_for_each_entry(link, >slave_links, slave_node) { master = link->master; @@ -286,7 +283,6 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, if (ret) goto err; -out: genpd->performance_state = state; return 0; @@ -361,6 +357,11 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, return 0; update_state: + if (!genpd_status_on(genpd)) { + genpd->performance_state = state; + return 0; + } + return _genpd_set_performance_state(genpd, state, depth); } -- viresh
Re: [PATCH] x86: only use ERMS for user copies for larger sizes
[ Cc:-ed a few other gents and lkml. ] * Jens Axboe wrote: > Hi, > > So this is a fun one... While I was doing the aio polled work, I noticed > that the submitting process spent a substantial amount of time copying > data to/from userspace. For aio, that's iocb and io_event, which are 64 > and 32 bytes respectively. Looking closer at this, and it seems that > ERMS rep movsb is SLOWER for smaller copies, due to a higher startup > cost. > > I came up with this hack to test it out, and low and behold, we now cut > the time spent in copying in half. 50% less. > > Since these kinds of patches tend to lend themselves to bike shedding, I > also ran a string of kernel compilations out of RAM. Results are as > follows: > > Patched : 62.86s avg, stddev 0.65s > Stock : 63.73s avg, stddev 0.67s > > which would also seem to indicate that we're faster punting smaller > (< 128 byte) copies. > > CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz > > Interestingly, text size is smaller with the patch as well?! > > I'm sure there are smarter ways to do this, but results look fairly > conclusive. FWIW, the behaviorial change was introduced by: > > commit 954e482bde20b0e208fd4d34ef26e10afd194600 > Author: Fenghua Yu > Date: Thu May 24 18:19:45 2012 -0700 > > x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature > > which contains nothing in terms of benchmarking or results, just claims > that the new hotness is better. > > Signed-off-by: Jens Axboe > --- > > diff --git a/arch/x86/include/asm/uaccess_64.h > b/arch/x86/include/asm/uaccess_64.h > index a9d637bc301d..7dbb78827e64 100644 > --- a/arch/x86/include/asm/uaccess_64.h > +++ b/arch/x86/include/asm/uaccess_64.h > @@ -29,16 +29,27 @@ copy_user_generic(void *to, const void *from, unsigned > len) > { > unsigned ret; > > + /* > + * For smaller copies, don't use ERMS as it's slower. > + */ > + if (len < 128) { > + alternative_call(copy_user_generic_unrolled, > + copy_user_generic_string, X86_FEATURE_REP_GOOD, > + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), > + "=d" (len)), > + "1" (to), "2" (from), "3" (len) > + : "memory", "rcx", "r8", "r9", "r10", "r11"); > + return ret; > + } > + > /* >* If CPU has ERMS feature, use copy_user_enhanced_fast_string. >* Otherwise, if CPU has rep_good feature, use copy_user_generic_string. >* Otherwise, use copy_user_generic_unrolled. >*/ > alternative_call_2(copy_user_generic_unrolled, > - copy_user_generic_string, > - X86_FEATURE_REP_GOOD, > - copy_user_enhanced_fast_string, > - X86_FEATURE_ERMS, > + copy_user_generic_string, X86_FEATURE_REP_GOOD, > + copy_user_enhanced_fast_string, X86_FEATURE_ERMS, >ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), >"=d" (len)), >"1" (to), "2" (from), "3" (len) So I'm inclined to do something like yours, because clearly the changelog of 954e482bde20 was at least partly false: Intel can say whatever they want, it's a fact that ERMS has high setup costs for low buffer sizes - ERMS is optimized for large size, cache-aligned copies mainly. But the result is counter-intuitive in terms of kernel text footprint, plus the '128' is pretty arbitrary - we should at least try to come up with a break-even point where manual copy is about as fast as ERMS - on at least a single CPU ... Thanks, Ingo
Re: [PATCH] x86: only use ERMS for user copies for larger sizes
[ Cc:-ed a few other gents and lkml. ] * Jens Axboe wrote: > Hi, > > So this is a fun one... While I was doing the aio polled work, I noticed > that the submitting process spent a substantial amount of time copying > data to/from userspace. For aio, that's iocb and io_event, which are 64 > and 32 bytes respectively. Looking closer at this, and it seems that > ERMS rep movsb is SLOWER for smaller copies, due to a higher startup > cost. > > I came up with this hack to test it out, and low and behold, we now cut > the time spent in copying in half. 50% less. > > Since these kinds of patches tend to lend themselves to bike shedding, I > also ran a string of kernel compilations out of RAM. Results are as > follows: > > Patched : 62.86s avg, stddev 0.65s > Stock : 63.73s avg, stddev 0.67s > > which would also seem to indicate that we're faster punting smaller > (< 128 byte) copies. > > CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz > > Interestingly, text size is smaller with the patch as well?! > > I'm sure there are smarter ways to do this, but results look fairly > conclusive. FWIW, the behaviorial change was introduced by: > > commit 954e482bde20b0e208fd4d34ef26e10afd194600 > Author: Fenghua Yu > Date: Thu May 24 18:19:45 2012 -0700 > > x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature > > which contains nothing in terms of benchmarking or results, just claims > that the new hotness is better. > > Signed-off-by: Jens Axboe > --- > > diff --git a/arch/x86/include/asm/uaccess_64.h > b/arch/x86/include/asm/uaccess_64.h > index a9d637bc301d..7dbb78827e64 100644 > --- a/arch/x86/include/asm/uaccess_64.h > +++ b/arch/x86/include/asm/uaccess_64.h > @@ -29,16 +29,27 @@ copy_user_generic(void *to, const void *from, unsigned > len) > { > unsigned ret; > > + /* > + * For smaller copies, don't use ERMS as it's slower. > + */ > + if (len < 128) { > + alternative_call(copy_user_generic_unrolled, > + copy_user_generic_string, X86_FEATURE_REP_GOOD, > + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), > + "=d" (len)), > + "1" (to), "2" (from), "3" (len) > + : "memory", "rcx", "r8", "r9", "r10", "r11"); > + return ret; > + } > + > /* >* If CPU has ERMS feature, use copy_user_enhanced_fast_string. >* Otherwise, if CPU has rep_good feature, use copy_user_generic_string. >* Otherwise, use copy_user_generic_unrolled. >*/ > alternative_call_2(copy_user_generic_unrolled, > - copy_user_generic_string, > - X86_FEATURE_REP_GOOD, > - copy_user_enhanced_fast_string, > - X86_FEATURE_ERMS, > + copy_user_generic_string, X86_FEATURE_REP_GOOD, > + copy_user_enhanced_fast_string, X86_FEATURE_ERMS, >ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), >"=d" (len)), >"1" (to), "2" (from), "3" (len) So I'm inclined to do something like yours, because clearly the changelog of 954e482bde20 was at least partly false: Intel can say whatever they want, it's a fact that ERMS has high setup costs for low buffer sizes - ERMS is optimized for large size, cache-aligned copies mainly. But the result is counter-intuitive in terms of kernel text footprint, plus the '128' is pretty arbitrary - we should at least try to come up with a break-even point where manual copy is about as fast as ERMS - on at least a single CPU ... Thanks, Ingo
Re: [PATCH] thermal: tegra: add get_trend ops
On 20/11/2018 11:38 PM, Thierry Reding wrote: > On Tue, Nov 20, 2018 at 05:11:17PM +0800, Wei Ni wrote: >> Add support for get_trend ops that allows soctherm >> sensors to be used with the step-wise governor. >> >> Signed-off-by: Wei Ni >> --- >> drivers/thermal/tegra/soctherm.c | 34 ++ >> 1 file changed, 34 insertions(+) >> >> diff --git a/drivers/thermal/tegra/soctherm.c >> b/drivers/thermal/tegra/soctherm.c >> index ed28110a3535..d2951fbe2b7c 100644 >> --- a/drivers/thermal/tegra/soctherm.c >> +++ b/drivers/thermal/tegra/soctherm.c >> @@ -488,9 +488,43 @@ static int tegra_thermctl_set_trip_temp(void *data, int >> trip, int temp) >> return 0; >> } >> >> +static int tegra_thermctl_get_trend(void *data, int trip, >> +enum thermal_trend *trend) >> +{ >> +struct tegra_thermctl_zone *zone = data; >> +struct thermal_zone_device *tz = zone->tz; >> +int trip_temp, temp, last_temp, ret; >> + >> +if (!tz) >> +return -EINVAL; >> + >> +ret = tz->ops->get_trip_temp(zone->tz, trip, _temp); >> +if (ret) >> +return ret; >> + >> +mutex_lock(>lock); >> +temp = tz->temperature; >> +last_temp = tz->last_temperature; >> +mutex_unlock(>lock); >> + >> +if (temp > trip_temp) { >> +if (temp >= last_temp) >> +*trend = THERMAL_TREND_RAISING; >> +else >> +*trend = THERMAL_TREND_STABLE; >> +} else if (temp < trip_temp) { >> +*trend = THERMAL_TREND_DROPPING; >> +} else { >> +*trend = THERMAL_TREND_STABLE; >> +} >> + >> +return 0; >> +} > > This looks like a reimplementation of the get_tz_trend() helper. Is > seems like that helper already has everything we need. Perhaps this > isn't working because of-thermal installs of_thermal_get_trend(), a > function that returns -EINVAL if the driver doesn't implement the > ->get_trend() callback. 1. The get_tz_trend() helper can work, because it has: if (tz->emul_temperature || !tz->ops->get_trend || tz->ops->get_trend(tz, trip, )) { ... } the tz->ops->get_trend is of_thermal_get_trend(). If without special get_trend(), it will return -EINVAL, so it will implement the if block to get the "trend". If we have the special get_trend(), then the of_thermal_get_trend() will return 0, so this helper will not implement the if block, it will get the "trend" from the special get_trend(). 2. There has a little difference between the helper and our special callback. The tegra_thermctl_get_trend() consider the trip_temp, but the get_tz_trend() helper didn't. > > Perhaps a better way would be to do something like this in > thermal_zone_of_add_sensor(): > > if (ops->get_trend) > tzd->ops->get_trend = of_thermal_get_trend; > > That's similar to how ->set_trips() and ->set_emul_temp() are set up > and should make sure that get_tz_trend() will do the right thing for > all drivers that don't implement a special ->get_trend(). As above description, I think the of_thermal_get_trend() already can handle this case, doesn't need to change. Wei. > > Thierry >
Re: [PATCH] thermal: tegra: add get_trend ops
On 20/11/2018 11:38 PM, Thierry Reding wrote: > On Tue, Nov 20, 2018 at 05:11:17PM +0800, Wei Ni wrote: >> Add support for get_trend ops that allows soctherm >> sensors to be used with the step-wise governor. >> >> Signed-off-by: Wei Ni >> --- >> drivers/thermal/tegra/soctherm.c | 34 ++ >> 1 file changed, 34 insertions(+) >> >> diff --git a/drivers/thermal/tegra/soctherm.c >> b/drivers/thermal/tegra/soctherm.c >> index ed28110a3535..d2951fbe2b7c 100644 >> --- a/drivers/thermal/tegra/soctherm.c >> +++ b/drivers/thermal/tegra/soctherm.c >> @@ -488,9 +488,43 @@ static int tegra_thermctl_set_trip_temp(void *data, int >> trip, int temp) >> return 0; >> } >> >> +static int tegra_thermctl_get_trend(void *data, int trip, >> +enum thermal_trend *trend) >> +{ >> +struct tegra_thermctl_zone *zone = data; >> +struct thermal_zone_device *tz = zone->tz; >> +int trip_temp, temp, last_temp, ret; >> + >> +if (!tz) >> +return -EINVAL; >> + >> +ret = tz->ops->get_trip_temp(zone->tz, trip, _temp); >> +if (ret) >> +return ret; >> + >> +mutex_lock(>lock); >> +temp = tz->temperature; >> +last_temp = tz->last_temperature; >> +mutex_unlock(>lock); >> + >> +if (temp > trip_temp) { >> +if (temp >= last_temp) >> +*trend = THERMAL_TREND_RAISING; >> +else >> +*trend = THERMAL_TREND_STABLE; >> +} else if (temp < trip_temp) { >> +*trend = THERMAL_TREND_DROPPING; >> +} else { >> +*trend = THERMAL_TREND_STABLE; >> +} >> + >> +return 0; >> +} > > This looks like a reimplementation of the get_tz_trend() helper. Is > seems like that helper already has everything we need. Perhaps this > isn't working because of-thermal installs of_thermal_get_trend(), a > function that returns -EINVAL if the driver doesn't implement the > ->get_trend() callback. 1. The get_tz_trend() helper can work, because it has: if (tz->emul_temperature || !tz->ops->get_trend || tz->ops->get_trend(tz, trip, )) { ... } the tz->ops->get_trend is of_thermal_get_trend(). If without special get_trend(), it will return -EINVAL, so it will implement the if block to get the "trend". If we have the special get_trend(), then the of_thermal_get_trend() will return 0, so this helper will not implement the if block, it will get the "trend" from the special get_trend(). 2. There has a little difference between the helper and our special callback. The tegra_thermctl_get_trend() consider the trip_temp, but the get_tz_trend() helper didn't. > > Perhaps a better way would be to do something like this in > thermal_zone_of_add_sensor(): > > if (ops->get_trend) > tzd->ops->get_trend = of_thermal_get_trend; > > That's similar to how ->set_trips() and ->set_emul_temp() are set up > and should make sure that get_tz_trend() will do the right thing for > all drivers that don't implement a special ->get_trend(). As above description, I think the of_thermal_get_trend() already can handle this case, doesn't need to change. Wei. > > Thierry >
Re: [PATCH] modpost: validate symbol names also in find_elf_symbol
On Wed, Oct 24, 2018 at 7:15 AM Sami Tolvanen wrote: > > If an ARM mapping symbol shares an address with a valid symbol, > find_elf_symbol can currently return the mapping symbol instead, as the > symbol is not validated. This can result in confusing warnings: > > WARNING: vmlinux.o(.text+0x18f4028): Section mismatch in reference > from the function set_reset_devices() to the variable .init.text:$x.0 > > This change adds a call to is_valid_name to find_elf_symbol, similarly > to how it's already used in find_elf_symbol2. > > Signed-off-by: Sami Tolvanen > --- Applied to linux-kbuild. Thanks! > scripts/mod/modpost.c | 50 ++- > 1 file changed, 26 insertions(+), 24 deletions(-) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 0d998c54564d..b709b2e623d6 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -1204,6 +1204,30 @@ static int secref_whitelist(const struct sectioncheck > *mismatch, > return 1; > } > > +static inline int is_arm_mapping_symbol(const char *str) > +{ > + return str[0] == '$' && strchr("axtd", str[1]) > + && (str[2] == '\0' || str[2] == '.'); > +} > + > +/* > + * If there's no name there, ignore it; likewise, ignore it if it's > + * one of the magic symbols emitted used by current ARM tools. > + * > + * Otherwise if find_symbols_between() returns those symbols, they'll > + * fail the whitelist tests and cause lots of false alarms ... fixable > + * only by merging __exit and __init sections into __text, bloating > + * the kernel (which is especially evil on embedded platforms). > + */ > +static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) > +{ > + const char *name = elf->strtab + sym->st_name; > + > + if (!name || !strlen(name)) > + return 0; > + return !is_arm_mapping_symbol(name); > +} > + > /** > * Find symbol based on relocation record info. > * In some cases the symbol supplied is a valid symbol so > @@ -1229,6 +1253,8 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, > Elf64_Sword addr, > continue; > if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) > continue; > + if (!is_valid_name(elf, sym)) > + continue; > if (sym->st_value == addr) > return sym; > /* Find a symbol nearby - addr are maybe negative */ > @@ -1247,30 +1273,6 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, > Elf64_Sword addr, > return NULL; > } > > -static inline int is_arm_mapping_symbol(const char *str) > -{ > - return str[0] == '$' && strchr("axtd", str[1]) > - && (str[2] == '\0' || str[2] == '.'); > -} > - > -/* > - * If there's no name there, ignore it; likewise, ignore it if it's > - * one of the magic symbols emitted used by current ARM tools. > - * > - * Otherwise if find_symbols_between() returns those symbols, they'll > - * fail the whitelist tests and cause lots of false alarms ... fixable > - * only by merging __exit and __init sections into __text, bloating > - * the kernel (which is especially evil on embedded platforms). > - */ > -static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) > -{ > - const char *name = elf->strtab + sym->st_name; > - > - if (!name || !strlen(name)) > - return 0; > - return !is_arm_mapping_symbol(name); > -} > - > /* > * Find symbols before or equal addr and after addr - in the section sec. > * If we find two symbols with equal offset prefer one with a valid name. > -- > 2.19.1.568.g152ad8e336-goog > -- Best Regards Masahiro Yamada
Re: [PATCH] modpost: validate symbol names also in find_elf_symbol
On Wed, Oct 24, 2018 at 7:15 AM Sami Tolvanen wrote: > > If an ARM mapping symbol shares an address with a valid symbol, > find_elf_symbol can currently return the mapping symbol instead, as the > symbol is not validated. This can result in confusing warnings: > > WARNING: vmlinux.o(.text+0x18f4028): Section mismatch in reference > from the function set_reset_devices() to the variable .init.text:$x.0 > > This change adds a call to is_valid_name to find_elf_symbol, similarly > to how it's already used in find_elf_symbol2. > > Signed-off-by: Sami Tolvanen > --- Applied to linux-kbuild. Thanks! > scripts/mod/modpost.c | 50 ++- > 1 file changed, 26 insertions(+), 24 deletions(-) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 0d998c54564d..b709b2e623d6 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -1204,6 +1204,30 @@ static int secref_whitelist(const struct sectioncheck > *mismatch, > return 1; > } > > +static inline int is_arm_mapping_symbol(const char *str) > +{ > + return str[0] == '$' && strchr("axtd", str[1]) > + && (str[2] == '\0' || str[2] == '.'); > +} > + > +/* > + * If there's no name there, ignore it; likewise, ignore it if it's > + * one of the magic symbols emitted used by current ARM tools. > + * > + * Otherwise if find_symbols_between() returns those symbols, they'll > + * fail the whitelist tests and cause lots of false alarms ... fixable > + * only by merging __exit and __init sections into __text, bloating > + * the kernel (which is especially evil on embedded platforms). > + */ > +static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) > +{ > + const char *name = elf->strtab + sym->st_name; > + > + if (!name || !strlen(name)) > + return 0; > + return !is_arm_mapping_symbol(name); > +} > + > /** > * Find symbol based on relocation record info. > * In some cases the symbol supplied is a valid symbol so > @@ -1229,6 +1253,8 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, > Elf64_Sword addr, > continue; > if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) > continue; > + if (!is_valid_name(elf, sym)) > + continue; > if (sym->st_value == addr) > return sym; > /* Find a symbol nearby - addr are maybe negative */ > @@ -1247,30 +1273,6 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, > Elf64_Sword addr, > return NULL; > } > > -static inline int is_arm_mapping_symbol(const char *str) > -{ > - return str[0] == '$' && strchr("axtd", str[1]) > - && (str[2] == '\0' || str[2] == '.'); > -} > - > -/* > - * If there's no name there, ignore it; likewise, ignore it if it's > - * one of the magic symbols emitted used by current ARM tools. > - * > - * Otherwise if find_symbols_between() returns those symbols, they'll > - * fail the whitelist tests and cause lots of false alarms ... fixable > - * only by merging __exit and __init sections into __text, bloating > - * the kernel (which is especially evil on embedded platforms). > - */ > -static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) > -{ > - const char *name = elf->strtab + sym->st_name; > - > - if (!name || !strlen(name)) > - return 0; > - return !is_arm_mapping_symbol(name); > -} > - > /* > * Find symbols before or equal addr and after addr - in the section sec. > * If we find two symbols with equal offset prefer one with a valid name. > -- > 2.19.1.568.g152ad8e336-goog > -- Best Regards Masahiro Yamada
[PATCH RFC v3 1/4] mm: gup: rename "nonblocking" to "locked" where proper
There's plenty of places around __get_user_pages() that has a parameter "nonblocking" which does not really mean that "it won't block" (because it can really block) but instead it shows whether the mmap_sem is released by up_read() during the page fault handling mostly when VM_FAULT_RETRY is returned. We have the correct naming in e.g. get_user_pages_locked() or get_user_pages_remote() as "locked", however there're still many places that are using the "nonblocking" as name. Renaming the places to "locked" where proper to better suite the functionality of the variable. While at it, fixing up some of the comments accordingly. Signed-off-by: Peter Xu --- mm/gup.c | 44 +--- mm/hugetlb.c | 8 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index aa43620a3270..a40111c742ba 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -506,12 +506,12 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, } /* - * mmap_sem must be held on entry. If @nonblocking != NULL and - * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. - * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + * mmap_sem must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_sem may be released. If it + * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *nonblocking) + unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; @@ -523,7 +523,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (nonblocking) + if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; @@ -549,8 +549,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *nonblocking = 0; + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; return -EBUSY; } @@ -627,7 +627,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention + * @locked: whether we're still with the mmap_sem held * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -656,13 +656,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. Further, if @gup_flags does not - * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in - * this case. + * If @locked != NULL, *@locked will be set to 0 when mmap_sem is + * released by an up_read(). That can happen if @gup_flags does not + * has FOLL_NOWAIT. * - * A caller using such a combination of @nonblocking and @gup_flags + * A caller using such a combination of @locked and @gup_flags * must therefore hold the mmap_sem for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. @@ -674,7 +672,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -719,7 +717,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, , _pages, i, - gup_flags, nonblocking); +
[PATCH RFC v3 2/4] mm: userfault: return VM_FAULT_RETRY on signals
There was a special path in handle_userfault() in the past that we'll return a VM_FAULT_NOPAGE when we detected non-fatal signals when waiting for userfault handling. We did that by reacquiring the mmap_sem before returning. However that brings a risk in that the vmas might have changed when we retake the mmap_sem and even we could be holding an invalid vma structure. The problem was reported by syzbot. This patch removes the special path and we'll return a VM_FAULT_RETRY with the common path even if we have got such signals. Then for all the architectures that is passing in VM_FAULT_ALLOW_RETRY into handle_mm_fault(), we check not only for SIGKILL but for all the rest of userspace pending signals right after we returned from handle_mm_fault(). The idea comes from the upstream discussion between Linus and Andrea: https://lkml.org/lkml/2017/10/30/560 (This patch contains a potential fix for a double-free of mmap_sem on ARC architecture; please see https://lkml.org/lkml/2018/11/1/723 for more information) Suggested-by: Linus Torvalds Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c| 11 +++ arch/arm/mm/fault.c| 14 ++ arch/arm64/mm/fault.c | 6 +++--- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 6 +++--- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/fault.c| 4 +++- arch/riscv/mm/fault.c | 4 ++-- arch/s390/mm/fault.c | 9 ++--- arch/sh/mm/fault.c | 4 arch/sparc/mm/fault_32.c | 3 +++ arch/sparc/mm/fault_64.c | 3 +++ arch/um/kernel/trap.c | 5 - arch/unicore32/mm/fault.c | 4 ++-- arch/x86/mm/fault.c| 12 +++- arch/xtensa/mm/fault.c | 3 +++ fs/userfaultfd.c | 24 24 files changed, 73 insertions(+), 57 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index d73dc473fbb9..46e5e420ad2a 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -150,7 +150,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, the fault. */ fault = handle_mm_fault(vma, address, flags); - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index c9da6102eb4f..52b6b71b74e2 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -142,11 +142,14 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) fault = handle_mm_fault(vma, address, flags); /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ - if (unlikely(fatal_signal_pending(current))) { - if ((fault & VM_FAULT_ERROR) && !(fault & VM_FAULT_RETRY)) + if (unlikely(fatal_signal_pending(current) && user_mode(regs))) { + /* +* VM_FAULT_RETRY means we have released the mmap_sem, +* otherwise we need to drop it before leaving +*/ + if (!(fault & VM_FAULT_RETRY)) up_read(>mmap_sem); - if (user_mode(regs)) - return; + return; } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index f4ea4c62c613..743077d19669 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -308,14 +308,20 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) fault = __do_page_fault(mm, addr, fsr, flags, tsk); - /* If we need to retry but a fatal signal is pending, handle the + /* If we need to retry but a signal is pending, handle the * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { - if (!user_mode(regs)) + if (fault & VM_FAULT_RETRY) { + if (fatal_signal_pending(current) && !user_mode(regs)) goto no_context; - return 0; + else if (signal_pending(current)) + /* +* It's either a common signal, or a fatal +* signal but for the userspace, we return +* immediately. +*/ + return 0; } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 7d9571f4ae3d..744d6451ea83 100644 ---
[PATCH RFC v3 1/4] mm: gup: rename "nonblocking" to "locked" where proper
There's plenty of places around __get_user_pages() that has a parameter "nonblocking" which does not really mean that "it won't block" (because it can really block) but instead it shows whether the mmap_sem is released by up_read() during the page fault handling mostly when VM_FAULT_RETRY is returned. We have the correct naming in e.g. get_user_pages_locked() or get_user_pages_remote() as "locked", however there're still many places that are using the "nonblocking" as name. Renaming the places to "locked" where proper to better suite the functionality of the variable. While at it, fixing up some of the comments accordingly. Signed-off-by: Peter Xu --- mm/gup.c | 44 +--- mm/hugetlb.c | 8 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index aa43620a3270..a40111c742ba 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -506,12 +506,12 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, } /* - * mmap_sem must be held on entry. If @nonblocking != NULL and - * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. - * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + * mmap_sem must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_sem may be released. If it + * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *nonblocking) + unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; @@ -523,7 +523,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (nonblocking) + if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; @@ -549,8 +549,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *nonblocking = 0; + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; return -EBUSY; } @@ -627,7 +627,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention + * @locked: whether we're still with the mmap_sem held * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -656,13 +656,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. Further, if @gup_flags does not - * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in - * this case. + * If @locked != NULL, *@locked will be set to 0 when mmap_sem is + * released by an up_read(). That can happen if @gup_flags does not + * has FOLL_NOWAIT. * - * A caller using such a combination of @nonblocking and @gup_flags + * A caller using such a combination of @locked and @gup_flags * must therefore hold the mmap_sem for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. @@ -674,7 +672,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -719,7 +717,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, , _pages, i, - gup_flags, nonblocking); +
[PATCH RFC v3 2/4] mm: userfault: return VM_FAULT_RETRY on signals
There was a special path in handle_userfault() in the past that we'll return a VM_FAULT_NOPAGE when we detected non-fatal signals when waiting for userfault handling. We did that by reacquiring the mmap_sem before returning. However that brings a risk in that the vmas might have changed when we retake the mmap_sem and even we could be holding an invalid vma structure. The problem was reported by syzbot. This patch removes the special path and we'll return a VM_FAULT_RETRY with the common path even if we have got such signals. Then for all the architectures that is passing in VM_FAULT_ALLOW_RETRY into handle_mm_fault(), we check not only for SIGKILL but for all the rest of userspace pending signals right after we returned from handle_mm_fault(). The idea comes from the upstream discussion between Linus and Andrea: https://lkml.org/lkml/2017/10/30/560 (This patch contains a potential fix for a double-free of mmap_sem on ARC architecture; please see https://lkml.org/lkml/2018/11/1/723 for more information) Suggested-by: Linus Torvalds Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c| 11 +++ arch/arm/mm/fault.c| 14 ++ arch/arm64/mm/fault.c | 6 +++--- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 6 +++--- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/fault.c| 4 +++- arch/riscv/mm/fault.c | 4 ++-- arch/s390/mm/fault.c | 9 ++--- arch/sh/mm/fault.c | 4 arch/sparc/mm/fault_32.c | 3 +++ arch/sparc/mm/fault_64.c | 3 +++ arch/um/kernel/trap.c | 5 - arch/unicore32/mm/fault.c | 4 ++-- arch/x86/mm/fault.c| 12 +++- arch/xtensa/mm/fault.c | 3 +++ fs/userfaultfd.c | 24 24 files changed, 73 insertions(+), 57 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index d73dc473fbb9..46e5e420ad2a 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -150,7 +150,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, the fault. */ fault = handle_mm_fault(vma, address, flags); - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index c9da6102eb4f..52b6b71b74e2 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -142,11 +142,14 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) fault = handle_mm_fault(vma, address, flags); /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ - if (unlikely(fatal_signal_pending(current))) { - if ((fault & VM_FAULT_ERROR) && !(fault & VM_FAULT_RETRY)) + if (unlikely(fatal_signal_pending(current) && user_mode(regs))) { + /* +* VM_FAULT_RETRY means we have released the mmap_sem, +* otherwise we need to drop it before leaving +*/ + if (!(fault & VM_FAULT_RETRY)) up_read(>mmap_sem); - if (user_mode(regs)) - return; + return; } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index f4ea4c62c613..743077d19669 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -308,14 +308,20 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) fault = __do_page_fault(mm, addr, fsr, flags, tsk); - /* If we need to retry but a fatal signal is pending, handle the + /* If we need to retry but a signal is pending, handle the * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { - if (!user_mode(regs)) + if (fault & VM_FAULT_RETRY) { + if (fatal_signal_pending(current) && !user_mode(regs)) goto no_context; - return 0; + else if (signal_pending(current)) + /* +* It's either a common signal, or a fatal +* signal but for the userspace, we return +* immediately. +*/ + return 0; } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 7d9571f4ae3d..744d6451ea83 100644 ---
[PATCH RFC v3 3/4] mm: allow VM_FAULT_RETRY for multiple times
The idea comes from a discussion between Linus and Andrea [1]. Before this patch we only allow a page fault to retry once. We achieved this by clearing the FAULT_FLAG_ALLOW_RETRY flag when doing handle_mm_fault() the second time. This was majorly used to avoid unexpected starvation of the system by looping over forever to handle the page fault on a single page. However that should hardly happen, and after all for each code path to return a VM_FAULT_RETRY we'll first wait for a condition (during which time we should possibly yield the cpu) to happen before VM_FAULT_RETRY is really returned. This patch removes the restriction by keeping the FAULT_FLAG_ALLOW_RETRY flag when we receive VM_FAULT_RETRY. It means that the page fault handler now can retry the page fault for multiple times if necessary without the need to generate another page fault event. Meanwhile we still keep the FAULT_FLAG_TRIED flag so page fault handler can still identify whether a page fault is the first attempt or not. GUP code is not touched yet and will be covered in follow up patch. This will be a nice enhancement for current code at the same time a supporting material for the future userfaultfd-writeprotect work since in that work there will always be an explicit userfault writeprotect retry for protected pages, and if that cannot resolve the page fault (e.g., when userfaultfd-writeprotect is used in conjunction with shared memory) then we'll possibly need a 3rd retry of the page fault. It might also benefit other potential users who will have similar requirement like userfault write-protection. Please read the thread below for more information. [1] https://lkml.org/lkml/2017/11/2/833 Suggested-by: Linus Torvalds Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c| 1 - arch/arm/mm/fault.c| 3 --- arch/arm64/mm/fault.c | 5 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/mm/fault.c | 3 --- arch/microblaze/mm/fault.c | 1 - arch/mips/mm/fault.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nios2/mm/fault.c | 3 --- arch/openrisc/mm/fault.c | 1 - arch/parisc/mm/fault.c | 2 -- arch/powerpc/mm/fault.c| 5 - arch/riscv/mm/fault.c | 5 - arch/s390/mm/fault.c | 5 + arch/sh/mm/fault.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/um/kernel/trap.c | 1 - arch/unicore32/mm/fault.c | 6 +- arch/x86/mm/fault.c| 1 - arch/xtensa/mm/fault.c | 1 - 23 files changed, 3 insertions(+), 49 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 46e5e420ad2a..deae82bb83c1 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -169,7 +169,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(>mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 52b6b71b74e2..a8b02db45324 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -168,7 +168,6 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 743077d19669..377781d8491a 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -342,9 +342,6 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) regs, addr); } if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 744d6451ea83..8a26e03fc2bf 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -510,12 +510,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, return 0; } - /* -* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of -* starvation. -*/ if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; mm_flags |=
[PATCH RFC v3 4/4] mm: gup: allow VM_FAULT_RETRY for multiple times
This is the gup counterpart of the change that allows the VM_FAULT_RETRY to happen for more than once. Signed-off-by: Peter Xu --- mm/gup.c | 17 + 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a40111c742ba..7c3b3ab6be88 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -528,7 +528,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { - VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + /* +* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED +* can co-exist +*/ fault_flags |= FAULT_FLAG_TRIED; } @@ -944,17 +947,23 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ pages += ret; start += ret << PAGE_SHIFT; + lock_dropped = true; +retry: /* * Repeat on the address that fired VM_FAULT_RETRY -* without FAULT_FLAG_ALLOW_RETRY but with +* with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. */ *locked = 1; - lock_dropped = true; down_read(>mmap_sem); ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, - pages, NULL, NULL); + pages, NULL, locked); + if (!*locked) { + /* Continue to retry until we succeeded */ + BUG_ON(ret != 0); + goto retry; + } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) -- 2.17.1
[PATCH RFC v3 3/4] mm: allow VM_FAULT_RETRY for multiple times
The idea comes from a discussion between Linus and Andrea [1]. Before this patch we only allow a page fault to retry once. We achieved this by clearing the FAULT_FLAG_ALLOW_RETRY flag when doing handle_mm_fault() the second time. This was majorly used to avoid unexpected starvation of the system by looping over forever to handle the page fault on a single page. However that should hardly happen, and after all for each code path to return a VM_FAULT_RETRY we'll first wait for a condition (during which time we should possibly yield the cpu) to happen before VM_FAULT_RETRY is really returned. This patch removes the restriction by keeping the FAULT_FLAG_ALLOW_RETRY flag when we receive VM_FAULT_RETRY. It means that the page fault handler now can retry the page fault for multiple times if necessary without the need to generate another page fault event. Meanwhile we still keep the FAULT_FLAG_TRIED flag so page fault handler can still identify whether a page fault is the first attempt or not. GUP code is not touched yet and will be covered in follow up patch. This will be a nice enhancement for current code at the same time a supporting material for the future userfaultfd-writeprotect work since in that work there will always be an explicit userfault writeprotect retry for protected pages, and if that cannot resolve the page fault (e.g., when userfaultfd-writeprotect is used in conjunction with shared memory) then we'll possibly need a 3rd retry of the page fault. It might also benefit other potential users who will have similar requirement like userfault write-protection. Please read the thread below for more information. [1] https://lkml.org/lkml/2017/11/2/833 Suggested-by: Linus Torvalds Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c| 1 - arch/arm/mm/fault.c| 3 --- arch/arm64/mm/fault.c | 5 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/mm/fault.c | 3 --- arch/microblaze/mm/fault.c | 1 - arch/mips/mm/fault.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nios2/mm/fault.c | 3 --- arch/openrisc/mm/fault.c | 1 - arch/parisc/mm/fault.c | 2 -- arch/powerpc/mm/fault.c| 5 - arch/riscv/mm/fault.c | 5 - arch/s390/mm/fault.c | 5 + arch/sh/mm/fault.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/um/kernel/trap.c | 1 - arch/unicore32/mm/fault.c | 6 +- arch/x86/mm/fault.c| 1 - arch/xtensa/mm/fault.c | 1 - 23 files changed, 3 insertions(+), 49 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 46e5e420ad2a..deae82bb83c1 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -169,7 +169,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(>mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 52b6b71b74e2..a8b02db45324 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -168,7 +168,6 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 743077d19669..377781d8491a 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -342,9 +342,6 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) regs, addr); } if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 744d6451ea83..8a26e03fc2bf 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -510,12 +510,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, return 0; } - /* -* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of -* starvation. -*/ if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; mm_flags |=
[PATCH RFC v3 4/4] mm: gup: allow VM_FAULT_RETRY for multiple times
This is the gup counterpart of the change that allows the VM_FAULT_RETRY to happen for more than once. Signed-off-by: Peter Xu --- mm/gup.c | 17 + 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a40111c742ba..7c3b3ab6be88 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -528,7 +528,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { - VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + /* +* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED +* can co-exist +*/ fault_flags |= FAULT_FLAG_TRIED; } @@ -944,17 +947,23 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ pages += ret; start += ret << PAGE_SHIFT; + lock_dropped = true; +retry: /* * Repeat on the address that fired VM_FAULT_RETRY -* without FAULT_FLAG_ALLOW_RETRY but with +* with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. */ *locked = 1; - lock_dropped = true; down_read(>mmap_sem); ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, - pages, NULL, NULL); + pages, NULL, locked); + if (!*locked) { + /* Continue to retry until we succeeded */ + BUG_ON(ret != 0); + goto retry; + } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) -- 2.17.1
[PATCH RFC v3 0/4] mm: some enhancements to the page fault mechanism
v3: - fix up issues that krobot reported, rebase This is an RFC series as cleanup and enhancements to current page fault logic. The whole idea comes from the discussion between Andrea and Linus on the bug reported by syzbot here: https://lkml.org/lkml/2017/11/2/833 Basically it does two things: (a) Allows the page fault logic to be more interactive on not only SIGKILL, but also the rest of userspace signals, and, (b) Allows the page fault retry (VM_FAULT_RETRY) to happen for more than once. For (a): with the changes we should be able to react faster when page faults are working in parallel with userspace signals like SIGSTOP and SIGCONT (and more), and with that we can remove the buggy part in userfaultfd and benefit the whole page fault mechanism on faster signal processing to reach the userspace. For (b), we should be able to allow the page fault handler to loop for even more than twice. Some context: for now since we have FAULT_FLAG_ALLOW_RETRY we can allow to retry the page fault once with the same interrupt context, however never more than twice. This can be not only a potential cleanup to remove this assumption since AFAIU the code itself doesn't really have this twice-only limitation (though that should be a protective approach in the past), at the same time it'll greatly simplify future works like userfaultfd write-protect where it's possible to retry for more than twice (please have a look at [1] below for a possible user that might require the page fault to be handled for a third time; if we can remove the retry limitation we can simply drop that patch and those complexity). Some more details on each of the patch (even more in commit messages): Patch 1: A cleanup of existing GUP code to rename the confusing "nonblocking" parameter to "locked" which seems suite more. Patch 2: Complete the page fault faster for non-sigkill signals Patch 3: Remove the limitation to only allow to retry page fault for twice (page fault part) Patch 4: Similar work of patch 3, but for GUP. The series is only lightly tested. Before running more tests, I'd be really glad to see whether there's any feedback first. Looking forward to your comments. Thanks, [1] https://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git/commit/?h=userfault=b245ecf6cf59156966f3da6e6b674f6695a5ffa5 Peter Xu (4): mm: gup: rename "nonblocking" to "locked" where proper mm: userfault: return VM_FAULT_RETRY on signals mm: allow VM_FAULT_RETRY for multiple times mm: gup: allow VM_FAULT_RETRY for multiple times arch/alpha/mm/fault.c | 4 +-- arch/arc/mm/fault.c| 12 arch/arm/mm/fault.c| 17 ++- arch/arm64/mm/fault.c | 11 ++- arch/hexagon/mm/vm_fault.c | 3 +- arch/ia64/mm/fault.c | 3 +- arch/m68k/mm/fault.c | 5 +--- arch/microblaze/mm/fault.c | 3 +- arch/mips/mm/fault.c | 3 +- arch/nds32/mm/fault.c | 7 ++--- arch/nios2/mm/fault.c | 5 +--- arch/openrisc/mm/fault.c | 3 +- arch/parisc/mm/fault.c | 4 +-- arch/powerpc/mm/fault.c| 9 ++ arch/riscv/mm/fault.c | 9 ++ arch/s390/mm/fault.c | 14 - arch/sh/mm/fault.c | 5 +++- arch/sparc/mm/fault_32.c | 4 ++- arch/sparc/mm/fault_64.c | 4 ++- arch/um/kernel/trap.c | 6 ++-- arch/unicore32/mm/fault.c | 10 ++- arch/x86/mm/fault.c| 13 ++-- arch/xtensa/mm/fault.c | 4 ++- fs/userfaultfd.c | 24 --- mm/gup.c | 61 +- mm/hugetlb.c | 8 ++--- 26 files changed, 114 insertions(+), 137 deletions(-) -- 2.17.1
[PATCH RFC v3 0/4] mm: some enhancements to the page fault mechanism
v3: - fix up issues that krobot reported, rebase This is an RFC series as cleanup and enhancements to current page fault logic. The whole idea comes from the discussion between Andrea and Linus on the bug reported by syzbot here: https://lkml.org/lkml/2017/11/2/833 Basically it does two things: (a) Allows the page fault logic to be more interactive on not only SIGKILL, but also the rest of userspace signals, and, (b) Allows the page fault retry (VM_FAULT_RETRY) to happen for more than once. For (a): with the changes we should be able to react faster when page faults are working in parallel with userspace signals like SIGSTOP and SIGCONT (and more), and with that we can remove the buggy part in userfaultfd and benefit the whole page fault mechanism on faster signal processing to reach the userspace. For (b), we should be able to allow the page fault handler to loop for even more than twice. Some context: for now since we have FAULT_FLAG_ALLOW_RETRY we can allow to retry the page fault once with the same interrupt context, however never more than twice. This can be not only a potential cleanup to remove this assumption since AFAIU the code itself doesn't really have this twice-only limitation (though that should be a protective approach in the past), at the same time it'll greatly simplify future works like userfaultfd write-protect where it's possible to retry for more than twice (please have a look at [1] below for a possible user that might require the page fault to be handled for a third time; if we can remove the retry limitation we can simply drop that patch and those complexity). Some more details on each of the patch (even more in commit messages): Patch 1: A cleanup of existing GUP code to rename the confusing "nonblocking" parameter to "locked" which seems suite more. Patch 2: Complete the page fault faster for non-sigkill signals Patch 3: Remove the limitation to only allow to retry page fault for twice (page fault part) Patch 4: Similar work of patch 3, but for GUP. The series is only lightly tested. Before running more tests, I'd be really glad to see whether there's any feedback first. Looking forward to your comments. Thanks, [1] https://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git/commit/?h=userfault=b245ecf6cf59156966f3da6e6b674f6695a5ffa5 Peter Xu (4): mm: gup: rename "nonblocking" to "locked" where proper mm: userfault: return VM_FAULT_RETRY on signals mm: allow VM_FAULT_RETRY for multiple times mm: gup: allow VM_FAULT_RETRY for multiple times arch/alpha/mm/fault.c | 4 +-- arch/arc/mm/fault.c| 12 arch/arm/mm/fault.c| 17 ++- arch/arm64/mm/fault.c | 11 ++- arch/hexagon/mm/vm_fault.c | 3 +- arch/ia64/mm/fault.c | 3 +- arch/m68k/mm/fault.c | 5 +--- arch/microblaze/mm/fault.c | 3 +- arch/mips/mm/fault.c | 3 +- arch/nds32/mm/fault.c | 7 ++--- arch/nios2/mm/fault.c | 5 +--- arch/openrisc/mm/fault.c | 3 +- arch/parisc/mm/fault.c | 4 +-- arch/powerpc/mm/fault.c| 9 ++ arch/riscv/mm/fault.c | 9 ++ arch/s390/mm/fault.c | 14 - arch/sh/mm/fault.c | 5 +++- arch/sparc/mm/fault_32.c | 4 ++- arch/sparc/mm/fault_64.c | 4 ++- arch/um/kernel/trap.c | 6 ++-- arch/unicore32/mm/fault.c | 10 ++- arch/x86/mm/fault.c| 13 ++-- arch/xtensa/mm/fault.c | 4 ++- fs/userfaultfd.c | 24 --- mm/gup.c | 61 +- mm/hugetlb.c | 8 ++--- 26 files changed, 114 insertions(+), 137 deletions(-) -- 2.17.1
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 11/21/2018 11:36 AM, Viresh Kumar wrote: On 21-11-18, 10:47, Viresh Kumar wrote: On 21-11-18, 10:34, Rajendra Nayak wrote: On 11/5/2018 12:06 PM, Viresh Kumar wrote: Introduce a new helper dev_pm_opp_xlate_performance_state() which will be used to translate from pstate of a device to another one. Initially this will be used by genpd to find pstate of a master domain using its sub-domain's pstate. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 49 ++ include/linux/pm_opp.h | 7 ++ 2 files changed, 56 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 0eaa954b3f6c..010a4268e8dd 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct opp_table *opp_table, dev_err(virt_dev, "Failed to find required device entry\n"); } +/** + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table. + * @src_table: OPP table which has dst_table as one of its required OPP table. So I have a case where the src_table and dst_table are shared/same. Can you explain how would it work in such a case? Can you give the example, as I am finding some issues with such shared tables. Though the code may work just fine btw. I may have found the problem you are facing here. Please try this diff and tell me if you hitting it, check this in dmesg. Yes, I do seem to be hitting this. diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3740822b4197..6c45774306b0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -225,6 +225,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, free_required_tables: _opp_table_free_required_tables(opp_table); put_np: + dev_err(dev, "Failed to allocate required OPP tables\n"); of_node_put(np); }
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 11/21/2018 11:36 AM, Viresh Kumar wrote: On 21-11-18, 10:47, Viresh Kumar wrote: On 21-11-18, 10:34, Rajendra Nayak wrote: On 11/5/2018 12:06 PM, Viresh Kumar wrote: Introduce a new helper dev_pm_opp_xlate_performance_state() which will be used to translate from pstate of a device to another one. Initially this will be used by genpd to find pstate of a master domain using its sub-domain's pstate. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 49 ++ include/linux/pm_opp.h | 7 ++ 2 files changed, 56 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 0eaa954b3f6c..010a4268e8dd 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct opp_table *opp_table, dev_err(virt_dev, "Failed to find required device entry\n"); } +/** + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table. + * @src_table: OPP table which has dst_table as one of its required OPP table. So I have a case where the src_table and dst_table are shared/same. Can you explain how would it work in such a case? Can you give the example, as I am finding some issues with such shared tables. Though the code may work just fine btw. I may have found the problem you are facing here. Please try this diff and tell me if you hitting it, check this in dmesg. Yes, I do seem to be hitting this. diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3740822b4197..6c45774306b0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -225,6 +225,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, free_required_tables: _opp_table_free_required_tables(opp_table); put_np: + dev_err(dev, "Failed to allocate required OPP tables\n"); of_node_put(np); }
Re: [Patch v6 14/16] x86/speculation: Use STIBP to restrict speculation on non-dumpable task
On Tue, 20 Nov 2018, Linus Torvalds wrote: > > Implements arch_update_spec_restriction() for x86. Use STIBP to > > restrict speculative execution when running a task set to non-dumpable, > > or clear the restriction if the task is set to dumpable. > > I don't think this necessarily makes sense. > > The new "auto" behavior is that we aim to restrict untrusted code (and > the loader of such code uses prctrl to set that flag), then this whole > "set STIBP for non-dumpable" makes little sense. > > A non-dumpable app by definition is *more* trusted, not less trusted. I understand your argument. I believe actually both ways of protection do make sense in some way (but it doesn't mean we should do it by default). Basically: - process marks itself "I am loading untrusted code" via that prctl() in order to avoid its untrusted code to be used as spectrev2 gadgets - process marks itself "I am loading untrusted code" via that prctl() in order to have its all threads/subprocesses marked the same way, so that one thread can't influence speculative code flow of the other in order to read its memory (the "javascript in one browser tab reads secrets from another tab") - non-dumpable tasks have the branch predictor flushed when context switching to them (IBPB) or when sibling is running untrusted code (STIBP) in order not guarantee that its speculative code flow can never be influenced by previous / sibling process mistraining branch predictor for it, and therefore do not allow reading its secrets from memory through gadgets that'd have to be in the process code itself But I agree there are many reasons why this shouldn't be done by default if we accept 'prctl' as the default mode. Namely: - the whole "proper gadgets need to be present in the process' .text" is dubious by itself - the unavoidable overhead it'd impose on network daemons that you can't really get rid of The distiled patchset that Thomas will be sending around today is not have the dumpability restriction in it. Thanks, -- Jiri Kosina SUSE Labs
Re: [Patch v6 14/16] x86/speculation: Use STIBP to restrict speculation on non-dumpable task
On Tue, 20 Nov 2018, Linus Torvalds wrote: > > Implements arch_update_spec_restriction() for x86. Use STIBP to > > restrict speculative execution when running a task set to non-dumpable, > > or clear the restriction if the task is set to dumpable. > > I don't think this necessarily makes sense. > > The new "auto" behavior is that we aim to restrict untrusted code (and > the loader of such code uses prctrl to set that flag), then this whole > "set STIBP for non-dumpable" makes little sense. > > A non-dumpable app by definition is *more* trusted, not less trusted. I understand your argument. I believe actually both ways of protection do make sense in some way (but it doesn't mean we should do it by default). Basically: - process marks itself "I am loading untrusted code" via that prctl() in order to avoid its untrusted code to be used as spectrev2 gadgets - process marks itself "I am loading untrusted code" via that prctl() in order to have its all threads/subprocesses marked the same way, so that one thread can't influence speculative code flow of the other in order to read its memory (the "javascript in one browser tab reads secrets from another tab") - non-dumpable tasks have the branch predictor flushed when context switching to them (IBPB) or when sibling is running untrusted code (STIBP) in order not guarantee that its speculative code flow can never be influenced by previous / sibling process mistraining branch predictor for it, and therefore do not allow reading its secrets from memory through gadgets that'd have to be in the process code itself But I agree there are many reasons why this shouldn't be done by default if we accept 'prctl' as the default mode. Namely: - the whole "proper gadgets need to be present in the process' .text" is dubious by itself - the unavoidable overhead it'd impose on network daemons that you can't really get rid of The distiled patchset that Thomas will be sending around today is not have the dumpability restriction in it. Thanks, -- Jiri Kosina SUSE Labs
Re: [PATCH v2 1/2] modpost: drop unused command line switches
On Thu, Nov 15, 2018 at 9:56 AM Paul Walmsley wrote: > > Drop modpost command line switches that are no longer used by > makefile.modpost, upon request from Sam Ravnborg , > who wrote: > > modpost is not supposed to be used outside the kernel build. [...] > I checked if there were any options supported by modpost that > was not configurable in makefile.modpost. > And I could see that the -M and -K options in getopt() was leftovers. > The code that used these option was was dropped in: > commit a8773769d1a1 ("Kbuild: clear marker out of modpost") > > Could you add a patch that delete these on top of what you already have. > > https://lore.kernel.org/lkml/20181020140835.ga3...@ravnborg.org/ > > Cc: Sam Ravnborg > Cc: Masahiro Yamada > Cc: Michal Marek > Cc: linux-kbu...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Signed-off-by: Paul Walmsley > Signed-off-by: Paul Walmsley > --- Applied to linux-kbuild. Thanks! > scripts/mod/modpost.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 0d998c54564d..85bd93c63180 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -2416,7 +2416,7 @@ int main(int argc, char **argv) > struct ext_sym_list *extsym_iter; > struct ext_sym_list *extsym_start = NULL; > > - while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awM:K:E")) != -1) { > + while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awE")) != -1) { > switch (opt) { > case 'i': > kernel_read = optarg; > -- > 2.19.1 > -- Best Regards Masahiro Yamada
Re: [PATCH v2 1/2] modpost: drop unused command line switches
On Thu, Nov 15, 2018 at 9:56 AM Paul Walmsley wrote: > > Drop modpost command line switches that are no longer used by > makefile.modpost, upon request from Sam Ravnborg , > who wrote: > > modpost is not supposed to be used outside the kernel build. [...] > I checked if there were any options supported by modpost that > was not configurable in makefile.modpost. > And I could see that the -M and -K options in getopt() was leftovers. > The code that used these option was was dropped in: > commit a8773769d1a1 ("Kbuild: clear marker out of modpost") > > Could you add a patch that delete these on top of what you already have. > > https://lore.kernel.org/lkml/20181020140835.ga3...@ravnborg.org/ > > Cc: Sam Ravnborg > Cc: Masahiro Yamada > Cc: Michal Marek > Cc: linux-kbu...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Signed-off-by: Paul Walmsley > Signed-off-by: Paul Walmsley > --- Applied to linux-kbuild. Thanks! > scripts/mod/modpost.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 0d998c54564d..85bd93c63180 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -2416,7 +2416,7 @@ int main(int argc, char **argv) > struct ext_sym_list *extsym_iter; > struct ext_sym_list *extsym_start = NULL; > > - while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awM:K:E")) != -1) { > + while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awE")) != -1) { > switch (opt) { > case 'i': > kernel_read = optarg; > -- > 2.19.1 > -- Best Regards Masahiro Yamada
Re: [PATCH v2 2/2] modpost: skip ELF local symbols by default during section mismatch check
Hi Paul, On Thu, Nov 15, 2018 at 9:57 AM Paul Walmsley wrote: > > During development of a serial console driver with a gcc 8.2.0 > toolchain for RISC-V, the following modpost warning appeared: > > > WARNING: vmlinux.o(.data+0x19b10): Section mismatch in reference from the > variable .LANCHOR1 to the function .init.text:sifive_serial_console_setup() > The variable .LANCHOR1 references > the function __init sifive_serial_console_setup() > If the reference is valid then annotate the > variable with __init* or __refdata (see linux/init.h) or name the variable: > *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console > > > ".LANCHOR1" is an ELF local symbol, automatically created by gcc's section > anchor generation code: > > https://gcc.gnu.org/onlinedocs/gccint/Anchored-Addresses.html > > https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/varasm.c;h=cd9591a45617464946dcf9a126dde277d9de9804;hb=9fb89fa845c1b2e0a18d85ada0b077c84508ab78#l7473 > > This was verified by compiling the kernel with -fno-section-anchors. > The serial driver code idiom triggering the warning is standard serial > driver practice, and one that has a specific whitelist inclusion in > modpost.c. > > I'm neither a modpost nor an ELF expert, but naively, it doesn't seem > useful for modpost to report section mismatch warnings caused by ELF > local symbols by default. Local symbols have compiler-generated > names, and thus bypass modpost's whitelisting algorithm, which relies > on the presence of a non-autogenerated symbol name. This increases > the likelihood that false positive warnings will be generated (as in > the above case). > > Thus, disable section mismatch reporting on ELF local symbols. The > rationale here is similar to that of commit 2e3a10a1551d ("ARM: avoid > ARM binutils leaking ELF local symbols") and of similar code already > present in modpost.c: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/mod/modpost.c?h=v4.19-rc4=7876320f88802b22d4e2daf7eb027dd14175a0f8#n1256 > > This second version of the patch drops the option to keep section > mismatch warnings for local sections, based on feedback from Sam > Ravnborg ; and clarifies that these warnings > appear with gcc 8.2.0. > > Cc: Russell King > Cc: Jim Wilson > Cc: Masahiro Yamada > Cc: Michal Marek > Cc: Sam Ravnborg > Cc: linux-kbu...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Signed-off-by: Paul Walmsley > Signed-off-by: Paul Walmsley > --- > scripts/mod/modpost.c | 14 ++ > 1 file changed, 14 insertions(+) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 85bd93c63180..0fb148171b78 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -1253,6 +1253,17 @@ static inline int is_arm_mapping_symbol(const char > *str) >&& (str[2] == '\0' || str[2] == '.'); > } > > +/* > + * If a symbol name follows the convention for ELF-local symbols (i.e., the > + * name begins with a ".L"), return true; otherwise false. This is used to > + * skip section mismatch reporting on ELF-local symbols, due to the risk of > + * false positives. > + */ > +static inline int is_local_symbol(const char *str) > +{ > + return str[0] == '.' && str[1] == 'L'; > +} > + > /* > * If there's no name there, ignore it; likewise, ignore it if it's > * one of the magic symbols emitted used by current ARM tools. > @@ -1535,6 +1546,9 @@ static void default_mismatch_handler(const char > *modname, struct elf_info *elf, > if (strstarts(fromsym, "reference___initcall")) > return; > > + if (is_local_symbol(fromsym)) > + return; > + > tosec = sec_name(elf, get_secindex(elf, sym)); > to = find_elf_symbol(elf, r->r_addend, sym); > tosym = sym_name(elf, to); > -- > 2.19.1 > I think this is almost good. Just a nit. Maybe, putting this check in secref_whitelist() (with a comment "Pattern 6:") could look more consistency. Also, if you use strstart() helper, you can remove is_local_symbol() function. /* Check for pattern 6 */ if (strstarts(fromsym, ".L")) return 0; -- Best Regards Masahiro Yamada
Re: [PATCH v2 2/2] modpost: skip ELF local symbols by default during section mismatch check
Hi Paul, On Thu, Nov 15, 2018 at 9:57 AM Paul Walmsley wrote: > > During development of a serial console driver with a gcc 8.2.0 > toolchain for RISC-V, the following modpost warning appeared: > > > WARNING: vmlinux.o(.data+0x19b10): Section mismatch in reference from the > variable .LANCHOR1 to the function .init.text:sifive_serial_console_setup() > The variable .LANCHOR1 references > the function __init sifive_serial_console_setup() > If the reference is valid then annotate the > variable with __init* or __refdata (see linux/init.h) or name the variable: > *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console > > > ".LANCHOR1" is an ELF local symbol, automatically created by gcc's section > anchor generation code: > > https://gcc.gnu.org/onlinedocs/gccint/Anchored-Addresses.html > > https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/varasm.c;h=cd9591a45617464946dcf9a126dde277d9de9804;hb=9fb89fa845c1b2e0a18d85ada0b077c84508ab78#l7473 > > This was verified by compiling the kernel with -fno-section-anchors. > The serial driver code idiom triggering the warning is standard serial > driver practice, and one that has a specific whitelist inclusion in > modpost.c. > > I'm neither a modpost nor an ELF expert, but naively, it doesn't seem > useful for modpost to report section mismatch warnings caused by ELF > local symbols by default. Local symbols have compiler-generated > names, and thus bypass modpost's whitelisting algorithm, which relies > on the presence of a non-autogenerated symbol name. This increases > the likelihood that false positive warnings will be generated (as in > the above case). > > Thus, disable section mismatch reporting on ELF local symbols. The > rationale here is similar to that of commit 2e3a10a1551d ("ARM: avoid > ARM binutils leaking ELF local symbols") and of similar code already > present in modpost.c: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/mod/modpost.c?h=v4.19-rc4=7876320f88802b22d4e2daf7eb027dd14175a0f8#n1256 > > This second version of the patch drops the option to keep section > mismatch warnings for local sections, based on feedback from Sam > Ravnborg ; and clarifies that these warnings > appear with gcc 8.2.0. > > Cc: Russell King > Cc: Jim Wilson > Cc: Masahiro Yamada > Cc: Michal Marek > Cc: Sam Ravnborg > Cc: linux-kbu...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Signed-off-by: Paul Walmsley > Signed-off-by: Paul Walmsley > --- > scripts/mod/modpost.c | 14 ++ > 1 file changed, 14 insertions(+) > > diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c > index 85bd93c63180..0fb148171b78 100644 > --- a/scripts/mod/modpost.c > +++ b/scripts/mod/modpost.c > @@ -1253,6 +1253,17 @@ static inline int is_arm_mapping_symbol(const char > *str) >&& (str[2] == '\0' || str[2] == '.'); > } > > +/* > + * If a symbol name follows the convention for ELF-local symbols (i.e., the > + * name begins with a ".L"), return true; otherwise false. This is used to > + * skip section mismatch reporting on ELF-local symbols, due to the risk of > + * false positives. > + */ > +static inline int is_local_symbol(const char *str) > +{ > + return str[0] == '.' && str[1] == 'L'; > +} > + > /* > * If there's no name there, ignore it; likewise, ignore it if it's > * one of the magic symbols emitted used by current ARM tools. > @@ -1535,6 +1546,9 @@ static void default_mismatch_handler(const char > *modname, struct elf_info *elf, > if (strstarts(fromsym, "reference___initcall")) > return; > > + if (is_local_symbol(fromsym)) > + return; > + > tosec = sec_name(elf, get_secindex(elf, sym)); > to = find_elf_symbol(elf, r->r_addend, sym); > tosym = sym_name(elf, to); > -- > 2.19.1 > I think this is almost good. Just a nit. Maybe, putting this check in secref_whitelist() (with a comment "Pattern 6:") could look more consistency. Also, if you use strstart() helper, you can remove is_local_symbol() function. /* Check for pattern 6 */ if (strstarts(fromsym, ".L")) return 0; -- Best Regards Masahiro Yamada
Re: [PATCH] staging: greybus: Parenthesis alignment
On Tue, Nov 20, 2018 at 11:42:05PM +0100, Cristian Sicilia wrote: > Thanks for note > > Can make sense do the patch for function that stay in the line but is not > aligned, or is better doesn't change anything? > Like this: > > static ssize_t gb_camera_debugfs_capabilities(struct gb_camera *gcam, > - char *buf, size_t len) > + char *buf, size_t len) If you really want to, sure :) greg k-h
Re: [PATCH] staging: greybus: Parenthesis alignment
On Tue, Nov 20, 2018 at 11:42:05PM +0100, Cristian Sicilia wrote: > Thanks for note > > Can make sense do the patch for function that stay in the line but is not > aligned, or is better doesn't change anything? > Like this: > > static ssize_t gb_camera_debugfs_capabilities(struct gb_camera *gcam, > - char *buf, size_t len) > + char *buf, size_t len) If you really want to, sure :) greg k-h
Re: [PATCH v2 0/6] RFC: gup+dma: tracking dma-pinned pages
On 11/19/18 10:57 AM, Tom Talpey wrote: > John, thanks for the discussion at LPC. One of the concerns we > raised however was the performance test. The numbers below are > rather obviously tainted. I think we need to get a better baseline > before concluding anything... > > Here's my main concern: > Hi Tom, Thanks again for looking at this! > On 11/10/2018 3:50 AM, john.hubb...@gmail.com wrote: >> From: John Hubbard >> ... >> -- >> WITHOUT the patch: >> -- >> reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) >> 4096B-4096B, ioengine=libaio, iodepth=64 >> fio-3.3 >> Starting 1 process >> Jobs: 1 (f=1): [R(1)][100.0%][r=55.5MiB/s,w=0KiB/s][r=14.2k,w=0 IOPS][eta >> 00m:00s] >> reader: (groupid=0, jobs=1): err= 0: pid=1750: Tue Nov 6 20:18:06 2018 >> read: IOPS=13.9k, BW=54.4MiB/s (57.0MB/s)(1024MiB/18826msec) > > ~14000 4KB read IOPS is really, really low for an NVMe disk. Yes, but Jan Kara's original config file for fio is *intended* to highlight the get_user_pages/put_user_pages changes. It was *not* intended to get max performance, as you can see by the numjobs and direct IO parameters: cat fio.conf [reader] direct=1 ioengine=libaio blocksize=4096 size=1g numjobs=1 rw=read iodepth=64 So I'm thinking that this is not a "tainted" test, but rather, we're constraining things a lot with these choices. It's hard to find a good test config to run that allows decisions, but so far, I'm not really seeing anything that says "this is so bad that we can't afford to fix the brokenness." I think. After talking with you and reading this email, I did a bunch more test runs, varying the following fio parameters: -- direct -- numjobs -- iodepth ...with both the baseline 4.20-rc3 kernel, and with my patches applied. (btw, if anyone cares, I'll post a github link that has a complete, testable patchset--not ready for submission as such, but it works cleanly and will allow others to attempt to reproduce my results). What I'm seeing is that I can get 10x or better improvements in IOPS and BW, just by going to 10 threads and turning off direct IO--as expected. So in the end, I increased the number of threads, and also increased iodepth a bit. Test results below... > >> cpu : usr=2.39%, sys=95.30%, ctx=669, majf=0, minf=72 > > CPU is obviously the limiting factor. At these IOPS, it should be far > less. >> -- >> OR, here's a better run WITH the patch applied, and you can see that this is >> nearly as good >> as the "without" case: >> -- >> >> reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) >> 4096B-4096B, ioengine=libaio, iodepth=64 >> fio-3.3 >> Starting 1 process >> Jobs: 1 (f=1): [R(1)][100.0%][r=53.2MiB/s,w=0KiB/s][r=13.6k,w=0 IOPS][eta >> 00m:00s] >> reader: (groupid=0, jobs=1): err= 0: pid=2521: Tue Nov 6 20:01:33 2018 >> read: IOPS=13.4k, BW=52.5MiB/s (55.1MB/s)(1024MiB/19499msec) > > Similar low IOPS. > >> cpu : usr=3.47%, sys=94.61%, ctx=370, majf=0, minf=73 > > Similar CPU saturation. > >> > > I get nearly 400,000 4KB IOPS on my tiny desktop, which has a 25W > i7-7500 and a Samsung PM961 128GB NVMe (stock Bionic 4.15 kernel > and fio version 3.1). Even then, the CPU saturates, so it's not > necessarily a perfect test. I'd like to see your runs both get to > "max" IOPS, i.e. CPU < 100%, and compare the CPU numbers. This would > give the best comparison for making a decision. I can get to CPU < 100% by increasing to 10 or 20 threads, although it makes latency ever so much worse. > > Can you confirm what type of hardware you're running this test on? > CPU, memory speed and capacity, and NVMe device especially? > > Tom. Yes, it's a nice new system, I don't expect any strange perf problems: CPU: Intel(R) Core(TM) i7-7800X CPU @ 3.50GHz (Intel X299 chipset) Block device: nvme-Samsung_SSD_970_EVO_250GB DRAM: 32 GB So, here's a comparison using 20 threads, direct IO, for the baseline vs. patched kernel (below). Highlights: -- IOPS are similar, around 60k. -- BW gets worse, dropping from 290 to 220 MB/s. -- CPU is well under 100%. -- latency is incredibly long, but...20 threads. Baseline: $ ./run.sh fio configuration: [reader] ioengine=libaio blocksize=4096 size=1g rw=read group_reporting iodepth=256 direct=1 numjobs=20 Running fio: reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=256 ... fio-3.3 Starting 20 processes Jobs: 4 (f=4): [_(8),R(2),_(2),R(1),_(1),R(1),_(5)][95.9%][r=244MiB/s,w=0KiB/s][r=62.5k,w=0 IOPS][eta 00m:03s] reader: (groupid=0, jobs=20): err= 0: pid=14499: Tue Nov 20 16:20:35 2018 read: IOPS=74.2k, BW=290MiB/s (304MB/s)(20.0GiB/70644msec) slat (usec): min=26,
Re: [PATCH v2 0/6] RFC: gup+dma: tracking dma-pinned pages
On 11/19/18 10:57 AM, Tom Talpey wrote: > John, thanks for the discussion at LPC. One of the concerns we > raised however was the performance test. The numbers below are > rather obviously tainted. I think we need to get a better baseline > before concluding anything... > > Here's my main concern: > Hi Tom, Thanks again for looking at this! > On 11/10/2018 3:50 AM, john.hubb...@gmail.com wrote: >> From: John Hubbard >> ... >> -- >> WITHOUT the patch: >> -- >> reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) >> 4096B-4096B, ioengine=libaio, iodepth=64 >> fio-3.3 >> Starting 1 process >> Jobs: 1 (f=1): [R(1)][100.0%][r=55.5MiB/s,w=0KiB/s][r=14.2k,w=0 IOPS][eta >> 00m:00s] >> reader: (groupid=0, jobs=1): err= 0: pid=1750: Tue Nov 6 20:18:06 2018 >> read: IOPS=13.9k, BW=54.4MiB/s (57.0MB/s)(1024MiB/18826msec) > > ~14000 4KB read IOPS is really, really low for an NVMe disk. Yes, but Jan Kara's original config file for fio is *intended* to highlight the get_user_pages/put_user_pages changes. It was *not* intended to get max performance, as you can see by the numjobs and direct IO parameters: cat fio.conf [reader] direct=1 ioengine=libaio blocksize=4096 size=1g numjobs=1 rw=read iodepth=64 So I'm thinking that this is not a "tainted" test, but rather, we're constraining things a lot with these choices. It's hard to find a good test config to run that allows decisions, but so far, I'm not really seeing anything that says "this is so bad that we can't afford to fix the brokenness." I think. After talking with you and reading this email, I did a bunch more test runs, varying the following fio parameters: -- direct -- numjobs -- iodepth ...with both the baseline 4.20-rc3 kernel, and with my patches applied. (btw, if anyone cares, I'll post a github link that has a complete, testable patchset--not ready for submission as such, but it works cleanly and will allow others to attempt to reproduce my results). What I'm seeing is that I can get 10x or better improvements in IOPS and BW, just by going to 10 threads and turning off direct IO--as expected. So in the end, I increased the number of threads, and also increased iodepth a bit. Test results below... > >> cpu : usr=2.39%, sys=95.30%, ctx=669, majf=0, minf=72 > > CPU is obviously the limiting factor. At these IOPS, it should be far > less. >> -- >> OR, here's a better run WITH the patch applied, and you can see that this is >> nearly as good >> as the "without" case: >> -- >> >> reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) >> 4096B-4096B, ioengine=libaio, iodepth=64 >> fio-3.3 >> Starting 1 process >> Jobs: 1 (f=1): [R(1)][100.0%][r=53.2MiB/s,w=0KiB/s][r=13.6k,w=0 IOPS][eta >> 00m:00s] >> reader: (groupid=0, jobs=1): err= 0: pid=2521: Tue Nov 6 20:01:33 2018 >> read: IOPS=13.4k, BW=52.5MiB/s (55.1MB/s)(1024MiB/19499msec) > > Similar low IOPS. > >> cpu : usr=3.47%, sys=94.61%, ctx=370, majf=0, minf=73 > > Similar CPU saturation. > >> > > I get nearly 400,000 4KB IOPS on my tiny desktop, which has a 25W > i7-7500 and a Samsung PM961 128GB NVMe (stock Bionic 4.15 kernel > and fio version 3.1). Even then, the CPU saturates, so it's not > necessarily a perfect test. I'd like to see your runs both get to > "max" IOPS, i.e. CPU < 100%, and compare the CPU numbers. This would > give the best comparison for making a decision. I can get to CPU < 100% by increasing to 10 or 20 threads, although it makes latency ever so much worse. > > Can you confirm what type of hardware you're running this test on? > CPU, memory speed and capacity, and NVMe device especially? > > Tom. Yes, it's a nice new system, I don't expect any strange perf problems: CPU: Intel(R) Core(TM) i7-7800X CPU @ 3.50GHz (Intel X299 chipset) Block device: nvme-Samsung_SSD_970_EVO_250GB DRAM: 32 GB So, here's a comparison using 20 threads, direct IO, for the baseline vs. patched kernel (below). Highlights: -- IOPS are similar, around 60k. -- BW gets worse, dropping from 290 to 220 MB/s. -- CPU is well under 100%. -- latency is incredibly long, but...20 threads. Baseline: $ ./run.sh fio configuration: [reader] ioengine=libaio blocksize=4096 size=1g rw=read group_reporting iodepth=256 direct=1 numjobs=20 Running fio: reader: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=256 ... fio-3.3 Starting 20 processes Jobs: 4 (f=4): [_(8),R(2),_(2),R(1),_(1),R(1),_(5)][95.9%][r=244MiB/s,w=0KiB/s][r=62.5k,w=0 IOPS][eta 00m:03s] reader: (groupid=0, jobs=20): err= 0: pid=14499: Tue Nov 20 16:20:35 2018 read: IOPS=74.2k, BW=290MiB/s (304MB/s)(20.0GiB/70644msec) slat (usec): min=26,
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 21-11-18, 10:47, Viresh Kumar wrote: > On 21-11-18, 10:34, Rajendra Nayak wrote: > > > > > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > > Introduce a new helper dev_pm_opp_xlate_performance_state() which will > > > be used to translate from pstate of a device to another one. > > > > > > Initially this will be used by genpd to find pstate of a master domain > > > using its sub-domain's pstate. > > > > > > Signed-off-by: Viresh Kumar > > > --- > > > drivers/opp/core.c | 49 ++ > > > include/linux/pm_opp.h | 7 ++ > > > 2 files changed, 56 insertions(+) > > > > > > diff --git a/drivers/opp/core.c b/drivers/opp/core.c > > > index 0eaa954b3f6c..010a4268e8dd 100644 > > > --- a/drivers/opp/core.c > > > +++ b/drivers/opp/core.c > > > @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct > > > opp_table *opp_table, > > > dev_err(virt_dev, "Failed to find required device > > > entry\n"); > > > } > > > +/** > > > + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for > > > src_table. > > > + * @src_table: OPP table which has dst_table as one of its required OPP > > > table. > > > > So I have a case where the src_table and dst_table are shared/same. Can you > > explain how would > > it work in such a case? > > Can you give the example, as I am finding some issues with such shared > tables. Though the code may work just fine btw. I may have found the problem you are facing here. Please try this diff and tell me if you hitting it, check this in dmesg. diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3740822b4197..6c45774306b0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -225,6 +225,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, free_required_tables: _opp_table_free_required_tables(opp_table); put_np: + dev_err(dev, "Failed to allocate required OPP tables\n"); of_node_put(np); } -- viresh
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 21-11-18, 10:47, Viresh Kumar wrote: > On 21-11-18, 10:34, Rajendra Nayak wrote: > > > > > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > > Introduce a new helper dev_pm_opp_xlate_performance_state() which will > > > be used to translate from pstate of a device to another one. > > > > > > Initially this will be used by genpd to find pstate of a master domain > > > using its sub-domain's pstate. > > > > > > Signed-off-by: Viresh Kumar > > > --- > > > drivers/opp/core.c | 49 ++ > > > include/linux/pm_opp.h | 7 ++ > > > 2 files changed, 56 insertions(+) > > > > > > diff --git a/drivers/opp/core.c b/drivers/opp/core.c > > > index 0eaa954b3f6c..010a4268e8dd 100644 > > > --- a/drivers/opp/core.c > > > +++ b/drivers/opp/core.c > > > @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct > > > opp_table *opp_table, > > > dev_err(virt_dev, "Failed to find required device > > > entry\n"); > > > } > > > +/** > > > + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for > > > src_table. > > > + * @src_table: OPP table which has dst_table as one of its required OPP > > > table. > > > > So I have a case where the src_table and dst_table are shared/same. Can you > > explain how would > > it work in such a case? > > Can you give the example, as I am finding some issues with such shared > tables. Though the code may work just fine btw. I may have found the problem you are facing here. Please try this diff and tell me if you hitting it, check this in dmesg. diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3740822b4197..6c45774306b0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -225,6 +225,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, free_required_tables: _opp_table_free_required_tables(opp_table); put_np: + dev_err(dev, "Failed to allocate required OPP tables\n"); of_node_put(np); } -- viresh
Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce
On Mon, Nov 19, 2018 at 08:29:39PM -0700, Jason Gunthorpe wrote: > Date: Mon, 19 Nov 2018 20:29:39 -0700 > From: Jason Gunthorpe > To: Kenneth Lee > CC: Leon Romanovsky , Kenneth Lee , > Tim Sell , linux-...@vger.kernel.org, Alexander > Shishkin , Zaibo Xu > , zhangfei@foxmail.com, linux...@huawei.com, > haojian.zhu...@linaro.org, Christoph Lameter , Hao Fang > , Gavin Schenk , RDMA mailing > list , Zhou Wang , > Doug Ledford , Uwe Kleine-König > , David Kershner > , Johan Hovold , Cyrille > Pitchen , Sagar Dharia > , Jens Axboe , > guodong...@linaro.org, linux-netdev , Randy Dunlap > , linux-kernel@vger.kernel.org, Vinod Koul > , linux-cry...@vger.kernel.org, Philippe Ombredanne > , Sanyog Kale , "David S. > Miller" , linux-accelerat...@lists.ozlabs.org > Subject: Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce > User-Agent: Mutt/1.9.4 (2018-02-28) > Message-ID: <20181120032939.gr4...@ziepe.ca> > > On Tue, Nov 20, 2018 at 11:07:02AM +0800, Kenneth Lee wrote: > > On Mon, Nov 19, 2018 at 11:49:54AM -0700, Jason Gunthorpe wrote: > > > Date: Mon, 19 Nov 2018 11:49:54 -0700 > > > From: Jason Gunthorpe > > > To: Kenneth Lee > > > CC: Leon Romanovsky , Kenneth Lee , > > > Tim Sell , linux-...@vger.kernel.org, Alexander > > > Shishkin , Zaibo Xu > > > , zhangfei@foxmail.com, linux...@huawei.com, > > > haojian.zhu...@linaro.org, Christoph Lameter , Hao Fang > > > , Gavin Schenk , RDMA > > > mailing > > > list , Zhou Wang , > > > Doug Ledford , Uwe Kleine-König > > > , David Kershner > > > , Johan Hovold , Cyrille > > > Pitchen , Sagar Dharia > > > , Jens Axboe , > > > guodong...@linaro.org, linux-netdev , Randy > > > Dunlap > > > , linux-kernel@vger.kernel.org, Vinod Koul > > > , linux-cry...@vger.kernel.org, Philippe Ombredanne > > > , Sanyog Kale , "David S. > > > Miller" , linux-accelerat...@lists.ozlabs.org > > > Subject: Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce > > > User-Agent: Mutt/1.9.4 (2018-02-28) > > > Message-ID: <20181119184954.gb4...@ziepe.ca> > > > > > > On Mon, Nov 19, 2018 at 05:14:05PM +0800, Kenneth Lee wrote: > > > > > > > If the hardware cannot share page table with the CPU, we then need to > > > > have > > > > some way to change the device page table. This is what happen in ODP. It > > > > invalidates the page table in device upon mmu_notifier call back. But > > > > this cannot > > > > solve the COW problem: if the user process A share a page P with > > > > device, and A > > > > forks a new process B, and it continue to write to the page. By COW, the > > > > process B will keep the page P, while A will get a new page P'. But you > > > > have > > > > no way to let the device know it should use P' rather than P. > > > > > > Is this true? I thought mmu_notifiers covered all these cases. > > > > > > The mm_notifier for A should fire if B causes the physical address of > > > A's pages to change via COW. > > > > > > And this causes the device page tables to re-synchronize. > > > > I don't see such code. The current do_cow_fault() implemenation has nothing > > to > > do with mm_notifer. > > Well, that sure sounds like it would be a bug in mmu_notifiers.. Yes, it can be taken that way:) But it is going to be a tough bug. > > But considering Jean's SVA stuff seems based on mmu notifiers, I have > a hard time believing that it has any different behavior from RDMA's > ODP, and if it does have different behavior, then it is probably just > a bug in the ODP implementation. As Jean has explained, his solution is based on page table sharing. I think ODP should also consider this new feature. > > > > > In WarpDrive/uacce, we make this simple. If you support IOMMU and it > > > > support > > > > SVM/SVA. Everything will be fine just like ODP implicit mode. And you > > > > don't need > > > > to write any code for that. Because it has been done by IOMMU > > > > framework. If it > > > > > > Looks like the IOMMU code uses mmu_notifier, so it is identical to > > > IB's ODP. The only difference is that IB tends to have the IOMMU page > > > table in the device, not in the CPU. > > > > > > The only case I know if that is different is the new-fangled CAPI > > > stuff where the IOMMU can directly use the CPU's page table and the > > > IOMMU page table (in device or CPU) is eliminated. > > > > Yes. We are not focusing on the current implementation. As mentioned in the > > cover letter. We are expecting Jean Philips' SVA patch: > > git://linux-arm.org/linux-jpb. > > This SVA stuff does not look comparable to CAPI as it still requires > maintaining seperate IOMMU page tables. > > Also, those patches from Jean have a lot of references to > mmu_notifiers (ie look at iommu_mmu_notifier). > > Are you really sure it is actually any different at all? > > > > Anyhow, I don't think a single instance of hardware should justify an > > > entire new subsystem. Subsystems are hard to make and without multiple > > >
Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce
On Mon, Nov 19, 2018 at 08:29:39PM -0700, Jason Gunthorpe wrote: > Date: Mon, 19 Nov 2018 20:29:39 -0700 > From: Jason Gunthorpe > To: Kenneth Lee > CC: Leon Romanovsky , Kenneth Lee , > Tim Sell , linux-...@vger.kernel.org, Alexander > Shishkin , Zaibo Xu > , zhangfei@foxmail.com, linux...@huawei.com, > haojian.zhu...@linaro.org, Christoph Lameter , Hao Fang > , Gavin Schenk , RDMA mailing > list , Zhou Wang , > Doug Ledford , Uwe Kleine-König > , David Kershner > , Johan Hovold , Cyrille > Pitchen , Sagar Dharia > , Jens Axboe , > guodong...@linaro.org, linux-netdev , Randy Dunlap > , linux-kernel@vger.kernel.org, Vinod Koul > , linux-cry...@vger.kernel.org, Philippe Ombredanne > , Sanyog Kale , "David S. > Miller" , linux-accelerat...@lists.ozlabs.org > Subject: Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce > User-Agent: Mutt/1.9.4 (2018-02-28) > Message-ID: <20181120032939.gr4...@ziepe.ca> > > On Tue, Nov 20, 2018 at 11:07:02AM +0800, Kenneth Lee wrote: > > On Mon, Nov 19, 2018 at 11:49:54AM -0700, Jason Gunthorpe wrote: > > > Date: Mon, 19 Nov 2018 11:49:54 -0700 > > > From: Jason Gunthorpe > > > To: Kenneth Lee > > > CC: Leon Romanovsky , Kenneth Lee , > > > Tim Sell , linux-...@vger.kernel.org, Alexander > > > Shishkin , Zaibo Xu > > > , zhangfei@foxmail.com, linux...@huawei.com, > > > haojian.zhu...@linaro.org, Christoph Lameter , Hao Fang > > > , Gavin Schenk , RDMA > > > mailing > > > list , Zhou Wang , > > > Doug Ledford , Uwe Kleine-König > > > , David Kershner > > > , Johan Hovold , Cyrille > > > Pitchen , Sagar Dharia > > > , Jens Axboe , > > > guodong...@linaro.org, linux-netdev , Randy > > > Dunlap > > > , linux-kernel@vger.kernel.org, Vinod Koul > > > , linux-cry...@vger.kernel.org, Philippe Ombredanne > > > , Sanyog Kale , "David S. > > > Miller" , linux-accelerat...@lists.ozlabs.org > > > Subject: Re: [RFCv3 PATCH 1/6] uacce: Add documents for WarpDrive/uacce > > > User-Agent: Mutt/1.9.4 (2018-02-28) > > > Message-ID: <20181119184954.gb4...@ziepe.ca> > > > > > > On Mon, Nov 19, 2018 at 05:14:05PM +0800, Kenneth Lee wrote: > > > > > > > If the hardware cannot share page table with the CPU, we then need to > > > > have > > > > some way to change the device page table. This is what happen in ODP. It > > > > invalidates the page table in device upon mmu_notifier call back. But > > > > this cannot > > > > solve the COW problem: if the user process A share a page P with > > > > device, and A > > > > forks a new process B, and it continue to write to the page. By COW, the > > > > process B will keep the page P, while A will get a new page P'. But you > > > > have > > > > no way to let the device know it should use P' rather than P. > > > > > > Is this true? I thought mmu_notifiers covered all these cases. > > > > > > The mm_notifier for A should fire if B causes the physical address of > > > A's pages to change via COW. > > > > > > And this causes the device page tables to re-synchronize. > > > > I don't see such code. The current do_cow_fault() implemenation has nothing > > to > > do with mm_notifer. > > Well, that sure sounds like it would be a bug in mmu_notifiers.. Yes, it can be taken that way:) But it is going to be a tough bug. > > But considering Jean's SVA stuff seems based on mmu notifiers, I have > a hard time believing that it has any different behavior from RDMA's > ODP, and if it does have different behavior, then it is probably just > a bug in the ODP implementation. As Jean has explained, his solution is based on page table sharing. I think ODP should also consider this new feature. > > > > > In WarpDrive/uacce, we make this simple. If you support IOMMU and it > > > > support > > > > SVM/SVA. Everything will be fine just like ODP implicit mode. And you > > > > don't need > > > > to write any code for that. Because it has been done by IOMMU > > > > framework. If it > > > > > > Looks like the IOMMU code uses mmu_notifier, so it is identical to > > > IB's ODP. The only difference is that IB tends to have the IOMMU page > > > table in the device, not in the CPU. > > > > > > The only case I know if that is different is the new-fangled CAPI > > > stuff where the IOMMU can directly use the CPU's page table and the > > > IOMMU page table (in device or CPU) is eliminated. > > > > Yes. We are not focusing on the current implementation. As mentioned in the > > cover letter. We are expecting Jean Philips' SVA patch: > > git://linux-arm.org/linux-jpb. > > This SVA stuff does not look comparable to CAPI as it still requires > maintaining seperate IOMMU page tables. > > Also, those patches from Jean have a lot of references to > mmu_notifiers (ie look at iommu_mmu_notifier). > > Are you really sure it is actually any different at all? > > > > Anyhow, I don't think a single instance of hardware should justify an > > > entire new subsystem. Subsystems are hard to make and without multiple > > >
Re: [PATCH V2] thermal: imx: fix for dependency on cpu-freq
Hi Anson, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on soc-thermal/next] [also build test WARNING on v4.20-rc3 next-20181120] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Anson-Huang/thermal-imx-fix-for-dependency-on-cpu-freq/20181121-124056 base: https://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git next config: sparc64-allyesconfig (attached as .config) compiler: sparc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=sparc64 All warnings (new ones prefixed by >>): drivers//thermal/imx_thermal.c: In function 'imx_thermal_unregister_legacy_cooling': >> drivers//thermal/imx_thermal.c:685:1: warning: no return statement in >> function returning non-void [-Wreturn-type] } ^ vim +685 drivers//thermal/imx_thermal.c 680 681 static int imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) 682 { 683 cpufreq_cooling_unregister(data->cdev); 684 cpufreq_cpu_put(data->policy); > 685 } 686 #else 687 static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) 688 { 689 return 0; 690 } 691 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH V2] thermal: imx: fix for dependency on cpu-freq
Hi Anson, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on soc-thermal/next] [also build test WARNING on v4.20-rc3 next-20181120] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Anson-Huang/thermal-imx-fix-for-dependency-on-cpu-freq/20181121-124056 base: https://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git next config: sparc64-allyesconfig (attached as .config) compiler: sparc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=sparc64 All warnings (new ones prefixed by >>): drivers//thermal/imx_thermal.c: In function 'imx_thermal_unregister_legacy_cooling': >> drivers//thermal/imx_thermal.c:685:1: warning: no return statement in >> function returning non-void [-Wreturn-type] } ^ vim +685 drivers//thermal/imx_thermal.c 680 681 static int imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) 682 { 683 cpufreq_cooling_unregister(data->cdev); 684 cpufreq_cpu_put(data->policy); > 685 } 686 #else 687 static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) 688 { 689 return 0; 690 } 691 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH V5 1/2] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:49, Anson Huang wrote: > The thermal driver is a standalone driver for monitoring SoC temperature > by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ > is NOT set. So remove the dependency with CPU_THERMAL. > > Introduce dummy function of legacy cooling register/unregister to make > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > Signed-off-by: Anson Huang > --- > drivers/thermal/Kconfig | 2 +- > drivers/thermal/imx_thermal.c | 47 > --- > 2 files changed, 36 insertions(+), 13 deletions(-) Acked-by: Viresh Kumar -- viresh
Re: [PATCH V5 2/2] thermal: imx: save one condition block for normal case of nvmem initialization
On 21-11-18, 05:49, Anson Huang wrote: > Put return value checks of calling imx_init_from_nvmem_cells() > into one block to save one condition block for normal case. > > Signed-off-by: Anson Huang > --- > drivers/thermal/imx_thermal.c | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c > index c924396..bb6754a 100644 > --- a/drivers/thermal/imx_thermal.c > +++ b/drivers/thermal/imx_thermal.c > @@ -742,9 +742,10 @@ static int imx_thermal_probe(struct platform_device > *pdev) > > if (of_find_property(pdev->dev.of_node, "nvmem-cells", NULL)) { > ret = imx_init_from_nvmem_cells(pdev); > - if (ret == -EPROBE_DEFER) > - return ret; > if (ret) { > + if (ret == -EPROBE_DEFER) > + return ret; > + > dev_err(>dev, "failed to init from nvmem: %d\n", > ret); > return ret; Acked-by: Viresh Kumar -- viresh
Re: [PATCH V5 1/2] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:49, Anson Huang wrote: > The thermal driver is a standalone driver for monitoring SoC temperature > by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ > is NOT set. So remove the dependency with CPU_THERMAL. > > Introduce dummy function of legacy cooling register/unregister to make > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > Signed-off-by: Anson Huang > --- > drivers/thermal/Kconfig | 2 +- > drivers/thermal/imx_thermal.c | 47 > --- > 2 files changed, 36 insertions(+), 13 deletions(-) Acked-by: Viresh Kumar -- viresh
Re: [PATCH V5 2/2] thermal: imx: save one condition block for normal case of nvmem initialization
On 21-11-18, 05:49, Anson Huang wrote: > Put return value checks of calling imx_init_from_nvmem_cells() > into one block to save one condition block for normal case. > > Signed-off-by: Anson Huang > --- > drivers/thermal/imx_thermal.c | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c > index c924396..bb6754a 100644 > --- a/drivers/thermal/imx_thermal.c > +++ b/drivers/thermal/imx_thermal.c > @@ -742,9 +742,10 @@ static int imx_thermal_probe(struct platform_device > *pdev) > > if (of_find_property(pdev->dev.of_node, "nvmem-cells", NULL)) { > ret = imx_init_from_nvmem_cells(pdev); > - if (ret == -EPROBE_DEFER) > - return ret; > if (ret) { > + if (ret == -EPROBE_DEFER) > + return ret; > + > dev_err(>dev, "failed to init from nvmem: %d\n", > ret); > return ret; Acked-by: Viresh Kumar -- viresh
[PATCH V5 2/2] thermal: imx: save one condition block for normal case of nvmem initialization
Put return value checks of calling imx_init_from_nvmem_cells() into one block to save one condition block for normal case. Signed-off-by: Anson Huang --- drivers/thermal/imx_thermal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index c924396..bb6754a 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -742,9 +742,10 @@ static int imx_thermal_probe(struct platform_device *pdev) if (of_find_property(pdev->dev.of_node, "nvmem-cells", NULL)) { ret = imx_init_from_nvmem_cells(pdev); - if (ret == -EPROBE_DEFER) - return ret; if (ret) { + if (ret == -EPROBE_DEFER) + return ret; + dev_err(>dev, "failed to init from nvmem: %d\n", ret); return ret; -- 2.7.4
[PATCH V5 1/2] thermal: imx: fix for dependency on cpu-freq
The thermal driver is a standalone driver for monitoring SoC temperature by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ is NOT set. So remove the dependency with CPU_THERMAL. Introduce dummy function of legacy cooling register/unregister to make thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. Signed-off-by: Anson Huang --- drivers/thermal/Kconfig | 2 +- drivers/thermal/imx_thermal.c | 47 --- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 5422523..93bd3bb 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -212,7 +212,7 @@ config HISI_THERMAL config IMX_THERMAL tristate "Temperature sensor driver for Freescale i.MX SoCs" - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST + depends on ARCH_MXC || COMPILE_TEST depends on NVMEM || !NVMEM depends on MFD_SYSCON depends on OF diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 1566154..c924396 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, of_imx_thermal_match); +#ifdef CONFIG_CPU_FREQ /* * Create cooling device in case no #cooling-cells property is available in * CPU node */ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) { - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); + struct device_node *np; int ret; + data->policy = cpufreq_cpu_get(0); + if (!data->policy) { + pr_debug("%s: CPUFreq policy not found\n", __func__); + return -EPROBE_DEFER; + } + + np = of_get_cpu_node(data->policy->cpu, NULL); + if (!np || !of_find_property(np, "#cooling-cells", NULL)) { data->cdev = cpufreq_cooling_register(data->policy); if (IS_ERR(data->cdev)) { @@ -669,6 +678,24 @@ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) return 0; } +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ + cpufreq_cooling_unregister(data->cdev); + cpufreq_cpu_put(data->policy); +} + +#else + +static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) +{ + return 0; +} + +static inline void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ +} +#endif + static int imx_thermal_probe(struct platform_device *pdev) { struct imx_thermal_data *data; @@ -743,14 +770,11 @@ static int imx_thermal_probe(struct platform_device *pdev) regmap_write(map, data->socdata->sensor_ctrl + REG_SET, data->socdata->power_down_mask); - data->policy = cpufreq_cpu_get(0); - if (!data->policy) { - pr_debug("%s: CPUFreq policy not found\n", __func__); - return -EPROBE_DEFER; - } - ret = imx_thermal_register_legacy_cooling(data); if (ret) { + if (ret == -EPROBE_DEFER) + return ret; + dev_err(>dev, "failed to register cpufreq cooling device: %d\n", ret); return ret; @@ -762,7 +786,7 @@ static int imx_thermal_probe(struct platform_device *pdev) if (ret != -EPROBE_DEFER) dev_err(>dev, "failed to get thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } /* @@ -775,7 +799,7 @@ static int imx_thermal_probe(struct platform_device *pdev) ret = clk_prepare_enable(data->thermal_clk); if (ret) { dev_err(>dev, "failed to enable thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } data->tz = thermal_zone_device_register("imx_thermal_zone", @@ -829,9 +853,8 @@ static int imx_thermal_probe(struct platform_device *pdev) thermal_zone_device_unregister(data->tz); clk_disable: clk_disable_unprepare(data->thermal_clk); -cpufreq_put: - cpufreq_cooling_unregister(data->cdev); - cpufreq_cpu_put(data->policy); +legacy_cleanup: + imx_thermal_unregister_legacy_cooling(data); return ret; } -- 2.7.4
[PATCH V5 2/2] thermal: imx: save one condition block for normal case of nvmem initialization
Put return value checks of calling imx_init_from_nvmem_cells() into one block to save one condition block for normal case. Signed-off-by: Anson Huang --- drivers/thermal/imx_thermal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index c924396..bb6754a 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -742,9 +742,10 @@ static int imx_thermal_probe(struct platform_device *pdev) if (of_find_property(pdev->dev.of_node, "nvmem-cells", NULL)) { ret = imx_init_from_nvmem_cells(pdev); - if (ret == -EPROBE_DEFER) - return ret; if (ret) { + if (ret == -EPROBE_DEFER) + return ret; + dev_err(>dev, "failed to init from nvmem: %d\n", ret); return ret; -- 2.7.4
[PATCH V5 1/2] thermal: imx: fix for dependency on cpu-freq
The thermal driver is a standalone driver for monitoring SoC temperature by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ is NOT set. So remove the dependency with CPU_THERMAL. Introduce dummy function of legacy cooling register/unregister to make thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. Signed-off-by: Anson Huang --- drivers/thermal/Kconfig | 2 +- drivers/thermal/imx_thermal.c | 47 --- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 5422523..93bd3bb 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -212,7 +212,7 @@ config HISI_THERMAL config IMX_THERMAL tristate "Temperature sensor driver for Freescale i.MX SoCs" - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST + depends on ARCH_MXC || COMPILE_TEST depends on NVMEM || !NVMEM depends on MFD_SYSCON depends on OF diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 1566154..c924396 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, of_imx_thermal_match); +#ifdef CONFIG_CPU_FREQ /* * Create cooling device in case no #cooling-cells property is available in * CPU node */ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) { - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); + struct device_node *np; int ret; + data->policy = cpufreq_cpu_get(0); + if (!data->policy) { + pr_debug("%s: CPUFreq policy not found\n", __func__); + return -EPROBE_DEFER; + } + + np = of_get_cpu_node(data->policy->cpu, NULL); + if (!np || !of_find_property(np, "#cooling-cells", NULL)) { data->cdev = cpufreq_cooling_register(data->policy); if (IS_ERR(data->cdev)) { @@ -669,6 +678,24 @@ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) return 0; } +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ + cpufreq_cooling_unregister(data->cdev); + cpufreq_cpu_put(data->policy); +} + +#else + +static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) +{ + return 0; +} + +static inline void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ +} +#endif + static int imx_thermal_probe(struct platform_device *pdev) { struct imx_thermal_data *data; @@ -743,14 +770,11 @@ static int imx_thermal_probe(struct platform_device *pdev) regmap_write(map, data->socdata->sensor_ctrl + REG_SET, data->socdata->power_down_mask); - data->policy = cpufreq_cpu_get(0); - if (!data->policy) { - pr_debug("%s: CPUFreq policy not found\n", __func__); - return -EPROBE_DEFER; - } - ret = imx_thermal_register_legacy_cooling(data); if (ret) { + if (ret == -EPROBE_DEFER) + return ret; + dev_err(>dev, "failed to register cpufreq cooling device: %d\n", ret); return ret; @@ -762,7 +786,7 @@ static int imx_thermal_probe(struct platform_device *pdev) if (ret != -EPROBE_DEFER) dev_err(>dev, "failed to get thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } /* @@ -775,7 +799,7 @@ static int imx_thermal_probe(struct platform_device *pdev) ret = clk_prepare_enable(data->thermal_clk); if (ret) { dev_err(>dev, "failed to enable thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } data->tz = thermal_zone_device_register("imx_thermal_zone", @@ -829,9 +853,8 @@ static int imx_thermal_probe(struct platform_device *pdev) thermal_zone_device_unregister(data->tz); clk_disable: clk_disable_unprepare(data->thermal_clk); -cpufreq_put: - cpufreq_cooling_unregister(data->cdev); - cpufreq_cpu_put(data->policy); +legacy_cleanup: + imx_thermal_unregister_legacy_cooling(data); return ret; } -- 2.7.4
Re: [PATCH v2] PCI: mediatek: Use devm_of_pci_get_host_bridge_resources() to parse DT
On Thu, 2018-11-08 at 10:56 +0800, honghui.zh...@mediatek.com wrote: > From: Honghui Zhang > > Use the devm_of_pci_get_host_bridge_resources() API in place of the PCI OF > DT parser. > > Signed-off-by: Honghui Zhang > --- > drivers/pci/controller/pcie-mediatek.c | 98 > +- > 1 file changed, 24 insertions(+), 74 deletions(-) > > diff --git a/drivers/pci/controller/pcie-mediatek.c > b/drivers/pci/controller/pcie-mediatek.c > index 2a1f97c..0590a93 100644 > --- a/drivers/pci/controller/pcie-mediatek.c > +++ b/drivers/pci/controller/pcie-mediatek.c > @@ -197,11 +197,7 @@ struct mtk_pcie_port { > * @dev: pointer to PCIe device > * @base: IO mapped register base > * @free_ck: free-run reference clock > - * @io: IO resource > - * @pio: PIO resource > * @mem: non-prefetchable memory resource > - * @busn: bus range > - * @offset: IO / Memory offset > * @ports: pointer to PCIe port information > * @soc: pointer to SoC-dependent operations > */ > @@ -210,14 +206,7 @@ struct mtk_pcie { > void __iomem *base; > struct clk *free_ck; > > - struct resource io; > - struct resource pio; > struct resource mem; > - struct resource busn; > - struct { > - resource_size_t mem; > - resource_size_t io; > - } offset; > struct list_head ports; > const struct mtk_pcie_soc *soc; > }; > @@ -1045,55 +1034,43 @@ static int mtk_pcie_setup(struct mtk_pcie *pcie) > { > struct device *dev = pcie->dev; > struct device_node *node = dev->of_node, *child; > - struct of_pci_range_parser parser; > - struct of_pci_range range; > - struct resource res; > struct mtk_pcie_port *port, *tmp; > + struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie); > + struct list_head *windows = >windows; > + struct resource_entry *win, *tmp_win; > + resource_size_t io_base; > int err; > > - if (of_pci_range_parser_init(, node)) { > - dev_err(dev, "missing \"ranges\" property\n"); > - return -EINVAL; > - } > + err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, > + windows, _base); > + if (err) > + return err; > > - for_each_of_pci_range(, ) { > - err = of_pci_range_to_resource(, node, ); > - if (err < 0) > - return err; > + err = devm_request_pci_bus_resources(dev, windows); > + if (err < 0) > + return err; > > - switch (res.flags & IORESOURCE_TYPE_BITS) { > + /* Get the I/O and memory ranges from DT */ > + resource_list_for_each_entry_safe(win, tmp_win, windows) { > + switch (resource_type(win->res)) { > case IORESOURCE_IO: > - pcie->offset.io = res.start - range.pci_addr; > - > - memcpy(>pio, , sizeof(res)); > - pcie->pio.name = node->full_name; > - > - pcie->io.start = range.cpu_addr; > - pcie->io.end = range.cpu_addr + range.size - 1; > - pcie->io.flags = IORESOURCE_MEM; > - pcie->io.name = "I/O"; > - > - memcpy(, >io, sizeof(res)); > + err = devm_pci_remap_iospace(dev, win->res, io_base); > + if (err) { > + dev_warn(dev, "error %d: failed to map resource > %pR\n", > + err, win->res); > + resource_list_destroy_entry(win); > + } > break; > - > case IORESOURCE_MEM: > - pcie->offset.mem = res.start - range.pci_addr; > - > - memcpy(>mem, , sizeof(res)); > + memcpy(>mem, win->res, sizeof(*win->res)); > pcie->mem.name = "non-prefetchable"; > break; > + case IORESOURCE_BUS: > + host->busnr = win->res->start; > + break; > } > } > > - err = of_pci_parse_bus_range(node, >busn); > - if (err < 0) { > - dev_err(dev, "failed to parse bus ranges property: %d\n", err); > - pcie->busn.name = node->name; > - pcie->busn.start = 0; > - pcie->busn.end = 0xff; > - pcie->busn.flags = IORESOURCE_BUS; > - } > - > for_each_available_child_of_node(node, child) { > int slot; > > @@ -1125,28 +1102,6 @@ static int mtk_pcie_setup(struct mtk_pcie *pcie) > return 0; > } > > -static int mtk_pcie_request_resources(struct mtk_pcie *pcie) > -{ > - struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie); > - struct list_head *windows = >windows; > - struct device *dev = pcie->dev; > - int err; > - > - pci_add_resource_offset(windows, >pio,
Re: [PATCH v2] PCI: mediatek: Use devm_of_pci_get_host_bridge_resources() to parse DT
On Thu, 2018-11-08 at 10:56 +0800, honghui.zh...@mediatek.com wrote: > From: Honghui Zhang > > Use the devm_of_pci_get_host_bridge_resources() API in place of the PCI OF > DT parser. > > Signed-off-by: Honghui Zhang > --- > drivers/pci/controller/pcie-mediatek.c | 98 > +- > 1 file changed, 24 insertions(+), 74 deletions(-) > > diff --git a/drivers/pci/controller/pcie-mediatek.c > b/drivers/pci/controller/pcie-mediatek.c > index 2a1f97c..0590a93 100644 > --- a/drivers/pci/controller/pcie-mediatek.c > +++ b/drivers/pci/controller/pcie-mediatek.c > @@ -197,11 +197,7 @@ struct mtk_pcie_port { > * @dev: pointer to PCIe device > * @base: IO mapped register base > * @free_ck: free-run reference clock > - * @io: IO resource > - * @pio: PIO resource > * @mem: non-prefetchable memory resource > - * @busn: bus range > - * @offset: IO / Memory offset > * @ports: pointer to PCIe port information > * @soc: pointer to SoC-dependent operations > */ > @@ -210,14 +206,7 @@ struct mtk_pcie { > void __iomem *base; > struct clk *free_ck; > > - struct resource io; > - struct resource pio; > struct resource mem; > - struct resource busn; > - struct { > - resource_size_t mem; > - resource_size_t io; > - } offset; > struct list_head ports; > const struct mtk_pcie_soc *soc; > }; > @@ -1045,55 +1034,43 @@ static int mtk_pcie_setup(struct mtk_pcie *pcie) > { > struct device *dev = pcie->dev; > struct device_node *node = dev->of_node, *child; > - struct of_pci_range_parser parser; > - struct of_pci_range range; > - struct resource res; > struct mtk_pcie_port *port, *tmp; > + struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie); > + struct list_head *windows = >windows; > + struct resource_entry *win, *tmp_win; > + resource_size_t io_base; > int err; > > - if (of_pci_range_parser_init(, node)) { > - dev_err(dev, "missing \"ranges\" property\n"); > - return -EINVAL; > - } > + err = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, > + windows, _base); > + if (err) > + return err; > > - for_each_of_pci_range(, ) { > - err = of_pci_range_to_resource(, node, ); > - if (err < 0) > - return err; > + err = devm_request_pci_bus_resources(dev, windows); > + if (err < 0) > + return err; > > - switch (res.flags & IORESOURCE_TYPE_BITS) { > + /* Get the I/O and memory ranges from DT */ > + resource_list_for_each_entry_safe(win, tmp_win, windows) { > + switch (resource_type(win->res)) { > case IORESOURCE_IO: > - pcie->offset.io = res.start - range.pci_addr; > - > - memcpy(>pio, , sizeof(res)); > - pcie->pio.name = node->full_name; > - > - pcie->io.start = range.cpu_addr; > - pcie->io.end = range.cpu_addr + range.size - 1; > - pcie->io.flags = IORESOURCE_MEM; > - pcie->io.name = "I/O"; > - > - memcpy(, >io, sizeof(res)); > + err = devm_pci_remap_iospace(dev, win->res, io_base); > + if (err) { > + dev_warn(dev, "error %d: failed to map resource > %pR\n", > + err, win->res); > + resource_list_destroy_entry(win); > + } > break; > - > case IORESOURCE_MEM: > - pcie->offset.mem = res.start - range.pci_addr; > - > - memcpy(>mem, , sizeof(res)); > + memcpy(>mem, win->res, sizeof(*win->res)); > pcie->mem.name = "non-prefetchable"; > break; > + case IORESOURCE_BUS: > + host->busnr = win->res->start; > + break; > } > } > > - err = of_pci_parse_bus_range(node, >busn); > - if (err < 0) { > - dev_err(dev, "failed to parse bus ranges property: %d\n", err); > - pcie->busn.name = node->name; > - pcie->busn.start = 0; > - pcie->busn.end = 0xff; > - pcie->busn.flags = IORESOURCE_BUS; > - } > - > for_each_available_child_of_node(node, child) { > int slot; > > @@ -1125,28 +1102,6 @@ static int mtk_pcie_setup(struct mtk_pcie *pcie) > return 0; > } > > -static int mtk_pcie_request_resources(struct mtk_pcie *pcie) > -{ > - struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie); > - struct list_head *windows = >windows; > - struct device *dev = pcie->dev; > - int err; > - > - pci_add_resource_offset(windows, >pio,
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 11:01 AM, Rajendra Nayak wrote: On 11/21/2018 10:46 AM, Viresh Kumar wrote: On 21-11-18, 10:33, Rajendra Nayak wrote: Hi Viresh, On 11/5/2018 12:06 PM, Viresh Kumar wrote: This commit updates genpd core to start propagating performance state updates to master domains that have their set_performance_state() callback set. A genpd handles two type of performance states now. The first one is the performance state requirement put on the genpd by the devices and sub-domains under it, which is already represented by genpd->performance_state. The second type, introduced in this commit, is the performance state requirement(s) put by the genpd on its master domain(s). There is a separate value required for each master that the genpd has and so a new field is added to the struct gpd_link (link->performance_state), which represents the link between a genpd and its master. The struct gpd_link also got another field prev_performance_state, which is used by genpd core as a temporary variable during transitions. We need to propagate setting performance state while powering-on a genpd as well, as we ignore performance state requirements from sub-domains which are powered-off. For this reason _genpd_power_on() also received the additional parameter, depth, which is used for hierarchical locking within genpd. Signed-off-by: Viresh Kumar --- drivers/base/power/domain.c | 107 +--- include/linux/pm_domain.h | 4 ++ 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 6d2e9b3406f1..81e02c5f753f 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd) static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {} #endif +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, + unsigned int state, int depth); + static int _genpd_set_performance_state(struct generic_pm_domain *genpd, - unsigned int state) + unsigned int state, int depth) { + struct generic_pm_domain *master; + struct gpd_link *link; + unsigned int mstate; int ret; if (!genpd_status_on(genpd)) goto out; This check here would mean we only propogate performance state to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? I would think this is analogous to a driver calling clk_set_rate() first and then a clk_enable(), which is certainly valid. So my question is, if calling a dev_pm_genpd_set_performance_state() and then runtime enabling the device would work (and take care of propagating the performance state). In my testing I found it does not. And the reason for that seems to be that we update the genpd status to GPD_STATE_ACTIVE *after* we try to set the performance state, so we always hit this check which bails out thinking the genpd is not ON. Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything.
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 11:01 AM, Rajendra Nayak wrote: On 11/21/2018 10:46 AM, Viresh Kumar wrote: On 21-11-18, 10:33, Rajendra Nayak wrote: Hi Viresh, On 11/5/2018 12:06 PM, Viresh Kumar wrote: This commit updates genpd core to start propagating performance state updates to master domains that have their set_performance_state() callback set. A genpd handles two type of performance states now. The first one is the performance state requirement put on the genpd by the devices and sub-domains under it, which is already represented by genpd->performance_state. The second type, introduced in this commit, is the performance state requirement(s) put by the genpd on its master domain(s). There is a separate value required for each master that the genpd has and so a new field is added to the struct gpd_link (link->performance_state), which represents the link between a genpd and its master. The struct gpd_link also got another field prev_performance_state, which is used by genpd core as a temporary variable during transitions. We need to propagate setting performance state while powering-on a genpd as well, as we ignore performance state requirements from sub-domains which are powered-off. For this reason _genpd_power_on() also received the additional parameter, depth, which is used for hierarchical locking within genpd. Signed-off-by: Viresh Kumar --- drivers/base/power/domain.c | 107 +--- include/linux/pm_domain.h | 4 ++ 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 6d2e9b3406f1..81e02c5f753f 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd) static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {} #endif +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, + unsigned int state, int depth); + static int _genpd_set_performance_state(struct generic_pm_domain *genpd, - unsigned int state) + unsigned int state, int depth) { + struct generic_pm_domain *master; + struct gpd_link *link; + unsigned int mstate; int ret; if (!genpd_status_on(genpd)) goto out; This check here would mean we only propogate performance state to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? I would think this is analogous to a driver calling clk_set_rate() first and then a clk_enable(), which is certainly valid. So my question is, if calling a dev_pm_genpd_set_performance_state() and then runtime enabling the device would work (and take care of propagating the performance state). In my testing I found it does not. And the reason for that seems to be that we update the genpd status to GPD_STATE_ACTIVE *after* we try to set the performance state, so we always hit this check which bails out thinking the genpd is not ON. Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything.
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 11:01, Rajendra Nayak wrote: > I would think this is analogous to a driver calling clk_set_rate() first and > then a clk_enable(), which is certainly valid. > So my question is, if calling a dev_pm_genpd_set_performance_state() > and then runtime enabling the device would work (and take care of propagating > the performance > state). Two things here.. - First, it should work. - Second, we don't look at device RPM states but only if a genpd is on or off. Maybe we should look at device states as well, but then we need to initiate set-performance-state routine from runtime enable as well. But that is completely different work and would require its own series. > In my testing I found it does not. As I said, we don't do anything in RPM enable of the device currently, but only during genpd_power_on(). So if the genpd was previously disabled, RPM enable of the device should have done propagation of states as well. Otherwise, we should already have take care of the vote from the device. So, it should have worked. Needs some investigation on why this isn't working for you. -- viresh
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 11:01, Rajendra Nayak wrote: > I would think this is analogous to a driver calling clk_set_rate() first and > then a clk_enable(), which is certainly valid. > So my question is, if calling a dev_pm_genpd_set_performance_state() > and then runtime enabling the device would work (and take care of propagating > the performance > state). Two things here.. - First, it should work. - Second, we don't look at device RPM states but only if a genpd is on or off. Maybe we should look at device states as well, but then we need to initiate set-performance-state routine from runtime enable as well. But that is completely different work and would require its own series. > In my testing I found it does not. As I said, we don't do anything in RPM enable of the device currently, but only during genpd_power_on(). So if the genpd was previously disabled, RPM enable of the device should have done propagation of states as well. Otherwise, we should already have take care of the vote from the device. So, it should have worked. Needs some investigation on why this isn't working for you. -- viresh
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 10:46 AM, Viresh Kumar wrote: On 21-11-18, 10:33, Rajendra Nayak wrote: Hi Viresh, On 11/5/2018 12:06 PM, Viresh Kumar wrote: This commit updates genpd core to start propagating performance state updates to master domains that have their set_performance_state() callback set. A genpd handles two type of performance states now. The first one is the performance state requirement put on the genpd by the devices and sub-domains under it, which is already represented by genpd->performance_state. The second type, introduced in this commit, is the performance state requirement(s) put by the genpd on its master domain(s). There is a separate value required for each master that the genpd has and so a new field is added to the struct gpd_link (link->performance_state), which represents the link between a genpd and its master. The struct gpd_link also got another field prev_performance_state, which is used by genpd core as a temporary variable during transitions. We need to propagate setting performance state while powering-on a genpd as well, as we ignore performance state requirements from sub-domains which are powered-off. For this reason _genpd_power_on() also received the additional parameter, depth, which is used for hierarchical locking within genpd. Signed-off-by: Viresh Kumar --- drivers/base/power/domain.c | 107 +--- include/linux/pm_domain.h | 4 ++ 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 6d2e9b3406f1..81e02c5f753f 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd) static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {} #endif +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, + unsigned int state, int depth); + static int _genpd_set_performance_state(struct generic_pm_domain *genpd, - unsigned int state) + unsigned int state, int depth) { + struct generic_pm_domain *master; + struct gpd_link *link; + unsigned int mstate; int ret; if (!genpd_status_on(genpd)) goto out; This check here would mean we only propogate performance state to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? I would think this is analogous to a driver calling clk_set_rate() first and then a clk_enable(), which is certainly valid. So my question is, if calling a dev_pm_genpd_set_performance_state() and then runtime enabling the device would work (and take care of propagating the performance state). In my testing I found it does not. Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything.
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 11/21/2018 10:46 AM, Viresh Kumar wrote: On 21-11-18, 10:33, Rajendra Nayak wrote: Hi Viresh, On 11/5/2018 12:06 PM, Viresh Kumar wrote: This commit updates genpd core to start propagating performance state updates to master domains that have their set_performance_state() callback set. A genpd handles two type of performance states now. The first one is the performance state requirement put on the genpd by the devices and sub-domains under it, which is already represented by genpd->performance_state. The second type, introduced in this commit, is the performance state requirement(s) put by the genpd on its master domain(s). There is a separate value required for each master that the genpd has and so a new field is added to the struct gpd_link (link->performance_state), which represents the link between a genpd and its master. The struct gpd_link also got another field prev_performance_state, which is used by genpd core as a temporary variable during transitions. We need to propagate setting performance state while powering-on a genpd as well, as we ignore performance state requirements from sub-domains which are powered-off. For this reason _genpd_power_on() also received the additional parameter, depth, which is used for hierarchical locking within genpd. Signed-off-by: Viresh Kumar --- drivers/base/power/domain.c | 107 +--- include/linux/pm_domain.h | 4 ++ 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 6d2e9b3406f1..81e02c5f753f 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd) static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {} #endif +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, + unsigned int state, int depth); + static int _genpd_set_performance_state(struct generic_pm_domain *genpd, - unsigned int state) + unsigned int state, int depth) { + struct generic_pm_domain *master; + struct gpd_link *link; + unsigned int mstate; int ret; if (!genpd_status_on(genpd)) goto out; This check here would mean we only propogate performance state to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? I would think this is analogous to a driver calling clk_set_rate() first and then a clk_enable(), which is certainly valid. So my question is, if calling a dev_pm_genpd_set_performance_state() and then runtime enabling the device would work (and take care of propagating the performance state). In my testing I found it does not. Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything.
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:28, Anson Huang wrote: > Since there is similar case of DEFER PROBE for the case of > imx_init_from_nvmem_cells > check, should I create another patch of same fix for it in V5 patch set? Send that as a separate patch please. Thanks. -- viresh
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:28, Anson Huang wrote: > Since there is similar case of DEFER PROBE for the case of > imx_init_from_nvmem_cells > check, should I create another patch of same fix for it in V5 patch set? Send that as a separate patch please. Thanks. -- viresh
RE: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
Best Regards! Anson Huang > -Original Message- > From: Viresh Kumar [mailto:viresh.ku...@linaro.org] > Sent: 2018年11月21日 13:21 > To: Anson Huang > Cc: rui.zh...@intel.com; edubez...@gmail.com; daniel.lezc...@linaro.org; > linux...@vger.kernel.org; linux-kernel@vger.kernel.org; > l.st...@pengutronix.de; dl-linux-imx > Subject: Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq > > On 21-11-18, 05:08, Anson Huang wrote: > > The thermal driver is a standalone driver for monitoring SoC > > temperature by enabling thermal sensor, so it can be enabled even when > > CONFIG_CPU_FREQ is NOT set. So remove the dependency with > CPU_THERMAL. > > > > Introduce dummy function of legacy cooling register/unregister to make > > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > > > Signed-off-by: Anson Huang > > --- > > changes since V3: > > rename the label of "cpufreq_put" with "legacy_cleanup". > > drivers/thermal/Kconfig | 2 +- > > drivers/thermal/imx_thermal.c | 44 > > +++ > > 2 files changed, 33 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index > > 5422523..93bd3bb 100644 > > --- a/drivers/thermal/Kconfig > > +++ b/drivers/thermal/Kconfig > > @@ -212,7 +212,7 @@ config HISI_THERMAL > > > > config IMX_THERMAL > > tristate "Temperature sensor driver for Freescale i.MX SoCs" > > - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST > > + depends on ARCH_MXC || COMPILE_TEST > > depends on NVMEM || !NVMEM > > depends on MFD_SYSCON > > depends on OF > > diff --git a/drivers/thermal/imx_thermal.c > > b/drivers/thermal/imx_thermal.c index 1566154..328ee05 100644 > > --- a/drivers/thermal/imx_thermal.c > > +++ b/drivers/thermal/imx_thermal.c > > @@ -648,15 +648,24 @@ static const struct of_device_id > > of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, > > of_imx_thermal_match); > > > > +#ifdef CONFIG_CPU_FREQ > > /* > > * Create cooling device in case no #cooling-cells property is available in > > * CPU node > > */ > > static int imx_thermal_register_legacy_cooling(struct > > imx_thermal_data *data) { > > - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); > > + struct device_node *np; > > int ret; > > > > + data->policy = cpufreq_cpu_get(0); > > + if (!data->policy) { > > + pr_debug("%s: CPUFreq policy not found\n", __func__); > > + return -EPROBE_DEFER; > > + } > > + > > + np = of_get_cpu_node(data->policy->cpu, NULL); > > + > > if (!np || !of_find_property(np, "#cooling-cells", NULL)) { > > data->cdev = cpufreq_cooling_register(data->policy); > > if (IS_ERR(data->cdev)) { > > @@ -669,6 +678,22 @@ static int > imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) > > return 0; > > } > > > > +static void imx_thermal_unregister_legacy_cooling(struct > > +imx_thermal_data *data) { > > + cpufreq_cooling_unregister(data->cdev); > > + cpufreq_cpu_put(data->policy); > > +} > > +#else > > +static inline int imx_thermal_register_legacy_cooling(struct > > +imx_thermal_data *data) { > > + return 0; > > +} > > + > > +static inline void imx_thermal_unregister_legacy_cooling(struct > > +imx_thermal_data *data) { } #endif > > + > > static int imx_thermal_probe(struct platform_device *pdev) { > > struct imx_thermal_data *data; > > @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct > platform_device *pdev) > > regmap_write(map, data->socdata->sensor_ctrl + REG_SET, > > data->socdata->power_down_mask); > > > > - data->policy = cpufreq_cpu_get(0); > > - if (!data->policy) { > > - pr_debug("%s: CPUFreq policy not found\n", __func__); > > - return -EPROBE_DEFER; > > - } > > - > > ret = imx_thermal_register_legacy_cooling(data); > > + if (ret == -EPROBE_DEFER) > > + return ret; > > if (ret) { > > Sorry for not noticing earlier, but wouldn't it be better to move the > EPROBE_DEFER check inside of this if block ? Otherwise we will have two > conditional blocks in the success (normal) case. Since there is similar case of DEFER PROBE for the case of imx_init_from_nvmem_cells check, should I create another patch of same fix for it in V5 patch set? Anson. > > Something like this: > > if (ret) { > if (ret == -EPROBE_DEFER) > return ret; > > dev_err(..); > ... > } > > > dev_err(>dev, > > "failed to register cpufreq cooling device: %d\n", > > ret); @@ > -762,7 > > +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > > if (ret != -EPROBE_DEFER) > > dev_err(>dev, > > "failed to get thermal clk: %d\n", ret); > > - goto cpufreq_put; > > + goto legacy_cleanup; > > } > > > > /* > > @@ -775,7 +796,7 @@
RE: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
Best Regards! Anson Huang > -Original Message- > From: Viresh Kumar [mailto:viresh.ku...@linaro.org] > Sent: 2018年11月21日 13:21 > To: Anson Huang > Cc: rui.zh...@intel.com; edubez...@gmail.com; daniel.lezc...@linaro.org; > linux...@vger.kernel.org; linux-kernel@vger.kernel.org; > l.st...@pengutronix.de; dl-linux-imx > Subject: Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq > > On 21-11-18, 05:08, Anson Huang wrote: > > The thermal driver is a standalone driver for monitoring SoC > > temperature by enabling thermal sensor, so it can be enabled even when > > CONFIG_CPU_FREQ is NOT set. So remove the dependency with > CPU_THERMAL. > > > > Introduce dummy function of legacy cooling register/unregister to make > > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > > > Signed-off-by: Anson Huang > > --- > > changes since V3: > > rename the label of "cpufreq_put" with "legacy_cleanup". > > drivers/thermal/Kconfig | 2 +- > > drivers/thermal/imx_thermal.c | 44 > > +++ > > 2 files changed, 33 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index > > 5422523..93bd3bb 100644 > > --- a/drivers/thermal/Kconfig > > +++ b/drivers/thermal/Kconfig > > @@ -212,7 +212,7 @@ config HISI_THERMAL > > > > config IMX_THERMAL > > tristate "Temperature sensor driver for Freescale i.MX SoCs" > > - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST > > + depends on ARCH_MXC || COMPILE_TEST > > depends on NVMEM || !NVMEM > > depends on MFD_SYSCON > > depends on OF > > diff --git a/drivers/thermal/imx_thermal.c > > b/drivers/thermal/imx_thermal.c index 1566154..328ee05 100644 > > --- a/drivers/thermal/imx_thermal.c > > +++ b/drivers/thermal/imx_thermal.c > > @@ -648,15 +648,24 @@ static const struct of_device_id > > of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, > > of_imx_thermal_match); > > > > +#ifdef CONFIG_CPU_FREQ > > /* > > * Create cooling device in case no #cooling-cells property is available in > > * CPU node > > */ > > static int imx_thermal_register_legacy_cooling(struct > > imx_thermal_data *data) { > > - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); > > + struct device_node *np; > > int ret; > > > > + data->policy = cpufreq_cpu_get(0); > > + if (!data->policy) { > > + pr_debug("%s: CPUFreq policy not found\n", __func__); > > + return -EPROBE_DEFER; > > + } > > + > > + np = of_get_cpu_node(data->policy->cpu, NULL); > > + > > if (!np || !of_find_property(np, "#cooling-cells", NULL)) { > > data->cdev = cpufreq_cooling_register(data->policy); > > if (IS_ERR(data->cdev)) { > > @@ -669,6 +678,22 @@ static int > imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) > > return 0; > > } > > > > +static void imx_thermal_unregister_legacy_cooling(struct > > +imx_thermal_data *data) { > > + cpufreq_cooling_unregister(data->cdev); > > + cpufreq_cpu_put(data->policy); > > +} > > +#else > > +static inline int imx_thermal_register_legacy_cooling(struct > > +imx_thermal_data *data) { > > + return 0; > > +} > > + > > +static inline void imx_thermal_unregister_legacy_cooling(struct > > +imx_thermal_data *data) { } #endif > > + > > static int imx_thermal_probe(struct platform_device *pdev) { > > struct imx_thermal_data *data; > > @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct > platform_device *pdev) > > regmap_write(map, data->socdata->sensor_ctrl + REG_SET, > > data->socdata->power_down_mask); > > > > - data->policy = cpufreq_cpu_get(0); > > - if (!data->policy) { > > - pr_debug("%s: CPUFreq policy not found\n", __func__); > > - return -EPROBE_DEFER; > > - } > > - > > ret = imx_thermal_register_legacy_cooling(data); > > + if (ret == -EPROBE_DEFER) > > + return ret; > > if (ret) { > > Sorry for not noticing earlier, but wouldn't it be better to move the > EPROBE_DEFER check inside of this if block ? Otherwise we will have two > conditional blocks in the success (normal) case. Since there is similar case of DEFER PROBE for the case of imx_init_from_nvmem_cells check, should I create another patch of same fix for it in V5 patch set? Anson. > > Something like this: > > if (ret) { > if (ret == -EPROBE_DEFER) > return ret; > > dev_err(..); > ... > } > > > dev_err(>dev, > > "failed to register cpufreq cooling device: %d\n", > > ret); @@ > -762,7 > > +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > > if (ret != -EPROBE_DEFER) > > dev_err(>dev, > > "failed to get thermal clk: %d\n", ret); > > - goto cpufreq_put; > > + goto legacy_cleanup; > > } > > > > /* > > @@ -775,7 +796,7 @@
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:08, Anson Huang wrote: > +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data > *data) > +{ > + cpufreq_cooling_unregister(data->cdev); > + cpufreq_cpu_put(data->policy); > +} > +#else > +static inline int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > +{ > + return 0; > +} And now that you are going to send V5, please add a blank line before and after #else here. -- viresh
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:08, Anson Huang wrote: > +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data > *data) > +{ > + cpufreq_cooling_unregister(data->cdev); > + cpufreq_cpu_put(data->policy); > +} > +#else > +static inline int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > +{ > + return 0; > +} And now that you are going to send V5, please add a blank line before and after #else here. -- viresh
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:08, Anson Huang wrote: > The thermal driver is a standalone driver for monitoring SoC temperature > by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ > is NOT set. So remove the dependency with CPU_THERMAL. > > Introduce dummy function of legacy cooling register/unregister to make > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > Signed-off-by: Anson Huang > --- > changes since V3: > rename the label of "cpufreq_put" with "legacy_cleanup". > drivers/thermal/Kconfig | 2 +- > drivers/thermal/imx_thermal.c | 44 > +++ > 2 files changed, 33 insertions(+), 13 deletions(-) > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig > index 5422523..93bd3bb 100644 > --- a/drivers/thermal/Kconfig > +++ b/drivers/thermal/Kconfig > @@ -212,7 +212,7 @@ config HISI_THERMAL > > config IMX_THERMAL > tristate "Temperature sensor driver for Freescale i.MX SoCs" > - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST > + depends on ARCH_MXC || COMPILE_TEST > depends on NVMEM || !NVMEM > depends on MFD_SYSCON > depends on OF > diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c > index 1566154..328ee05 100644 > --- a/drivers/thermal/imx_thermal.c > +++ b/drivers/thermal/imx_thermal.c > @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] > = { > }; > MODULE_DEVICE_TABLE(of, of_imx_thermal_match); > > +#ifdef CONFIG_CPU_FREQ > /* > * Create cooling device in case no #cooling-cells property is available in > * CPU node > */ > static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) > { > - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); > + struct device_node *np; > int ret; > > + data->policy = cpufreq_cpu_get(0); > + if (!data->policy) { > + pr_debug("%s: CPUFreq policy not found\n", __func__); > + return -EPROBE_DEFER; > + } > + > + np = of_get_cpu_node(data->policy->cpu, NULL); > + > if (!np || !of_find_property(np, "#cooling-cells", NULL)) { > data->cdev = cpufreq_cooling_register(data->policy); > if (IS_ERR(data->cdev)) { > @@ -669,6 +678,22 @@ static int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > return 0; > } > > +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data > *data) > +{ > + cpufreq_cooling_unregister(data->cdev); > + cpufreq_cpu_put(data->policy); > +} > +#else > +static inline int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > +{ > + return 0; > +} > + > +static inline void imx_thermal_unregister_legacy_cooling(struct > imx_thermal_data *data) > +{ > +} > +#endif > + > static int imx_thermal_probe(struct platform_device *pdev) > { > struct imx_thermal_data *data; > @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct platform_device > *pdev) > regmap_write(map, data->socdata->sensor_ctrl + REG_SET, >data->socdata->power_down_mask); > > - data->policy = cpufreq_cpu_get(0); > - if (!data->policy) { > - pr_debug("%s: CPUFreq policy not found\n", __func__); > - return -EPROBE_DEFER; > - } > - > ret = imx_thermal_register_legacy_cooling(data); > + if (ret == -EPROBE_DEFER) > + return ret; > if (ret) { Sorry for not noticing earlier, but wouldn't it be better to move the EPROBE_DEFER check inside of this if block ? Otherwise we will have two conditional blocks in the success (normal) case. Something like this: if (ret) { if (ret == -EPROBE_DEFER) return ret; dev_err(..); ... } > dev_err(>dev, > "failed to register cpufreq cooling device: %d\n", ret); > @@ -762,7 +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > if (ret != -EPROBE_DEFER) > dev_err(>dev, > "failed to get thermal clk: %d\n", ret); > - goto cpufreq_put; > + goto legacy_cleanup; > } > > /* > @@ -775,7 +796,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > ret = clk_prepare_enable(data->thermal_clk); > if (ret) { > dev_err(>dev, "failed to enable thermal clk: %d\n", ret); > - goto cpufreq_put; > + goto legacy_cleanup; > } > > data->tz = thermal_zone_device_register("imx_thermal_zone", > @@ -829,9 +850,8 @@ static int imx_thermal_probe(struct platform_device *pdev) > thermal_zone_device_unregister(data->tz); > clk_disable: > clk_disable_unprepare(data->thermal_clk); > -cpufreq_put: > - cpufreq_cooling_unregister(data->cdev); > - cpufreq_cpu_put(data->policy); > +legacy_cleanup: > +
Re: [PATCH V4] thermal: imx: fix for dependency on cpu-freq
On 21-11-18, 05:08, Anson Huang wrote: > The thermal driver is a standalone driver for monitoring SoC temperature > by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ > is NOT set. So remove the dependency with CPU_THERMAL. > > Introduce dummy function of legacy cooling register/unregister to make > thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. > > Signed-off-by: Anson Huang > --- > changes since V3: > rename the label of "cpufreq_put" with "legacy_cleanup". > drivers/thermal/Kconfig | 2 +- > drivers/thermal/imx_thermal.c | 44 > +++ > 2 files changed, 33 insertions(+), 13 deletions(-) > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig > index 5422523..93bd3bb 100644 > --- a/drivers/thermal/Kconfig > +++ b/drivers/thermal/Kconfig > @@ -212,7 +212,7 @@ config HISI_THERMAL > > config IMX_THERMAL > tristate "Temperature sensor driver for Freescale i.MX SoCs" > - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST > + depends on ARCH_MXC || COMPILE_TEST > depends on NVMEM || !NVMEM > depends on MFD_SYSCON > depends on OF > diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c > index 1566154..328ee05 100644 > --- a/drivers/thermal/imx_thermal.c > +++ b/drivers/thermal/imx_thermal.c > @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] > = { > }; > MODULE_DEVICE_TABLE(of, of_imx_thermal_match); > > +#ifdef CONFIG_CPU_FREQ > /* > * Create cooling device in case no #cooling-cells property is available in > * CPU node > */ > static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) > { > - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); > + struct device_node *np; > int ret; > > + data->policy = cpufreq_cpu_get(0); > + if (!data->policy) { > + pr_debug("%s: CPUFreq policy not found\n", __func__); > + return -EPROBE_DEFER; > + } > + > + np = of_get_cpu_node(data->policy->cpu, NULL); > + > if (!np || !of_find_property(np, "#cooling-cells", NULL)) { > data->cdev = cpufreq_cooling_register(data->policy); > if (IS_ERR(data->cdev)) { > @@ -669,6 +678,22 @@ static int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > return 0; > } > > +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data > *data) > +{ > + cpufreq_cooling_unregister(data->cdev); > + cpufreq_cpu_put(data->policy); > +} > +#else > +static inline int imx_thermal_register_legacy_cooling(struct > imx_thermal_data *data) > +{ > + return 0; > +} > + > +static inline void imx_thermal_unregister_legacy_cooling(struct > imx_thermal_data *data) > +{ > +} > +#endif > + > static int imx_thermal_probe(struct platform_device *pdev) > { > struct imx_thermal_data *data; > @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct platform_device > *pdev) > regmap_write(map, data->socdata->sensor_ctrl + REG_SET, >data->socdata->power_down_mask); > > - data->policy = cpufreq_cpu_get(0); > - if (!data->policy) { > - pr_debug("%s: CPUFreq policy not found\n", __func__); > - return -EPROBE_DEFER; > - } > - > ret = imx_thermal_register_legacy_cooling(data); > + if (ret == -EPROBE_DEFER) > + return ret; > if (ret) { Sorry for not noticing earlier, but wouldn't it be better to move the EPROBE_DEFER check inside of this if block ? Otherwise we will have two conditional blocks in the success (normal) case. Something like this: if (ret) { if (ret == -EPROBE_DEFER) return ret; dev_err(..); ... } > dev_err(>dev, > "failed to register cpufreq cooling device: %d\n", ret); > @@ -762,7 +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > if (ret != -EPROBE_DEFER) > dev_err(>dev, > "failed to get thermal clk: %d\n", ret); > - goto cpufreq_put; > + goto legacy_cleanup; > } > > /* > @@ -775,7 +796,7 @@ static int imx_thermal_probe(struct platform_device *pdev) > ret = clk_prepare_enable(data->thermal_clk); > if (ret) { > dev_err(>dev, "failed to enable thermal clk: %d\n", ret); > - goto cpufreq_put; > + goto legacy_cleanup; > } > > data->tz = thermal_zone_device_register("imx_thermal_zone", > @@ -829,9 +850,8 @@ static int imx_thermal_probe(struct platform_device *pdev) > thermal_zone_device_unregister(data->tz); > clk_disable: > clk_disable_unprepare(data->thermal_clk); > -cpufreq_put: > - cpufreq_cooling_unregister(data->cdev); > - cpufreq_cpu_put(data->policy); > +legacy_cleanup: > +
Re: RFC: userspace exception fixups
On 2018-11-21 04:25, Jarkko Sakkinen wrote: On Tue, Nov 20, 2018 at 07:19:37AM -0800, Andy Lutomirski wrote: general by mucking with some regs and retrying -- that will infinite loop and confuse everyone. I'm not even 100% convinced that decoding the insn stream is useful -- AEP can point to something that isn't ENCLU. In my return-to-AEP approach to whole point was not to do any decoding but instead have something else always in the AEP handler than just ENCLU. No instruction decoding. No RIP manipulation. IOW the kernel needs to know *when* to apply this special behavior. Sadly there is no bit in the exception frame that says "came from SGX". Jarkko, can you please explain you solution in detail? The CPU receives an exception. This will be handled by the kernel exception handler. What information does the kernel exception handler use to determine whether to deliver the exception as a regular signal to the process, or whether to set the special registers values for userspace and just continue executing the process manually? -- Jethro Beekman | Fortanix smime.p7s Description: S/MIME Cryptographic Signature
Re: RFC: userspace exception fixups
On 2018-11-21 04:25, Jarkko Sakkinen wrote: On Tue, Nov 20, 2018 at 07:19:37AM -0800, Andy Lutomirski wrote: general by mucking with some regs and retrying -- that will infinite loop and confuse everyone. I'm not even 100% convinced that decoding the insn stream is useful -- AEP can point to something that isn't ENCLU. In my return-to-AEP approach to whole point was not to do any decoding but instead have something else always in the AEP handler than just ENCLU. No instruction decoding. No RIP manipulation. IOW the kernel needs to know *when* to apply this special behavior. Sadly there is no bit in the exception frame that says "came from SGX". Jarkko, can you please explain you solution in detail? The CPU receives an exception. This will be handled by the kernel exception handler. What information does the kernel exception handler use to determine whether to deliver the exception as a regular signal to the process, or whether to set the special registers values for userspace and just continue executing the process manually? -- Jethro Beekman | Fortanix smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 21-11-18, 10:34, Rajendra Nayak wrote: > > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > Introduce a new helper dev_pm_opp_xlate_performance_state() which will > > be used to translate from pstate of a device to another one. > > > > Initially this will be used by genpd to find pstate of a master domain > > using its sub-domain's pstate. > > > > Signed-off-by: Viresh Kumar > > --- > > drivers/opp/core.c | 49 ++ > > include/linux/pm_opp.h | 7 ++ > > 2 files changed, 56 insertions(+) > > > > diff --git a/drivers/opp/core.c b/drivers/opp/core.c > > index 0eaa954b3f6c..010a4268e8dd 100644 > > --- a/drivers/opp/core.c > > +++ b/drivers/opp/core.c > > @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct opp_table > > *opp_table, > > dev_err(virt_dev, "Failed to find required device entry\n"); > > } > > +/** > > + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for > > src_table. > > + * @src_table: OPP table which has dst_table as one of its required OPP > > table. > > So I have a case where the src_table and dst_table are shared/same. Can you > explain how would > it work in such a case? Can you give the example, as I am finding some issues with such shared tables. Though the code may work just fine btw. -- viresh
Re: [PATCH 1/4] OPP: Add dev_pm_opp_xlate_performance_state() helper
On 21-11-18, 10:34, Rajendra Nayak wrote: > > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > Introduce a new helper dev_pm_opp_xlate_performance_state() which will > > be used to translate from pstate of a device to another one. > > > > Initially this will be used by genpd to find pstate of a master domain > > using its sub-domain's pstate. > > > > Signed-off-by: Viresh Kumar > > --- > > drivers/opp/core.c | 49 ++ > > include/linux/pm_opp.h | 7 ++ > > 2 files changed, 56 insertions(+) > > > > diff --git a/drivers/opp/core.c b/drivers/opp/core.c > > index 0eaa954b3f6c..010a4268e8dd 100644 > > --- a/drivers/opp/core.c > > +++ b/drivers/opp/core.c > > @@ -1707,6 +1707,55 @@ void dev_pm_opp_put_genpd_virt_dev(struct opp_table > > *opp_table, > > dev_err(virt_dev, "Failed to find required device entry\n"); > > } > > +/** > > + * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for > > src_table. > > + * @src_table: OPP table which has dst_table as one of its required OPP > > table. > > So I have a case where the src_table and dst_table are shared/same. Can you > explain how would > it work in such a case? Can you give the example, as I am finding some issues with such shared tables. Though the code may work just fine btw. -- viresh
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 10:33, Rajendra Nayak wrote: > Hi Viresh, > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > This commit updates genpd core to start propagating performance state > > updates to master domains that have their set_performance_state() > > callback set. > > > > A genpd handles two type of performance states now. The first one is the > > performance state requirement put on the genpd by the devices and > > sub-domains under it, which is already represented by > > genpd->performance_state. The second type, introduced in this commit, is > > the performance state requirement(s) put by the genpd on its master > > domain(s). There is a separate value required for each master that the > > genpd has and so a new field is added to the struct gpd_link > > (link->performance_state), which represents the link between a genpd and > > its master. The struct gpd_link also got another field > > prev_performance_state, which is used by genpd core as a temporary > > variable during transitions. > > > > We need to propagate setting performance state while powering-on a genpd > > as well, as we ignore performance state requirements from sub-domains > > which are powered-off. For this reason _genpd_power_on() also received > > the additional parameter, depth, which is used for hierarchical locking > > within genpd. > > > > Signed-off-by: Viresh Kumar > > --- > > drivers/base/power/domain.c | 107 +--- > > include/linux/pm_domain.h | 4 ++ > > 2 files changed, 92 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c > > index 6d2e9b3406f1..81e02c5f753f 100644 > > --- a/drivers/base/power/domain.c > > +++ b/drivers/base/power/domain.c > > @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct > > generic_pm_domain *genpd) > > static inline void genpd_update_accounting(struct generic_pm_domain > > *genpd) {} > > #endif > > +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, > > + unsigned int state, int depth); > > + > > static int _genpd_set_performance_state(struct generic_pm_domain *genpd, > > - unsigned int state) > > + unsigned int state, int depth) > > { > > + struct generic_pm_domain *master; > > + struct gpd_link *link; > > + unsigned int mstate; > > int ret; > > if (!genpd_status_on(genpd)) > > goto out; > > This check here would mean we only propogate performance state > to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything. -- viresh
Re: [PATCH 4/4] PM / Domains: Propagate performance state updates
On 21-11-18, 10:33, Rajendra Nayak wrote: > Hi Viresh, > > On 11/5/2018 12:06 PM, Viresh Kumar wrote: > > This commit updates genpd core to start propagating performance state > > updates to master domains that have their set_performance_state() > > callback set. > > > > A genpd handles two type of performance states now. The first one is the > > performance state requirement put on the genpd by the devices and > > sub-domains under it, which is already represented by > > genpd->performance_state. The second type, introduced in this commit, is > > the performance state requirement(s) put by the genpd on its master > > domain(s). There is a separate value required for each master that the > > genpd has and so a new field is added to the struct gpd_link > > (link->performance_state), which represents the link between a genpd and > > its master. The struct gpd_link also got another field > > prev_performance_state, which is used by genpd core as a temporary > > variable during transitions. > > > > We need to propagate setting performance state while powering-on a genpd > > as well, as we ignore performance state requirements from sub-domains > > which are powered-off. For this reason _genpd_power_on() also received > > the additional parameter, depth, which is used for hierarchical locking > > within genpd. > > > > Signed-off-by: Viresh Kumar > > --- > > drivers/base/power/domain.c | 107 +--- > > include/linux/pm_domain.h | 4 ++ > > 2 files changed, 92 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c > > index 6d2e9b3406f1..81e02c5f753f 100644 > > --- a/drivers/base/power/domain.c > > +++ b/drivers/base/power/domain.c > > @@ -239,28 +239,86 @@ static void genpd_update_accounting(struct > > generic_pm_domain *genpd) > > static inline void genpd_update_accounting(struct generic_pm_domain > > *genpd) {} > > #endif > > +static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, > > + unsigned int state, int depth); > > + > > static int _genpd_set_performance_state(struct generic_pm_domain *genpd, > > - unsigned int state) > > + unsigned int state, int depth) > > { > > + struct generic_pm_domain *master; > > + struct gpd_link *link; > > + unsigned int mstate; > > int ret; > > if (!genpd_status_on(genpd)) > > goto out; > > This check here would mean we only propogate performance state > to masters if the genpd is ON? Yeah, isn't that what should we do anyway? It is similar to clk-enable etc. Why increase the reference count if the device isn't enabled and isn't using the clock ? Also you can see that I have updated the genpd power-on code to start propagation to make sure we don't miss anything. -- viresh
Re: [PATCH] iio: st_sensors: Fix the sleep time for sampling
Denis CIOCCA 於 2018年11月20日 週二 上午3:05寫道: > > Hi Jian, > > Not clear to me why should be + instead of *. > > ODR is expressed in Hz, so (1/Hz) = period in seconds (1 sample sampling > time) [s] > 1000 * (1/Hz) = period in milliseconds (1 sample sampling time) [ms] > n * 1000 * (1/Hz) = n times period in milliseconds (n times sample sampling > time) [ms] > > In your case you assume bootime is in milliseconds. Yes, I assume that according to the original comment. >Maybe we can change the comment and use 'number of samples ...'. Making the meaning more clear is better. However, does the bootime of the measurement need as the long time to be enabled? If the sampling rate is 1Hz and n is 2, then they will do msleep with 2000 ms for each st_sensors_read_info_raw. Regards, Jian-Hong Pan > > > -Original Message- > From: linux-iio-ow...@vger.kernel.org On > Behalf Of Jian-Hong Pan > Sent: Sunday, November 18, 2018 10:12 PM > To: Jonathan Cameron ; Hartmut Knaack ; > Lars-Peter Clausen ; Peter Meerwald-Stadler > ; Dominique Martinet > Cc: linux-...@vger.kernel.org; linux-kernel@vger.kernel.org; > li...@endlessm.com; Jian-Hong Pan > Subject: [PATCH] iio: st_sensors: Fix the sleep time for sampling > > According to the description of st_sensor_settings and st_sensor_data > structures' comments: > - bootime: samples to discard when sensor passing from power-down to power-up. > - odr: Output data rate of the sensor [Hz]. > > The sleep time should be > sdata->sensor_settings->bootime + 1000 / sdata->odr ms. > > Signed-off-by: Jian-Hong Pan > --- > drivers/iio/common/st_sensors/st_sensors_core.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c > b/drivers/iio/common/st_sensors/st_sensors_core.c > index 26fbd1bd9413..6b87ea657a92 100644 > --- a/drivers/iio/common/st_sensors/st_sensors_core.c > +++ b/drivers/iio/common/st_sensors/st_sensors_core.c > @@ -594,7 +594,7 @@ int st_sensors_read_info_raw(struct iio_dev *indio_dev, > if (err < 0) > goto out; > > - msleep((sdata->sensor_settings->bootime * 1000) / sdata->odr); > + msleep(sdata->sensor_settings->bootime + 1000 / sdata->odr); > err = st_sensors_read_axis_data(indio_dev, ch, val); > if (err < 0) > goto out; > -- > 2.11.0 >
Re: [PATCH] iio: st_sensors: Fix the sleep time for sampling
Denis CIOCCA 於 2018年11月20日 週二 上午3:05寫道: > > Hi Jian, > > Not clear to me why should be + instead of *. > > ODR is expressed in Hz, so (1/Hz) = period in seconds (1 sample sampling > time) [s] > 1000 * (1/Hz) = period in milliseconds (1 sample sampling time) [ms] > n * 1000 * (1/Hz) = n times period in milliseconds (n times sample sampling > time) [ms] > > In your case you assume bootime is in milliseconds. Yes, I assume that according to the original comment. >Maybe we can change the comment and use 'number of samples ...'. Making the meaning more clear is better. However, does the bootime of the measurement need as the long time to be enabled? If the sampling rate is 1Hz and n is 2, then they will do msleep with 2000 ms for each st_sensors_read_info_raw. Regards, Jian-Hong Pan > > > -Original Message- > From: linux-iio-ow...@vger.kernel.org On > Behalf Of Jian-Hong Pan > Sent: Sunday, November 18, 2018 10:12 PM > To: Jonathan Cameron ; Hartmut Knaack ; > Lars-Peter Clausen ; Peter Meerwald-Stadler > ; Dominique Martinet > Cc: linux-...@vger.kernel.org; linux-kernel@vger.kernel.org; > li...@endlessm.com; Jian-Hong Pan > Subject: [PATCH] iio: st_sensors: Fix the sleep time for sampling > > According to the description of st_sensor_settings and st_sensor_data > structures' comments: > - bootime: samples to discard when sensor passing from power-down to power-up. > - odr: Output data rate of the sensor [Hz]. > > The sleep time should be > sdata->sensor_settings->bootime + 1000 / sdata->odr ms. > > Signed-off-by: Jian-Hong Pan > --- > drivers/iio/common/st_sensors/st_sensors_core.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c > b/drivers/iio/common/st_sensors/st_sensors_core.c > index 26fbd1bd9413..6b87ea657a92 100644 > --- a/drivers/iio/common/st_sensors/st_sensors_core.c > +++ b/drivers/iio/common/st_sensors/st_sensors_core.c > @@ -594,7 +594,7 @@ int st_sensors_read_info_raw(struct iio_dev *indio_dev, > if (err < 0) > goto out; > > - msleep((sdata->sensor_settings->bootime * 1000) / sdata->odr); > + msleep(sdata->sensor_settings->bootime + 1000 / sdata->odr); > err = st_sensors_read_axis_data(indio_dev, ch, val); > if (err < 0) > goto out; > -- > 2.11.0 >
RE: [PATCH V3] thermal: imx: fix for dependency on cpu-freq
Best Regards! Anson Huang > -Original Message- > From: Viresh Kumar [mailto:viresh.ku...@linaro.org] > Sent: 2018年11月21日 11:54 > To: Anson Huang > Cc: Zhang Rui ; Eduardo Valentin > ; Daniel Lezcano ; Linux > PM list ; Linux Kernel Mailing List > ; l.st...@pengutronix.de; dl-linux-imx > > Subject: Re: [PATCH V3] thermal: imx: fix for dependency on cpu-freq > > On Wed, Nov 21, 2018 at 8:11 AM Anson Huang > wrote: > > > @@ -830,8 +851,7 @@ static int imx_thermal_probe(struct platform_device > *pdev) > > clk_disable: > > clk_disable_unprepare(data->thermal_clk); > > cpufreq_put: > > While at it, rename this label as well to something like legacy_cleanup. Done with V4 patch, thanks. Anson. > > > - cpufreq_cooling_unregister(data->cdev); > > - cpufreq_cpu_put(data->policy); > > + imx_thermal_unregister_legacy_cooling(data); > > > > return ret; > > }
RE: [PATCH V3] thermal: imx: fix for dependency on cpu-freq
Best Regards! Anson Huang > -Original Message- > From: Viresh Kumar [mailto:viresh.ku...@linaro.org] > Sent: 2018年11月21日 11:54 > To: Anson Huang > Cc: Zhang Rui ; Eduardo Valentin > ; Daniel Lezcano ; Linux > PM list ; Linux Kernel Mailing List > ; l.st...@pengutronix.de; dl-linux-imx > > Subject: Re: [PATCH V3] thermal: imx: fix for dependency on cpu-freq > > On Wed, Nov 21, 2018 at 8:11 AM Anson Huang > wrote: > > > @@ -830,8 +851,7 @@ static int imx_thermal_probe(struct platform_device > *pdev) > > clk_disable: > > clk_disable_unprepare(data->thermal_clk); > > cpufreq_put: > > While at it, rename this label as well to something like legacy_cleanup. Done with V4 patch, thanks. Anson. > > > - cpufreq_cooling_unregister(data->cdev); > > - cpufreq_cpu_put(data->policy); > > + imx_thermal_unregister_legacy_cooling(data); > > > > return ret; > > }
[PATCH V4] thermal: imx: fix for dependency on cpu-freq
The thermal driver is a standalone driver for monitoring SoC temperature by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ is NOT set. So remove the dependency with CPU_THERMAL. Introduce dummy function of legacy cooling register/unregister to make thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. Signed-off-by: Anson Huang --- changes since V3: rename the label of "cpufreq_put" with "legacy_cleanup". drivers/thermal/Kconfig | 2 +- drivers/thermal/imx_thermal.c | 44 +++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 5422523..93bd3bb 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -212,7 +212,7 @@ config HISI_THERMAL config IMX_THERMAL tristate "Temperature sensor driver for Freescale i.MX SoCs" - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST + depends on ARCH_MXC || COMPILE_TEST depends on NVMEM || !NVMEM depends on MFD_SYSCON depends on OF diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 1566154..328ee05 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, of_imx_thermal_match); +#ifdef CONFIG_CPU_FREQ /* * Create cooling device in case no #cooling-cells property is available in * CPU node */ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) { - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); + struct device_node *np; int ret; + data->policy = cpufreq_cpu_get(0); + if (!data->policy) { + pr_debug("%s: CPUFreq policy not found\n", __func__); + return -EPROBE_DEFER; + } + + np = of_get_cpu_node(data->policy->cpu, NULL); + if (!np || !of_find_property(np, "#cooling-cells", NULL)) { data->cdev = cpufreq_cooling_register(data->policy); if (IS_ERR(data->cdev)) { @@ -669,6 +678,22 @@ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) return 0; } +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ + cpufreq_cooling_unregister(data->cdev); + cpufreq_cpu_put(data->policy); +} +#else +static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) +{ + return 0; +} + +static inline void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ +} +#endif + static int imx_thermal_probe(struct platform_device *pdev) { struct imx_thermal_data *data; @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct platform_device *pdev) regmap_write(map, data->socdata->sensor_ctrl + REG_SET, data->socdata->power_down_mask); - data->policy = cpufreq_cpu_get(0); - if (!data->policy) { - pr_debug("%s: CPUFreq policy not found\n", __func__); - return -EPROBE_DEFER; - } - ret = imx_thermal_register_legacy_cooling(data); + if (ret == -EPROBE_DEFER) + return ret; if (ret) { dev_err(>dev, "failed to register cpufreq cooling device: %d\n", ret); @@ -762,7 +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) if (ret != -EPROBE_DEFER) dev_err(>dev, "failed to get thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } /* @@ -775,7 +796,7 @@ static int imx_thermal_probe(struct platform_device *pdev) ret = clk_prepare_enable(data->thermal_clk); if (ret) { dev_err(>dev, "failed to enable thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } data->tz = thermal_zone_device_register("imx_thermal_zone", @@ -829,9 +850,8 @@ static int imx_thermal_probe(struct platform_device *pdev) thermal_zone_device_unregister(data->tz); clk_disable: clk_disable_unprepare(data->thermal_clk); -cpufreq_put: - cpufreq_cooling_unregister(data->cdev); - cpufreq_cpu_put(data->policy); +legacy_cleanup: + imx_thermal_unregister_legacy_cooling(data); return ret; } -- 2.7.4
[PATCH V4] thermal: imx: fix for dependency on cpu-freq
The thermal driver is a standalone driver for monitoring SoC temperature by enabling thermal sensor, so it can be enabled even when CONFIG_CPU_FREQ is NOT set. So remove the dependency with CPU_THERMAL. Introduce dummy function of legacy cooling register/unregister to make thermal driver probe successfully when CONFIG_CPU_FREQ is NOT set. Signed-off-by: Anson Huang --- changes since V3: rename the label of "cpufreq_put" with "legacy_cleanup". drivers/thermal/Kconfig | 2 +- drivers/thermal/imx_thermal.c | 44 +++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 5422523..93bd3bb 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -212,7 +212,7 @@ config HISI_THERMAL config IMX_THERMAL tristate "Temperature sensor driver for Freescale i.MX SoCs" - depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST + depends on ARCH_MXC || COMPILE_TEST depends on NVMEM || !NVMEM depends on MFD_SYSCON depends on OF diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 1566154..328ee05 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -648,15 +648,24 @@ static const struct of_device_id of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, of_imx_thermal_match); +#ifdef CONFIG_CPU_FREQ /* * Create cooling device in case no #cooling-cells property is available in * CPU node */ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) { - struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); + struct device_node *np; int ret; + data->policy = cpufreq_cpu_get(0); + if (!data->policy) { + pr_debug("%s: CPUFreq policy not found\n", __func__); + return -EPROBE_DEFER; + } + + np = of_get_cpu_node(data->policy->cpu, NULL); + if (!np || !of_find_property(np, "#cooling-cells", NULL)) { data->cdev = cpufreq_cooling_register(data->policy); if (IS_ERR(data->cdev)) { @@ -669,6 +678,22 @@ static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) return 0; } +static void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ + cpufreq_cooling_unregister(data->cdev); + cpufreq_cpu_put(data->policy); +} +#else +static inline int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) +{ + return 0; +} + +static inline void imx_thermal_unregister_legacy_cooling(struct imx_thermal_data *data) +{ +} +#endif + static int imx_thermal_probe(struct platform_device *pdev) { struct imx_thermal_data *data; @@ -743,13 +768,9 @@ static int imx_thermal_probe(struct platform_device *pdev) regmap_write(map, data->socdata->sensor_ctrl + REG_SET, data->socdata->power_down_mask); - data->policy = cpufreq_cpu_get(0); - if (!data->policy) { - pr_debug("%s: CPUFreq policy not found\n", __func__); - return -EPROBE_DEFER; - } - ret = imx_thermal_register_legacy_cooling(data); + if (ret == -EPROBE_DEFER) + return ret; if (ret) { dev_err(>dev, "failed to register cpufreq cooling device: %d\n", ret); @@ -762,7 +783,7 @@ static int imx_thermal_probe(struct platform_device *pdev) if (ret != -EPROBE_DEFER) dev_err(>dev, "failed to get thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } /* @@ -775,7 +796,7 @@ static int imx_thermal_probe(struct platform_device *pdev) ret = clk_prepare_enable(data->thermal_clk); if (ret) { dev_err(>dev, "failed to enable thermal clk: %d\n", ret); - goto cpufreq_put; + goto legacy_cleanup; } data->tz = thermal_zone_device_register("imx_thermal_zone", @@ -829,9 +850,8 @@ static int imx_thermal_probe(struct platform_device *pdev) thermal_zone_device_unregister(data->tz); clk_disable: clk_disable_unprepare(data->thermal_clk); -cpufreq_put: - cpufreq_cooling_unregister(data->cdev); - cpufreq_cpu_put(data->policy); +legacy_cleanup: + imx_thermal_unregister_legacy_cooling(data); return ret; } -- 2.7.4