Re: [PATCH 1/2] init/main: Fix double "the" in comment

2017-04-16 Thread Viresh Kumar
On 23-03-17, 17:00, Viresh Kumar wrote:
> s/the\ the/the
> 
> Signed-off-by: Viresh Kumar 
> ---
>  init/main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/init/main.c b/init/main.c
> index f9c9d9948203..717b2ab803e5 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -495,7 +495,7 @@ asmlinkage __visible void __init start_kernel(void)
>   debug_objects_early_init();
>  
>   /*
> -  * Set up the the initial canary ASAP:
> +  * Set up the initial canary ASAP:
>*/
>   boot_init_stack_canary();
>  

Ping.

-- 
viresh


Re: [PATCH 1/2] init/main: Fix double "the" in comment

2017-04-16 Thread Viresh Kumar
On 23-03-17, 17:00, Viresh Kumar wrote:
> s/the\ the/the
> 
> Signed-off-by: Viresh Kumar 
> ---
>  init/main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/init/main.c b/init/main.c
> index f9c9d9948203..717b2ab803e5 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -495,7 +495,7 @@ asmlinkage __visible void __init start_kernel(void)
>   debug_objects_early_init();
>  
>   /*
> -  * Set up the the initial canary ASAP:
> +  * Set up the initial canary ASAP:
>*/
>   boot_init_stack_canary();
>  

Ping.

-- 
viresh


[PATCH] pci-sysfs: Make PCI bridge attribute visible in sysfs

2017-04-16 Thread Wong Vee Khee
From: vwong 

Export the PCIe link attributes of PCI bridges to sysfs.

Signed-off-by: Wong Vee Khee 
Signed-off-by: Hui Chun Ong 
---
 drivers/pci/pci-sysfs.c   | 197 +-
 include/uapi/linux/pci_regs.h |   4 +
 2 files changed, 197 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 25d010d..a218c43 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -154,6 +154,127 @@ static ssize_t resource_show(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RO(resource);
 
+static ssize_t max_link_speed_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u32 linkcap;
+   int err;
+   const char *speed;
+
+   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
+
+   if (!err && linkcap) {
+   switch (linkcap & PCI_EXP_LNKCAP_MLS) {
+   case PCI_EXP_LNKCAP_MLS_8_0GB:
+   speed = "8 GT/s";
+   break;
+   case PCI_EXP_LNKCAP_MLS_5_0GB:
+   speed = "5 GT/s";
+   break;
+   case PCI_EXP_LNKCAP_MLS_2_5GB:
+   speed = "2.5 GT/s";
+   break;
+   default:
+   speed = "Unknown speed";
+   }
+
+   return sprintf(buf, "%s\n", speed);
+   }
+
+   return -EINVAL;
+}
+static DEVICE_ATTR_RO(max_link_speed);
+
+static ssize_t max_link_width_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u32 linkcap;
+   int err;
+
+   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
+
+   return err ? -EINVAL : sprintf(
+   buf, "%u\n", (linkcap & PCI_EXP_LNKCAP_MLW) >> 4);
+}
+static DEVICE_ATTR_RO(max_link_width);
+
+static ssize_t current_link_speed_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u16 linkstat;
+   int err;
+   const char *speed;
+
+   err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, );
+
+   if (!err && linkstat) {
+   switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+   case PCI_EXP_LNKSTA_CLS_8_0GB:
+   speed = "8 GT/s";
+   break;
+   case PCI_EXP_LNKSTA_CLS_5_0GB:
+   speed = "5 GT/s";
+   break;
+   case PCI_EXP_LNKSTA_CLS_2_5GB:
+   speed = "2.5 GT/s";
+   break;
+   default:
+   speed = "Unknown speed";
+   }
+
+   return sprintf(buf, "%s\n", speed);
+   }
+
+   return -EINVAL;
+}
+static DEVICE_ATTR_RO(current_link_speed);
+
+static ssize_t current_link_width_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u16 linkstat;
+   int err;
+
+   err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, );
+
+   return err ? -EINVAL : sprintf(
+   buf, "%u\n",
+   (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT);
+}
+static DEVICE_ATTR_RO(current_link_width);
+
+static ssize_t secondary_bus_number_show(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u8 sec_bus;
+   int err;
+
+   err = pci_read_config_byte(pci_dev, PCI_SECONDARY_BUS, _bus);
+
+   return err ? -EINVAL : sprintf(buf, "%u\n", sec_bus);
+}
+static DEVICE_ATTR_RO(secondary_bus_number);
+
+static ssize_t subordinate_bus_number_show(struct device *dev,
+  struct device_attribute *attr,
+  char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u8 sub_bus;
+   int err;
+
+   err = pci_read_config_byte(pci_dev, PCI_SUBORDINATE_BUS, _bus);
+
+   return err ? -EINVAL : sprintf(buf, "%u\n", sub_bus);
+}
+static DEVICE_ATTR_RO(subordinate_bus_number);
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 char *buf)
 {
@@ -602,12 +723,17 @@ static struct attribute *pci_dev_attrs[] = {
NULL,
 };
 
-static const struct attribute_group pci_dev_group = {
-   .attrs = pci_dev_attrs,
+static struct attribute *pci_bridge_attrs[] = {
+   _attr_subordinate_bus_number.attr,
+   

Re: [PATCH] soc/tegra: pmc: Don't allocate struct tegra_powergate on stack

2017-04-16 Thread Viresh Kumar
On 21-03-17, 16:09, Viresh Kumar wrote:
> On 21-03-17, 10:37, Jon Hunter wrote:
> > 
> > On 21/03/17 05:24, Viresh Kumar wrote:
> > > The size of the struct tegra_powergate is quite big and if any more
> > > fields are added to the internal genpd structure, following warnings are
> > > thrown:
> > > 
> > > drivers/soc/tegra/pmc.c:577:1: warning: the frame size of 1176 bytes is 
> > > larger than 1024 bytes [-Wframe-larger-than=]
> > 
> > Hmmm ... AFAICT the size of the tegra_powergate struct is 312 bytes
> > (based upon next-20170321) and so it looks like something massive needs
> > to be added to the genpd struct to blow this up to over 1024 bytes. Are
> > there some genpd changes in-flight that are causing this?
> 
> https://marc.info/?l=linux-kernel=149000247329743=2
> 
> This is up for discussion right now though and we don't know if it
> will surely get merged or not.

@Jon: Regardless of the above series, do you want this patch to be merged as it
will still be better to avoid keeping large structures on stack.

Else I would be required to keep this in my above series from now on.

-- 
viresh


[PATCH] pci-sysfs: Make PCI bridge attribute visible in sysfs

2017-04-16 Thread Wong Vee Khee
From: vwong 

Export the PCIe link attributes of PCI bridges to sysfs.

Signed-off-by: Wong Vee Khee 
Signed-off-by: Hui Chun Ong 
---
 drivers/pci/pci-sysfs.c   | 197 +-
 include/uapi/linux/pci_regs.h |   4 +
 2 files changed, 197 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 25d010d..a218c43 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -154,6 +154,127 @@ static ssize_t resource_show(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RO(resource);
 
+static ssize_t max_link_speed_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u32 linkcap;
+   int err;
+   const char *speed;
+
+   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
+
+   if (!err && linkcap) {
+   switch (linkcap & PCI_EXP_LNKCAP_MLS) {
+   case PCI_EXP_LNKCAP_MLS_8_0GB:
+   speed = "8 GT/s";
+   break;
+   case PCI_EXP_LNKCAP_MLS_5_0GB:
+   speed = "5 GT/s";
+   break;
+   case PCI_EXP_LNKCAP_MLS_2_5GB:
+   speed = "2.5 GT/s";
+   break;
+   default:
+   speed = "Unknown speed";
+   }
+
+   return sprintf(buf, "%s\n", speed);
+   }
+
+   return -EINVAL;
+}
+static DEVICE_ATTR_RO(max_link_speed);
+
+static ssize_t max_link_width_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u32 linkcap;
+   int err;
+
+   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
+
+   return err ? -EINVAL : sprintf(
+   buf, "%u\n", (linkcap & PCI_EXP_LNKCAP_MLW) >> 4);
+}
+static DEVICE_ATTR_RO(max_link_width);
+
+static ssize_t current_link_speed_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u16 linkstat;
+   int err;
+   const char *speed;
+
+   err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, );
+
+   if (!err && linkstat) {
+   switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+   case PCI_EXP_LNKSTA_CLS_8_0GB:
+   speed = "8 GT/s";
+   break;
+   case PCI_EXP_LNKSTA_CLS_5_0GB:
+   speed = "5 GT/s";
+   break;
+   case PCI_EXP_LNKSTA_CLS_2_5GB:
+   speed = "2.5 GT/s";
+   break;
+   default:
+   speed = "Unknown speed";
+   }
+
+   return sprintf(buf, "%s\n", speed);
+   }
+
+   return -EINVAL;
+}
+static DEVICE_ATTR_RO(current_link_speed);
+
+static ssize_t current_link_width_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u16 linkstat;
+   int err;
+
+   err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, );
+
+   return err ? -EINVAL : sprintf(
+   buf, "%u\n",
+   (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT);
+}
+static DEVICE_ATTR_RO(current_link_width);
+
+static ssize_t secondary_bus_number_show(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u8 sec_bus;
+   int err;
+
+   err = pci_read_config_byte(pci_dev, PCI_SECONDARY_BUS, _bus);
+
+   return err ? -EINVAL : sprintf(buf, "%u\n", sec_bus);
+}
+static DEVICE_ATTR_RO(secondary_bus_number);
+
+static ssize_t subordinate_bus_number_show(struct device *dev,
+  struct device_attribute *attr,
+  char *buf)
+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u8 sub_bus;
+   int err;
+
+   err = pci_read_config_byte(pci_dev, PCI_SUBORDINATE_BUS, _bus);
+
+   return err ? -EINVAL : sprintf(buf, "%u\n", sub_bus);
+}
+static DEVICE_ATTR_RO(subordinate_bus_number);
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 char *buf)
 {
@@ -602,12 +723,17 @@ static struct attribute *pci_dev_attrs[] = {
NULL,
 };
 
-static const struct attribute_group pci_dev_group = {
-   .attrs = pci_dev_attrs,
+static struct attribute *pci_bridge_attrs[] = {
+   _attr_subordinate_bus_number.attr,
+   _attr_secondary_bus_number.attr,
+   NULL,
 };
 
-const struct attribute_group 

Re: [PATCH] soc/tegra: pmc: Don't allocate struct tegra_powergate on stack

2017-04-16 Thread Viresh Kumar
On 21-03-17, 16:09, Viresh Kumar wrote:
> On 21-03-17, 10:37, Jon Hunter wrote:
> > 
> > On 21/03/17 05:24, Viresh Kumar wrote:
> > > The size of the struct tegra_powergate is quite big and if any more
> > > fields are added to the internal genpd structure, following warnings are
> > > thrown:
> > > 
> > > drivers/soc/tegra/pmc.c:577:1: warning: the frame size of 1176 bytes is 
> > > larger than 1024 bytes [-Wframe-larger-than=]
> > 
> > Hmmm ... AFAICT the size of the tegra_powergate struct is 312 bytes
> > (based upon next-20170321) and so it looks like something massive needs
> > to be added to the genpd struct to blow this up to over 1024 bytes. Are
> > there some genpd changes in-flight that are causing this?
> 
> https://marc.info/?l=linux-kernel=149000247329743=2
> 
> This is up for discussion right now though and we don't know if it
> will surely get merged or not.

@Jon: Regardless of the above series, do you want this patch to be merged as it
will still be better to avoid keeping large structures on stack.

Else I would be required to keep this in my above series from now on.

-- 
viresh


Re: [PATCH] MAINTAINERS: drop akarwar from mwifiex

2017-04-16 Thread amit karwar
On Thu, Apr 13, 2017 at 6:47 PM, Kalle Valo  wrote:
> Brian Norris  writes:
>
>> His email is bouncing, and I expect he's not doing this work any more.
>>
>> Cc: Amitkumar Karwar 
>> Cc: Nishant Sarmukadam 
>> Cc: Ganapathi Bhat 
>> Cc: Xinming Hu 
>> Signed-off-by: Brian Norris 
>> ---
>> Or alternatively, we can update his email address. I just don't want to keep
>> seeing bounced emails :)
>
> Me neither :) So Amitkumar, which one do you prefer?
>

Updating email address would be fine. Please replace
akar...@marvell.com with amitkar...@gmail.com

Regards,
Amitkumar Karwar


Re: [PATCH] MAINTAINERS: drop akarwar from mwifiex

2017-04-16 Thread amit karwar
On Thu, Apr 13, 2017 at 6:47 PM, Kalle Valo  wrote:
> Brian Norris  writes:
>
>> His email is bouncing, and I expect he's not doing this work any more.
>>
>> Cc: Amitkumar Karwar 
>> Cc: Nishant Sarmukadam 
>> Cc: Ganapathi Bhat 
>> Cc: Xinming Hu 
>> Signed-off-by: Brian Norris 
>> ---
>> Or alternatively, we can update his email address. I just don't want to keep
>> seeing bounced emails :)
>
> Me neither :) So Amitkumar, which one do you prefer?
>

Updating email address would be fine. Please replace
akar...@marvell.com with amitkar...@gmail.com

Regards,
Amitkumar Karwar


Re: your mail

2017-04-16 Thread Joonsoo Kim
On Sat, Apr 15, 2017 at 02:17:31PM +0200, Michal Hocko wrote:
> Hi,
> here I 3 more preparatory patches which I meant to send on Thursday but
> forgot... After more thinking about pfn walkers I have realized that
> the current code doesn't check offline holes in zones. From a quick
> review that doesn't seem to be a problem currently. Pfn walkers can race
> with memory offlining and with the original hotplug impementation those
> offline pages can change the zone but I wasn't able to find any serious
> problem other than small confusion. The new hotplug code, will not have
> any valid zone, though so those code paths should check PageReserved
> to rule offline holes. I hope I have addressed all of them in these 3
> patches. I would appreciate if Vlastimil and Jonsoo double check after
> me.

Hello, Michal.

s/Jonsoo/Joonsoo. :)

I'm not sure that it's a good idea to add PageResereved() check in pfn
walkers. First, this makes struct page validity check as two steps,
pfn_valid() and then PageResereved(). If we should not use struct page
in this case, it's better to pfn_valid() returns false rather than
adding a separate check. Anyway, we need to fix more places (all pfn
walker?) if we want to check validity by two steps.

The other problem I found is that your change will makes some
contiguous zones to be considered as non-contiguous. Memory allocated
by memblock API is also marked as PageResereved. If we consider this as
a hole, we will set such a zone as non-contiguous.

And, I guess that it's not enough to check PageResereved() in
pageblock_pfn_to_page() in order to skip these pages in compaction. If
holes are in the middle of the pageblock, pageblock_pfn_to_page()
cannot catch it and compaction will use struct page for this hole.

Therefore, I think that making pfn_valid() return false for not
onlined memory is a better solution for this problem. I don't know the
implementation detail for hotplug and I don't see your recent change
but we may defer memmap initialization until the zone is determined.
It will make pfn_valid() return false for un-initialized range.

Thanks.


Re: your mail

2017-04-16 Thread Joonsoo Kim
On Sat, Apr 15, 2017 at 02:17:31PM +0200, Michal Hocko wrote:
> Hi,
> here I 3 more preparatory patches which I meant to send on Thursday but
> forgot... After more thinking about pfn walkers I have realized that
> the current code doesn't check offline holes in zones. From a quick
> review that doesn't seem to be a problem currently. Pfn walkers can race
> with memory offlining and with the original hotplug impementation those
> offline pages can change the zone but I wasn't able to find any serious
> problem other than small confusion. The new hotplug code, will not have
> any valid zone, though so those code paths should check PageReserved
> to rule offline holes. I hope I have addressed all of them in these 3
> patches. I would appreciate if Vlastimil and Jonsoo double check after
> me.

Hello, Michal.

s/Jonsoo/Joonsoo. :)

I'm not sure that it's a good idea to add PageResereved() check in pfn
walkers. First, this makes struct page validity check as two steps,
pfn_valid() and then PageResereved(). If we should not use struct page
in this case, it's better to pfn_valid() returns false rather than
adding a separate check. Anyway, we need to fix more places (all pfn
walker?) if we want to check validity by two steps.

The other problem I found is that your change will makes some
contiguous zones to be considered as non-contiguous. Memory allocated
by memblock API is also marked as PageResereved. If we consider this as
a hole, we will set such a zone as non-contiguous.

And, I guess that it's not enough to check PageResereved() in
pageblock_pfn_to_page() in order to skip these pages in compaction. If
holes are in the middle of the pageblock, pageblock_pfn_to_page()
cannot catch it and compaction will use struct page for this hole.

Therefore, I think that making pfn_valid() return false for not
onlined memory is a better solution for this problem. I don't know the
implementation detail for hotplug and I don't see your recent change
but we may defer memmap initialization until the zone is determined.
It will make pfn_valid() return false for un-initialized range.

Thanks.


Re: [RFC/RFT][PATCH 1/2] cpufreq: schedutil: Use policy-dependent transition delays

2017-04-16 Thread Viresh Kumar
On 11-04-17, 00:20, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki 
> 
> Make the schedutil governor take the initial (default) value of the
> rate_limit_us sysfs attribute from the (new) transition_delay_us
> policy parameter (to be set by the scaling driver).
> 
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
> 
> Make intel_pstate set transition_delay_us to 500.
> 
> Signed-off-by: Rafael J. Wysocki 
> ---
> 
> This is a replacement for https://patchwork.kernel.org/patch/9671831/

Acked-by: Viresh Kumar 

-- 
viresh


Re: [RFC/RFT][PATCH 1/2] cpufreq: schedutil: Use policy-dependent transition delays

2017-04-16 Thread Viresh Kumar
On 11-04-17, 00:20, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki 
> 
> Make the schedutil governor take the initial (default) value of the
> rate_limit_us sysfs attribute from the (new) transition_delay_us
> policy parameter (to be set by the scaling driver).
> 
> That will allow scaling drivers to make schedutil use smaller default
> values of rate_limit_us and reduce the default average time interval
> between consecutive frequency changes.
> 
> Make intel_pstate set transition_delay_us to 500.
> 
> Signed-off-by: Rafael J. Wysocki 
> ---
> 
> This is a replacement for https://patchwork.kernel.org/patch/9671831/

Acked-by: Viresh Kumar 

-- 
viresh


Re: [PATCH V4 1/9] PM / OPP: Allow OPP table to be used for power-domains

2017-04-16 Thread Viresh Kumar
On 13-04-17, 14:43, Sudeep Holla wrote:
> Interesting. My understand of power domain and in particular power
> domain performance was that it would control both. The abstract number
> you introduce would hide clocks and regulators.
> 
> But if the concept treats it just as yet another regulator, we do we
> need these at all. Why don't we relate this performance to regulator
> values and be done with it ?
> 
> Sorry if I am missing to understand something here. I would look this as
> replacement for both clocks and regulators, something similar to ACPI
> CPPC. If not, it looks unnecessary to me with the information I have got
> so far.

I kind of answered that in the other email.

Some background may be good here. So Qcom tried to solve all this with virtual
regulators, but the problem was that they need to talk in terms of integer
values (1, 2, 3..) and not voltages and so they can't use the regulator
framework straight away. And so we are doing all this.

-- 
viresh


Re: [PATCH V4 1/9] PM / OPP: Allow OPP table to be used for power-domains

2017-04-16 Thread Viresh Kumar
On 13-04-17, 14:43, Sudeep Holla wrote:
> Interesting. My understand of power domain and in particular power
> domain performance was that it would control both. The abstract number
> you introduce would hide clocks and regulators.
> 
> But if the concept treats it just as yet another regulator, we do we
> need these at all. Why don't we relate this performance to regulator
> values and be done with it ?
> 
> Sorry if I am missing to understand something here. I would look this as
> replacement for both clocks and regulators, something similar to ACPI
> CPPC. If not, it looks unnecessary to me with the information I have got
> so far.

I kind of answered that in the other email.

Some background may be good here. So Qcom tried to solve all this with virtual
regulators, but the problem was that they need to talk in terms of integer
values (1, 2, 3..) and not voltages and so they can't use the regulator
framework straight away. And so we are doing all this.

-- 
viresh


Re: [PATCH V2] mm/madvise: Move up the behavior parameter validation

2017-04-16 Thread Naoya Horiguchi
On Fri, Apr 14, 2017 at 07:21:41PM +0530, Anshuman Khandual wrote:
> The madvise_behavior_valid() function should be called before
> acting upon the behavior parameter. Hence move up the function.
> This also includes MADV_SOFT_OFFLINE and MADV_HWPOISON options
> as valid behavior parameter for the system call madvise().
> 
> Signed-off-by: Anshuman Khandual 
> ---
> Changes in V2:
> 
> Added CONFIG_MEMORY_FAILURE check before using MADV_SOFT_OFFLINE
> and MADV_HWPOISONE constants.
> 
>  mm/madvise.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index efd4721..ccff186 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -694,6 +694,10 @@ static int madvise_inject_error(int behavior,
>  #endif
>   case MADV_DONTDUMP:
>   case MADV_DODUMP:
> +#ifdef CONFIG_MEMORY_FAILURE
> + case MADV_SOFT_OFFLINE:
> + case MADV_HWPOISON:
> +#endif
>   return true;
>  
>   default:
> @@ -767,12 +771,13 @@ static int madvise_inject_error(int behavior,
>   size_t len;
>   struct blk_plug plug;
>  
> + if (!madvise_behavior_valid(behavior))
> + return error;
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
>   return madvise_inject_error(behavior, start, start + len_in);
>  #endif
> - if (!madvise_behavior_valid(behavior))
> - return error;

Hi Anshuman,

I'm wondering why current code calls madvise_inject_error() at the beginning
of SYSCALL_DEFINE3(madvise), without any boundary checks of address or length.
I agree to checking madvise_behavior_valid for MADV_{HWPOISON,SOFT_OFFLINE},
but checking boundary of other arguments is also helpful, so how about moving
down the existing #ifdef block like below?

diff --git a/mm/madvise.c b/mm/madvise.c
index a09d2d3dfae9..7b36912e1f4a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -754,10 +754,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, 
len_in, int, behavior)
size_t len;
struct blk_plug plug;
 
-#ifdef CONFIG_MEMORY_FAILURE
-   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
-   return madvise_inject_error(behavior, start, start+len_in);
-#endif
if (!madvise_behavior_valid(behavior))
return error;
 
@@ -777,6 +773,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, 
len_in, int, behavior)
if (end == start)
return error;
 
+#ifdef CONFIG_MEMORY_FAILURE
+   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+   return madvise_inject_error(behavior, start, start+len_in);
+#endif
+
write = madvise_need_mmap_write(behavior);
if (write) {
if (down_write_killable(>mm->mmap_sem))


Thanks,
Naoya Horiguchi


Re: [PATCH V2] mm/madvise: Move up the behavior parameter validation

2017-04-16 Thread Naoya Horiguchi
On Fri, Apr 14, 2017 at 07:21:41PM +0530, Anshuman Khandual wrote:
> The madvise_behavior_valid() function should be called before
> acting upon the behavior parameter. Hence move up the function.
> This also includes MADV_SOFT_OFFLINE and MADV_HWPOISON options
> as valid behavior parameter for the system call madvise().
> 
> Signed-off-by: Anshuman Khandual 
> ---
> Changes in V2:
> 
> Added CONFIG_MEMORY_FAILURE check before using MADV_SOFT_OFFLINE
> and MADV_HWPOISONE constants.
> 
>  mm/madvise.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index efd4721..ccff186 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -694,6 +694,10 @@ static int madvise_inject_error(int behavior,
>  #endif
>   case MADV_DONTDUMP:
>   case MADV_DODUMP:
> +#ifdef CONFIG_MEMORY_FAILURE
> + case MADV_SOFT_OFFLINE:
> + case MADV_HWPOISON:
> +#endif
>   return true;
>  
>   default:
> @@ -767,12 +771,13 @@ static int madvise_inject_error(int behavior,
>   size_t len;
>   struct blk_plug plug;
>  
> + if (!madvise_behavior_valid(behavior))
> + return error;
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
>   return madvise_inject_error(behavior, start, start + len_in);
>  #endif
> - if (!madvise_behavior_valid(behavior))
> - return error;

Hi Anshuman,

I'm wondering why current code calls madvise_inject_error() at the beginning
of SYSCALL_DEFINE3(madvise), without any boundary checks of address or length.
I agree to checking madvise_behavior_valid for MADV_{HWPOISON,SOFT_OFFLINE},
but checking boundary of other arguments is also helpful, so how about moving
down the existing #ifdef block like below?

diff --git a/mm/madvise.c b/mm/madvise.c
index a09d2d3dfae9..7b36912e1f4a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -754,10 +754,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, 
len_in, int, behavior)
size_t len;
struct blk_plug plug;
 
-#ifdef CONFIG_MEMORY_FAILURE
-   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
-   return madvise_inject_error(behavior, start, start+len_in);
-#endif
if (!madvise_behavior_valid(behavior))
return error;
 
@@ -777,6 +773,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, 
len_in, int, behavior)
if (end == start)
return error;
 
+#ifdef CONFIG_MEMORY_FAILURE
+   if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+   return madvise_inject_error(behavior, start, start+len_in);
+#endif
+
write = madvise_need_mmap_write(behavior);
if (write) {
if (down_write_killable(>mm->mmap_sem))


Thanks,
Naoya Horiguchi


Re: [PATCH V4 1/9] PM / OPP: Allow OPP table to be used for power-domains

2017-04-16 Thread Viresh Kumar
On 13-04-17, 14:42, Sudeep Holla wrote:
> What I was referring is about power domain provider with multiple power
> domains(simply #power-domain-cells=<1> case as explained in the
> power-domain specification.

I am not sure if we should be looking to target such a situation for now, as
that would be like this:

Device controlled by Domain A. Domain A itself is controlled by Domain B and
Domain C.

Though we will end up converting the domain-performance-state property to an
array if that is required in near future.

> Yes. To simplify what not we just have power-domain for a device and
> change state of that domain to change the performance of that device.

Consider this case to understand what I have in Mind.

The power domain have its states as A, B, C, D. There can be multiple devices
regulated by that domain and one of the devices have its power states as: A1,
A2, A3, B1, B2, B3, C1, C2, C3, D1, D2, D3 and all these states have different
frequency/voltages.

IOW, the devices can have regulators as well and may want to fine tune within
the domain performance-state.

> Then put this in the hierarchy. Some thing similar to what we already
> have with new domain-idle states. In that way, we can move any
> performance control to the domain and abstract the clocks and regulators
> from the devices as the first step and from the OSPM view if there's
> firmware support.
> 
> If we are looking this power-domains with performance as just some
> *advanced regulators*, I don't like the complexity added.

In the particular case I am trying to solve (Qcom), we have some sort of
regulators which are only programmed by a M3 core. The M3 core needs integer
numbers representing state we want the domain to be in and it will put the
regulators (or whatever) in a particular state.

-- 
viresh


Re: [PATCH V4 1/9] PM / OPP: Allow OPP table to be used for power-domains

2017-04-16 Thread Viresh Kumar
On 13-04-17, 14:42, Sudeep Holla wrote:
> What I was referring is about power domain provider with multiple power
> domains(simply #power-domain-cells=<1> case as explained in the
> power-domain specification.

I am not sure if we should be looking to target such a situation for now, as
that would be like this:

Device controlled by Domain A. Domain A itself is controlled by Domain B and
Domain C.

Though we will end up converting the domain-performance-state property to an
array if that is required in near future.

> Yes. To simplify what not we just have power-domain for a device and
> change state of that domain to change the performance of that device.

Consider this case to understand what I have in Mind.

The power domain have its states as A, B, C, D. There can be multiple devices
regulated by that domain and one of the devices have its power states as: A1,
A2, A3, B1, B2, B3, C1, C2, C3, D1, D2, D3 and all these states have different
frequency/voltages.

IOW, the devices can have regulators as well and may want to fine tune within
the domain performance-state.

> Then put this in the hierarchy. Some thing similar to what we already
> have with new domain-idle states. In that way, we can move any
> performance control to the domain and abstract the clocks and regulators
> from the devices as the first step and from the OSPM view if there's
> firmware support.
> 
> If we are looking this power-domains with performance as just some
> *advanced regulators*, I don't like the complexity added.

In the particular case I am trying to solve (Qcom), we have some sort of
regulators which are only programmed by a M3 core. The M3 core needs integer
numbers representing state we want the domain to be in and it will put the
regulators (or whatever) in a particular state.

-- 
viresh


RE: [EXT] Re: [PATCH] MAINTAINERS: drop akarwar from mwifiex

2017-04-16 Thread Ganapathi Bhat
Hi Kalle/Brian,

> Brian Norris  writes:
> 
> > His email is bouncing, and I expect he's not doing this work any
> more.
> >
> > Cc: Amitkumar Karwar 
> > Cc: Nishant Sarmukadam 
> > Cc: Ganapathi Bhat 
> > Cc: Xinming Hu 
> > Signed-off-by: Brian Norris 
> > ---
> > Or alternatively, we can update his email address. I just don't want
> > to keep seeing bounced emails :)
> 
> Me neither :) So Amitkumar, which one do you prefer?
> 
Yes. Amit's official mail id (akar...@marvell.com) can be removed.
> --
> Kalle Valo


Regards,
Ganapathi


RE: [EXT] Re: [PATCH] MAINTAINERS: drop akarwar from mwifiex

2017-04-16 Thread Ganapathi Bhat
Hi Kalle/Brian,

> Brian Norris  writes:
> 
> > His email is bouncing, and I expect he's not doing this work any
> more.
> >
> > Cc: Amitkumar Karwar 
> > Cc: Nishant Sarmukadam 
> > Cc: Ganapathi Bhat 
> > Cc: Xinming Hu 
> > Signed-off-by: Brian Norris 
> > ---
> > Or alternatively, we can update his email address. I just don't want
> > to keep seeing bounced emails :)
> 
> Me neither :) So Amitkumar, which one do you prefer?
> 
Yes. Amit's official mail id (akar...@marvell.com) can be removed.
> --
> Kalle Valo


Regards,
Ganapathi


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-16 Thread Logan Gunthorpe


On 16/04/17 04:32 PM, Benjamin Herrenschmidt wrote:
>> I'll consider this. Given the fact I can use your existing
>> get_dev_pagemap infrastructure to look up the p2pmem device this
>> probably isn't as hard as I thought it would be anyway (we probably
>> don't even need a page flag). We'd just have lookup the dev_pagemap,
>> test if it's a p2pmem device, and if so, call a p2pmem_dma_map function
>> which could apply the offset or do any other arch specific logic (if
>> necessary).
> 
> I'm still not 100% why do you need a "p2mem device" mind you ...

Well, you don't "need" it but it is a design choice that I think makes a
lot of sense for the following reasons:

1) p2pmem is in fact a device on the pci bus. A pci driver will need to
set it up and create the device and thus it will have a natural parent
pci device. Instantiating a struct device for it means it will appear in
the device hierarchy and one can use that to reason about its position
in the topology.

2) In order to create the struct pages we use the ZONE_DEVICE
infrastructure which requires a struct device. (See
devm_memremap_pages.) This amazingly gets us the get_dev_pagemap
architecture which also uses a struct device. So by using a p2pmem
device we can go from struct page to struct device to p2pmem device
quickly and effortlessly.

3) You wouldn't want to use the pci's struct device because it doesn't
really describe what's going on. For example, there may be multiple
devices on the pci device in question: eg. an NVME card and some p2pmem.
Or it could be a NIC with some p2pmem. Or it could just be p2pmem by
itself. And the logic to figure out what memory is available and where
the address is will be non-standard so it's really straightforward to
have any pci driver just instantiate a p2pmem device.

It is probably worth you reading the RFC patches at this point to get a
better feel for this.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-16 Thread Logan Gunthorpe


On 16/04/17 04:32 PM, Benjamin Herrenschmidt wrote:
>> I'll consider this. Given the fact I can use your existing
>> get_dev_pagemap infrastructure to look up the p2pmem device this
>> probably isn't as hard as I thought it would be anyway (we probably
>> don't even need a page flag). We'd just have lookup the dev_pagemap,
>> test if it's a p2pmem device, and if so, call a p2pmem_dma_map function
>> which could apply the offset or do any other arch specific logic (if
>> necessary).
> 
> I'm still not 100% why do you need a "p2mem device" mind you ...

Well, you don't "need" it but it is a design choice that I think makes a
lot of sense for the following reasons:

1) p2pmem is in fact a device on the pci bus. A pci driver will need to
set it up and create the device and thus it will have a natural parent
pci device. Instantiating a struct device for it means it will appear in
the device hierarchy and one can use that to reason about its position
in the topology.

2) In order to create the struct pages we use the ZONE_DEVICE
infrastructure which requires a struct device. (See
devm_memremap_pages.) This amazingly gets us the get_dev_pagemap
architecture which also uses a struct device. So by using a p2pmem
device we can go from struct page to struct device to p2pmem device
quickly and effortlessly.

3) You wouldn't want to use the pci's struct device because it doesn't
really describe what's going on. For example, there may be multiple
devices on the pci device in question: eg. an NVME card and some p2pmem.
Or it could be a NIC with some p2pmem. Or it could just be p2pmem by
itself. And the logic to figure out what memory is available and where
the address is will be non-standard so it's really straightforward to
have any pci driver just instantiate a p2pmem device.

It is probably worth you reading the RFC patches at this point to get a
better feel for this.

Logan


Re: [PATCH v2 0/15] [dt-bindings] [media] Add document file and driver for Sony CXD2880 DVB-T2/T tuner + demodulator

2017-04-16 Thread Takiguchi, Yasunari
On 2017/04/14 10:50, Takiguchi, Yasunari wrote:
> From: Yasunari Takiguchi 
> 
> Hi,
> 
> This is the patch series (version 2) of Sony CXD2880 DVB-T2/T tuner + 
> demodulator driver.
> The driver supports DVB-API and interfaces through SPI.
> 
> We have tested the driver on Raspberry Pi 3 and got picture and sound from a 
> media player.
> 
> Thanks,
> Takiguchi
> ---
>  Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt|   14 
> ++
>  drivers/media/spi/cxd2880-spi.c | 728 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880.h   |   46 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_common.c|   84 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_common.h|   86 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_io.c|   68 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_io.h|   62 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_stdlib.h|   35 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_stopwatch_port.c|   71 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_math.c  |   89 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_math.h  |   40 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_devio_spi.c |  147 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_devio_spi.h |   40 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi.h   |   51 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi_device.c|  130 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi_device.h|   45 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dtv.h   |   50 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd.c| 3925 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd.h|  395 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_driver_version.h |   29 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_mon.c|  207 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_mon.h|   52 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ.c |   99 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ.h |   44 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_top.c   | 1550 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dvbt.h  |   91 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt.c   | 1072 
> +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt.h   |   62 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt.c|  197 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt.h|   58 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt_mon.c   | 1190 
> +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt_mon.h   |  106 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dvbt2.h |  402 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2.c  | 1309 
> ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2.h  |   82 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt2.c   |  311 
> +++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt2.h   |   64 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2_mon.c  | 2523 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2_mon.h  |  170 ++
>  drivers/media/dvb-frontends/Makefile|1 +
>  drivers/media/dvb-frontends/cxd2880/Makefile|   21 
> +
>  drivers/media/spi/Makefile  |5 
> +
>  drivers/media/dvb-frontends/Kconfig |2 ++
>  drivers/media/dvb-frontends/cxd2880/Kconfig |6 
> ++
>  drivers/media/spi/Kconfig   |   14 
> ++
>  MAINTAINERS |9 
> +
> 
>  46 files changed, 15782 insertions(+)
> 
>  create mode 100644 
> Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
>  create mode 100644 drivers/media/spi/cxd2880-spi.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_common.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_common.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_io.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_io.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_stdlib.h
>  create mode 100644 
> 

Re: [PATCH v2 0/15] [dt-bindings] [media] Add document file and driver for Sony CXD2880 DVB-T2/T tuner + demodulator

2017-04-16 Thread Takiguchi, Yasunari
On 2017/04/14 10:50, Takiguchi, Yasunari wrote:
> From: Yasunari Takiguchi 
> 
> Hi,
> 
> This is the patch series (version 2) of Sony CXD2880 DVB-T2/T tuner + 
> demodulator driver.
> The driver supports DVB-API and interfaces through SPI.
> 
> We have tested the driver on Raspberry Pi 3 and got picture and sound from a 
> media player.
> 
> Thanks,
> Takiguchi
> ---
>  Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt|   14 
> ++
>  drivers/media/spi/cxd2880-spi.c | 728 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880.h   |   46 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_common.c|   84 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_common.h|   86 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_io.c|   68 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_io.h|   62 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_stdlib.h|   35 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_stopwatch_port.c|   71 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_math.c  |   89 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_math.h  |   40 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_devio_spi.c |  147 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_devio_spi.h |   40 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi.h   |   51 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi_device.c|  130 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_spi_device.h|   45 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dtv.h   |   50 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd.c| 3925 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd.h|  395 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_driver_version.h |   29 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_mon.c|  207 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_mon.h|   52 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ.c |   99 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ.h |   44 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_top.c   | 1550 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dvbt.h  |   91 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt.c   | 1072 
> +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt.h   |   62 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt.c|  197 ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt.h|   58 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt_mon.c   | 1190 
> +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt_mon.h   |  106 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_dvbt2.h |  402 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2.c  | 1309 
> ++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2.h  |   82 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt2.c   |  311 
> +++
>  drivers/media/dvb-frontends/cxd2880/cxd2880_integ_dvbt2.h   |   64 +
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2_mon.c  | 2523 
> 
>  drivers/media/dvb-frontends/cxd2880/cxd2880_tnrdmd_dvbt2_mon.h  |  170 ++
>  drivers/media/dvb-frontends/Makefile|1 +
>  drivers/media/dvb-frontends/cxd2880/Makefile|   21 
> +
>  drivers/media/spi/Makefile  |5 
> +
>  drivers/media/dvb-frontends/Kconfig |2 ++
>  drivers/media/dvb-frontends/cxd2880/Kconfig |6 
> ++
>  drivers/media/spi/Kconfig   |   14 
> ++
>  MAINTAINERS |9 
> +
> 
>  46 files changed, 15782 insertions(+)
> 
>  create mode 100644 
> Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
>  create mode 100644 drivers/media/spi/cxd2880-spi.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_common.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_common.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_io.c
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_io.h
>  create mode 100644 drivers/media/dvb-frontends/cxd2880/cxd2880_stdlib.h
>  create mode 100644 
> drivers/media/dvb-frontends/cxd2880/cxd2880_stopwatch_port.c
>  create 

Re: [PATCH v2 01/15] [dt-bindings] [media] Add document file for CXD2880 SPI I/F

2017-04-16 Thread Takiguchi, Yasunari
> From: Yasunari Takiguchi 
> 
> This is the document file for Sony CXD2880 DVB-T2/T tuner + demodulator.
> It contains the description of the SPI adapter binding.
> 
> Signed-off-by: Yasunari Takiguchi 
> Signed-off-by: Masayuki Yamamoto 
> Signed-off-by: Hideki Nozawa 
> Signed-off-by: Kota Yonezawa 
> Signed-off-by: Toshihiko Matsumoto 
> Signed-off-by: Satoshi Watanabe 
No changes since version 1, I should have carried the ack forward:

Acked-by: Rob Herring 


> ---
>  .../devicetree/bindings/media/spi/sony-cxd2880.txt | 14 
> ++
>  1 file changed, 14 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> 
> diff --git a/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt 
> b/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> new file mode 100644
> index ..fc5aa263abe5
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> @@ -0,0 +1,14 @@
> +Sony CXD2880 DVB-T2/T tuner + demodulator driver SPI adapter
> +
> +Required properties:
> +- compatible: Should be "sony,cxd2880".
> +- reg: SPI chip select number for the device.
> +- spi-max-frequency: Maximum bus speed, should be set to <5500> (55MHz).
> +
> +Example:
> +
> +cxd2880@0 {
> + compatible = "sony,cxd2880";
> + reg = <0>; /* CE0 */
> + spi-max-frequency = <5500>; /* 55MHz */
> +};
> 

Takiguchi


Re: [PATCH v2 01/15] [dt-bindings] [media] Add document file for CXD2880 SPI I/F

2017-04-16 Thread Takiguchi, Yasunari
> From: Yasunari Takiguchi 
> 
> This is the document file for Sony CXD2880 DVB-T2/T tuner + demodulator.
> It contains the description of the SPI adapter binding.
> 
> Signed-off-by: Yasunari Takiguchi 
> Signed-off-by: Masayuki Yamamoto 
> Signed-off-by: Hideki Nozawa 
> Signed-off-by: Kota Yonezawa 
> Signed-off-by: Toshihiko Matsumoto 
> Signed-off-by: Satoshi Watanabe 
No changes since version 1, I should have carried the ack forward:

Acked-by: Rob Herring 


> ---
>  .../devicetree/bindings/media/spi/sony-cxd2880.txt | 14 
> ++
>  1 file changed, 14 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> 
> diff --git a/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt 
> b/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> new file mode 100644
> index ..fc5aa263abe5
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/media/spi/sony-cxd2880.txt
> @@ -0,0 +1,14 @@
> +Sony CXD2880 DVB-T2/T tuner + demodulator driver SPI adapter
> +
> +Required properties:
> +- compatible: Should be "sony,cxd2880".
> +- reg: SPI chip select number for the device.
> +- spi-max-frequency: Maximum bus speed, should be set to <5500> (55MHz).
> +
> +Example:
> +
> +cxd2880@0 {
> + compatible = "sony,cxd2880";
> + reg = <0>; /* CE0 */
> + spi-max-frequency = <5500>; /* 55MHz */
> +};
> 

Takiguchi


Re: [PATCH v3] powerpc: mm: support ARCH_MMAP_RND_BITS

2017-04-16 Thread Bhupesh SHARMA
On Thu, Apr 13, 2017 at 12:39 PM, Balbir Singh  wrote:
>>>
>>> Yes. It was derived from TASK_SIZE :
>>>
>>> http://lxr.free-electrons.com/source/arch/powerpc/include/asm/processor.h#L105
>>>
>>
>> That is getting update to 128TB by default and conditionally to 512TB
>>
>
> Since this is compile time, we should probably keep the scope to 128TB
> for now and see if we want to change things at run time later, since
> the expansion is based on a hint. Suggestions?
>

I think this makes sense. If the conditional expansion to 512TB is
protected by a kconfig symbol, we can use the same to have separate
ranges for 128TB and 512TB using the kconfig symbol as the
differentiating factor.

Also please let me know which branch/tree to use where we are done
with the change making the default to 128TB. My v2 was based on
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
(mater branch), where the TASK_SIZE is set to 46-bits inside
'arch/powerpc/include/asm/processor.h'.

So that I can spin the v3 accordingly.

Thanks,
Bhupesh


Re: [PATCH v3] powerpc: mm: support ARCH_MMAP_RND_BITS

2017-04-16 Thread Bhupesh SHARMA
On Thu, Apr 13, 2017 at 12:39 PM, Balbir Singh  wrote:
>>>
>>> Yes. It was derived from TASK_SIZE :
>>>
>>> http://lxr.free-electrons.com/source/arch/powerpc/include/asm/processor.h#L105
>>>
>>
>> That is getting update to 128TB by default and conditionally to 512TB
>>
>
> Since this is compile time, we should probably keep the scope to 128TB
> for now and see if we want to change things at run time later, since
> the expansion is based on a hint. Suggestions?
>

I think this makes sense. If the conditional expansion to 512TB is
protected by a kconfig symbol, we can use the same to have separate
ranges for 128TB and 512TB using the kconfig symbol as the
differentiating factor.

Also please let me know which branch/tree to use where we are done
with the change making the default to 128TB. My v2 was based on
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
(mater branch), where the TASK_SIZE is set to 46-bits inside
'arch/powerpc/include/asm/processor.h'.

So that I can spin the v3 accordingly.

Thanks,
Bhupesh


Re: [RFC 0/1] add support for reclaiming priorities per mem cgroup

2017-04-16 Thread Minchan Kim
Hi Johannes,

On Thu, Apr 13, 2017 at 12:01:47PM -0400, Johannes Weiner wrote:
> On Thu, Apr 13, 2017 at 01:30:47PM +0900, Minchan Kim wrote:
> > On Thu, Mar 30, 2017 at 12:40:32PM -0700, Tim Murray wrote:
> > > As a result, I think there's still a need for relative priority
> > > between mem cgroups, not just an absolute limit.
> > > 
> > > Does that make sense?
> > 
> > I agree with it.
> > 
> > Recently, embedded platform's workload for smart things would be much
> > diverse(from game to alarm) so it's hard to handle the absolute limit
> > proactively and userspace has more hints about what workloads are
> > more important(ie, greedy) compared to others although it would be
> > harmful for something(e.g., it's not visible effect to user)
> > 
> > As a such point of view, I support this idea as basic approach.
> > And with thrashing detector from Johannes, we can do fine-tune of
> > LRU balancing and vmpressure shooting time better.
> > 
> > Johannes,
> > 
> > Do you have any concern about this memcg prority idea?
> 
> While I fully agree that relative priority levels would be easier to
> configure, this patch doesn't really do that. It allows you to set a
> scan window divider to a fixed amount and, as I already pointed out,
> the scan window is no longer representative of memory pressure.
> 
> [ Really, sc->priority should probably just be called LRU lookahead
>   factor or something, there is not much about it being representative
>   of any kind of urgency anymore. ]

I agree that sc->priority is not memory pressure indication.
I should have clarified my intention. Sorry about that.

I'm not saying I like this implementation as I mentioned with
previous reply.
http://lkml.kernel.org/r/20170322052013.GE30149@bbox

Just about general idea, in global OOM case, break proportional
reclaim and then prefering low-priority group's reclaim would be
good for some workload like current embedded platform. And to
achieve it, aging velocity control via scan window adjusting seems
to be reasonable.

> 
> With this patch, if you configure the priorities of two 8G groups to 0
> and 4, reclaim will treat them exactly the same*. If you configure the
> priorities of two 100G groups to 0 and 7, reclaim will treat them
> exactly the same. The bigger the group, the more of the lower range of
> the priority range becomes meaningless, because once the divider
> produces outcomes bigger than SWAP_CLUSTER_MAX(32), it doesn't
> actually bias reclaim anymore.

It seems it's the logic of memcg reclaim not global which is major
concern for current problem because there is no set up limitation for
each memcg.

> 
> So that's not a portable relative scale of pressure discrimination.
> 
> But the bigger problem with this is that, as sc->priority doesn't
> represent memory pressure anymore, it is merely a cut-off for which
> groups to scan and which groups not to scan *based on their size*.

Yes, because there are no measurable pressure concept in current VM
and you are trying to add the notion which is really good!

> 
> That is the same as setting memory.low!
> 
> * For simplicity, I'm glossing over the fact here that LRUs are split
>   by type and into inactive/active, so in reality the numbers are a
>   little different, but you get the point.
> 
> > Or
> > Do you think the patchset you are preparing solve this situation?
> 
> It's certainly a requirement. In order to implement a relative scale
> of memory pressure discrimination, we first need to be able to really
> quantify memory pressure.

Yeb. If we can get it, it would be better than unconditional
discriminated aging by the static priority which would leave
non-workingset pages in high priority group while workingset in
low-priority group would be evicted.

Rather than it, we ages every group's LRU fairly and if high-priority
group makes memory pressure beyond his threshold, VM should feedback
to low priority groups to be reclaimed more, which would be better.

> 
> Then we can either allow setting absolute latency/slowdown minimums
> for each group, with reclaim skipping groups above those thresholds,
> or we can map a relative priority scale against the total slowdown due
> to lack of memory in the system, and each group gets a relative share
> based on its priority compared to other groups.

Fully agreed.

> 
> But there is no way around first having a working measure of memory
> pressure before we can meaningfully distribute it among the groups.

Yeb. I'm looking foward to seeing it.

Thanks for the thoughtful comment, Johannes!


Re: [RFC 0/1] add support for reclaiming priorities per mem cgroup

2017-04-16 Thread Minchan Kim
Hi Johannes,

On Thu, Apr 13, 2017 at 12:01:47PM -0400, Johannes Weiner wrote:
> On Thu, Apr 13, 2017 at 01:30:47PM +0900, Minchan Kim wrote:
> > On Thu, Mar 30, 2017 at 12:40:32PM -0700, Tim Murray wrote:
> > > As a result, I think there's still a need for relative priority
> > > between mem cgroups, not just an absolute limit.
> > > 
> > > Does that make sense?
> > 
> > I agree with it.
> > 
> > Recently, embedded platform's workload for smart things would be much
> > diverse(from game to alarm) so it's hard to handle the absolute limit
> > proactively and userspace has more hints about what workloads are
> > more important(ie, greedy) compared to others although it would be
> > harmful for something(e.g., it's not visible effect to user)
> > 
> > As a such point of view, I support this idea as basic approach.
> > And with thrashing detector from Johannes, we can do fine-tune of
> > LRU balancing and vmpressure shooting time better.
> > 
> > Johannes,
> > 
> > Do you have any concern about this memcg prority idea?
> 
> While I fully agree that relative priority levels would be easier to
> configure, this patch doesn't really do that. It allows you to set a
> scan window divider to a fixed amount and, as I already pointed out,
> the scan window is no longer representative of memory pressure.
> 
> [ Really, sc->priority should probably just be called LRU lookahead
>   factor or something, there is not much about it being representative
>   of any kind of urgency anymore. ]

I agree that sc->priority is not memory pressure indication.
I should have clarified my intention. Sorry about that.

I'm not saying I like this implementation as I mentioned with
previous reply.
http://lkml.kernel.org/r/20170322052013.GE30149@bbox

Just about general idea, in global OOM case, break proportional
reclaim and then prefering low-priority group's reclaim would be
good for some workload like current embedded platform. And to
achieve it, aging velocity control via scan window adjusting seems
to be reasonable.

> 
> With this patch, if you configure the priorities of two 8G groups to 0
> and 4, reclaim will treat them exactly the same*. If you configure the
> priorities of two 100G groups to 0 and 7, reclaim will treat them
> exactly the same. The bigger the group, the more of the lower range of
> the priority range becomes meaningless, because once the divider
> produces outcomes bigger than SWAP_CLUSTER_MAX(32), it doesn't
> actually bias reclaim anymore.

It seems it's the logic of memcg reclaim not global which is major
concern for current problem because there is no set up limitation for
each memcg.

> 
> So that's not a portable relative scale of pressure discrimination.
> 
> But the bigger problem with this is that, as sc->priority doesn't
> represent memory pressure anymore, it is merely a cut-off for which
> groups to scan and which groups not to scan *based on their size*.

Yes, because there are no measurable pressure concept in current VM
and you are trying to add the notion which is really good!

> 
> That is the same as setting memory.low!
> 
> * For simplicity, I'm glossing over the fact here that LRUs are split
>   by type and into inactive/active, so in reality the numbers are a
>   little different, but you get the point.
> 
> > Or
> > Do you think the patchset you are preparing solve this situation?
> 
> It's certainly a requirement. In order to implement a relative scale
> of memory pressure discrimination, we first need to be able to really
> quantify memory pressure.

Yeb. If we can get it, it would be better than unconditional
discriminated aging by the static priority which would leave
non-workingset pages in high priority group while workingset in
low-priority group would be evicted.

Rather than it, we ages every group's LRU fairly and if high-priority
group makes memory pressure beyond his threshold, VM should feedback
to low priority groups to be reclaimed more, which would be better.

> 
> Then we can either allow setting absolute latency/slowdown minimums
> for each group, with reclaim skipping groups above those thresholds,
> or we can map a relative priority scale against the total slowdown due
> to lack of memory in the system, and each group gets a relative share
> based on its priority compared to other groups.

Fully agreed.

> 
> But there is no way around first having a working measure of memory
> pressure before we can meaningfully distribute it among the groups.

Yeb. I'm looking foward to seeing it.

Thanks for the thoughtful comment, Johannes!


Re: [patch 06/20] cpufreq: Use cpuhp_setup_state_nocalls_locked()

2017-04-16 Thread Viresh Kumar
On 15-04-17, 19:01, Thomas Gleixner wrote:
> From: Sebastian Andrzej Siewior 
> 
> cpufreq holds get_online_cpus() while invoking cpuhp_setup_state_nocalls()
> to make subsys_interface_register() and the registration of hotplug calls
> atomic versus cpu hotplug.
> 
> cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is
> correct, but prevents the conversion of the hotplug locking to a percpu
> rwsem.
> 
> Use cpuhp_setup_state_nocalls_locked() to avoid the nested call.
> 
> Signed-off-by: Sebastian Andrzej Siewior 
> Signed-off-by: Thomas Gleixner 
> Cc: "Rafael J. Wysocki" 
> Cc: Viresh Kumar 
> Cc: linux...@vger.kernel.org
> 
> ---
>  drivers/cpufreq/cpufreq.c |9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2473,9 +2473,10 @@ int cpufreq_register_driver(struct cpufr
>   goto err_if_unreg;
>   }
>  
> - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online",
> - cpuhp_cpufreq_online,
> - cpuhp_cpufreq_offline);
> + ret = cpuhp_setup_state_nocalls_locked(CPUHP_AP_ONLINE_DYN,
> +"cpufreq:online",
> +cpuhp_cpufreq_online,
> +cpuhp_cpufreq_offline);
>   if (ret < 0)
>   goto err_if_unreg;
>   hp_online = ret;
> @@ -2519,7 +2520,7 @@ int cpufreq_unregister_driver(struct cpu
>   get_online_cpus();
>   subsys_interface_unregister(_interface);
>   remove_boost_sysfs_file();
> - cpuhp_remove_state_nocalls(hp_online);
> + cpuhp_remove_state_nocalls_locked(hp_online);
>  
>   write_lock_irqsave(_driver_lock, flags);

Acked-by: Viresh Kumar 

-- 
viresh


Re: [patch 06/20] cpufreq: Use cpuhp_setup_state_nocalls_locked()

2017-04-16 Thread Viresh Kumar
On 15-04-17, 19:01, Thomas Gleixner wrote:
> From: Sebastian Andrzej Siewior 
> 
> cpufreq holds get_online_cpus() while invoking cpuhp_setup_state_nocalls()
> to make subsys_interface_register() and the registration of hotplug calls
> atomic versus cpu hotplug.
> 
> cpuhp_setup_state_nocalls() invokes get_online_cpus() as well. This is
> correct, but prevents the conversion of the hotplug locking to a percpu
> rwsem.
> 
> Use cpuhp_setup_state_nocalls_locked() to avoid the nested call.
> 
> Signed-off-by: Sebastian Andrzej Siewior 
> Signed-off-by: Thomas Gleixner 
> Cc: "Rafael J. Wysocki" 
> Cc: Viresh Kumar 
> Cc: linux...@vger.kernel.org
> 
> ---
>  drivers/cpufreq/cpufreq.c |9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2473,9 +2473,10 @@ int cpufreq_register_driver(struct cpufr
>   goto err_if_unreg;
>   }
>  
> - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online",
> - cpuhp_cpufreq_online,
> - cpuhp_cpufreq_offline);
> + ret = cpuhp_setup_state_nocalls_locked(CPUHP_AP_ONLINE_DYN,
> +"cpufreq:online",
> +cpuhp_cpufreq_online,
> +cpuhp_cpufreq_offline);
>   if (ret < 0)
>   goto err_if_unreg;
>   hp_online = ret;
> @@ -2519,7 +2520,7 @@ int cpufreq_unregister_driver(struct cpu
>   get_online_cpus();
>   subsys_interface_unregister(_interface);
>   remove_boost_sysfs_file();
> - cpuhp_remove_state_nocalls(hp_online);
> + cpuhp_remove_state_nocalls_locked(hp_online);
>  
>   write_lock_irqsave(_driver_lock, flags);

Acked-by: Viresh Kumar 

-- 
viresh


Re: [PATCH v3 1/6] powerpc/perf: Define big-endian version of perf_mem_data_src

2017-04-16 Thread Madhavan Srinivasan



On Thursday 13 April 2017 06:08 PM, Peter Zijlstra wrote:

On Tue, Apr 11, 2017 at 07:21:05AM +0530, Madhavan Srinivasan wrote:

From: Sukadev Bhattiprolu 

perf_mem_data_src is an union that is initialized via the ->val field
and accessed via the bitmap fields. For this to work on big endian
platforms (Which is broken now), we also need a big-endian represenation
of perf_mem_data_src. i.e, in a big endian system, if user request
PERF_SAMPLE_DATA_SRC (perf report -d), will get the default value from
perf_sample_data_init(), which is PERF_MEM_NA. Value for PERF_MEM_NA
is constructed using shifts:

   /* TLB access */
   #define PERF_MEM_TLB_NA  0x01 /* not available */
   ...
   #define PERF_MEM_TLB_SHIFT   26

   #define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

   #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA)  |\
PERF_MEM_S(TLB, NA))

Which works out as:

   ((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))

Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.

But then in the perf tool, the code uses the bitfields to inspect the
value, and currently the bitfields are defined using little endian
ordering.

So eg. in perf_mem__tlb_scnprintf() we see:
   data_src->val = 0x5080021
  op = 0x0
 lvl = 0x0
   snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021

Patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian.
And it makes no change on little endian.

I think it is important to note that there are no current big-endian
users. So 'fixing' this will not break anybody and will ensure future
users (next patch) will work correctly.

Aside from that amendment,

Acked-by: Peter Zijlstra (Intel) 


Thanks
Maddy




Re: [PATCH v3 1/6] powerpc/perf: Define big-endian version of perf_mem_data_src

2017-04-16 Thread Madhavan Srinivasan



On Thursday 13 April 2017 06:08 PM, Peter Zijlstra wrote:

On Tue, Apr 11, 2017 at 07:21:05AM +0530, Madhavan Srinivasan wrote:

From: Sukadev Bhattiprolu 

perf_mem_data_src is an union that is initialized via the ->val field
and accessed via the bitmap fields. For this to work on big endian
platforms (Which is broken now), we also need a big-endian represenation
of perf_mem_data_src. i.e, in a big endian system, if user request
PERF_SAMPLE_DATA_SRC (perf report -d), will get the default value from
perf_sample_data_init(), which is PERF_MEM_NA. Value for PERF_MEM_NA
is constructed using shifts:

   /* TLB access */
   #define PERF_MEM_TLB_NA  0x01 /* not available */
   ...
   #define PERF_MEM_TLB_SHIFT   26

   #define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

   #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA)  |\
PERF_MEM_S(TLB, NA))

Which works out as:

   ((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))

Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.

But then in the perf tool, the code uses the bitfields to inspect the
value, and currently the bitfields are defined using little endian
ordering.

So eg. in perf_mem__tlb_scnprintf() we see:
   data_src->val = 0x5080021
  op = 0x0
 lvl = 0x0
   snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021

Patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian.
And it makes no change on little endian.

I think it is important to note that there are no current big-endian
users. So 'fixing' this will not break anybody and will ensure future
users (next patch) will work correctly.

Aside from that amendment,

Acked-by: Peter Zijlstra (Intel) 


Thanks
Maddy




Re: [PATCH v3 1/6] powerpc/perf: Define big-endian version of perf_mem_data_src

2017-04-16 Thread Madhavan Srinivasan



On Thursday 13 April 2017 06:53 PM, Michael Ellerman wrote:

Peter Zijlstra  writes:


On Tue, Apr 11, 2017 at 07:21:05AM +0530, Madhavan Srinivasan wrote:

From: Sukadev Bhattiprolu 

perf_mem_data_src is an union that is initialized via the ->val field
and accessed via the bitmap fields. For this to work on big endian
platforms (Which is broken now), we also need a big-endian represenation
of perf_mem_data_src. i.e, in a big endian system, if user request
PERF_SAMPLE_DATA_SRC (perf report -d), will get the default value from
perf_sample_data_init(), which is PERF_MEM_NA. Value for PERF_MEM_NA
is constructed using shifts:

   /* TLB access */
   #define PERF_MEM_TLB_NA  0x01 /* not available */
   ...
   #define PERF_MEM_TLB_SHIFT   26

   #define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

   #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA)  |\
PERF_MEM_S(TLB, NA))

Which works out as:

   ((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))

Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.

But then in the perf tool, the code uses the bitfields to inspect the
value, and currently the bitfields are defined using little endian
ordering.

So eg. in perf_mem__tlb_scnprintf() we see:
   data_src->val = 0x5080021
  op = 0x0
 lvl = 0x0
   snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021

Patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian.
And it makes no change on little endian.

I think it is important to note that there are no current big-endian
users. So 'fixing' this will not break anybody and will ensure future
users (next patch) will work correctly.

Sure I'll fold in something along those lines.


Thanks mpe.

Maddy




Aside from that amendment,

Acked-by: Peter Zijlstra (Intel) 

Thanks.

cheers





Re: [PATCH v3 1/6] powerpc/perf: Define big-endian version of perf_mem_data_src

2017-04-16 Thread Madhavan Srinivasan



On Thursday 13 April 2017 06:53 PM, Michael Ellerman wrote:

Peter Zijlstra  writes:


On Tue, Apr 11, 2017 at 07:21:05AM +0530, Madhavan Srinivasan wrote:

From: Sukadev Bhattiprolu 

perf_mem_data_src is an union that is initialized via the ->val field
and accessed via the bitmap fields. For this to work on big endian
platforms (Which is broken now), we also need a big-endian represenation
of perf_mem_data_src. i.e, in a big endian system, if user request
PERF_SAMPLE_DATA_SRC (perf report -d), will get the default value from
perf_sample_data_init(), which is PERF_MEM_NA. Value for PERF_MEM_NA
is constructed using shifts:

   /* TLB access */
   #define PERF_MEM_TLB_NA  0x01 /* not available */
   ...
   #define PERF_MEM_TLB_SHIFT   26

   #define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

   #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA)  |\
PERF_MEM_S(TLB, NA))

Which works out as:

   ((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))

Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.

But then in the perf tool, the code uses the bitfields to inspect the
value, and currently the bitfields are defined using little endian
ordering.

So eg. in perf_mem__tlb_scnprintf() we see:
   data_src->val = 0x5080021
  op = 0x0
 lvl = 0x0
   snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021

Patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian.
And it makes no change on little endian.

I think it is important to note that there are no current big-endian
users. So 'fixing' this will not break anybody and will ensure future
users (next patch) will work correctly.

Sure I'll fold in something along those lines.


Thanks mpe.

Maddy




Aside from that amendment,

Acked-by: Peter Zijlstra (Intel) 

Thanks.

cheers





Re: [PATCH v1 1/1] mtd: mtk-nor: set controller's address width according to nor flash

2017-04-16 Thread Guochun Mao
Hi Cyrille,

On Sun, 2017-04-16 at 19:18 +0200, Cyrille Pitchen wrote:
> Le 13/04/2017 à 10:24, Cyrille Pitchen a écrit :
> > Hi Guochun,
> > 
> > Le 13/04/2017 à 04:40, Guochun Mao a écrit :
> >> Hi Cyrille,
> >>
> >> On Wed, 2017-04-12 at 22:57 +0200, Cyrille Pitchen wrote:
> >>> Hi Guochun,
> >>>
> >>> Le 05/04/2017 à 10:37, Guochun Mao a écrit :
>  When nor's size larger than 16MByte, nor's address width maybe
>  set to 3 or 4, and controller should change address width according
>  to nor's setting.
> 
>  Signed-off-by: Guochun Mao st
> > 
> > Acked-by: Cyrille Pitchen 
> > 
> 
> Applied to github/spi-nor

Thank you very much!

Best regards,
Guochun



Re: [PATCH v1 1/1] mtd: mtk-nor: set controller's address width according to nor flash

2017-04-16 Thread Guochun Mao
Hi Cyrille,

On Sun, 2017-04-16 at 19:18 +0200, Cyrille Pitchen wrote:
> Le 13/04/2017 à 10:24, Cyrille Pitchen a écrit :
> > Hi Guochun,
> > 
> > Le 13/04/2017 à 04:40, Guochun Mao a écrit :
> >> Hi Cyrille,
> >>
> >> On Wed, 2017-04-12 at 22:57 +0200, Cyrille Pitchen wrote:
> >>> Hi Guochun,
> >>>
> >>> Le 05/04/2017 à 10:37, Guochun Mao a écrit :
>  When nor's size larger than 16MByte, nor's address width maybe
>  set to 3 or 4, and controller should change address width according
>  to nor's setting.
> 
>  Signed-off-by: Guochun Mao st
> > 
> > Acked-by: Cyrille Pitchen 
> > 
> 
> Applied to github/spi-nor

Thank you very much!

Best regards,
Guochun



Re: [virtio-dev] Re: [PATCH v9 2/5] virtio-balloon: VIRTIO_BALLOON_F_BALLOON_CHUNKS

2017-04-16 Thread Wei Wang

On 04/15/2017 05:38 AM, Michael S. Tsirkin wrote:

On Fri, Apr 14, 2017 at 04:37:52PM +0800, Wei Wang wrote:

On 04/14/2017 12:34 AM, Michael S. Tsirkin wrote:

On Thu, Apr 13, 2017 at 05:35:05PM +0800, Wei Wang wrote:

So we don't need the bitmap to talk to host, it is just
a data structure we chose to maintain lists of pages, right?

Right. bitmap is the way to gather pages to chunk.
It's only needed in the balloon page case.
For the unused page case, we don't need it, since the free
page blocks are already chunks.


OK as far as it goes but you need much better isolation for it.
Build a data structure with APIs such as _init, _cleanup, _add, _clear,
_find_first, _find_next.
Completely unrelated to pages, it just maintains bits.
Then use it here.



   static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
   module_param(oom_pages, int, S_IRUSR | S_IWUSR);
   MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
@@ -50,6 +54,10 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
   static struct vfsmount *balloon_mnt;
   #endif
+/* Types of pages to chunk */
+#define PAGE_CHUNK_TYPE_BALLOON 0
+

Doesn't look like you are ever adding more types in this
patchset.  Pls keep code simple, generalize it later.


"#define PAGE_CHUNK_TYPE_UNUSED 1" is added in another patch.

I would say add the extra code there too. Or maybe we can avoid
adding it altogether.


I'm trying to have the two features( i.e. "balloon pages" and
"unused pages") decoupled while trying to use common functions
to deal with the commonalities. That's the reason to define
the above macro.
Without the macro, we will need to have separate functions,
for example, instead of one "add_one_chunk()", we need to
have add_one_balloon_page_chunk() and
add_one_unused_page_chunk(),
and some of the implementations will be kind of duplicate in the
two functions.
Probably we can add it when the second feature comes to
the code.




Types of page to chunk are treated differently. Different types of page
chunks are sent to the host via different protocols.

1) PAGE_CHUNK_TYPE_BALLOON: Ballooned (i.e. inflated/deflated) pages
to chunk.  For the ballooned type, it uses the basic chunk msg format:

virtio_balloon_page_chunk_hdr +
virtio_balloon_page_chunk * MAX_PAGE_CHUNKS

2) PAGE_CHUNK_TYPE_UNUSED: unused pages to chunk. It uses this miscq msg
format:
miscq_hdr +
virtio_balloon_page_chunk_hdr +
virtio_balloon_page_chunk * MAX_PAGE_CHUNKS

The chunk msg is actually the payload of the miscq msg.



So just combine the two message formats and then it'll all be easier?



Yes, it'll be simple with only one msg format. But the problem I see
here is that miscq hdr is something necessary for the "unused page"
usage, but not needed by the "balloon page" usage. To be more
precise,
struct virtio_balloon_miscq_hdr {
 __le16 cmd;
 __le16 flags;
};
'cmd' specifies  the command from the miscq (I envision that
miscq will be further used to handle other possible miscellaneous
requests either from the host or to the host), so 'cmd' is necessary
for the miscq. But the inflateq is exclusively used for inflating
pages, so adding a command to it would be redundant and look a little
bewildered there.
'flags': We currently use bit 0 of flags to indicate the completion
ofa command, this is also useful in the "unused page" usage, and not
needed by the "balloon page" usage.

+#define MAX_PAGE_CHUNKS 4096

This is an order-4 allocation. I'd make it 4095 and then it's
an order-3 one.

Sounds good, thanks.
I think it would be better to make it 4090. Leave some space for the hdr
as well.

And miscq hdr. In fact just let compiler do the math - something like:
(8 * PAGE_SIZE - sizeof(hdr)) / sizeof(chunk)

Agree, thanks.



I skimmed explanation of algorithms below but please make sure
code speaks for itself and add comments inline to document it.
Whenever you answered me inline this is where you want to
try to make code clearer and add comments.

Also, pls find ways to abstract the data structure so we don't
need to deal with its internals all over the code.





   {
struct scatterlist sg;
+   struct virtio_balloon_page_chunk_hdr *hdr;
+   void *buf;
unsigned int len;
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   switch (type) {
+   case PAGE_CHUNK_TYPE_BALLOON:
+   hdr = vb->balloon_page_chunk_hdr;
+   len = 0;
+   break;
+   default:
+   dev_warn(>vdev->dev, "%s: chunk %d of unknown pages\n",
+__func__, type);
+   return;
+   }
-   /* We should always be able to add one buffer to an empty queue. */
-   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
-   virtqueue_kick(vq);
+   buf = (void *)hdr - len;

Moving back to before the header? How can this make sense?
It works fine since len is 0, so just buf = hdr.


For the unused page chunk case, it follows its own protocol:
miscq_hdr + payload(chunk msg).
  "buf = (void 

Re: [virtio-dev] Re: [PATCH v9 2/5] virtio-balloon: VIRTIO_BALLOON_F_BALLOON_CHUNKS

2017-04-16 Thread Wei Wang

On 04/15/2017 05:38 AM, Michael S. Tsirkin wrote:

On Fri, Apr 14, 2017 at 04:37:52PM +0800, Wei Wang wrote:

On 04/14/2017 12:34 AM, Michael S. Tsirkin wrote:

On Thu, Apr 13, 2017 at 05:35:05PM +0800, Wei Wang wrote:

So we don't need the bitmap to talk to host, it is just
a data structure we chose to maintain lists of pages, right?

Right. bitmap is the way to gather pages to chunk.
It's only needed in the balloon page case.
For the unused page case, we don't need it, since the free
page blocks are already chunks.


OK as far as it goes but you need much better isolation for it.
Build a data structure with APIs such as _init, _cleanup, _add, _clear,
_find_first, _find_next.
Completely unrelated to pages, it just maintains bits.
Then use it here.



   static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
   module_param(oom_pages, int, S_IRUSR | S_IWUSR);
   MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
@@ -50,6 +54,10 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
   static struct vfsmount *balloon_mnt;
   #endif
+/* Types of pages to chunk */
+#define PAGE_CHUNK_TYPE_BALLOON 0
+

Doesn't look like you are ever adding more types in this
patchset.  Pls keep code simple, generalize it later.


"#define PAGE_CHUNK_TYPE_UNUSED 1" is added in another patch.

I would say add the extra code there too. Or maybe we can avoid
adding it altogether.


I'm trying to have the two features( i.e. "balloon pages" and
"unused pages") decoupled while trying to use common functions
to deal with the commonalities. That's the reason to define
the above macro.
Without the macro, we will need to have separate functions,
for example, instead of one "add_one_chunk()", we need to
have add_one_balloon_page_chunk() and
add_one_unused_page_chunk(),
and some of the implementations will be kind of duplicate in the
two functions.
Probably we can add it when the second feature comes to
the code.




Types of page to chunk are treated differently. Different types of page
chunks are sent to the host via different protocols.

1) PAGE_CHUNK_TYPE_BALLOON: Ballooned (i.e. inflated/deflated) pages
to chunk.  For the ballooned type, it uses the basic chunk msg format:

virtio_balloon_page_chunk_hdr +
virtio_balloon_page_chunk * MAX_PAGE_CHUNKS

2) PAGE_CHUNK_TYPE_UNUSED: unused pages to chunk. It uses this miscq msg
format:
miscq_hdr +
virtio_balloon_page_chunk_hdr +
virtio_balloon_page_chunk * MAX_PAGE_CHUNKS

The chunk msg is actually the payload of the miscq msg.



So just combine the two message formats and then it'll all be easier?



Yes, it'll be simple with only one msg format. But the problem I see
here is that miscq hdr is something necessary for the "unused page"
usage, but not needed by the "balloon page" usage. To be more
precise,
struct virtio_balloon_miscq_hdr {
 __le16 cmd;
 __le16 flags;
};
'cmd' specifies  the command from the miscq (I envision that
miscq will be further used to handle other possible miscellaneous
requests either from the host or to the host), so 'cmd' is necessary
for the miscq. But the inflateq is exclusively used for inflating
pages, so adding a command to it would be redundant and look a little
bewildered there.
'flags': We currently use bit 0 of flags to indicate the completion
ofa command, this is also useful in the "unused page" usage, and not
needed by the "balloon page" usage.

+#define MAX_PAGE_CHUNKS 4096

This is an order-4 allocation. I'd make it 4095 and then it's
an order-3 one.

Sounds good, thanks.
I think it would be better to make it 4090. Leave some space for the hdr
as well.

And miscq hdr. In fact just let compiler do the math - something like:
(8 * PAGE_SIZE - sizeof(hdr)) / sizeof(chunk)

Agree, thanks.



I skimmed explanation of algorithms below but please make sure
code speaks for itself and add comments inline to document it.
Whenever you answered me inline this is where you want to
try to make code clearer and add comments.

Also, pls find ways to abstract the data structure so we don't
need to deal with its internals all over the code.





   {
struct scatterlist sg;
+   struct virtio_balloon_page_chunk_hdr *hdr;
+   void *buf;
unsigned int len;
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   switch (type) {
+   case PAGE_CHUNK_TYPE_BALLOON:
+   hdr = vb->balloon_page_chunk_hdr;
+   len = 0;
+   break;
+   default:
+   dev_warn(>vdev->dev, "%s: chunk %d of unknown pages\n",
+__func__, type);
+   return;
+   }
-   /* We should always be able to add one buffer to an empty queue. */
-   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
-   virtqueue_kick(vq);
+   buf = (void *)hdr - len;

Moving back to before the header? How can this make sense?
It works fine since len is 0, so just buf = hdr.


For the unused page chunk case, it follows its own protocol:
miscq_hdr + payload(chunk msg).
  "buf = (void 

Re: [PATCH V5 1/4] arm64: dts: Add basic DT to support Spreadtrum's SP9860G

2017-04-16 Thread Chunyan Zhang
Hi Arnd,

Could you please take this patch through arm-soc git if there are no
further comments? (The three other patches in this series have been
taken by Greg.)

Thanks,
Chunyan

On 27 March 2017 at 15:32, Chunyan Zhang  wrote:
> From: Orson Zhai 
>
> SC9860G is a 8 cores of A53 SoC with 4G LTE support SoC from Spreadtrum.
>
> According to regular hierarchy of sprd dts, whale2.dtsi contains SoC
> peripherals IP nodes, sc9860.dtsi contains stuff related to ARM core stuff
> and sp9860g dts is for the board level.
>
> Signed-off-by: Orson Zhai 
> Signed-off-by: Chunyan Zhang 
> Reviewed-by: Mathieu Poirier 
> ---
>  arch/arm64/boot/dts/sprd/Makefile |   3 +-
>  arch/arm64/boot/dts/sprd/sc9860.dtsi  | 569 
> ++
>  arch/arm64/boot/dts/sprd/sp9860g-1h10.dts |  56 +++
>  arch/arm64/boot/dts/sprd/whale2.dtsi  |  71 
>  4 files changed, 698 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/boot/dts/sprd/sc9860.dtsi
>  create mode 100644 arch/arm64/boot/dts/sprd/sp9860g-1h10.dts
>  create mode 100644 arch/arm64/boot/dts/sprd/whale2.dtsi
>
> diff --git a/arch/arm64/boot/dts/sprd/Makefile 
> b/arch/arm64/boot/dts/sprd/Makefile
> index b658c5e..f0535e6 100644
> --- a/arch/arm64/boot/dts/sprd/Makefile
> +++ b/arch/arm64/boot/dts/sprd/Makefile
> @@ -1,4 +1,5 @@
> -dtb-$(CONFIG_ARCH_SPRD) += sc9836-openphone.dtb
> +dtb-$(CONFIG_ARCH_SPRD) += sc9836-openphone.dtb \
> +   sp9860g-1h10.dtb
>
>  always := $(dtb-y)
>  subdir-y   := $(dts-dirs)
> diff --git a/arch/arm64/boot/dts/sprd/sc9860.dtsi 
> b/arch/arm64/boot/dts/sprd/sc9860.dtsi
> new file mode 100644
> index 000..7b7d8ce
> --- /dev/null
> +++ b/arch/arm64/boot/dts/sprd/sc9860.dtsi
> @@ -0,0 +1,569 @@
> +/*
> + * Spreadtrum SC9860 SoC
> + *
> + * Copyright (C) 2016, Spreadtrum Communications Inc.
> + *
> + * SPDX-License-Identifier: (GPL-2.0+ OR MIT)
> + */
> +
> +#include 
> +#include "whale2.dtsi"
> +
> +/ {
> +   cpus {
> +   #address-cells = <2>;
> +   #size-cells = <0>;
> +
> +   cpu-map {
> +   cluster0 {
> +   core0 {
> +   cpu = <>;
> +   };
> +   core1 {
> +   cpu = <>;
> +   };
> +   core2 {
> +   cpu = <>;
> +   };
> +   core3 {
> +   cpu = <>;
> +   };
> +   };
> +
> +   cluster1 {
> +   core0 {
> +   cpu = <>;
> +   };
> +   core1 {
> +   cpu = <>;
> +   };
> +   core2 {
> +   cpu = <>;
> +   };
> +   core3 {
> +   cpu = <>;
> +   };
> +   };
> +   };
> +
> +   CPU0: cpu@53 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x53>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU1: cpu@530001 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530001>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU2: cpu@530002 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530002>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU3: cpu@530003 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530003>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU4: cpu@530100 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530100>;
> +

Re: [PATCH V5 1/4] arm64: dts: Add basic DT to support Spreadtrum's SP9860G

2017-04-16 Thread Chunyan Zhang
Hi Arnd,

Could you please take this patch through arm-soc git if there are no
further comments? (The three other patches in this series have been
taken by Greg.)

Thanks,
Chunyan

On 27 March 2017 at 15:32, Chunyan Zhang  wrote:
> From: Orson Zhai 
>
> SC9860G is a 8 cores of A53 SoC with 4G LTE support SoC from Spreadtrum.
>
> According to regular hierarchy of sprd dts, whale2.dtsi contains SoC
> peripherals IP nodes, sc9860.dtsi contains stuff related to ARM core stuff
> and sp9860g dts is for the board level.
>
> Signed-off-by: Orson Zhai 
> Signed-off-by: Chunyan Zhang 
> Reviewed-by: Mathieu Poirier 
> ---
>  arch/arm64/boot/dts/sprd/Makefile |   3 +-
>  arch/arm64/boot/dts/sprd/sc9860.dtsi  | 569 
> ++
>  arch/arm64/boot/dts/sprd/sp9860g-1h10.dts |  56 +++
>  arch/arm64/boot/dts/sprd/whale2.dtsi  |  71 
>  4 files changed, 698 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/boot/dts/sprd/sc9860.dtsi
>  create mode 100644 arch/arm64/boot/dts/sprd/sp9860g-1h10.dts
>  create mode 100644 arch/arm64/boot/dts/sprd/whale2.dtsi
>
> diff --git a/arch/arm64/boot/dts/sprd/Makefile 
> b/arch/arm64/boot/dts/sprd/Makefile
> index b658c5e..f0535e6 100644
> --- a/arch/arm64/boot/dts/sprd/Makefile
> +++ b/arch/arm64/boot/dts/sprd/Makefile
> @@ -1,4 +1,5 @@
> -dtb-$(CONFIG_ARCH_SPRD) += sc9836-openphone.dtb
> +dtb-$(CONFIG_ARCH_SPRD) += sc9836-openphone.dtb \
> +   sp9860g-1h10.dtb
>
>  always := $(dtb-y)
>  subdir-y   := $(dts-dirs)
> diff --git a/arch/arm64/boot/dts/sprd/sc9860.dtsi 
> b/arch/arm64/boot/dts/sprd/sc9860.dtsi
> new file mode 100644
> index 000..7b7d8ce
> --- /dev/null
> +++ b/arch/arm64/boot/dts/sprd/sc9860.dtsi
> @@ -0,0 +1,569 @@
> +/*
> + * Spreadtrum SC9860 SoC
> + *
> + * Copyright (C) 2016, Spreadtrum Communications Inc.
> + *
> + * SPDX-License-Identifier: (GPL-2.0+ OR MIT)
> + */
> +
> +#include 
> +#include "whale2.dtsi"
> +
> +/ {
> +   cpus {
> +   #address-cells = <2>;
> +   #size-cells = <0>;
> +
> +   cpu-map {
> +   cluster0 {
> +   core0 {
> +   cpu = <>;
> +   };
> +   core1 {
> +   cpu = <>;
> +   };
> +   core2 {
> +   cpu = <>;
> +   };
> +   core3 {
> +   cpu = <>;
> +   };
> +   };
> +
> +   cluster1 {
> +   core0 {
> +   cpu = <>;
> +   };
> +   core1 {
> +   cpu = <>;
> +   };
> +   core2 {
> +   cpu = <>;
> +   };
> +   core3 {
> +   cpu = <>;
> +   };
> +   };
> +   };
> +
> +   CPU0: cpu@53 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x53>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU1: cpu@530001 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530001>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU2: cpu@530002 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530002>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU3: cpu@530003 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530003>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU4: cpu@530100 {
> +   device_type = "cpu";
> +   compatible = "arm,cortex-a53", "arm,armv8";
> +   reg = <0x0 0x530100>;
> +   enable-method = "psci";
> +   cpu-idle-states = <_PD _PD>;
> +   };
> +
> +   CPU5: 

Re: [PATCH v3 1/8] trace: ras: add ARM processor error information trace event

2017-04-16 Thread Xie XiuQi
Hi Tyler,

On 2017/4/17 11:08, Xie XiuQi wrote:
> Hi Tyler,
> 
> Thanks for your comments and testing.
> 
> On 2017/4/15 4:36, Baicar, Tyler wrote:
>> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>>> Add a new trace event for ARM processor error information, so that
>>> the user will know what error occurred. With this information the
>>> user may take appropriate action.
>>>
>>> These trace events are consistent with the ARM processor error
>>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>>
>>> ---
>>> v2: add trace enabled condition as Steven's suggestion.
>>>  fix a typo.
>>> ---
>>>
>>> Cc: Steven Rostedt 
>>> Cc: Tyler Baicar 
>>> Signed-off-by: Xie XiuQi 
>>> ---
>> ...
>>>   +#define ARM_PROC_ERR_TYPE\
>>> +EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )\
>>> +EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )\
>>> +EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )\
>>> +EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>>> +
>>> +#define ARM_PROC_ERR_FLAGS\
>>> +EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )\
>>> +EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )\
>>> +EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )\
>>> +EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>>> +
>> Hello Xie XiuQi,
>>
>> This isn't compiling for me because of these definitions. Here you are using 
>> ARM_*, but below in the TP_printk you are using ARCH_*. The compiler 
>> complains the ARCH_* ones are undefined:
>>
>> ./include/trace/../../include/ras/ras_event.h:278:37: error: 
>> 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>>  __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
>> ./include/trace/../../include/ras/ras_event.h:280:38: error: 
>> 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>>  __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),
> 
> Sorry, it's a typo. It should be ARM_xxx.
> 
>>
>>> +/*
>>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>>> + * via TRACE_DEFINE_ENUM().
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>>> +#define EMe(a, b)TRACE_DEFINE_ENUM(a);
>>> +
>>> +ARM_PROC_ERR_TYPE
>>> +ARM_PROC_ERR_FLAGS
>> Are the above two lines supposed to be here?
>>> +
>>> +/*
>>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>>> + * that will be printed in the output.
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b){ a, b },
>>> +#define EMe(a, b){ a, b }
>>> +
>>> +TRACE_EVENT(arm_proc_err,
>> I think it would be better to keep this similar to the naming of the current 
>> RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I 
>> would suggest using "arm_err_info_event" since this is handling the error 
>> information structures of the arm errors.
>>> +
>>> +TP_PROTO(const struct cper_arm_err_info *err),
>>> +
>>> +TP_ARGS(err),
>>> +
>>> +TP_STRUCT__entry(
>>> +__field(u8, type)
>>> +__field(u16, multiple_error)
>>> +__field(u8, flags)
>>> +__field(u64, error_info)
>>> +__field(u64, virt_fault_addr)
>>> +__field(u64, physical_fault_addr)
>> Validation bits should also be a part of this structure that way user space 
>> tools will know which of these fields are valid.
> 
> Could we use the default value to check the validation which we have checked 
> in TP_fast_assign?
> 
>>> +),
>>> +
>>> +TP_fast_assign(
>>> +__entry->type = err->type;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>>> +__entry->multiple_error = err->multiple_error;
>>> +else
>>> +__entry->multiple_error = ~0;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>>> +__entry->flags = err->flags;
>>> +else
>>> +__entry->flags = ~0;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>>> +__entry->error_info = err->error_info;
>>> +else
>>> +__entry->error_info = 0ULL;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>>> +__entry->virt_fault_addr = err->virt_fault_addr;
>>> +else
>>> +__entry->virt_fault_addr = 0ULL;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>>> +__entry->physical_fault_addr = err->physical_fault_addr;
>>> +else
>>> +__entry->physical_fault_addr = 0ULL;
>>> +),
>>> +
>>> +TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
>> I think the "ARM Processor Error:" part of this should just be removed. 
>> Here's the output with this removed and the trace event renamed to 
>> arm_err_info_event. I think this looks much cleaner and matches the style 
>> used with the arm_event.
>>

Re: [PATCH v3 1/8] trace: ras: add ARM processor error information trace event

2017-04-16 Thread Xie XiuQi
Hi Tyler,

On 2017/4/17 11:08, Xie XiuQi wrote:
> Hi Tyler,
> 
> Thanks for your comments and testing.
> 
> On 2017/4/15 4:36, Baicar, Tyler wrote:
>> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>>> Add a new trace event for ARM processor error information, so that
>>> the user will know what error occurred. With this information the
>>> user may take appropriate action.
>>>
>>> These trace events are consistent with the ARM processor error
>>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>>
>>> ---
>>> v2: add trace enabled condition as Steven's suggestion.
>>>  fix a typo.
>>> ---
>>>
>>> Cc: Steven Rostedt 
>>> Cc: Tyler Baicar 
>>> Signed-off-by: Xie XiuQi 
>>> ---
>> ...
>>>   +#define ARM_PROC_ERR_TYPE\
>>> +EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )\
>>> +EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )\
>>> +EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )\
>>> +EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>>> +
>>> +#define ARM_PROC_ERR_FLAGS\
>>> +EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )\
>>> +EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )\
>>> +EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )\
>>> +EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>>> +
>> Hello Xie XiuQi,
>>
>> This isn't compiling for me because of these definitions. Here you are using 
>> ARM_*, but below in the TP_printk you are using ARCH_*. The compiler 
>> complains the ARCH_* ones are undefined:
>>
>> ./include/trace/../../include/ras/ras_event.h:278:37: error: 
>> 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>>  __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
>> ./include/trace/../../include/ras/ras_event.h:280:38: error: 
>> 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>>  __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),
> 
> Sorry, it's a typo. It should be ARM_xxx.
> 
>>
>>> +/*
>>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>>> + * via TRACE_DEFINE_ENUM().
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>>> +#define EMe(a, b)TRACE_DEFINE_ENUM(a);
>>> +
>>> +ARM_PROC_ERR_TYPE
>>> +ARM_PROC_ERR_FLAGS
>> Are the above two lines supposed to be here?
>>> +
>>> +/*
>>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>>> + * that will be printed in the output.
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b){ a, b },
>>> +#define EMe(a, b){ a, b }
>>> +
>>> +TRACE_EVENT(arm_proc_err,
>> I think it would be better to keep this similar to the naming of the current 
>> RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I 
>> would suggest using "arm_err_info_event" since this is handling the error 
>> information structures of the arm errors.
>>> +
>>> +TP_PROTO(const struct cper_arm_err_info *err),
>>> +
>>> +TP_ARGS(err),
>>> +
>>> +TP_STRUCT__entry(
>>> +__field(u8, type)
>>> +__field(u16, multiple_error)
>>> +__field(u8, flags)
>>> +__field(u64, error_info)
>>> +__field(u64, virt_fault_addr)
>>> +__field(u64, physical_fault_addr)
>> Validation bits should also be a part of this structure that way user space 
>> tools will know which of these fields are valid.
> 
> Could we use the default value to check the validation which we have checked 
> in TP_fast_assign?
> 
>>> +),
>>> +
>>> +TP_fast_assign(
>>> +__entry->type = err->type;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>>> +__entry->multiple_error = err->multiple_error;
>>> +else
>>> +__entry->multiple_error = ~0;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>>> +__entry->flags = err->flags;
>>> +else
>>> +__entry->flags = ~0;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>>> +__entry->error_info = err->error_info;
>>> +else
>>> +__entry->error_info = 0ULL;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>>> +__entry->virt_fault_addr = err->virt_fault_addr;
>>> +else
>>> +__entry->virt_fault_addr = 0ULL;
>>> +
>>> +if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>>> +__entry->physical_fault_addr = err->physical_fault_addr;
>>> +else
>>> +__entry->physical_fault_addr = 0ULL;
>>> +),
>>> +
>>> +TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
>> I think the "ARM Processor Error:" part of this should just be removed. 
>> Here's the output with this removed and the trace event renamed to 
>> arm_err_info_event. I think this looks much cleaner and matches the style 
>> used with the arm_event.
>>
>>   -0 [020] .ns.   366.592434: arm_event: affinity 

[PATCH] usb: gadget: remove redundant self assignment

2017-04-16 Thread Stefan Agner
The assignment ret = ret is redundant and can be removed.

Signed-off-by: Stefan Agner 
---
A very similar patch has been applied already last year, but there is
a second such assignment...

--
Stefan

 drivers/usb/gadget/udc/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index d685d82dcf48..b57bd53812fe 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -139,10 +139,8 @@ int usb_ep_disable(struct usb_ep *ep)
goto out;
 
ret = ep->ops->disable(ep);
-   if (ret) {
-   ret = ret;
+   if (ret)
goto out;
-   }
 
ep->enabled = false;
 
-- 
2.12.2



[PATCH] usb: gadget: remove redundant self assignment

2017-04-16 Thread Stefan Agner
The assignment ret = ret is redundant and can be removed.

Signed-off-by: Stefan Agner 
---
A very similar patch has been applied already last year, but there is
a second such assignment...

--
Stefan

 drivers/usb/gadget/udc/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index d685d82dcf48..b57bd53812fe 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -139,10 +139,8 @@ int usb_ep_disable(struct usb_ep *ep)
goto out;
 
ret = ep->ops->disable(ep);
-   if (ret) {
-   ret = ret;
+   if (ret)
goto out;
-   }
 
ep->enabled = false;
 
-- 
2.12.2



Re: [PATCH v3 1/8] trace: ras: add ARM processor error information trace event

2017-04-16 Thread Xie XiuQi
Hi Tyler,

Thanks for your comments and testing.

On 2017/4/15 4:36, Baicar, Tyler wrote:
> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>> Add a new trace event for ARM processor error information, so that
>> the user will know what error occurred. With this information the
>> user may take appropriate action.
>>
>> These trace events are consistent with the ARM processor error
>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>
>> ---
>> v2: add trace enabled condition as Steven's suggestion.
>>  fix a typo.
>> ---
>>
>> Cc: Steven Rostedt 
>> Cc: Tyler Baicar 
>> Signed-off-by: Xie XiuQi 
>> ---
> ...
>>   +#define ARM_PROC_ERR_TYPE\
>> +EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )\
>> +EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )\
>> +EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )\
>> +EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>> +
>> +#define ARM_PROC_ERR_FLAGS\
>> +EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )\
>> +EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )\
>> +EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )\
>> +EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>> +
> Hello Xie XiuQi,
> 
> This isn't compiling for me because of these definitions. Here you are using 
> ARM_*, but below in the TP_printk you are using ARCH_*. The compiler 
> complains the ARCH_* ones are undefined:
> 
> ./include/trace/../../include/ras/ras_event.h:278:37: error: 
> 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>  __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
> ./include/trace/../../include/ras/ras_event.h:280:38: error: 
> 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>  __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),

Sorry, it's a typo. It should be ARM_xxx.

> 
>> +/*
>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>> + * via TRACE_DEFINE_ENUM().
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>> +#define EMe(a, b)TRACE_DEFINE_ENUM(a);
>> +
>> +ARM_PROC_ERR_TYPE
>> +ARM_PROC_ERR_FLAGS
> Are the above two lines supposed to be here?
>> +
>> +/*
>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>> + * that will be printed in the output.
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b){ a, b },
>> +#define EMe(a, b){ a, b }
>> +
>> +TRACE_EVENT(arm_proc_err,
> I think it would be better to keep this similar to the naming of the current 
> RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I 
> would suggest using "arm_err_info_event" since this is handling the error 
> information structures of the arm errors.
>> +
>> +TP_PROTO(const struct cper_arm_err_info *err),
>> +
>> +TP_ARGS(err),
>> +
>> +TP_STRUCT__entry(
>> +__field(u8, type)
>> +__field(u16, multiple_error)
>> +__field(u8, flags)
>> +__field(u64, error_info)
>> +__field(u64, virt_fault_addr)
>> +__field(u64, physical_fault_addr)
> Validation bits should also be a part of this structure that way user space 
> tools will know which of these fields are valid.

Could we use the default value to check the validation which we have checked in 
TP_fast_assign?

>> +),
>> +
>> +TP_fast_assign(
>> +__entry->type = err->type;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>> +__entry->multiple_error = err->multiple_error;
>> +else
>> +__entry->multiple_error = ~0;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>> +__entry->flags = err->flags;
>> +else
>> +__entry->flags = ~0;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>> +__entry->error_info = err->error_info;
>> +else
>> +__entry->error_info = 0ULL;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>> +__entry->virt_fault_addr = err->virt_fault_addr;
>> +else
>> +__entry->virt_fault_addr = 0ULL;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>> +__entry->physical_fault_addr = err->physical_fault_addr;
>> +else
>> +__entry->physical_fault_addr = 0ULL;
>> +),
>> +
>> +TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
> I think the "ARM Processor Error:" part of this should just be removed. 
> Here's the output with this removed and the trace event renamed to 
> arm_err_info_event. I think this looks much cleaner and matches the style 
> used with the arm_event.
> 
>   -0 [020] .ns.   366.592434: arm_event: affinity level: 2; 
> MPIDR: ; MIDR: 510f8000; running state: 1; PSCI 
> state: 0
>   -0 [020] 

Re: [PATCH v3 1/8] trace: ras: add ARM processor error information trace event

2017-04-16 Thread Xie XiuQi
Hi Tyler,

Thanks for your comments and testing.

On 2017/4/15 4:36, Baicar, Tyler wrote:
> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>> Add a new trace event for ARM processor error information, so that
>> the user will know what error occurred. With this information the
>> user may take appropriate action.
>>
>> These trace events are consistent with the ARM processor error
>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>
>> ---
>> v2: add trace enabled condition as Steven's suggestion.
>>  fix a typo.
>> ---
>>
>> Cc: Steven Rostedt 
>> Cc: Tyler Baicar 
>> Signed-off-by: Xie XiuQi 
>> ---
> ...
>>   +#define ARM_PROC_ERR_TYPE\
>> +EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )\
>> +EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )\
>> +EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )\
>> +EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>> +
>> +#define ARM_PROC_ERR_FLAGS\
>> +EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )\
>> +EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )\
>> +EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )\
>> +EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>> +
> Hello Xie XiuQi,
> 
> This isn't compiling for me because of these definitions. Here you are using 
> ARM_*, but below in the TP_printk you are using ARCH_*. The compiler 
> complains the ARCH_* ones are undefined:
> 
> ./include/trace/../../include/ras/ras_event.h:278:37: error: 
> 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>  __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
> ./include/trace/../../include/ras/ras_event.h:280:38: error: 
> 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>  __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),

Sorry, it's a typo. It should be ARM_xxx.

> 
>> +/*
>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>> + * via TRACE_DEFINE_ENUM().
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>> +#define EMe(a, b)TRACE_DEFINE_ENUM(a);
>> +
>> +ARM_PROC_ERR_TYPE
>> +ARM_PROC_ERR_FLAGS
> Are the above two lines supposed to be here?
>> +
>> +/*
>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>> + * that will be printed in the output.
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b){ a, b },
>> +#define EMe(a, b){ a, b }
>> +
>> +TRACE_EVENT(arm_proc_err,
> I think it would be better to keep this similar to the naming of the current 
> RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I 
> would suggest using "arm_err_info_event" since this is handling the error 
> information structures of the arm errors.
>> +
>> +TP_PROTO(const struct cper_arm_err_info *err),
>> +
>> +TP_ARGS(err),
>> +
>> +TP_STRUCT__entry(
>> +__field(u8, type)
>> +__field(u16, multiple_error)
>> +__field(u8, flags)
>> +__field(u64, error_info)
>> +__field(u64, virt_fault_addr)
>> +__field(u64, physical_fault_addr)
> Validation bits should also be a part of this structure that way user space 
> tools will know which of these fields are valid.

Could we use the default value to check the validation which we have checked in 
TP_fast_assign?

>> +),
>> +
>> +TP_fast_assign(
>> +__entry->type = err->type;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>> +__entry->multiple_error = err->multiple_error;
>> +else
>> +__entry->multiple_error = ~0;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>> +__entry->flags = err->flags;
>> +else
>> +__entry->flags = ~0;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>> +__entry->error_info = err->error_info;
>> +else
>> +__entry->error_info = 0ULL;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>> +__entry->virt_fault_addr = err->virt_fault_addr;
>> +else
>> +__entry->virt_fault_addr = 0ULL;
>> +
>> +if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>> +__entry->physical_fault_addr = err->physical_fault_addr;
>> +else
>> +__entry->physical_fault_addr = 0ULL;
>> +),
>> +
>> +TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
> I think the "ARM Processor Error:" part of this should just be removed. 
> Here's the output with this removed and the trace event renamed to 
> arm_err_info_event. I think this looks much cleaner and matches the style 
> used with the arm_event.
> 
>   -0 [020] .ns.   366.592434: arm_event: affinity level: 2; 
> MPIDR: ; MIDR: 510f8000; running state: 1; PSCI 
> state: 0
>   -0 [020] .ns.   366.592437: arm_err_info_event: type 
> cache error; count: 

Re: [PATCH] acpica: trivial changes on comments

2017-04-16 Thread Cao jin
Re-to: linux-a...@vger.kernel.org

On 04/17/2017 10:56 AM, Cao jin wrote:
> Remove superfluous word; unify comments and function prototype.
> 
> Signed-off-by: Cao jin 
> ---
>  drivers/acpi/acpica/tbfadt.c  | 2 +-
>  drivers/acpi/acpica/tbutils.c | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
> index 51860bf..ba5fcbe 100644
> --- a/drivers/acpi/acpica/tbfadt.c
> +++ b/drivers/acpi/acpica/tbfadt.c
> @@ -449,7 +449,7 @@ void acpi_tb_create_local_fadt(struct acpi_table_header 
> *table, u32 length)
>   * The 64-bit X fields are optional extensions to the original 32-bit FADT
>   * V1.0 fields. Even if they are present in the FADT, they are optional and
>   * are unused if the BIOS sets them to zero. Therefore, we must copy/expand
> - * 32-bit V1.0 fields to the 64-bit X fields if the the 64-bit X field is
> + * 32-bit V1.0 fields to the 64-bit X fields if the 64-bit X field is
>   * originally zero.
>   *
>   * For ACPI 1.0 FADTs (that contain no 64-bit addresses), all 32-bit address
> diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
> index 5a968a7..e0c 100644
> --- a/drivers/acpi/acpica/tbutils.c
> +++ b/drivers/acpi/acpica/tbutils.c
> @@ -141,7 +141,7 @@ void acpi_tb_check_dsdt_header(void)
>   *
>   * FUNCTION:acpi_tb_copy_dsdt
>   *
> - * PARAMETERS:  table_desc  - Installed table to copy
> + * PARAMETERS:  table_index - Index of installed table to copy
>   *
>   * RETURN:  None
>   *
> @@ -239,7 +239,7 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 
> table_entry_size)
>   *
>   * FUNCTION:acpi_tb_parse_root_table
>   *
> - * PARAMETERS:  rsdp- Pointer to the RSDP
> + * PARAMETERS:  rsdp_address- Pointer to the RSDP
>   *
>   * RETURN:  Status
>   *
> 

-- 
Sincerely,
Cao jin




Re: [PATCH] acpica: trivial changes on comments

2017-04-16 Thread Cao jin
Re-to: linux-a...@vger.kernel.org

On 04/17/2017 10:56 AM, Cao jin wrote:
> Remove superfluous word; unify comments and function prototype.
> 
> Signed-off-by: Cao jin 
> ---
>  drivers/acpi/acpica/tbfadt.c  | 2 +-
>  drivers/acpi/acpica/tbutils.c | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
> index 51860bf..ba5fcbe 100644
> --- a/drivers/acpi/acpica/tbfadt.c
> +++ b/drivers/acpi/acpica/tbfadt.c
> @@ -449,7 +449,7 @@ void acpi_tb_create_local_fadt(struct acpi_table_header 
> *table, u32 length)
>   * The 64-bit X fields are optional extensions to the original 32-bit FADT
>   * V1.0 fields. Even if they are present in the FADT, they are optional and
>   * are unused if the BIOS sets them to zero. Therefore, we must copy/expand
> - * 32-bit V1.0 fields to the 64-bit X fields if the the 64-bit X field is
> + * 32-bit V1.0 fields to the 64-bit X fields if the 64-bit X field is
>   * originally zero.
>   *
>   * For ACPI 1.0 FADTs (that contain no 64-bit addresses), all 32-bit address
> diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
> index 5a968a7..e0c 100644
> --- a/drivers/acpi/acpica/tbutils.c
> +++ b/drivers/acpi/acpica/tbutils.c
> @@ -141,7 +141,7 @@ void acpi_tb_check_dsdt_header(void)
>   *
>   * FUNCTION:acpi_tb_copy_dsdt
>   *
> - * PARAMETERS:  table_desc  - Installed table to copy
> + * PARAMETERS:  table_index - Index of installed table to copy
>   *
>   * RETURN:  None
>   *
> @@ -239,7 +239,7 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 
> table_entry_size)
>   *
>   * FUNCTION:acpi_tb_parse_root_table
>   *
> - * PARAMETERS:  rsdp- Pointer to the RSDP
> + * PARAMETERS:  rsdp_address- Pointer to the RSDP
>   *
>   * RETURN:  Status
>   *
> 

-- 
Sincerely,
Cao jin




[PATCH] acpica: trivial changes on comments

2017-04-16 Thread Cao jin
Remove superfluous word; unify comments and function prototype.

Signed-off-by: Cao jin 
---
 drivers/acpi/acpica/tbfadt.c  | 2 +-
 drivers/acpi/acpica/tbutils.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 51860bf..ba5fcbe 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -449,7 +449,7 @@ void acpi_tb_create_local_fadt(struct acpi_table_header 
*table, u32 length)
  * The 64-bit X fields are optional extensions to the original 32-bit FADT
  * V1.0 fields. Even if they are present in the FADT, they are optional and
  * are unused if the BIOS sets them to zero. Therefore, we must copy/expand
- * 32-bit V1.0 fields to the 64-bit X fields if the the 64-bit X field is
+ * 32-bit V1.0 fields to the 64-bit X fields if the 64-bit X field is
  * originally zero.
  *
  * For ACPI 1.0 FADTs (that contain no 64-bit addresses), all 32-bit address
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 5a968a7..e0c 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -141,7 +141,7 @@ void acpi_tb_check_dsdt_header(void)
  *
  * FUNCTION:acpi_tb_copy_dsdt
  *
- * PARAMETERS:  table_desc  - Installed table to copy
+ * PARAMETERS:  table_index - Index of installed table to copy
  *
  * RETURN:  None
  *
@@ -239,7 +239,7 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 
table_entry_size)
  *
  * FUNCTION:acpi_tb_parse_root_table
  *
- * PARAMETERS:  rsdp- Pointer to the RSDP
+ * PARAMETERS:  rsdp_address- Pointer to the RSDP
  *
  * RETURN:  Status
  *
-- 
2.1.0





[PATCH] acpica: trivial changes on comments

2017-04-16 Thread Cao jin
Remove superfluous word; unify comments and function prototype.

Signed-off-by: Cao jin 
---
 drivers/acpi/acpica/tbfadt.c  | 2 +-
 drivers/acpi/acpica/tbutils.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 51860bf..ba5fcbe 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -449,7 +449,7 @@ void acpi_tb_create_local_fadt(struct acpi_table_header 
*table, u32 length)
  * The 64-bit X fields are optional extensions to the original 32-bit FADT
  * V1.0 fields. Even if they are present in the FADT, they are optional and
  * are unused if the BIOS sets them to zero. Therefore, we must copy/expand
- * 32-bit V1.0 fields to the 64-bit X fields if the the 64-bit X field is
+ * 32-bit V1.0 fields to the 64-bit X fields if the 64-bit X field is
  * originally zero.
  *
  * For ACPI 1.0 FADTs (that contain no 64-bit addresses), all 32-bit address
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 5a968a7..e0c 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -141,7 +141,7 @@ void acpi_tb_check_dsdt_header(void)
  *
  * FUNCTION:acpi_tb_copy_dsdt
  *
- * PARAMETERS:  table_desc  - Installed table to copy
+ * PARAMETERS:  table_index - Index of installed table to copy
  *
  * RETURN:  None
  *
@@ -239,7 +239,7 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 
table_entry_size)
  *
  * FUNCTION:acpi_tb_parse_root_table
  *
- * PARAMETERS:  rsdp- Pointer to the RSDP
+ * PARAMETERS:  rsdp_address- Pointer to the RSDP
  *
  * RETURN:  Status
  *
-- 
2.1.0





[lkp-robot] [cpu/hotplug] 94380da276: INFO:possible_recursive_locking_detected

2017-04-16 Thread kernel test robot

FYI, we noticed the following commit:

commit: 94380da2765a391335c2326ba327e835c2e7aa03 ("cpu/hotplug: Convert hotplug 
locking to percpu rwsem")
https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git WIP.hotplug

in testcase: trinity
with following parameters:

runtime: 300s

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


on test machine: qemu-system-x86_64 -enable-kvm -cpu IvyBridge -m 420M

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):


+--+++
|  | d632ccfe1c | 94380da276 |
+--+++
| boot_successes   | 0  | 4  |
| boot_failures| 0  | 4  |
| INFO:possible_recursive_locking_detected | 0  | 4  |
| calltrace:SyS_perf_event_open| 0  | 4  |
+--+++



[   23.381603] [ INFO: possible recursive locking detected ]
[   23.383344] 4.11.0-rc6-00236-g94380da #32 Not tainted
[   23.385029] -
[   23.386830] trinity-main/2266 is trying to acquire lock:
[   23.388554]  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] 
perf_swevent_init+0x75/0x286
[   23.391640] 
[   23.391640] but task is already holding lock:
[   23.394084]  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] 
SyS_perf_event_open+0x547/0xe3c
[   23.397273] 
[   23.397273] other info that might help us debug this:
[   23.399855]  Possible unsafe locking scenario:
[   23.399855] 
[   23.402314]CPU0
[   23.403507]
[   23.405813]   lock(cpu_hotplug_lock.rw_sem);
[   23.407449]   lock(cpu_hotplug_lock.rw_sem);
[   23.409117] 
[   23.409117]  *** DEADLOCK ***
[   23.409117] 
[   23.412453]  May be due to missing lock nesting notation
[   23.412453] 
[   23.415292] 3 locks held by trinity-main/2266:
[   23.416986]  #0:  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: 
[] SyS_perf_event_open+0x547/0xe3c
[   23.433473]  #1:  (>cred_guard_mutex){+.+.+.}, at: [] 
SyS_perf_event_open+0x5b2/0xe3c
[   23.436931]  #2:  (_srcu){..}, at: [] 
perf_event_alloc+0x2fc/0x7a1
[   23.457210] 
[   23.457210] stack backtrace:
[   23.459813] CPU: 0 PID: 2266 Comm: trinity-main Not tainted 
4.11.0-rc6-00236-g94380da #32
[   23.463175] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.9.3-20161025_171302-gandalf 04/01/2014
[   23.481993] Call Trace:
[   23.483305]  dump_stack+0x86/0xc0
[   23.484930]  __lock_acquire+0x9a4/0x1580
[   23.486552]  ? kvm_sched_clock_read+0x9/0x12
[   23.488380]  lock_acquire+0x180/0x21b
[   23.489918]  ? lock_acquire+0x180/0x21b
[   23.491528]  ? perf_swevent_init+0x75/0x286
[   23.493332]  get_online_cpus+0x42/0x80
[   23.494906]  ? perf_swevent_init+0x75/0x286
[   23.496605]  perf_swevent_init+0x75/0x286
[   23.498260]  ? perf_event_alloc+0x346/0x7a1
[   23.499973]  perf_try_init_event+0x46/0x76
[   23.501662]  perf_event_alloc+0x46d/0x7a1
[   23.503339]  SyS_perf_event_open+0x587/0xe3c
[   23.505078]  do_int80_syscall_32+0x66/0x12c
[   23.506792]  entry_INT80_compat+0x38/0x50
[   23.508468] RIP: 0023:0x8090aa2
[   23.509962] RSP: 002b:ffd738d8 EFLAGS: 0282 ORIG_RAX: 
0150
[   23.513094] RAX: ffda RBX: 08b1e790 RCX: 
[   23.515432] RDX:  RSI:  RDI: 0009
[   23.517789] RBP: 6d656d0c R08:  R09: 
[   23.520128] R10:  R11:  R12: 
[   23.522463] R13:  R14:  R15: 
[   23.574529] trinity-main uses obsolete (PF_INET,SOCK_PACKET)
[   23.591467] kernel msg: ebtables bug: please report to author: Wrong len 
argument
[   24.396248] VFS: Warning: trinity-c0 using old stat() call. Recompile your 
binary.
[   24.812470] VFS: Warning: trinity-c1 using old stat() call. Recompile your 
binary.
[   25.016003] VFS: Warning: trinity-c2 using old stat() call. Recompile your 
binary.
[   25.298865] VFS: Warning: trinity-c0 using old stat() call. Recompile your 
binary.
[   25.641396] VFS: Warning: trinity-c2 using old stat() call. Recompile your 
binary.
[   62.647743] rcu-torture: rtc: 836b9d00 ver: 1 tfle: 0 rta: 1 rtaf: 0 
rtf: 0 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 1 onoff: 0/0:0/0 
-1,0:-1,0 0:0 (HZ=1000) barrier: 0/0:0 cbflood: 1
[   62.655884] rcu-torture: Reader Pipe:  2 0 0 0 0 0 0 0 0 0 0
[   62.661435] rcu-torture: Reader Batch:  0 2 0 0 0 0 0 0 0 0 0
[   62.664264] rcu-torture: Free-Block Circulation:  0 0 0 0 0 0 0 0 0 0 0
[   95.628115] trinity-c1 (3029) used greatest stack depth: 12776 bytes left
[  124.087732] rcu-torture: rtc: 836b9d00 ver: 1 tfle: 0 rta: 1 

[lkp-robot] [cpu/hotplug] 94380da276: INFO:possible_recursive_locking_detected

2017-04-16 Thread kernel test robot

FYI, we noticed the following commit:

commit: 94380da2765a391335c2326ba327e835c2e7aa03 ("cpu/hotplug: Convert hotplug 
locking to percpu rwsem")
https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git WIP.hotplug

in testcase: trinity
with following parameters:

runtime: 300s

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


on test machine: qemu-system-x86_64 -enable-kvm -cpu IvyBridge -m 420M

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):


+--+++
|  | d632ccfe1c | 94380da276 |
+--+++
| boot_successes   | 0  | 4  |
| boot_failures| 0  | 4  |
| INFO:possible_recursive_locking_detected | 0  | 4  |
| calltrace:SyS_perf_event_open| 0  | 4  |
+--+++



[   23.381603] [ INFO: possible recursive locking detected ]
[   23.383344] 4.11.0-rc6-00236-g94380da #32 Not tainted
[   23.385029] -
[   23.386830] trinity-main/2266 is trying to acquire lock:
[   23.388554]  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] 
perf_swevent_init+0x75/0x286
[   23.391640] 
[   23.391640] but task is already holding lock:
[   23.394084]  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] 
SyS_perf_event_open+0x547/0xe3c
[   23.397273] 
[   23.397273] other info that might help us debug this:
[   23.399855]  Possible unsafe locking scenario:
[   23.399855] 
[   23.402314]CPU0
[   23.403507]
[   23.405813]   lock(cpu_hotplug_lock.rw_sem);
[   23.407449]   lock(cpu_hotplug_lock.rw_sem);
[   23.409117] 
[   23.409117]  *** DEADLOCK ***
[   23.409117] 
[   23.412453]  May be due to missing lock nesting notation
[   23.412453] 
[   23.415292] 3 locks held by trinity-main/2266:
[   23.416986]  #0:  (cpu_hotplug_lock.rw_sem){.+.+.+}, at: 
[] SyS_perf_event_open+0x547/0xe3c
[   23.433473]  #1:  (>cred_guard_mutex){+.+.+.}, at: [] 
SyS_perf_event_open+0x5b2/0xe3c
[   23.436931]  #2:  (_srcu){..}, at: [] 
perf_event_alloc+0x2fc/0x7a1
[   23.457210] 
[   23.457210] stack backtrace:
[   23.459813] CPU: 0 PID: 2266 Comm: trinity-main Not tainted 
4.11.0-rc6-00236-g94380da #32
[   23.463175] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.9.3-20161025_171302-gandalf 04/01/2014
[   23.481993] Call Trace:
[   23.483305]  dump_stack+0x86/0xc0
[   23.484930]  __lock_acquire+0x9a4/0x1580
[   23.486552]  ? kvm_sched_clock_read+0x9/0x12
[   23.488380]  lock_acquire+0x180/0x21b
[   23.489918]  ? lock_acquire+0x180/0x21b
[   23.491528]  ? perf_swevent_init+0x75/0x286
[   23.493332]  get_online_cpus+0x42/0x80
[   23.494906]  ? perf_swevent_init+0x75/0x286
[   23.496605]  perf_swevent_init+0x75/0x286
[   23.498260]  ? perf_event_alloc+0x346/0x7a1
[   23.499973]  perf_try_init_event+0x46/0x76
[   23.501662]  perf_event_alloc+0x46d/0x7a1
[   23.503339]  SyS_perf_event_open+0x587/0xe3c
[   23.505078]  do_int80_syscall_32+0x66/0x12c
[   23.506792]  entry_INT80_compat+0x38/0x50
[   23.508468] RIP: 0023:0x8090aa2
[   23.509962] RSP: 002b:ffd738d8 EFLAGS: 0282 ORIG_RAX: 
0150
[   23.513094] RAX: ffda RBX: 08b1e790 RCX: 
[   23.515432] RDX:  RSI:  RDI: 0009
[   23.517789] RBP: 6d656d0c R08:  R09: 
[   23.520128] R10:  R11:  R12: 
[   23.522463] R13:  R14:  R15: 
[   23.574529] trinity-main uses obsolete (PF_INET,SOCK_PACKET)
[   23.591467] kernel msg: ebtables bug: please report to author: Wrong len 
argument
[   24.396248] VFS: Warning: trinity-c0 using old stat() call. Recompile your 
binary.
[   24.812470] VFS: Warning: trinity-c1 using old stat() call. Recompile your 
binary.
[   25.016003] VFS: Warning: trinity-c2 using old stat() call. Recompile your 
binary.
[   25.298865] VFS: Warning: trinity-c0 using old stat() call. Recompile your 
binary.
[   25.641396] VFS: Warning: trinity-c2 using old stat() call. Recompile your 
binary.
[   62.647743] rcu-torture: rtc: 836b9d00 ver: 1 tfle: 0 rta: 1 rtaf: 0 
rtf: 0 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 1 onoff: 0/0:0/0 
-1,0:-1,0 0:0 (HZ=1000) barrier: 0/0:0 cbflood: 1
[   62.655884] rcu-torture: Reader Pipe:  2 0 0 0 0 0 0 0 0 0 0
[   62.661435] rcu-torture: Reader Batch:  0 2 0 0 0 0 0 0 0 0 0
[   62.664264] rcu-torture: Free-Block Circulation:  0 0 0 0 0 0 0 0 0 0 0
[   95.628115] trinity-c1 (3029) used greatest stack depth: 12776 bytes left
[  124.087732] rcu-torture: rtc: 836b9d00 ver: 1 tfle: 0 rta: 1 

[PATCH 2/4] ftrace: Add 'function-fork' trace option

2017-04-16 Thread Namhyung Kim
The function-fork option is same as event-fork that it tracks task
fork/exit and set the pid filter properly.  This can be useful if user
wants to trace selected tasks including their children only.

Signed-off-by: Namhyung Kim 
---
 kernel/trace/ftrace.c | 37 +
 kernel/trace/trace.c  |  5 -
 kernel/trace/trace.h  |  6 +-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b27066c5ed0..eb0303f8afa0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5587,6 +5587,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool 
preempt,
   trace_ignore_this_task(pid_list, next));
 }
 
+static void
+ftrace_pid_follow_sched_process_fork(void *data,
+struct task_struct *self,
+struct task_struct *task)
+{
+   struct trace_pid_list *pid_list;
+   struct trace_array *tr = data;
+
+   pid_list = rcu_dereference_sched(tr->function_pids);
+   trace_filter_add_remove_task(pid_list, self, task);
+}
+
+static void
+ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
+{
+   struct trace_pid_list *pid_list;
+   struct trace_array *tr = data;
+
+   pid_list = rcu_dereference_sched(tr->function_pids);
+   trace_filter_add_remove_task(pid_list, NULL, task);
+}
+
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
+{
+   if (enable) {
+   
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+ tr);
+   
register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ tr);
+   } else {
+   
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+   tr);
+   
unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+   tr);
+   }
+}
+
 static void clear_ftrace_pids(struct trace_array *tr)
 {
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0bf623c182da..c008bf0f9f93 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec)
 
 /* trace_flags that are default zero for instances */
 #define ZEROED_TRACE_FLAGS \
-   TRACE_ITER_EVENT_FORK
+   (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
 
 /*
  * The global_trace is the descriptor that holds the top-level tracing
@@ -4205,6 +4205,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int 
mask, int enabled)
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
 
+   if (mask == TRACE_ITER_FUNC_FORK)
+   ftrace_pid_follow_fork(tr, enabled);
+
if (mask == TRACE_ITER_OVERWRITE) {
ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee27163b7549..7dedf9cf0a6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -898,6 +898,7 @@ void ftrace_init_tracefs_toplevel(struct trace_array *tr,
  struct dentry *d_tracer);
 int init_function_trace(void);
 void ftrace_clear_pids(struct trace_array *tr);
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -918,6 +919,7 @@ static inline void ftrace_init_tracefs(struct trace_array 
*tr, struct dentry *d)
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct 
dentry *d) { }
 static inline int init_function_trace(void) { return 0; }
 static inline void ftrace_clear_pids(struct trace_array *tr) { }
+static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) 
{ }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -991,11 +993,13 @@ extern int trace_get_user(struct trace_parser *parser, 
const char __user *ubuf,
 
 #ifdef CONFIG_FUNCTION_TRACER
 # define FUNCTION_FLAGS\
-   C(FUNCTION, "function-trace"),
+   C(FUNCTION, "function-trace"),  \
+   C(FUNC_FORK,"function-fork"),
 # define FUNCTION_DEFAULT_FLAGSTRACE_ITER_FUNCTION
 #else
 # define FUNCTION_FLAGS
 # define FUNCTION_DEFAULT_FLAGS0UL
+# define TRACE_ITER_FUNC_FORK  0UL
 #endif
 
 #ifdef CONFIG_STACKTRACE
-- 
2.12.2



[PATCH 2/4] ftrace: Add 'function-fork' trace option

2017-04-16 Thread Namhyung Kim
The function-fork option is same as event-fork that it tracks task
fork/exit and set the pid filter properly.  This can be useful if user
wants to trace selected tasks including their children only.

Signed-off-by: Namhyung Kim 
---
 kernel/trace/ftrace.c | 37 +
 kernel/trace/trace.c  |  5 -
 kernel/trace/trace.h  |  6 +-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b27066c5ed0..eb0303f8afa0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5587,6 +5587,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool 
preempt,
   trace_ignore_this_task(pid_list, next));
 }
 
+static void
+ftrace_pid_follow_sched_process_fork(void *data,
+struct task_struct *self,
+struct task_struct *task)
+{
+   struct trace_pid_list *pid_list;
+   struct trace_array *tr = data;
+
+   pid_list = rcu_dereference_sched(tr->function_pids);
+   trace_filter_add_remove_task(pid_list, self, task);
+}
+
+static void
+ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
+{
+   struct trace_pid_list *pid_list;
+   struct trace_array *tr = data;
+
+   pid_list = rcu_dereference_sched(tr->function_pids);
+   trace_filter_add_remove_task(pid_list, NULL, task);
+}
+
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
+{
+   if (enable) {
+   
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+ tr);
+   
register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ tr);
+   } else {
+   
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+   tr);
+   
unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+   tr);
+   }
+}
+
 static void clear_ftrace_pids(struct trace_array *tr)
 {
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0bf623c182da..c008bf0f9f93 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec)
 
 /* trace_flags that are default zero for instances */
 #define ZEROED_TRACE_FLAGS \
-   TRACE_ITER_EVENT_FORK
+   (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
 
 /*
  * The global_trace is the descriptor that holds the top-level tracing
@@ -4205,6 +4205,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int 
mask, int enabled)
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
 
+   if (mask == TRACE_ITER_FUNC_FORK)
+   ftrace_pid_follow_fork(tr, enabled);
+
if (mask == TRACE_ITER_OVERWRITE) {
ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee27163b7549..7dedf9cf0a6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -898,6 +898,7 @@ void ftrace_init_tracefs_toplevel(struct trace_array *tr,
  struct dentry *d_tracer);
 int init_function_trace(void);
 void ftrace_clear_pids(struct trace_array *tr);
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -918,6 +919,7 @@ static inline void ftrace_init_tracefs(struct trace_array 
*tr, struct dentry *d)
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct 
dentry *d) { }
 static inline int init_function_trace(void) { return 0; }
 static inline void ftrace_clear_pids(struct trace_array *tr) { }
+static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) 
{ }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -991,11 +993,13 @@ extern int trace_get_user(struct trace_parser *parser, 
const char __user *ubuf,
 
 #ifdef CONFIG_FUNCTION_TRACER
 # define FUNCTION_FLAGS\
-   C(FUNCTION, "function-trace"),
+   C(FUNCTION, "function-trace"),  \
+   C(FUNC_FORK,"function-fork"),
 # define FUNCTION_DEFAULT_FLAGSTRACE_ITER_FUNCTION
 #else
 # define FUNCTION_FLAGS
 # define FUNCTION_DEFAULT_FLAGS0UL
+# define TRACE_ITER_FUNC_FORK  0UL
 #endif
 
 #ifdef CONFIG_STACKTRACE
-- 
2.12.2



[PATCH 0/4] ftrace: Add 'function-fork' trace option (v2)

2017-04-16 Thread Namhyung Kim
Hello,

This patchset add 'function-fork' option to function tracer which
makes pid filter to be inherited like 'event-fork' does.  During the
test, I found a bug of pid filter on an instance directory.  The patch
1 fixes it and maybe it should go to the stable tree.

The function-fork option is disabled by default as event-fork does,
but we might consider changing the default since it seems to be more
natural from an user's perspective IMHO.

v2 changes)
 * use ftrace_clear_pids()  (Steve)
 * add Ack from Masami
 
The code is also available at 'ftrace/function-fork-v2' branch on

  git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git

Thanks,
Namhyung


Namhyung Kim (4):
  ftrace: Fix function pid filter on instances
  ftrace: Add 'function-fork' trace option
  selftests: ftrace: Add -l/--logdir option
  selftests: ftrace: Add a testcase for function PID filter

 kernel/trace/ftrace.c  | 46 ++
 kernel/trace/trace.c   |  6 +-
 kernel/trace/trace.h   |  8 +-
 tools/testing/selftests/ftrace/ftracetest  |  5 ++
 .../ftrace/test.d/ftrace/func-filter-pid.tc| 98 ++
 5 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc

-- 
2.12.2



[PATCH 4/4] selftests: ftrace: Add a testcase for function PID filter

2017-04-16 Thread Namhyung Kim
Like event pid filtering test, add function pid filtering test with the
new "function-fork" option.  It also tests it on an instance directory
so that it can verify the bug related pid filtering on instances.

Cc: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Shuah Khan 
Signed-off-by: Namhyung Kim 
---
 .../ftrace/test.d/ftrace/func-filter-pid.tc| 98 ++
 1 file changed, 98 insertions(+)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc 
b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
new file mode 100644
index ..cd552f44c3b4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
@@ -0,0 +1,98 @@
+#!/bin/sh
+# description: ftrace - function pid filters
+
+# Make sure that function pid matching filter works.
+# Also test it on an instance directory
+
+if ! grep -q function available_tracers; then
+echo "no function tracer configured"
+exit_unsupported
+fi
+
+if [ ! -f set_ftrace_pid ]; then
+echo "set_ftrace_pid not found? Is function tracer not set?"
+exit_unsupported
+fi
+
+if [ ! -f set_ftrace_filter ]; then
+echo "set_ftrace_filter not found? Is function tracer not set?"
+exit_unsupported
+fi
+
+read PID _ < /proc/self/stat
+
+# default value of function-fork option
+orig_value=`grep function-fork trace_options`
+
+do_reset() {
+reset_tracer
+clear_trace
+enable_tracing
+echo > set_ftrace_filter
+echo > set_ftrace_pid
+
+echo $orig_value > trace_options
+}
+
+fail() { # msg
+do_reset
+echo $1
+exit $FAIL
+}
+
+yield() {
+ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
+}
+
+do_test() {
+disable_tracing
+
+echo do_execve* > set_ftrace_filter
+echo *do_fork >> set_ftrace_filter
+
+echo $PID > set_ftrace_pid
+echo function > current_tracer
+
+# don't allow children to be traced
+echo nofunction-fork > trace_options
+
+enable_tracing
+yield
+
+count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+# count_other should be 0
+if [ $count_pid -eq 0 -o $count_other -ne 0 ]; then
+   fail "PID filtering not working?"
+fi
+
+disable_tracing
+clear_trace
+
+# allow children to be traced
+echo function-fork > trace_options
+
+enable_tracing
+yield
+
+count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+# count_other should NOT be 0
+if [ $count_pid -eq 0 -o $count_other -eq 0 ]; then
+   fail "PID filtering not following fork?"
+fi
+}
+
+do_test
+
+mkdir instances/foo
+cd instances/foo
+do_test
+cd ../../
+rmdir instances/foo
+
+do_reset
+
+exit 0
-- 
2.12.2



[PATCH 0/4] ftrace: Add 'function-fork' trace option (v2)

2017-04-16 Thread Namhyung Kim
Hello,

This patchset add 'function-fork' option to function tracer which
makes pid filter to be inherited like 'event-fork' does.  During the
test, I found a bug of pid filter on an instance directory.  The patch
1 fixes it and maybe it should go to the stable tree.

The function-fork option is disabled by default as event-fork does,
but we might consider changing the default since it seems to be more
natural from an user's perspective IMHO.

v2 changes)
 * use ftrace_clear_pids()  (Steve)
 * add Ack from Masami
 
The code is also available at 'ftrace/function-fork-v2' branch on

  git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git

Thanks,
Namhyung


Namhyung Kim (4):
  ftrace: Fix function pid filter on instances
  ftrace: Add 'function-fork' trace option
  selftests: ftrace: Add -l/--logdir option
  selftests: ftrace: Add a testcase for function PID filter

 kernel/trace/ftrace.c  | 46 ++
 kernel/trace/trace.c   |  6 +-
 kernel/trace/trace.h   |  8 +-
 tools/testing/selftests/ftrace/ftracetest  |  5 ++
 .../ftrace/test.d/ftrace/func-filter-pid.tc| 98 ++
 5 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc

-- 
2.12.2



[PATCH 4/4] selftests: ftrace: Add a testcase for function PID filter

2017-04-16 Thread Namhyung Kim
Like event pid filtering test, add function pid filtering test with the
new "function-fork" option.  It also tests it on an instance directory
so that it can verify the bug related pid filtering on instances.

Cc: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Shuah Khan 
Signed-off-by: Namhyung Kim 
---
 .../ftrace/test.d/ftrace/func-filter-pid.tc| 98 ++
 1 file changed, 98 insertions(+)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc 
b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
new file mode 100644
index ..cd552f44c3b4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
@@ -0,0 +1,98 @@
+#!/bin/sh
+# description: ftrace - function pid filters
+
+# Make sure that function pid matching filter works.
+# Also test it on an instance directory
+
+if ! grep -q function available_tracers; then
+echo "no function tracer configured"
+exit_unsupported
+fi
+
+if [ ! -f set_ftrace_pid ]; then
+echo "set_ftrace_pid not found? Is function tracer not set?"
+exit_unsupported
+fi
+
+if [ ! -f set_ftrace_filter ]; then
+echo "set_ftrace_filter not found? Is function tracer not set?"
+exit_unsupported
+fi
+
+read PID _ < /proc/self/stat
+
+# default value of function-fork option
+orig_value=`grep function-fork trace_options`
+
+do_reset() {
+reset_tracer
+clear_trace
+enable_tracing
+echo > set_ftrace_filter
+echo > set_ftrace_pid
+
+echo $orig_value > trace_options
+}
+
+fail() { # msg
+do_reset
+echo $1
+exit $FAIL
+}
+
+yield() {
+ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
+}
+
+do_test() {
+disable_tracing
+
+echo do_execve* > set_ftrace_filter
+echo *do_fork >> set_ftrace_filter
+
+echo $PID > set_ftrace_pid
+echo function > current_tracer
+
+# don't allow children to be traced
+echo nofunction-fork > trace_options
+
+enable_tracing
+yield
+
+count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+# count_other should be 0
+if [ $count_pid -eq 0 -o $count_other -ne 0 ]; then
+   fail "PID filtering not working?"
+fi
+
+disable_tracing
+clear_trace
+
+# allow children to be traced
+echo function-fork > trace_options
+
+enable_tracing
+yield
+
+count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+# count_other should NOT be 0
+if [ $count_pid -eq 0 -o $count_other -eq 0 ]; then
+   fail "PID filtering not following fork?"
+fi
+}
+
+do_test
+
+mkdir instances/foo
+cd instances/foo
+do_test
+cd ../../
+rmdir instances/foo
+
+do_reset
+
+exit 0
-- 
2.12.2



[PATCH 1/4] ftrace: Fix function pid filter on instances

2017-04-16 Thread Namhyung Kim
When function tracer has a pid filter, it adds a probe to sched_switch
to track if current task can be ignored.  The probe checks the
ftrace_ignore_pid from current tr to filter tasks.  But it misses to
delete the probe when removing an instance so that it can cause a crash
due to the invalid tr pointer (use-after-free).

This is easily reproducible with the following:

  # cd /sys/kernel/debug/tracing
  # mkdir instances/buggy
  # echo $$ > instances/buggy/set_ftrace_pid
  # rmdir instances/buggy

  
  BUG: KASAN: use-after-free in ftrace_filter_pid_sched_switch_probe+0x3d/0x90
  Read of size 8 by task kworker/0:1/17
  CPU: 0 PID: 17 Comm: kworker/0:1 Tainted: GB   4.11.0-rc3  #198
  Call Trace:
   dump_stack+0x68/0x9f
   kasan_object_err+0x21/0x70
   kasan_report.part.1+0x22b/0x500
   ? ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   kasan_report+0x25/0x30
   __asan_load8+0x5e/0x70
   ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   ? fpid_start+0x130/0x130
   __schedule+0x571/0xce0
   ...

To fix it, use ftrace_pid_reset() to unregister the probe.  As
instance_rmdir() already updated ftrace codes, it can just free the
filter safely.

Fixes: 0c8916c34203 ("tracing: Add rmdir to remove multibuffer instances")
Signed-off-by: Namhyung Kim 
---
 kernel/trace/ftrace.c | 9 +
 kernel/trace/trace.c  | 1 +
 kernel/trace/trace.h  | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34f63e78d661..7b27066c5ed0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5610,6 +5610,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
trace_free_pid_list(pid_list);
 }
 
+void ftrace_clear_pids(struct trace_array *tr)
+{
+   mutex_lock(_lock);
+
+   clear_ftrace_pids(tr);
+
+   mutex_unlock(_lock);
+}
+
 static void ftrace_pid_reset(struct trace_array *tr)
 {
mutex_lock(_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b5d4b80f2d45..0bf623c182da 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7485,6 +7485,7 @@ static int instance_rmdir(const char *name)
 
tracing_set_nop(tr);
event_trace_del_tracer(tr);
+   ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 571acee52a32..ee27163b7549 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -897,6 +897,7 @@ void ftrace_init_tracefs(struct trace_array *tr, struct 
dentry *d_tracer);
 void ftrace_init_tracefs_toplevel(struct trace_array *tr,
  struct dentry *d_tracer);
 int init_function_trace(void);
+void ftrace_clear_pids(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -916,6 +917,7 @@ static inline void ftrace_reset_array_ops(struct 
trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry 
*d) { }
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct 
dentry *d) { }
 static inline int init_function_trace(void) { return 0; }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
-- 
2.12.2



[PATCH 1/4] ftrace: Fix function pid filter on instances

2017-04-16 Thread Namhyung Kim
When function tracer has a pid filter, it adds a probe to sched_switch
to track if current task can be ignored.  The probe checks the
ftrace_ignore_pid from current tr to filter tasks.  But it misses to
delete the probe when removing an instance so that it can cause a crash
due to the invalid tr pointer (use-after-free).

This is easily reproducible with the following:

  # cd /sys/kernel/debug/tracing
  # mkdir instances/buggy
  # echo $$ > instances/buggy/set_ftrace_pid
  # rmdir instances/buggy

  
  BUG: KASAN: use-after-free in ftrace_filter_pid_sched_switch_probe+0x3d/0x90
  Read of size 8 by task kworker/0:1/17
  CPU: 0 PID: 17 Comm: kworker/0:1 Tainted: GB   4.11.0-rc3  #198
  Call Trace:
   dump_stack+0x68/0x9f
   kasan_object_err+0x21/0x70
   kasan_report.part.1+0x22b/0x500
   ? ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   kasan_report+0x25/0x30
   __asan_load8+0x5e/0x70
   ftrace_filter_pid_sched_switch_probe+0x3d/0x90
   ? fpid_start+0x130/0x130
   __schedule+0x571/0xce0
   ...

To fix it, use ftrace_pid_reset() to unregister the probe.  As
instance_rmdir() already updated ftrace codes, it can just free the
filter safely.

Fixes: 0c8916c34203 ("tracing: Add rmdir to remove multibuffer instances")
Signed-off-by: Namhyung Kim 
---
 kernel/trace/ftrace.c | 9 +
 kernel/trace/trace.c  | 1 +
 kernel/trace/trace.h  | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34f63e78d661..7b27066c5ed0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5610,6 +5610,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
trace_free_pid_list(pid_list);
 }
 
+void ftrace_clear_pids(struct trace_array *tr)
+{
+   mutex_lock(_lock);
+
+   clear_ftrace_pids(tr);
+
+   mutex_unlock(_lock);
+}
+
 static void ftrace_pid_reset(struct trace_array *tr)
 {
mutex_lock(_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b5d4b80f2d45..0bf623c182da 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7485,6 +7485,7 @@ static int instance_rmdir(const char *name)
 
tracing_set_nop(tr);
event_trace_del_tracer(tr);
+   ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 571acee52a32..ee27163b7549 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -897,6 +897,7 @@ void ftrace_init_tracefs(struct trace_array *tr, struct 
dentry *d_tracer);
 void ftrace_init_tracefs_toplevel(struct trace_array *tr,
  struct dentry *d_tracer);
 int init_function_trace(void);
+void ftrace_clear_pids(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -916,6 +917,7 @@ static inline void ftrace_reset_array_ops(struct 
trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry 
*d) { }
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct 
dentry *d) { }
 static inline int init_function_trace(void) { return 0; }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
-- 
2.12.2



[PATCH 3/4] selftests: ftrace: Add -l/--logdir option

2017-04-16 Thread Namhyung Kim
In my virtual machine setup, running ftracetest failed on creating
LOG_DIR on a read-only filesystem.  It'd be convenient to provide an
option to specify a different directory as log directory.

Acked-by: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Shuah Khan 
Signed-off-by: Namhyung Kim 
---
 tools/testing/selftests/ftrace/ftracetest | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 52e3c4df28d6..a8631d978725 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -16,6 +16,7 @@ echo "-k|--keep  Keep passed test logs"
 echo " -v|--verbose Increase verbosity of test messages"
 echo " -vvAlias of -v -v (Show all results in stdout)"
 echo " -d|--debug Debug mode (trace all shell commands)"
+echo " -l|--logdir  Save logs on the "
 exit $1
 }
 
@@ -64,6 +65,10 @@ parse_opts() { # opts
   DEBUG=1
   shift 1
 ;;
+--logdir|-l)
+  LOG_DIR=$2
+  shift 2
+;;
 *.tc)
   if [ -f "$1" ]; then
 OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`"
-- 
2.12.2



[PATCH 3/4] selftests: ftrace: Add -l/--logdir option

2017-04-16 Thread Namhyung Kim
In my virtual machine setup, running ftracetest failed on creating
LOG_DIR on a read-only filesystem.  It'd be convenient to provide an
option to specify a different directory as log directory.

Acked-by: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Shuah Khan 
Signed-off-by: Namhyung Kim 
---
 tools/testing/selftests/ftrace/ftracetest | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 52e3c4df28d6..a8631d978725 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -16,6 +16,7 @@ echo "-k|--keep  Keep passed test logs"
 echo " -v|--verbose Increase verbosity of test messages"
 echo " -vvAlias of -v -v (Show all results in stdout)"
 echo " -d|--debug Debug mode (trace all shell commands)"
+echo " -l|--logdir  Save logs on the "
 exit $1
 }
 
@@ -64,6 +65,10 @@ parse_opts() { # opts
   DEBUG=1
   shift 1
 ;;
+--logdir|-l)
+  LOG_DIR=$2
+  shift 2
+;;
 *.tc)
   if [ -f "$1" ]; then
 OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`"
-- 
2.12.2



Re: [PATCH v2 2/4] zram: implement deduplication in zram

2017-04-16 Thread Joonsoo Kim
On Mon, Apr 17, 2017 at 10:38:16AM +0900, Minchan Kim wrote:
> Hi Joonsoo,
> 
> I reviewed this patch and overall, looks great! Thanks.

Thanks!

> However, as you know, recently, zram had lots of clean up so
> this patchset should be rebased on it massively.
> Sorry for the inconvenience.

No problem.

> 
> And there are some minor in below. I hope you handle them in
> next submit, please.

Okay.

> On Thu, Mar 30, 2017 at 02:38:07PM +0900, js1...@gmail.com wrote:
> > From: Joonsoo Kim 
> > 
> > This patch implements deduplication feature in zram. The purpose
> > of this work is naturally to save amount of memory usage by zram.
> > 
> > Android is one of the biggest users to use zram as swap and it's
> > really important to save amount of memory usage. There is a paper
> > that reports that duplication ratio of Android's memory content is
> > rather high [1]. And, there is a similar work on zswap that also
> > reports that experiments has shown that around 10-15% of pages
> > stored in zswp are duplicates and deduplicate them provides some
> > benefits [2].
> > 
> > Also, there is a different kind of workload that uses zram as blockdev
> > and store build outputs into it to reduce wear-out problem of real
> > blockdev. In this workload, deduplication hit is very high due to
> > temporary files and intermediate object files. Detailed analysis is
> > on the bottom of this description.
> > 
> > Anyway, if we can detect duplicated content and avoid to store duplicated
> > content at different memory space, we can save memory. This patch
> > tries to do that.
> > 
> > Implementation is almost simple and intuitive but I should note
> > one thing about implementation detail.
> > 
> > To check duplication, this patch uses checksum of the page and
> > collision of this checksum could be possible. There would be
> > many choices to handle this situation but this patch chooses
> > to allow entry with duplicated checksum to be added to the hash,
> > but, not to compare all entries with duplicated checksum
> > when checking duplication. I guess that checksum collision is quite
> > rare event and we don't need to pay any attention to such a case.
> > Therefore, I decided the most simplest way to implement the feature.
> > If there is a different opinion, I can accept and go that way.
> > 
> > Following is the result of this patch.
> > 
> > Test result #1 (Swap):
> > Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
> > 
> > orig_data_size: 145297408
> > compr_data_size: 32408125
> > mem_used_total: 32276480
> > dup_data_size: 3188134
> > meta_data_size: 1444272
> > 
> > Last two metrics added to mm_stat are related to this work.
> > First one, dup_data_size, is amount of saved memory by avoiding
> > to store duplicated page. Later one, meta_data_size, is the amount of
> > data structure to support deduplication. If dup > meta, we can judge
> > that the patch improves memory usage.
> > 
> > In Adnroid, we can save 5% of memory usage by this work.
> > 
> > Test result #2 (Blockdev):
> > build the kernel and store output to ext4 FS on zram
> > 
> > 
> > Elapsed time: 249 s
> > mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
> > 
> > 
> > Elapsed time: 250 s
> > mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038  3945792
> > 
> > There is no performance degradation and save 23% memory.
> > 
> > Test result #3 (Blockdev):
> > copy android build output dir(out/host) to ext4 FS on zram
> > 
> > 
> > Elapsed time: out/host: 88 s
> > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > 
> > 
> > Elapsed time: out/host: 100 s
> > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 
> > 80880336
> > 
> > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > it is due to overhead of calculating checksum and comparison.
> > 
> > Test result #4 (Blockdev):
> > copy android build output dir(out/target/common) to ext4 FS on zram
> > 
> > 
> > Elapsed time: out/host: 203 s
> > mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
> > 
> > 
> > Elapsed time: out/host: 201 s
> > mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 
> > 24564336
> > 
> > Memory is saved by 42% and performance is the same. Even if there is 
> > overhead
> > of calculating checksum and comparison, large hit ratio compensate it since
> > hit leads to less compression attempt.
> > 
> > I checked the detailed reason of savings on kernel build workload and
> > there are some cases that deduplication happens.
> > 
> > 1) *.cmd
> > Build command is usually similar in one directory so content of these file
> > are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> > and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> > respectively.
> > 
> > 2) intermediate object files
> > built-in.o and temporary object file have the similar contents. More than
> > 50% of 

Re: [PATCH v2 2/4] zram: implement deduplication in zram

2017-04-16 Thread Joonsoo Kim
On Mon, Apr 17, 2017 at 10:38:16AM +0900, Minchan Kim wrote:
> Hi Joonsoo,
> 
> I reviewed this patch and overall, looks great! Thanks.

Thanks!

> However, as you know, recently, zram had lots of clean up so
> this patchset should be rebased on it massively.
> Sorry for the inconvenience.

No problem.

> 
> And there are some minor in below. I hope you handle them in
> next submit, please.

Okay.

> On Thu, Mar 30, 2017 at 02:38:07PM +0900, js1...@gmail.com wrote:
> > From: Joonsoo Kim 
> > 
> > This patch implements deduplication feature in zram. The purpose
> > of this work is naturally to save amount of memory usage by zram.
> > 
> > Android is one of the biggest users to use zram as swap and it's
> > really important to save amount of memory usage. There is a paper
> > that reports that duplication ratio of Android's memory content is
> > rather high [1]. And, there is a similar work on zswap that also
> > reports that experiments has shown that around 10-15% of pages
> > stored in zswp are duplicates and deduplicate them provides some
> > benefits [2].
> > 
> > Also, there is a different kind of workload that uses zram as blockdev
> > and store build outputs into it to reduce wear-out problem of real
> > blockdev. In this workload, deduplication hit is very high due to
> > temporary files and intermediate object files. Detailed analysis is
> > on the bottom of this description.
> > 
> > Anyway, if we can detect duplicated content and avoid to store duplicated
> > content at different memory space, we can save memory. This patch
> > tries to do that.
> > 
> > Implementation is almost simple and intuitive but I should note
> > one thing about implementation detail.
> > 
> > To check duplication, this patch uses checksum of the page and
> > collision of this checksum could be possible. There would be
> > many choices to handle this situation but this patch chooses
> > to allow entry with duplicated checksum to be added to the hash,
> > but, not to compare all entries with duplicated checksum
> > when checking duplication. I guess that checksum collision is quite
> > rare event and we don't need to pay any attention to such a case.
> > Therefore, I decided the most simplest way to implement the feature.
> > If there is a different opinion, I can accept and go that way.
> > 
> > Following is the result of this patch.
> > 
> > Test result #1 (Swap):
> > Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
> > 
> > orig_data_size: 145297408
> > compr_data_size: 32408125
> > mem_used_total: 32276480
> > dup_data_size: 3188134
> > meta_data_size: 1444272
> > 
> > Last two metrics added to mm_stat are related to this work.
> > First one, dup_data_size, is amount of saved memory by avoiding
> > to store duplicated page. Later one, meta_data_size, is the amount of
> > data structure to support deduplication. If dup > meta, we can judge
> > that the patch improves memory usage.
> > 
> > In Adnroid, we can save 5% of memory usage by this work.
> > 
> > Test result #2 (Blockdev):
> > build the kernel and store output to ext4 FS on zram
> > 
> > 
> > Elapsed time: 249 s
> > mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
> > 
> > 
> > Elapsed time: 250 s
> > mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038  3945792
> > 
> > There is no performance degradation and save 23% memory.
> > 
> > Test result #3 (Blockdev):
> > copy android build output dir(out/host) to ext4 FS on zram
> > 
> > 
> > Elapsed time: out/host: 88 s
> > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > 
> > 
> > Elapsed time: out/host: 100 s
> > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 
> > 80880336
> > 
> > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > it is due to overhead of calculating checksum and comparison.
> > 
> > Test result #4 (Blockdev):
> > copy android build output dir(out/target/common) to ext4 FS on zram
> > 
> > 
> > Elapsed time: out/host: 203 s
> > mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
> > 
> > 
> > Elapsed time: out/host: 201 s
> > mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 
> > 24564336
> > 
> > Memory is saved by 42% and performance is the same. Even if there is 
> > overhead
> > of calculating checksum and comparison, large hit ratio compensate it since
> > hit leads to less compression attempt.
> > 
> > I checked the detailed reason of savings on kernel build workload and
> > there are some cases that deduplication happens.
> > 
> > 1) *.cmd
> > Build command is usually similar in one directory so content of these file
> > are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> > and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> > respectively.
> > 
> > 2) intermediate object files
> > built-in.o and temporary object file have the similar contents. More than
> > 50% of fs/ext4/ext4.o is the 

Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Minchan Kim
Hi Sergey,

On Mon, Apr 17, 2017 at 10:54:29AM +0900, Sergey Senozhatsky wrote:
> On (04/17/17 10:21), Sergey Senozhatsky wrote:
> > > However, it should be *fixed* to prevent confusion in future
> 
> or may be something like below? can save us some cycles.
> 
> remove this calculation
> 
> -   offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> 
> 
> and pass 0 to zram_bvec_rw()
> 
> -   err = zram_bvec_rw(zram, , index, offset, is_write);
> +   err = zram_bvec_rw(zram, , index, 0, is_write);

That was one I wrote but have thought it more.

Because I suspect fs can submit page-size IO in non-aligned PAGE_SIZE
sector? For example, it can submit PAGE_SIZE read request from 9 sector.
Is it possible? I don't know.

As well, FS can format zram from sector 1, not sector 0? IOW, can't it
use starting sector as non-page algined sector?
We can do it via fdisk?

Anyway, If one of scenario I mentioned is possible, zram_rw_page will
be broken.

If it's hard to check all of scenario in this moment, it would be
better to not remove it and then add WARN_ON(offset) in there.

While I am writing this, I found this.

/**
 * bdev_read_page() - Start reading a page from a block device
 * @bdev: The device to read the page from
 * @sector: The offset on the device to read the page to (need not be aligned)
 * @page: The page to read
 *

Hmm,, need investigation but no time.


Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Minchan Kim
Hi Sergey,

On Mon, Apr 17, 2017 at 10:54:29AM +0900, Sergey Senozhatsky wrote:
> On (04/17/17 10:21), Sergey Senozhatsky wrote:
> > > However, it should be *fixed* to prevent confusion in future
> 
> or may be something like below? can save us some cycles.
> 
> remove this calculation
> 
> -   offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> 
> 
> and pass 0 to zram_bvec_rw()
> 
> -   err = zram_bvec_rw(zram, , index, offset, is_write);
> +   err = zram_bvec_rw(zram, , index, 0, is_write);

That was one I wrote but have thought it more.

Because I suspect fs can submit page-size IO in non-aligned PAGE_SIZE
sector? For example, it can submit PAGE_SIZE read request from 9 sector.
Is it possible? I don't know.

As well, FS can format zram from sector 1, not sector 0? IOW, can't it
use starting sector as non-page algined sector?
We can do it via fdisk?

Anyway, If one of scenario I mentioned is possible, zram_rw_page will
be broken.

If it's hard to check all of scenario in this moment, it would be
better to not remove it and then add WARN_ON(offset) in there.

While I am writing this, I found this.

/**
 * bdev_read_page() - Start reading a page from a block device
 * @bdev: The device to read the page from
 * @sector: The offset on the device to read the page to (need not be aligned)
 * @page: The page to read
 *

Hmm,, need investigation but no time.


Re: [PATCH V2] PM / OPP: Use - instead of @ for DT entries

2017-04-16 Thread Masahiro Yamada
2017-04-15 7:47 GMT+09:00 Rafael J. Wysocki :
> On Monday, April 10, 2017 02:51:35 PM Viresh Kumar wrote:
>> Compiling the DT file with W=1, DTC warns like follows:
>>
>> Warning (unit_address_vs_reg): Node /opp_table0/opp@10 has a
>> unit name, but no reg property
>>
>> Fix this by replacing '@' with '-' as the OPP nodes will never have a
>> "reg" property.
>>
>> Reported-by: Krzysztof Kozlowski 
>> Reported-by: Masahiro Yamada 
>> Suggested-by: Mark Rutland 
>> Signed-off-by: Viresh Kumar 
>> Acked-by: Maxime Ripard  (sunxi)
>> Reviewed-by: Chanwoo Choi 
>> Reviewed-by: Krzysztof Kozlowski 
>
> OK, so any ACKs from the DT side?  Rob?
>
> Thanks,
> Rafael



I see Rob's Acked-by.

https://lkml.org/lkml/2017/4/13/648


-- 
Best Regards
Masahiro Yamada


Re: [PATCH V2] PM / OPP: Use - instead of @ for DT entries

2017-04-16 Thread Masahiro Yamada
2017-04-15 7:47 GMT+09:00 Rafael J. Wysocki :
> On Monday, April 10, 2017 02:51:35 PM Viresh Kumar wrote:
>> Compiling the DT file with W=1, DTC warns like follows:
>>
>> Warning (unit_address_vs_reg): Node /opp_table0/opp@10 has a
>> unit name, but no reg property
>>
>> Fix this by replacing '@' with '-' as the OPP nodes will never have a
>> "reg" property.
>>
>> Reported-by: Krzysztof Kozlowski 
>> Reported-by: Masahiro Yamada 
>> Suggested-by: Mark Rutland 
>> Signed-off-by: Viresh Kumar 
>> Acked-by: Maxime Ripard  (sunxi)
>> Reviewed-by: Chanwoo Choi 
>> Reviewed-by: Krzysztof Kozlowski 
>
> OK, so any ACKs from the DT side?  Rob?
>
> Thanks,
> Rafael



I see Rob's Acked-by.

https://lkml.org/lkml/2017/4/13/648


-- 
Best Regards
Masahiro Yamada


Re: [PATCH] slab: avoid IPIs when creating kmem caches

2017-04-16 Thread Joonsoo Kim
On Sun, Apr 16, 2017 at 02:45:44PM -0700, Greg Thelen wrote:
> Each slab kmem cache has per cpu array caches.  The array caches are
> created when the kmem_cache is created, either via kmem_cache_create()
> or lazily when the first object is allocated in context of a kmem
> enabled memcg.  Array caches are replaced by writing to /proc/slabinfo.
> 
> Array caches are protected by holding slab_mutex or disabling
> interrupts.  Array cache allocation and replacement is done by
> __do_tune_cpucache() which holds slab_mutex and calls
> kick_all_cpus_sync() to interrupt all remote processors which confirms
> there are no references to the old array caches.
> 
> IPIs are needed when replacing array caches.  But when creating a new
> array cache, there's no need to send IPIs because there cannot be any
> references to the new cache.  Outside of memcg kmem accounting these
> IPIs occur at boot time, so they're not a problem.  But with memcg kmem
> accounting each container can create kmem caches, so the IPIs are
> wasteful.
> 
> Avoid unnecessary IPIs when creating array caches.
> 
> Test which reports the IPI count of allocating slab in 1 memcg:
>   import os
> 
>   def ipi_count():
>   with open("/proc/interrupts") as f:
>   for l in f:
>   if 'Function call interrupts' in l:
>   return int(l.split()[1])
> 
>   def echo(val, path):
>   with open(path, "w") as f:
>   f.write(val)
> 
>   n = 1
>   os.chdir("/mnt/cgroup/memory")
>   pid = str(os.getpid())
>   a = ipi_count()
>   for i in range(n):
>   os.mkdir(str(i))
>   echo("1G\n", "%d/memory.limit_in_bytes" % i)
>   echo("1G\n", "%d/memory.kmem.limit_in_bytes" % i)
>   echo(pid, "%d/cgroup.procs" % i)
>   open("/tmp/x", "w").close()
>   os.unlink("/tmp/x")
>   b = ipi_count()
>   print "%d loops: %d => %d (+%d ipis)" % (n, a, b, b-a)
>   echo(pid, "cgroup.procs")
>   for i in range(n):
>   os.rmdir(str(i))
> 
> patched:   1 loops: 1069 => 1170 (+101 ipis)
> unpatched: 1 loops: 1192 => 48933 (+47741 ipis)
> 
> Signed-off-by: Greg Thelen 

Acked-by: Joonsoo Kim 



Re: [PATCH] slab: avoid IPIs when creating kmem caches

2017-04-16 Thread Joonsoo Kim
On Sun, Apr 16, 2017 at 02:45:44PM -0700, Greg Thelen wrote:
> Each slab kmem cache has per cpu array caches.  The array caches are
> created when the kmem_cache is created, either via kmem_cache_create()
> or lazily when the first object is allocated in context of a kmem
> enabled memcg.  Array caches are replaced by writing to /proc/slabinfo.
> 
> Array caches are protected by holding slab_mutex or disabling
> interrupts.  Array cache allocation and replacement is done by
> __do_tune_cpucache() which holds slab_mutex and calls
> kick_all_cpus_sync() to interrupt all remote processors which confirms
> there are no references to the old array caches.
> 
> IPIs are needed when replacing array caches.  But when creating a new
> array cache, there's no need to send IPIs because there cannot be any
> references to the new cache.  Outside of memcg kmem accounting these
> IPIs occur at boot time, so they're not a problem.  But with memcg kmem
> accounting each container can create kmem caches, so the IPIs are
> wasteful.
> 
> Avoid unnecessary IPIs when creating array caches.
> 
> Test which reports the IPI count of allocating slab in 1 memcg:
>   import os
> 
>   def ipi_count():
>   with open("/proc/interrupts") as f:
>   for l in f:
>   if 'Function call interrupts' in l:
>   return int(l.split()[1])
> 
>   def echo(val, path):
>   with open(path, "w") as f:
>   f.write(val)
> 
>   n = 1
>   os.chdir("/mnt/cgroup/memory")
>   pid = str(os.getpid())
>   a = ipi_count()
>   for i in range(n):
>   os.mkdir(str(i))
>   echo("1G\n", "%d/memory.limit_in_bytes" % i)
>   echo("1G\n", "%d/memory.kmem.limit_in_bytes" % i)
>   echo(pid, "%d/cgroup.procs" % i)
>   open("/tmp/x", "w").close()
>   os.unlink("/tmp/x")
>   b = ipi_count()
>   print "%d loops: %d => %d (+%d ipis)" % (n, a, b, b-a)
>   echo(pid, "cgroup.procs")
>   for i in range(n):
>   os.rmdir(str(i))
> 
> patched:   1 loops: 1069 => 1170 (+101 ipis)
> unpatched: 1 loops: 1192 => 48933 (+47741 ipis)
> 
> Signed-off-by: Greg Thelen 

Acked-by: Joonsoo Kim 



Re: [PATCH v7 0/7] Introduce ZONE_CMA

2017-04-16 Thread Joonsoo Kim
On Thu, Apr 13, 2017 at 01:56:15PM +0200, Michal Hocko wrote:
> On Wed 12-04-17 10:35:06, Joonsoo Kim wrote:
> > On Tue, Apr 11, 2017 at 08:15:20PM +0200, Michal Hocko wrote:
> > > Hi,
> > > I didn't get to read though patches yet but the cover letter didn't
> > > really help me to understand the basic concepts to have a good starting
> > > point before diving into implementation details. It contains a lot of
> > > history remarks which is not bad but IMHO too excessive here. I would
> > > appreciate the following information (some of that is already provided
> > > in the cover but could benefit from some rewording/text reorganization).
> > > 
> > > - what is ZONE_CMA and how it is configured (from admin POV)
> > > - how does ZONE_CMA compare to other zones
> > > - who is allowed to allocate from this zone and what are the
> > >   guarantees/requirements for successful allocation
> > > - how does the zone compare to a preallocate allocation pool
> > > - how is ZONE_CMA balanced/reclaimed due to internal memory pressure
> > >   (from CMA users)
> > > - is this zone reclaimable for the global memory reclaim
> > > - why this was/is controversial
> > 
> > Hello,
> > 
> > I hope that following summary helps you to understand this patchset.
> > I skip some basic things about CMA. I will attach this description to
> > the cover-letter if re-spin is needed.
> 
> I believe that sorting out these questions is more important than what
> you have in the current cover letter. Andrew tends to fold the cover
> into the first patch so I think you should update.

Okay.
 
> > 2. How does ZONE_CMA compare to other zones
> > 
> > ZONE_CMA is conceptually the same with ZONE_MOVABLE. There is a software
> > constraint to guarantee the success of future allocation request from
> > the device. If the device requests the specific range of the memory in CMA
> > area at the runtime, page that allocated by MM will be migrated to
> > the other page and it will be returned to the device. To guarantee it,
> > ZONE_CMA only takes the allocation request with GFP_MOVABLE.
> 
> The immediate follow up question is. Why cannot we reuse ZONE_MOVABLE
> for that purpose?

I can make CMA reuses the ZONE_MOVABLE but I don't want it. Reasons
are that

1. If ZONE_MOVABLE has two different types of memory, hotpluggable and
CMA, it may need special handling for each type. This would lead to a new
migratetype again (to distinguish them) and easy to be error-prone. I
don't want that case.

2. CMA users want to see usage stat separately since CMA often causes
the problems and separate stat would helps to debug it.

> > The other important point about ZONE_CMA is that span of ZONE_CMA would be
> > overlapped with the other zone. This is not new to MM subsystem and
> > MM subsystem has enough logic to handle such situation
> > so there would be no problem.
> 
> I am not really sure this is actually true. Zones are disjoint from the
> early beginning. I remember that we had something like numa nodes
> interleaving but that is such a rare configuration that I wouldn't be
> surprised if it wasn't very well tested and actually broken in some
> subtle ways.

I agree with your concern however if something is broken for them, it
just shows that we need to fix it. MM should handle this situation
since we already know that such architecture exists.

> 
> There are many page_zone(page) != zone checks sprinkled in the code but
> I do not see anything consistent there. Similarly pageblock_pfn_to_page
> is only used by compaction but there are other pfn walkers which do
> ad-hoc checking. I was staring into that code these days due to my
> hotplug patches.
>
> That being said, I think that interleaving zones are an interesting
> concept but I would be rather nervous to consider this as working
> currently without a deeper review.

I have tried to audit all the pfn walkers before and have added above
mentioned check. Perhaps, I missed something however I believe not
that much. Our production already have used ZONE_CMA and I haven't get
the report about such problem.

> 
> > Other things are completely the same with other zones. For MM POV, there is
> > no difference in allocation process except that it only takes
> > GFP_MOVABLE request. In reclaim, pages that are allocated by MM will
> > be reclaimed by the same policy of the MM. So, no difference.
> 
> OK, so essentially this is yet another "highmem" zone. We already know
> that only GFP_MOVABLE are allowed to fallback to ZONE_CMA but do CMA
> allocations fallback to other zones and punch new holes? In which zone
> order?

Hmm... I don't understand your question. Could you elaborate it more?

> > This 'no difference' is a strong point of this approach. ZONE_CMA is
> > naturally handled by MM subsystem unlike as before (special handling is
> > required for MIGRATE_CMA).
> > 
> > 3. Controversial Point
> > 
> > Major concern from Mel is that zone concept is abused. ZONE is originally
> > introduced to solve some issues 

Re: [PATCH v7 0/7] Introduce ZONE_CMA

2017-04-16 Thread Joonsoo Kim
On Thu, Apr 13, 2017 at 01:56:15PM +0200, Michal Hocko wrote:
> On Wed 12-04-17 10:35:06, Joonsoo Kim wrote:
> > On Tue, Apr 11, 2017 at 08:15:20PM +0200, Michal Hocko wrote:
> > > Hi,
> > > I didn't get to read though patches yet but the cover letter didn't
> > > really help me to understand the basic concepts to have a good starting
> > > point before diving into implementation details. It contains a lot of
> > > history remarks which is not bad but IMHO too excessive here. I would
> > > appreciate the following information (some of that is already provided
> > > in the cover but could benefit from some rewording/text reorganization).
> > > 
> > > - what is ZONE_CMA and how it is configured (from admin POV)
> > > - how does ZONE_CMA compare to other zones
> > > - who is allowed to allocate from this zone and what are the
> > >   guarantees/requirements for successful allocation
> > > - how does the zone compare to a preallocate allocation pool
> > > - how is ZONE_CMA balanced/reclaimed due to internal memory pressure
> > >   (from CMA users)
> > > - is this zone reclaimable for the global memory reclaim
> > > - why this was/is controversial
> > 
> > Hello,
> > 
> > I hope that following summary helps you to understand this patchset.
> > I skip some basic things about CMA. I will attach this description to
> > the cover-letter if re-spin is needed.
> 
> I believe that sorting out these questions is more important than what
> you have in the current cover letter. Andrew tends to fold the cover
> into the first patch so I think you should update.

Okay.
 
> > 2. How does ZONE_CMA compare to other zones
> > 
> > ZONE_CMA is conceptually the same with ZONE_MOVABLE. There is a software
> > constraint to guarantee the success of future allocation request from
> > the device. If the device requests the specific range of the memory in CMA
> > area at the runtime, page that allocated by MM will be migrated to
> > the other page and it will be returned to the device. To guarantee it,
> > ZONE_CMA only takes the allocation request with GFP_MOVABLE.
> 
> The immediate follow up question is. Why cannot we reuse ZONE_MOVABLE
> for that purpose?

I can make CMA reuses the ZONE_MOVABLE but I don't want it. Reasons
are that

1. If ZONE_MOVABLE has two different types of memory, hotpluggable and
CMA, it may need special handling for each type. This would lead to a new
migratetype again (to distinguish them) and easy to be error-prone. I
don't want that case.

2. CMA users want to see usage stat separately since CMA often causes
the problems and separate stat would helps to debug it.

> > The other important point about ZONE_CMA is that span of ZONE_CMA would be
> > overlapped with the other zone. This is not new to MM subsystem and
> > MM subsystem has enough logic to handle such situation
> > so there would be no problem.
> 
> I am not really sure this is actually true. Zones are disjoint from the
> early beginning. I remember that we had something like numa nodes
> interleaving but that is such a rare configuration that I wouldn't be
> surprised if it wasn't very well tested and actually broken in some
> subtle ways.

I agree with your concern however if something is broken for them, it
just shows that we need to fix it. MM should handle this situation
since we already know that such architecture exists.

> 
> There are many page_zone(page) != zone checks sprinkled in the code but
> I do not see anything consistent there. Similarly pageblock_pfn_to_page
> is only used by compaction but there are other pfn walkers which do
> ad-hoc checking. I was staring into that code these days due to my
> hotplug patches.
>
> That being said, I think that interleaving zones are an interesting
> concept but I would be rather nervous to consider this as working
> currently without a deeper review.

I have tried to audit all the pfn walkers before and have added above
mentioned check. Perhaps, I missed something however I believe not
that much. Our production already have used ZONE_CMA and I haven't get
the report about such problem.

> 
> > Other things are completely the same with other zones. For MM POV, there is
> > no difference in allocation process except that it only takes
> > GFP_MOVABLE request. In reclaim, pages that are allocated by MM will
> > be reclaimed by the same policy of the MM. So, no difference.
> 
> OK, so essentially this is yet another "highmem" zone. We already know
> that only GFP_MOVABLE are allowed to fallback to ZONE_CMA but do CMA
> allocations fallback to other zones and punch new holes? In which zone
> order?

Hmm... I don't understand your question. Could you elaborate it more?

> > This 'no difference' is a strong point of this approach. ZONE_CMA is
> > naturally handled by MM subsystem unlike as before (special handling is
> > required for MIGRATE_CMA).
> > 
> > 3. Controversial Point
> > 
> > Major concern from Mel is that zone concept is abused. ZONE is originally
> > introduced to solve some issues 

Re: [kbuild-all] [tip:x86/cpu 8/12] arch/x86/kernel/cpu/intel_rdt.c:63: error: unknown field 'cache' specified in initializer

2017-04-16 Thread Fengguang Wu

On Sat, Apr 15, 2017 at 07:40:34AM +0200, Thomas Gleixner wrote:

On Sat, 15 Apr 2017, kbuild test robot wrote:


tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/cpu
head:   64e8ed3d4a6dcd6139a869a3e760e625cb0d3022
commit: 05b93417ce5b924c6652de19fdcc27439ab37c90 [8/12] x86/intel_rdt/mba: Add 
primary support for Memory Bandwidth Allocation (MBA)
config: x86_64-randconfig-s0-04150438 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
git checkout 05b93417ce5b924c6652de19fdcc27439ab37c90
# save the attached .config to linux build tree
make ARCH=x86_64


That's weird.


c1c7c3f9 Fenghua Yu  2016-10-22  57 {
c1c7c3f9 Fenghua Yu  2016-10-22  58 .name   = 
"L3",
c1c7c3f9 Fenghua Yu  2016-10-22  59 .domains
= domain_init(RDT_RESOURCE_L3),
c1c7c3f9 Fenghua Yu  2016-10-22  60 .msr_base   
= IA32_L3_CBM_BASE,
0921c547 Thomas Gleixner 2017-04-14  61 .msr_update 
= cat_wrmsr,
c1c7c3f9 Fenghua Yu  2016-10-22  62 .cache_level
= 3,
d3e11b4d Thomas Gleixner 2017-04-14 @63 .cache = {
d3e11b4d Thomas Gleixner 2017-04-14 @64 .min_cbm_bits   
= 1,
d3e11b4d Thomas Gleixner 2017-04-14 @65 .cbm_idx_mult   
= 1,
d3e11b4d Thomas Gleixner 2017-04-14  66 .cbm_idx_offset 
= 0,
d3e11b4d Thomas Gleixner 2017-04-14  67 },
c1c7c3f9 Fenghua Yu  2016-10-22  68 },

:: The code at line 63 was first introduced by commit
:: d3e11b4d6ffd363747ac6e6b5522baa9ca5a20c0 x86/intel_rdt: Move CBM 
specific data into a struct



So the compiler fails to handle the anon union, which was introduced in
05b93417ce5b924. No idea why, but that concept is not new and widely used
in the kernel already.


It looks like a problem with gcc 4.4:

   Bug 42875 - gcc disallows named initializers for anonymous unions
   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42875
   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676

   gcc-4.4.3 lets you statically initialize named fields, and
   lets you assign to anonymous union members, but you cannot
   statically initialize a named member of an anonymous union.

Thanks,
Fengguang


Re: [kbuild-all] [tip:x86/cpu 8/12] arch/x86/kernel/cpu/intel_rdt.c:63: error: unknown field 'cache' specified in initializer

2017-04-16 Thread Fengguang Wu

On Sat, Apr 15, 2017 at 07:40:34AM +0200, Thomas Gleixner wrote:

On Sat, 15 Apr 2017, kbuild test robot wrote:


tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/cpu
head:   64e8ed3d4a6dcd6139a869a3e760e625cb0d3022
commit: 05b93417ce5b924c6652de19fdcc27439ab37c90 [8/12] x86/intel_rdt/mba: Add 
primary support for Memory Bandwidth Allocation (MBA)
config: x86_64-randconfig-s0-04150438 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
git checkout 05b93417ce5b924c6652de19fdcc27439ab37c90
# save the attached .config to linux build tree
make ARCH=x86_64


That's weird.


c1c7c3f9 Fenghua Yu  2016-10-22  57 {
c1c7c3f9 Fenghua Yu  2016-10-22  58 .name   = 
"L3",
c1c7c3f9 Fenghua Yu  2016-10-22  59 .domains
= domain_init(RDT_RESOURCE_L3),
c1c7c3f9 Fenghua Yu  2016-10-22  60 .msr_base   
= IA32_L3_CBM_BASE,
0921c547 Thomas Gleixner 2017-04-14  61 .msr_update 
= cat_wrmsr,
c1c7c3f9 Fenghua Yu  2016-10-22  62 .cache_level
= 3,
d3e11b4d Thomas Gleixner 2017-04-14 @63 .cache = {
d3e11b4d Thomas Gleixner 2017-04-14 @64 .min_cbm_bits   
= 1,
d3e11b4d Thomas Gleixner 2017-04-14 @65 .cbm_idx_mult   
= 1,
d3e11b4d Thomas Gleixner 2017-04-14  66 .cbm_idx_offset 
= 0,
d3e11b4d Thomas Gleixner 2017-04-14  67 },
c1c7c3f9 Fenghua Yu  2016-10-22  68 },

:: The code at line 63 was first introduced by commit
:: d3e11b4d6ffd363747ac6e6b5522baa9ca5a20c0 x86/intel_rdt: Move CBM 
specific data into a struct



So the compiler fails to handle the anon union, which was introduced in
05b93417ce5b924. No idea why, but that concept is not new and widely used
in the kernel already.


It looks like a problem with gcc 4.4:

   Bug 42875 - gcc disallows named initializers for anonymous unions
   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42875
   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676

   gcc-4.4.3 lets you statically initialize named fields, and
   lets you assign to anonymous union members, but you cannot
   statically initialize a named member of an anonymous union.

Thanks,
Fengguang


Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
On (04/17/17 10:21), Sergey Senozhatsky wrote:
> > However, it should be *fixed* to prevent confusion in future

or may be something like below? can save us some cycles.

remove this calculation

-   offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;


and pass 0 to zram_bvec_rw()

-   err = zram_bvec_rw(zram, , index, offset, is_write);
+   err = zram_bvec_rw(zram, , index, 0, is_write);


-ss


Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
On (04/17/17 10:21), Sergey Senozhatsky wrote:
> > However, it should be *fixed* to prevent confusion in future

or may be something like below? can save us some cycles.

remove this calculation

-   offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;


and pass 0 to zram_bvec_rw()

-   err = zram_bvec_rw(zram, , index, offset, is_write);
+   err = zram_bvec_rw(zram, , index, 0, is_write);


-ss


Re: [PATCH 00/10] mac68k: Miscellaneous fixes, cleanup and modernization

2017-04-16 Thread Finn Thain

On Sun, 16 Apr 2017, Geert Uytterhoeven wrote:

> Hi Finn,
> 
> On Sun, Apr 9, 2017 at 1:51 AM, Finn Thain  
> wrote:
> > This series has various patches from several different people. Two 
> > printk modernization patches were originally from Geert Uytterhoeven 
> > and three Nubus patches were originally committed to the Linux/mac68k 
> > CVS by David Huggins-Daines.
> 
> Thanks, most of them look sane enough to apply and still queue for 
> v4.12.
> 
> I'm a bit reluctant about the nubus changes (patches 6 and 8), though. 
> Do you think they need more testing?
> 
> Thanks!

Patch 6 is partly dead code removal. In principle, this patch is a 
reversion to the old code (pre-v2.3.17). The old code was thoroughly 
tested in Debian Sarge. I suppose a reviewer might wonder whether we want 
to keep new code for probing fake slot resources in Apple's on-board ROMs. 
That would be useful if it could eliminate the macintosh_config struct. 
But it can't, and we don't want both mechanisms. Hence the reversion in 
the mac68k CVS.

Patch 8 changes the pointer validation code and although this has been 
tested on the valid path, you are right that it could use some negative 
testing. But that would seem to require cards with flawed ROMs. I don't 
know of any of such cards. So I think that all we can do is more review.

Maybe Michael or Laurent would be willing to review these two patches?

-- 

> 
> > Finn Thain (10):
> >   m68k/mac: IOP - Modernize printing of kernel messages
> >   m68k/mac: Modernize printing of kernel messages
> >   m68k/mac: Adopt platform_device_register_simple()
> >   m68k/mac: Clarify IOP message alloc/free confusion
> >   nubus: Fix nubus_rewinddir (from mac68k CVS)
> >   nubus: Remove slot zero probe (from mac68k CVS)
> >   nubus: Clean up printk calls (from mac68k CVS)
> >   nubus: Fix pointer validation
> >   nubus: Clean up whitespace
> >   nubus: Add MVC and VSC video card definitions
> 
> Gr{oetje,eeting}s,
> 
> Geert
> 


Re: [PATCH 00/10] mac68k: Miscellaneous fixes, cleanup and modernization

2017-04-16 Thread Finn Thain

On Sun, 16 Apr 2017, Geert Uytterhoeven wrote:

> Hi Finn,
> 
> On Sun, Apr 9, 2017 at 1:51 AM, Finn Thain  
> wrote:
> > This series has various patches from several different people. Two 
> > printk modernization patches were originally from Geert Uytterhoeven 
> > and three Nubus patches were originally committed to the Linux/mac68k 
> > CVS by David Huggins-Daines.
> 
> Thanks, most of them look sane enough to apply and still queue for 
> v4.12.
> 
> I'm a bit reluctant about the nubus changes (patches 6 and 8), though. 
> Do you think they need more testing?
> 
> Thanks!

Patch 6 is partly dead code removal. In principle, this patch is a 
reversion to the old code (pre-v2.3.17). The old code was thoroughly 
tested in Debian Sarge. I suppose a reviewer might wonder whether we want 
to keep new code for probing fake slot resources in Apple's on-board ROMs. 
That would be useful if it could eliminate the macintosh_config struct. 
But it can't, and we don't want both mechanisms. Hence the reversion in 
the mac68k CVS.

Patch 8 changes the pointer validation code and although this has been 
tested on the valid path, you are right that it could use some negative 
testing. But that would seem to require cards with flawed ROMs. I don't 
know of any of such cards. So I think that all we can do is more review.

Maybe Michael or Laurent would be willing to review these two patches?

-- 

> 
> > Finn Thain (10):
> >   m68k/mac: IOP - Modernize printing of kernel messages
> >   m68k/mac: Modernize printing of kernel messages
> >   m68k/mac: Adopt platform_device_register_simple()
> >   m68k/mac: Clarify IOP message alloc/free confusion
> >   nubus: Fix nubus_rewinddir (from mac68k CVS)
> >   nubus: Remove slot zero probe (from mac68k CVS)
> >   nubus: Clean up printk calls (from mac68k CVS)
> >   nubus: Fix pointer validation
> >   nubus: Clean up whitespace
> >   nubus: Add MVC and VSC video card definitions
> 
> Gr{oetje,eeting}s,
> 
> Geert
> 


Re: [PATCH 2/3] zram: do not use copy_page with non-page alinged address

2017-04-16 Thread Sergey Senozhatsky
On (04/13/17 09:17), Minchan Kim wrote:
> The copy_page is optimized memcpy for page-alinged address.
> If it is used with non-page aligned address, it can corrupt memory which
> means system corruption. With zram, it can happen with
> 
> 1. 64K architecture
> 2. partial IO
> 3. slub debug
> 
> Partial IO need to allocate a page and zram allocates it via kmalloc.
> With slub debug, kmalloc(PAGE_SIZE) doesn't return page-size aligned
> address. And finally, copy_page(mem, cmem) corrupts memory.
> 
> So, this patch changes it to memcpy.
> 
> Acutaully, we don't need to change zram_bvec_write part because zsmalloc
> returns page-aligned address in case of PAGE_SIZE class but it's not
> good to rely on the internal of zsmalloc.
> 
> Note:
> When this patch is merged to stable, clear_page should be fixed, too.
> Unfortunately, recent zram removes it by "same page merge" feature
> so it's hard to backport this patch to -stable tree.
> 
> I will handle it when I receive the mail from stable tree maintainer
> to merge this patch to backport.
> 
> Fixes: 42e99bd ("zram: optimize memory operations with 
> clear_page()/copy_page()")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Minchan Kim 

Reviewed-by: Sergey Senozhatsky 

-ss


Re: [PATCH 2/3] zram: do not use copy_page with non-page alinged address

2017-04-16 Thread Sergey Senozhatsky
On (04/13/17 09:17), Minchan Kim wrote:
> The copy_page is optimized memcpy for page-alinged address.
> If it is used with non-page aligned address, it can corrupt memory which
> means system corruption. With zram, it can happen with
> 
> 1. 64K architecture
> 2. partial IO
> 3. slub debug
> 
> Partial IO need to allocate a page and zram allocates it via kmalloc.
> With slub debug, kmalloc(PAGE_SIZE) doesn't return page-size aligned
> address. And finally, copy_page(mem, cmem) corrupts memory.
> 
> So, this patch changes it to memcpy.
> 
> Acutaully, we don't need to change zram_bvec_write part because zsmalloc
> returns page-aligned address in case of PAGE_SIZE class but it's not
> good to rely on the internal of zsmalloc.
> 
> Note:
> When this patch is merged to stable, clear_page should be fixed, too.
> Unfortunately, recent zram removes it by "same page merge" feature
> so it's hard to backport this patch to -stable tree.
> 
> I will handle it when I receive the mail from stable tree maintainer
> to merge this patch to backport.
> 
> Fixes: 42e99bd ("zram: optimize memory operations with 
> clear_page()/copy_page()")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Minchan Kim 

Reviewed-by: Sergey Senozhatsky 

-ss


copy_page() on a kmalloc-ed page with DEBUG_SLAB enabled (was "zram: do not use copy_page with non-page alinged address")

2017-04-16 Thread Sergey Senozhatsky
Hello,

I'll fork it into a separate thread and Cc more MM people.
sorry for top-posting.

Minchan reported that doing copy_page() on a kmalloc(PAGE_SIZE) page
with DEBUG_SLAB enabled can cause a memory corruption (See below or
lkml.kernel.org/r/1492042622-12074-2-git-send-email-minc...@kernel.org )

that's an interesting problem. arm64 copy_page(), for instance, wants src
and dst to be page aligned, which is reasonable, while generic copy_page(),
on the contrary, simply does memcpy(). there are, probably, other callpaths
that do copy_page() on kmalloc-ed pages and I'm wondering if there is some
sort of a generic fix to the problem.

> > On (04/13/17 09:17), Minchan Kim wrote:
> > > The copy_page is optimized memcpy for page-alinged address.
> > > If it is used with non-page aligned address, it can corrupt memory which
> > > means system corruption. With zram, it can happen with
> > > 
> > > 1. 64K architecture
> > > 2. partial IO
> > > 3. slub debug
> > > 
> > > Partial IO need to allocate a page and zram allocates it via kmalloc.
> > > With slub debug, kmalloc(PAGE_SIZE) doesn't return page-size aligned
> > > address. And finally, copy_page(mem, cmem) corrupts memory.
> > 
> > which would be the case for many other copy_page() calls in the kernel.
> > right? if so - should the fix be in copy_page() then?
> 
> I thought about it but was not sure it's good idea by several reasons
> (but don't want to discuss it in this thread).
> 
> Anyway, it's stable stuff so I don't want to make the patch bloat.
> If you believe it is right direction and valuable, you could be
> a volunteer. :)

-ss


copy_page() on a kmalloc-ed page with DEBUG_SLAB enabled (was "zram: do not use copy_page with non-page alinged address")

2017-04-16 Thread Sergey Senozhatsky
Hello,

I'll fork it into a separate thread and Cc more MM people.
sorry for top-posting.

Minchan reported that doing copy_page() on a kmalloc(PAGE_SIZE) page
with DEBUG_SLAB enabled can cause a memory corruption (See below or
lkml.kernel.org/r/1492042622-12074-2-git-send-email-minc...@kernel.org )

that's an interesting problem. arm64 copy_page(), for instance, wants src
and dst to be page aligned, which is reasonable, while generic copy_page(),
on the contrary, simply does memcpy(). there are, probably, other callpaths
that do copy_page() on kmalloc-ed pages and I'm wondering if there is some
sort of a generic fix to the problem.

> > On (04/13/17 09:17), Minchan Kim wrote:
> > > The copy_page is optimized memcpy for page-alinged address.
> > > If it is used with non-page aligned address, it can corrupt memory which
> > > means system corruption. With zram, it can happen with
> > > 
> > > 1. 64K architecture
> > > 2. partial IO
> > > 3. slub debug
> > > 
> > > Partial IO need to allocate a page and zram allocates it via kmalloc.
> > > With slub debug, kmalloc(PAGE_SIZE) doesn't return page-size aligned
> > > address. And finally, copy_page(mem, cmem) corrupts memory.
> > 
> > which would be the case for many other copy_page() calls in the kernel.
> > right? if so - should the fix be in copy_page() then?
> 
> I thought about it but was not sure it's good idea by several reasons
> (but don't want to discuss it in this thread).
> 
> Anyway, it's stable stuff so I don't want to make the patch bloat.
> If you believe it is right direction and valuable, you could be
> a volunteer. :)

-ss


[PATCH v4 2/2] vfio/type1: Prune vfio_pin_page_external()

2017-04-16 Thread Alex Williamson
With vfio_lock_acct() testing the locked memory limit under mmap_sem,
it's redundant to do it here for a single page.  We can also reorder
our tests such that we can avoid testing for reserved pages if we're
not doing accounting, and test the process CAP_IPC_LOCK only if we
are doing accounting.  Finally, this function oddly returns 1 on
success.  Update to return zero on success, -errno on error.  Since
the function only pins a single page, there's no need to return the
number of pages pinned.

N.B. vfio_pin_pages_remote() can pin a large contiguous range of pages
before calling vfio_lock_acct().  If we were to similarly remove the
extra test there, a user could temporarily pin far more pages than
they're allowed.

Suggested-by: Kirti Wankhede 
Suggested-by: Peter Xu 
Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |   34 +-
 1 file changed, 5 insertions(+), 29 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fb18e4a5df62..07e0e58f22e9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -479,43 +479,21 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, 
dma_addr_t iova,
 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
  unsigned long *pfn_base, bool do_accounting)
 {
-   unsigned long limit;
-   bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK);
struct mm_struct *mm;
int ret;
-   bool rsvd;
 
mm = get_task_mm(dma->task);
if (!mm)
return -ENODEV;
 
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
-   if (ret)
-   goto pin_page_exit;
-
-   rsvd = is_invalid_reserved_pfn(*pfn_base);
-   limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
-   put_pfn(*pfn_base, dma->prot);
-   pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
-   __func__, dma->task->comm, task_pid_nr(dma->task),
-   limit << PAGE_SHIFT);
-   ret = -ENOMEM;
-   goto pin_page_exit;
-   }
-
-   if (!rsvd && do_accounting) {
-   ret = vfio_lock_acct(dma->task, 1, lock_cap);
-   if (ret) {
+   if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+   ret = vfio_lock_acct(dma->task, 1,
+has_capability(dma->task, CAP_IPC_LOCK));
+   if (ret)
put_pfn(*pfn_base, dma->prot);
-   goto pin_page_exit;
-   }
}
 
-   ret = 1;
-
-pin_page_exit:
mmput(mm);
return ret;
 }
@@ -595,10 +573,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
remote_vaddr = dma->vaddr + iova - dma->iova;
ret = vfio_pin_page_external(dma, remote_vaddr, _pfn[i],
 do_accounting);
-   if (ret <= 0) {
-   WARN_ON(!ret);
+   if (ret)
goto pin_unwind;
-   }
 
ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
if (ret) {



[PATCH v4 2/2] vfio/type1: Prune vfio_pin_page_external()

2017-04-16 Thread Alex Williamson
With vfio_lock_acct() testing the locked memory limit under mmap_sem,
it's redundant to do it here for a single page.  We can also reorder
our tests such that we can avoid testing for reserved pages if we're
not doing accounting, and test the process CAP_IPC_LOCK only if we
are doing accounting.  Finally, this function oddly returns 1 on
success.  Update to return zero on success, -errno on error.  Since
the function only pins a single page, there's no need to return the
number of pages pinned.

N.B. vfio_pin_pages_remote() can pin a large contiguous range of pages
before calling vfio_lock_acct().  If we were to similarly remove the
extra test there, a user could temporarily pin far more pages than
they're allowed.

Suggested-by: Kirti Wankhede 
Suggested-by: Peter Xu 
Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |   34 +-
 1 file changed, 5 insertions(+), 29 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fb18e4a5df62..07e0e58f22e9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -479,43 +479,21 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, 
dma_addr_t iova,
 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
  unsigned long *pfn_base, bool do_accounting)
 {
-   unsigned long limit;
-   bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK);
struct mm_struct *mm;
int ret;
-   bool rsvd;
 
mm = get_task_mm(dma->task);
if (!mm)
return -ENODEV;
 
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
-   if (ret)
-   goto pin_page_exit;
-
-   rsvd = is_invalid_reserved_pfn(*pfn_base);
-   limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
-   put_pfn(*pfn_base, dma->prot);
-   pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
-   __func__, dma->task->comm, task_pid_nr(dma->task),
-   limit << PAGE_SHIFT);
-   ret = -ENOMEM;
-   goto pin_page_exit;
-   }
-
-   if (!rsvd && do_accounting) {
-   ret = vfio_lock_acct(dma->task, 1, lock_cap);
-   if (ret) {
+   if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+   ret = vfio_lock_acct(dma->task, 1,
+has_capability(dma->task, CAP_IPC_LOCK));
+   if (ret)
put_pfn(*pfn_base, dma->prot);
-   goto pin_page_exit;
-   }
}
 
-   ret = 1;
-
-pin_page_exit:
mmput(mm);
return ret;
 }
@@ -595,10 +573,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
remote_vaddr = dma->vaddr + iova - dma->iova;
ret = vfio_pin_page_external(dma, remote_vaddr, _pfn[i],
 do_accounting);
-   if (ret <= 0) {
-   WARN_ON(!ret);
+   if (ret)
goto pin_unwind;
-   }
 
ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
if (ret) {



[PATCH v4 1/2] vfio/type1: Remove locked page accounting workqueue

2017-04-16 Thread Alex Williamson
If the mmap_sem is contented then the vfio type1 IOMMU backend will
defer locked page accounting updates to a workqueue task.  This has a
few problems and depending on which side the user tries to play, they
might be over-penalized for unmaps that haven't yet been accounted or
race the workqueue to enter more mappings than they're allowed.  The
original intent of this workqueue mechanism seems to be focused on
reducing latency through the ioctl, but we cannot do so at the cost
of correctness.  Remove this workqueue mechanism and update the
callers to allow for failure.  We can also now recheck the limit under
write lock to make sure we don't exceed it.

vfio_pin_pages_remote() also now necessarily includes an unwind path
which we can jump to directly if the consecutive page pinning finds
that we're exceeding the user's memory limits.  This avoids the
current lazy approach which does accounting and mapping up to the
fault, only to return an error on the next iteration to unwind the
entire vfio_dma.

Cc: sta...@vger.kernel.org
Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |  107 +--
 1 file changed, 48 insertions(+), 59 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32d2633092a3..fb18e4a5df62 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -246,69 +246,43 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
return ret;
 }
 
-struct vwork {
-   struct mm_struct*mm;
-   longnpage;
-   struct work_struct  work;
-};
-
-/* delayed decrement/increment for locked_vm */
-static void vfio_lock_acct_bg(struct work_struct *work)
-{
-   struct vwork *vwork = container_of(work, struct vwork, work);
-   struct mm_struct *mm;
-
-   mm = vwork->mm;
-   down_write(>mmap_sem);
-   mm->locked_vm += vwork->npage;
-   up_write(>mmap_sem);
-   mmput(mm);
-   kfree(vwork);
-}
-
-static void vfio_lock_acct(struct task_struct *task, long npage)
+static int vfio_lock_acct(struct task_struct *task, long npage, bool lock_cap)
 {
-   struct vwork *vwork;
struct mm_struct *mm;
bool is_current;
+   int ret;
 
if (!npage)
-   return;
+   return 0;
 
is_current = (task->mm == current->mm);
 
mm = is_current ? task->mm : get_task_mm(task);
if (!mm)
-   return; /* process exited */
+   return -ESRCH; /* process exited */
 
-   if (down_write_trylock(>mmap_sem)) {
-   mm->locked_vm += npage;
-   up_write(>mmap_sem);
-   if (!is_current)
-   mmput(mm);
-   return;
-   }
+   ret = down_write_killable(>mmap_sem);
+   if (!ret) {
+   if (npage < 0 || lock_cap) {
+   mm->locked_vm += npage;
+   } else {
+   unsigned long limit;
 
-   if (is_current) {
-   mm = get_task_mm(task);
-   if (!mm)
-   return;
+   limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+   if (mm->locked_vm + npage <= limit)
+   mm->locked_vm += npage;
+   else
+   ret = -ENOMEM;
+   }
+
+   up_write(>mmap_sem);
}
 
-   /*
-* Couldn't get mmap_sem lock, so must setup to update
-* mm->locked_vm later. If locked_vm were atomic, we
-* wouldn't need this silliness
-*/
-   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
-   if (WARN_ON(!vwork)) {
+   if (!is_current)
mmput(mm);
-   return;
-   }
-   INIT_WORK(>work, vfio_lock_acct_bg);
-   vwork->mm = mm;
-   vwork->npage = npage;
-   schedule_work(>work);
+
+   return ret;
 }
 
 /*
@@ -405,7 +379,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
  long npage, unsigned long *pfn_base)
 {
-   unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+   unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
bool lock_cap = capable(CAP_IPC_LOCK);
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
@@ -442,8 +416,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
unsigned long vaddr,
/* Lock all the consecutive pages from pfn_base */
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-   unsigned long pfn = 0;
-
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, );
if (ret)
   

[PATCH v4 1/2] vfio/type1: Remove locked page accounting workqueue

2017-04-16 Thread Alex Williamson
If the mmap_sem is contented then the vfio type1 IOMMU backend will
defer locked page accounting updates to a workqueue task.  This has a
few problems and depending on which side the user tries to play, they
might be over-penalized for unmaps that haven't yet been accounted or
race the workqueue to enter more mappings than they're allowed.  The
original intent of this workqueue mechanism seems to be focused on
reducing latency through the ioctl, but we cannot do so at the cost
of correctness.  Remove this workqueue mechanism and update the
callers to allow for failure.  We can also now recheck the limit under
write lock to make sure we don't exceed it.

vfio_pin_pages_remote() also now necessarily includes an unwind path
which we can jump to directly if the consecutive page pinning finds
that we're exceeding the user's memory limits.  This avoids the
current lazy approach which does accounting and mapping up to the
fault, only to return an error on the next iteration to unwind the
entire vfio_dma.

Cc: sta...@vger.kernel.org
Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |  107 +--
 1 file changed, 48 insertions(+), 59 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32d2633092a3..fb18e4a5df62 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -246,69 +246,43 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
return ret;
 }
 
-struct vwork {
-   struct mm_struct*mm;
-   longnpage;
-   struct work_struct  work;
-};
-
-/* delayed decrement/increment for locked_vm */
-static void vfio_lock_acct_bg(struct work_struct *work)
-{
-   struct vwork *vwork = container_of(work, struct vwork, work);
-   struct mm_struct *mm;
-
-   mm = vwork->mm;
-   down_write(>mmap_sem);
-   mm->locked_vm += vwork->npage;
-   up_write(>mmap_sem);
-   mmput(mm);
-   kfree(vwork);
-}
-
-static void vfio_lock_acct(struct task_struct *task, long npage)
+static int vfio_lock_acct(struct task_struct *task, long npage, bool lock_cap)
 {
-   struct vwork *vwork;
struct mm_struct *mm;
bool is_current;
+   int ret;
 
if (!npage)
-   return;
+   return 0;
 
is_current = (task->mm == current->mm);
 
mm = is_current ? task->mm : get_task_mm(task);
if (!mm)
-   return; /* process exited */
+   return -ESRCH; /* process exited */
 
-   if (down_write_trylock(>mmap_sem)) {
-   mm->locked_vm += npage;
-   up_write(>mmap_sem);
-   if (!is_current)
-   mmput(mm);
-   return;
-   }
+   ret = down_write_killable(>mmap_sem);
+   if (!ret) {
+   if (npage < 0 || lock_cap) {
+   mm->locked_vm += npage;
+   } else {
+   unsigned long limit;
 
-   if (is_current) {
-   mm = get_task_mm(task);
-   if (!mm)
-   return;
+   limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+   if (mm->locked_vm + npage <= limit)
+   mm->locked_vm += npage;
+   else
+   ret = -ENOMEM;
+   }
+
+   up_write(>mmap_sem);
}
 
-   /*
-* Couldn't get mmap_sem lock, so must setup to update
-* mm->locked_vm later. If locked_vm were atomic, we
-* wouldn't need this silliness
-*/
-   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
-   if (WARN_ON(!vwork)) {
+   if (!is_current)
mmput(mm);
-   return;
-   }
-   INIT_WORK(>work, vfio_lock_acct_bg);
-   vwork->mm = mm;
-   vwork->npage = npage;
-   schedule_work(>work);
+
+   return ret;
 }
 
 /*
@@ -405,7 +379,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
  long npage, unsigned long *pfn_base)
 {
-   unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+   unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
bool lock_cap = capable(CAP_IPC_LOCK);
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
@@ -442,8 +416,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
unsigned long vaddr,
/* Lock all the consecutive pages from pfn_base */
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-   unsigned long pfn = 0;
-
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, );
if (ret)
break;
@@ 

[PATCH v4 0/2] vfio/type1: Synchronous locked page accounting

2017-04-16 Thread Alex Williamson
v4: vfio_lock_acct() should not fail due to RLIMIT_MEMLOCK if task
has CAP_IPC_LOCK capability.  Introduced 2nd patch to remove
redundancy from vfio_pin_page_external() and fix return value.

Please re-review.  Thanks!

Alex

---

Alex Williamson (2):
  vfio/type1: Remove locked page accounting workqueue
  vfio/type1: Prune vfio_pin_page_external()


 drivers/vfio/vfio_iommu_type1.c |  127 ++-
 1 file changed, 46 insertions(+), 81 deletions(-)


[PATCH v4 0/2] vfio/type1: Synchronous locked page accounting

2017-04-16 Thread Alex Williamson
v4: vfio_lock_acct() should not fail due to RLIMIT_MEMLOCK if task
has CAP_IPC_LOCK capability.  Introduced 2nd patch to remove
redundancy from vfio_pin_page_external() and fix return value.

Please re-review.  Thanks!

Alex

---

Alex Williamson (2):
  vfio/type1: Remove locked page accounting workqueue
  vfio/type1: Prune vfio_pin_page_external()


 drivers/vfio/vfio_iommu_type1.c |  127 ++-
 1 file changed, 46 insertions(+), 81 deletions(-)


Re: [PATCH v2 2/4] zram: implement deduplication in zram

2017-04-16 Thread Minchan Kim
Hi Joonsoo,

I reviewed this patch and overall, looks great! Thanks.

However, as you know, recently, zram had lots of clean up so
this patchset should be rebased on it massively.
Sorry for the inconvenience.

And there are some minor in below. I hope you handle them in
next submit, please.

On Thu, Mar 30, 2017 at 02:38:07PM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim 
> 
> This patch implements deduplication feature in zram. The purpose
> of this work is naturally to save amount of memory usage by zram.
> 
> Android is one of the biggest users to use zram as swap and it's
> really important to save amount of memory usage. There is a paper
> that reports that duplication ratio of Android's memory content is
> rather high [1]. And, there is a similar work on zswap that also
> reports that experiments has shown that around 10-15% of pages
> stored in zswp are duplicates and deduplicate them provides some
> benefits [2].
> 
> Also, there is a different kind of workload that uses zram as blockdev
> and store build outputs into it to reduce wear-out problem of real
> blockdev. In this workload, deduplication hit is very high due to
> temporary files and intermediate object files. Detailed analysis is
> on the bottom of this description.
> 
> Anyway, if we can detect duplicated content and avoid to store duplicated
> content at different memory space, we can save memory. This patch
> tries to do that.
> 
> Implementation is almost simple and intuitive but I should note
> one thing about implementation detail.
> 
> To check duplication, this patch uses checksum of the page and
> collision of this checksum could be possible. There would be
> many choices to handle this situation but this patch chooses
> to allow entry with duplicated checksum to be added to the hash,
> but, not to compare all entries with duplicated checksum
> when checking duplication. I guess that checksum collision is quite
> rare event and we don't need to pay any attention to such a case.
> Therefore, I decided the most simplest way to implement the feature.
> If there is a different opinion, I can accept and go that way.
> 
> Following is the result of this patch.
> 
> Test result #1 (Swap):
> Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
> 
> orig_data_size: 145297408
> compr_data_size: 32408125
> mem_used_total: 32276480
> dup_data_size: 3188134
> meta_data_size: 1444272
> 
> Last two metrics added to mm_stat are related to this work.
> First one, dup_data_size, is amount of saved memory by avoiding
> to store duplicated page. Later one, meta_data_size, is the amount of
> data structure to support deduplication. If dup > meta, we can judge
> that the patch improves memory usage.
> 
> In Adnroid, we can save 5% of memory usage by this work.
> 
> Test result #2 (Blockdev):
> build the kernel and store output to ext4 FS on zram
> 
> 
> Elapsed time: 249 s
> mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
> 
> 
> Elapsed time: 250 s
> mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038  3945792
> 
> There is no performance degradation and save 23% memory.
> 
> Test result #3 (Blockdev):
> copy android build output dir(out/host) to ext4 FS on zram
> 
> 
> Elapsed time: out/host: 88 s
> mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> 
> 
> Elapsed time: out/host: 100 s
> mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 
> 80880336
> 
> It shows performance degradation roughly 13% and save 24% memory. Maybe,
> it is due to overhead of calculating checksum and comparison.
> 
> Test result #4 (Blockdev):
> copy android build output dir(out/target/common) to ext4 FS on zram
> 
> 
> Elapsed time: out/host: 203 s
> mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
> 
> 
> Elapsed time: out/host: 201 s
> mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 
> 24564336
> 
> Memory is saved by 42% and performance is the same. Even if there is overhead
> of calculating checksum and comparison, large hit ratio compensate it since
> hit leads to less compression attempt.
> 
> I checked the detailed reason of savings on kernel build workload and
> there are some cases that deduplication happens.
> 
> 1) *.cmd
> Build command is usually similar in one directory so content of these file
> are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> respectively.
> 
> 2) intermediate object files
> built-in.o and temporary object file have the similar contents. More than
> 50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.
> 
> 3) vmlinux
> .tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
> have the similar contents.
> 
> Android test has similar case that some of object files(.class
> and .so) are similar with another ones.
> (./host/linux-x86/lib/libartd.so and
> 

Re: [PATCH v2 2/4] zram: implement deduplication in zram

2017-04-16 Thread Minchan Kim
Hi Joonsoo,

I reviewed this patch and overall, looks great! Thanks.

However, as you know, recently, zram had lots of clean up so
this patchset should be rebased on it massively.
Sorry for the inconvenience.

And there are some minor in below. I hope you handle them in
next submit, please.

On Thu, Mar 30, 2017 at 02:38:07PM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim 
> 
> This patch implements deduplication feature in zram. The purpose
> of this work is naturally to save amount of memory usage by zram.
> 
> Android is one of the biggest users to use zram as swap and it's
> really important to save amount of memory usage. There is a paper
> that reports that duplication ratio of Android's memory content is
> rather high [1]. And, there is a similar work on zswap that also
> reports that experiments has shown that around 10-15% of pages
> stored in zswp are duplicates and deduplicate them provides some
> benefits [2].
> 
> Also, there is a different kind of workload that uses zram as blockdev
> and store build outputs into it to reduce wear-out problem of real
> blockdev. In this workload, deduplication hit is very high due to
> temporary files and intermediate object files. Detailed analysis is
> on the bottom of this description.
> 
> Anyway, if we can detect duplicated content and avoid to store duplicated
> content at different memory space, we can save memory. This patch
> tries to do that.
> 
> Implementation is almost simple and intuitive but I should note
> one thing about implementation detail.
> 
> To check duplication, this patch uses checksum of the page and
> collision of this checksum could be possible. There would be
> many choices to handle this situation but this patch chooses
> to allow entry with duplicated checksum to be added to the hash,
> but, not to compare all entries with duplicated checksum
> when checking duplication. I guess that checksum collision is quite
> rare event and we don't need to pay any attention to such a case.
> Therefore, I decided the most simplest way to implement the feature.
> If there is a different opinion, I can accept and go that way.
> 
> Following is the result of this patch.
> 
> Test result #1 (Swap):
> Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
> 
> orig_data_size: 145297408
> compr_data_size: 32408125
> mem_used_total: 32276480
> dup_data_size: 3188134
> meta_data_size: 1444272
> 
> Last two metrics added to mm_stat are related to this work.
> First one, dup_data_size, is amount of saved memory by avoiding
> to store duplicated page. Later one, meta_data_size, is the amount of
> data structure to support deduplication. If dup > meta, we can judge
> that the patch improves memory usage.
> 
> In Adnroid, we can save 5% of memory usage by this work.
> 
> Test result #2 (Blockdev):
> build the kernel and store output to ext4 FS on zram
> 
> 
> Elapsed time: 249 s
> mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
> 
> 
> Elapsed time: 250 s
> mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038  3945792
> 
> There is no performance degradation and save 23% memory.
> 
> Test result #3 (Blockdev):
> copy android build output dir(out/host) to ext4 FS on zram
> 
> 
> Elapsed time: out/host: 88 s
> mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> 
> 
> Elapsed time: out/host: 100 s
> mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 
> 80880336
> 
> It shows performance degradation roughly 13% and save 24% memory. Maybe,
> it is due to overhead of calculating checksum and comparison.
> 
> Test result #4 (Blockdev):
> copy android build output dir(out/target/common) to ext4 FS on zram
> 
> 
> Elapsed time: out/host: 203 s
> mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
> 
> 
> Elapsed time: out/host: 201 s
> mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 
> 24564336
> 
> Memory is saved by 42% and performance is the same. Even if there is overhead
> of calculating checksum and comparison, large hit ratio compensate it since
> hit leads to less compression attempt.
> 
> I checked the detailed reason of savings on kernel build workload and
> there are some cases that deduplication happens.
> 
> 1) *.cmd
> Build command is usually similar in one directory so content of these file
> are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> respectively.
> 
> 2) intermediate object files
> built-in.o and temporary object file have the similar contents. More than
> 50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.
> 
> 3) vmlinux
> .tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
> have the similar contents.
> 
> Android test has similar case that some of object files(.class
> and .so) are similar with another ones.
> (./host/linux-x86/lib/libartd.so and
> ./host/linux-x86-lib/libartd-comiler.so)

RE: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk

2017-04-16 Thread Andy Tang
Hi Stephen and Michael,

This patch set has been pending for more than two months since it was first 
sent.
I have not received any response from you until now.

Could you give some comments on it?

Regards,
Andy

-Original Message-
From: Andy Tang 
Sent: Wednesday, April 05, 2017 2:16 PM
To: mturque...@baylibre.com; sb...@codeaurora.org
Cc: robh...@kernel.org; mark.rutl...@arm.com; linux-...@vger.kernel.org; 
devicet...@vger.kernel.org; linux-kernel@vger.kernel.org; 
linux-arm-ker...@lists.infradead.org; Scott Wood 
Subject: RE: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk

Hello 

Do you have any comments on this patch set which was acked by Rob?

Regards,
Andy

> -Original Message-
> From: Yuantian Tang [mailto:andy.t...@nxp.com]
> Sent: Monday, March 20, 2017 10:37 AM
> To: mturque...@baylibre.com
> Cc: sb...@codeaurora.org; robh...@kernel.org; mark.rutl...@arm.com; 
> linux-...@vger.kernel.org; devicet...@vger.kernel.org; linux- 
> ker...@vger.kernel.org; linux-arm-ker...@lists.infradead.org; Scott 
> Wood ; Andy Tang 
> Subject: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk
> 
> From: Scott Wood 
> 
> ls1012a has separate input root clocks for core PLLs versus the 
> platform PLL, with the latter described as sysclk in the hw docs.
> Update the qoriq-clock binding to allow a second input clock, named 
> "coreclk".  If present, this clock will be used for the core PLLs.
> 
> Signed-off-by: Scott Wood 
> Signed-off-by: Tang Yuantian 
> Acked-by: Rob Herring 
> ---
> v2:
>   -- change the author to Scott
>  Documentation/devicetree/bindings/clock/qoriq-clock.txt | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> index aa3526f..119cafd 100644
> --- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> +++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> @@ -56,6 +56,11 @@ Optional properties:
>  - clocks: If clock-frequency is not specified, sysclk may be provided
>   as an input clock.  Either clock-frequency or clocks must be
>   provided.
> + A second input clock, called "coreclk", may be provided if
> + core PLLs are based on a different input clock from the
> + platform PLL.
> +- clock-names: Required if a coreclk is present.  Valid names are
> + "sysclk" and "coreclk".
> 
>  2. Clock Provider
> 
> @@ -72,6 +77,7 @@ second cell is the clock index for the specified type.
>   2   hwaccel index (n in CLKCGnHWACSR)
>   3   fman0 for fm1, 1 for fm2
>   4   platform pll0=pll, 1=pll/2, 2=pll/3, 3=pll/4
> + 5   coreclk must be 0
> 
>  3. Example
> 
> --
> 2.1.0.27.g96db324



RE: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk

2017-04-16 Thread Andy Tang
Hi Stephen and Michael,

This patch set has been pending for more than two months since it was first 
sent.
I have not received any response from you until now.

Could you give some comments on it?

Regards,
Andy

-Original Message-
From: Andy Tang 
Sent: Wednesday, April 05, 2017 2:16 PM
To: mturque...@baylibre.com; sb...@codeaurora.org
Cc: robh...@kernel.org; mark.rutl...@arm.com; linux-...@vger.kernel.org; 
devicet...@vger.kernel.org; linux-kernel@vger.kernel.org; 
linux-arm-ker...@lists.infradead.org; Scott Wood 
Subject: RE: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk

Hello 

Do you have any comments on this patch set which was acked by Rob?

Regards,
Andy

> -Original Message-
> From: Yuantian Tang [mailto:andy.t...@nxp.com]
> Sent: Monday, March 20, 2017 10:37 AM
> To: mturque...@baylibre.com
> Cc: sb...@codeaurora.org; robh...@kernel.org; mark.rutl...@arm.com; 
> linux-...@vger.kernel.org; devicet...@vger.kernel.org; linux- 
> ker...@vger.kernel.org; linux-arm-ker...@lists.infradead.org; Scott 
> Wood ; Andy Tang 
> Subject: [PATCH 1/2 v2] dt-bindings: qoriq-clock: Add coreclk
> 
> From: Scott Wood 
> 
> ls1012a has separate input root clocks for core PLLs versus the 
> platform PLL, with the latter described as sysclk in the hw docs.
> Update the qoriq-clock binding to allow a second input clock, named 
> "coreclk".  If present, this clock will be used for the core PLLs.
> 
> Signed-off-by: Scott Wood 
> Signed-off-by: Tang Yuantian 
> Acked-by: Rob Herring 
> ---
> v2:
>   -- change the author to Scott
>  Documentation/devicetree/bindings/clock/qoriq-clock.txt | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> index aa3526f..119cafd 100644
> --- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> +++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
> @@ -56,6 +56,11 @@ Optional properties:
>  - clocks: If clock-frequency is not specified, sysclk may be provided
>   as an input clock.  Either clock-frequency or clocks must be
>   provided.
> + A second input clock, called "coreclk", may be provided if
> + core PLLs are based on a different input clock from the
> + platform PLL.
> +- clock-names: Required if a coreclk is present.  Valid names are
> + "sysclk" and "coreclk".
> 
>  2. Clock Provider
> 
> @@ -72,6 +77,7 @@ second cell is the clock index for the specified type.
>   2   hwaccel index (n in CLKCGnHWACSR)
>   3   fman0 for fm1, 1 for fm2
>   4   platform pll0=pll, 1=pll/2, 2=pll/3, 3=pll/4
> + 5   coreclk must be 0
> 
>  3. Example
> 
> --
> 2.1.0.27.g96db324



[PATCH 2/2] staging:skein: skein_base.h, skein_block.h: move macros into appropriate header files

2017-04-16 Thread Karim Eshapa
Macros more related to BLK operations.

Signed-off-by: Karim Eshapa 
---
 drivers/staging/skein/skein_base.h  | 28 
 drivers/staging/skein/skein_block.h | 28 
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/staging/skein/skein_base.h 
b/drivers/staging/skein/skein_base.h
index cd794c1..a159a32 100644
--- a/drivers/staging/skein/skein_base.h
+++ b/drivers/staging/skein/skein_base.h
@@ -169,34 +169,6 @@ int skein_1024_output(struct skein_1024_ctx *ctx, u8 
*hash_val);
 #define SKEIN_T1_TREE_LVL_MASK  (((u64)0x7F) << SKEIN_T1_POS_TREE_LVL)
 #define SKEIN_T1_TREE_LEVEL(n)  (((u64)(n))  << SKEIN_T1_POS_TREE_LVL)
 
-/* tweak word tweak[1]: block type field */
-#define SKEIN_BLK_TYPE_KEY   (0) /* key, for MAC and KDF */
-#define SKEIN_BLK_TYPE_CFG   (4) /* configuration block */
-#define SKEIN_BLK_TYPE_PERS  (8) /* personalization string */
-#define SKEIN_BLK_TYPE_PK   (12) /* pubkey (for digital sigs) */
-#define SKEIN_BLK_TYPE_KDF  (16) /* key identifier for KDF */
-#define SKEIN_BLK_TYPE_NONCE(20) /* nonce for PRNG */
-#define SKEIN_BLK_TYPE_MSG  (48) /* message processing */
-#define SKEIN_BLK_TYPE_OUT  (63) /* output stage */
-#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
-
-#define SKEIN_T1_BLK_TYPE(T)   (((u64)(SKEIN_BLK_TYPE_##T)) << \
-   SKEIN_T1_POS_BLK_TYPE)
-#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* for MAC and KDF */
-#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* config block */
-#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization */
-#define SKEIN_T1_BLK_TYPE_PKSKEIN_T1_BLK_TYPE(PK)   /* pubkey (for sigs) */
-#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key ident for KDF */
-#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing 
*/
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
-#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
-
-#define SKEIN_T1_BLK_TYPE_CFG_FINAL(SKEIN_T1_BLK_TYPE_CFG | \
-   SKEIN_T1_FLAG_FINAL)
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL(SKEIN_T1_BLK_TYPE_OUT | \
-   SKEIN_T1_FLAG_FINAL)
-
 #define SKEIN_VERSION   (1)
 
 #ifndef SKEIN_ID_STRING_LE  /* allow compile-time personalization */
diff --git a/drivers/staging/skein/skein_block.h 
b/drivers/staging/skein/skein_block.h
index ec1baea..0dfceef 100644
--- a/drivers/staging/skein/skein_block.h
+++ b/drivers/staging/skein/skein_block.h
@@ -14,6 +14,34 @@
 
 #include "skein_base.h" /* get the Skein API definitions   */
 
+/* tweak word tweak[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY   (0) /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG   (4) /* configuration block */
+#define SKEIN_BLK_TYPE_PERS  (8) /* personalization string */
+#define SKEIN_BLK_TYPE_PK   (12) /* pubkey (for digital sigs) */
+#define SKEIN_BLK_TYPE_KDF  (16) /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE(20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG  (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT  (63) /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64)(SKEIN_BLK_TYPE_##T)) << \
+   SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* config block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization */
+#define SKEIN_T1_BLK_TYPE_PKSKEIN_T1_BLK_TYPE(PK)   /* pubkey (for sigs) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key ident for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing 
*/
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL(SKEIN_T1_BLK_TYPE_CFG | \
+   SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL(SKEIN_T1_BLK_TYPE_OUT | \
+   SKEIN_T1_FLAG_FINAL)
+
 void skein_256_process_block(struct skein_256_ctx *ctx, const u8 *blk_ptr,
 size_t blk_cnt, size_t byte_cnt_add);
 void skein_512_process_block(struct skein_512_ctx *ctx, const u8 *blk_ptr,
-- 
2.7.4



[PATCH 2/2] staging:skein: skein_base.h, skein_block.h: move macros into appropriate header files

2017-04-16 Thread Karim Eshapa
Macros more related to BLK operations.

Signed-off-by: Karim Eshapa 
---
 drivers/staging/skein/skein_base.h  | 28 
 drivers/staging/skein/skein_block.h | 28 
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/staging/skein/skein_base.h 
b/drivers/staging/skein/skein_base.h
index cd794c1..a159a32 100644
--- a/drivers/staging/skein/skein_base.h
+++ b/drivers/staging/skein/skein_base.h
@@ -169,34 +169,6 @@ int skein_1024_output(struct skein_1024_ctx *ctx, u8 
*hash_val);
 #define SKEIN_T1_TREE_LVL_MASK  (((u64)0x7F) << SKEIN_T1_POS_TREE_LVL)
 #define SKEIN_T1_TREE_LEVEL(n)  (((u64)(n))  << SKEIN_T1_POS_TREE_LVL)
 
-/* tweak word tweak[1]: block type field */
-#define SKEIN_BLK_TYPE_KEY   (0) /* key, for MAC and KDF */
-#define SKEIN_BLK_TYPE_CFG   (4) /* configuration block */
-#define SKEIN_BLK_TYPE_PERS  (8) /* personalization string */
-#define SKEIN_BLK_TYPE_PK   (12) /* pubkey (for digital sigs) */
-#define SKEIN_BLK_TYPE_KDF  (16) /* key identifier for KDF */
-#define SKEIN_BLK_TYPE_NONCE(20) /* nonce for PRNG */
-#define SKEIN_BLK_TYPE_MSG  (48) /* message processing */
-#define SKEIN_BLK_TYPE_OUT  (63) /* output stage */
-#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
-
-#define SKEIN_T1_BLK_TYPE(T)   (((u64)(SKEIN_BLK_TYPE_##T)) << \
-   SKEIN_T1_POS_BLK_TYPE)
-#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* for MAC and KDF */
-#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* config block */
-#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization */
-#define SKEIN_T1_BLK_TYPE_PKSKEIN_T1_BLK_TYPE(PK)   /* pubkey (for sigs) */
-#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key ident for KDF */
-#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing 
*/
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
-#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
-
-#define SKEIN_T1_BLK_TYPE_CFG_FINAL(SKEIN_T1_BLK_TYPE_CFG | \
-   SKEIN_T1_FLAG_FINAL)
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL(SKEIN_T1_BLK_TYPE_OUT | \
-   SKEIN_T1_FLAG_FINAL)
-
 #define SKEIN_VERSION   (1)
 
 #ifndef SKEIN_ID_STRING_LE  /* allow compile-time personalization */
diff --git a/drivers/staging/skein/skein_block.h 
b/drivers/staging/skein/skein_block.h
index ec1baea..0dfceef 100644
--- a/drivers/staging/skein/skein_block.h
+++ b/drivers/staging/skein/skein_block.h
@@ -14,6 +14,34 @@
 
 #include "skein_base.h" /* get the Skein API definitions   */
 
+/* tweak word tweak[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY   (0) /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG   (4) /* configuration block */
+#define SKEIN_BLK_TYPE_PERS  (8) /* personalization string */
+#define SKEIN_BLK_TYPE_PK   (12) /* pubkey (for digital sigs) */
+#define SKEIN_BLK_TYPE_KDF  (16) /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE(20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG  (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT  (63) /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64)(SKEIN_BLK_TYPE_##T)) << \
+   SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* config block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization */
+#define SKEIN_T1_BLK_TYPE_PKSKEIN_T1_BLK_TYPE(PK)   /* pubkey (for sigs) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key ident for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing 
*/
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL(SKEIN_T1_BLK_TYPE_CFG | \
+   SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL(SKEIN_T1_BLK_TYPE_OUT | \
+   SKEIN_T1_FLAG_FINAL)
+
 void skein_256_process_block(struct skein_256_ctx *ctx, const u8 *blk_ptr,
 size_t blk_cnt, size_t byte_cnt_add);
 void skein_512_process_block(struct skein_512_ctx *ctx, const u8 *blk_ptr,
-- 
2.7.4



Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
Hello,

On (04/15/17 00:33), Minchan Kim wrote:
> On Fri, Apr 14, 2017 at 02:07:47PM +0900, Sergey Senozhatsky wrote:
> > On (04/13/17 09:17), Minchan Kim wrote:
> > [..]
> > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > index 9e2199060040..83c38a123242 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -930,7 +930,7 @@ static int zram_rw_page(struct block_device *bdev, 
> > > sector_t sector,
> > >   }
> > >  
> > >   index = sector >> SECTORS_PER_PAGE_SHIFT;
> > > - offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> > > + offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
> > 
> > sorry, can it actually produce different results?
> 
> I got your point. Actually, offset was wrong but rw_page is called
> with PAGE_SIZE io while that offset is related to only partial io
> (non-PAGEE size io). IOW, although the wrong offset it is never used
> in functions.
> 
> To find subtle corruption in ppc64, I added some debug code to
> catch up wrong buffer overflow and found it with other bugs but
> didn't prove the specific case is valid case or not. Good catch, Sergey!
> 
> However, it should be *fixed* to prevent confusion in future but surely,
> no need to go to the stable. I will send reply to Greg to prevent merging
> it to *stable* when he send review asking to merge.

cool. thanks!

> And next week I will send another fix which *maybe* removes code to get the
> offset in zram_rw_page.

sounds interesting!

-ss


Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
Hello,

On (04/15/17 00:33), Minchan Kim wrote:
> On Fri, Apr 14, 2017 at 02:07:47PM +0900, Sergey Senozhatsky wrote:
> > On (04/13/17 09:17), Minchan Kim wrote:
> > [..]
> > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > index 9e2199060040..83c38a123242 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -930,7 +930,7 @@ static int zram_rw_page(struct block_device *bdev, 
> > > sector_t sector,
> > >   }
> > >  
> > >   index = sector >> SECTORS_PER_PAGE_SHIFT;
> > > - offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> > > + offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
> > 
> > sorry, can it actually produce different results?
> 
> I got your point. Actually, offset was wrong but rw_page is called
> with PAGE_SIZE io while that offset is related to only partial io
> (non-PAGEE size io). IOW, although the wrong offset it is never used
> in functions.
> 
> To find subtle corruption in ppc64, I added some debug code to
> catch up wrong buffer overflow and found it with other bugs but
> didn't prove the specific case is valid case or not. Good catch, Sergey!
> 
> However, it should be *fixed* to prevent confusion in future but surely,
> no need to go to the stable. I will send reply to Greg to prevent merging
> it to *stable* when he send review asking to merge.

cool. thanks!

> And next week I will send another fix which *maybe* removes code to get the
> offset in zram_rw_page.

sounds interesting!

-ss


Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
On (04/13/17 09:17), Minchan Kim wrote:
> Date: Thu, 13 Apr 2017 09:17:00 +0900
> From: Minchan Kim 
> To: Andrew Morton 
> CC: linux-kernel@vger.kernel.org, Sergey Senozhatsky
>  , kernel-t...@lge.com, Minchan Kim
>  , sta...@vger.kernel.org
> Subject: [PATCH 1/3] zram: fix operator precedence to get offset
> X-Mailer: git-send-email 2.7.4
> 
> In zram_rw_page, the logic to get offset is wrong by operator precedence
> (i.e., "<<" is higher than "&"). With wrong offset, zram can corrupt the
> user's data. This patch fixes it.
> 
> Fixes: 8c7f01025 ("zram: implement rw_page operation of zram")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Minchan Kim 

Reviewed-by: Sergey Senozhatsky 

Re: [PATCH 1/3] zram: fix operator precedence to get offset

2017-04-16 Thread Sergey Senozhatsky
On (04/13/17 09:17), Minchan Kim wrote:
> Date: Thu, 13 Apr 2017 09:17:00 +0900
> From: Minchan Kim 
> To: Andrew Morton 
> CC: linux-kernel@vger.kernel.org, Sergey Senozhatsky
>  , kernel-t...@lge.com, Minchan Kim
>  , sta...@vger.kernel.org
> Subject: [PATCH 1/3] zram: fix operator precedence to get offset
> X-Mailer: git-send-email 2.7.4
> 
> In zram_rw_page, the logic to get offset is wrong by operator precedence
> (i.e., "<<" is higher than "&"). With wrong offset, zram can corrupt the
> user's data. This patch fixes it.
> 
> Fixes: 8c7f01025 ("zram: implement rw_page operation of zram")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Minchan Kim 

Reviewed-by: Sergey Senozhatsky 

  1   2   3   4   5   6   7   8   >