RE: [v3, 5/9] fsl/fman: Add Frame Manager support

2015-07-23 Thread Liberman Igal

Regards,
Igal Liberman.

 +static struct platform_driver fm_driver = {
 + .driver = {
 +.name = fsl-fman,
 +.of_match_table = fm_match,
 +},
 + .probe = fm_probe,
 +};
 +
 +builtin_platform_driver(fm_driver);
 +
 +static int __init __cold fm_load(void)
 +{
 + if (platform_driver_register(fm_driver)) {
 + pr_crit(platform_driver_register() failed\n);
 + return -ENODEV;
 + }
 +
 + pr_info(Freescale FMan module\n);
 + return 0;
 +}
 +
 +device_initcall(fm_load);

Please notice, when using builtin_platform_driver, device_initcall(fm_load); 
becomes redundant. 
Same issue in 2 other places.
I have patches which fix that which were left out of this submission, I'll add 
them to v4.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] leds/powernv: Add driver for PowerNV platform

2015-07-23 Thread Jacek Anaszewski

Vasant,

On 23.07.2015 10:08, Vasant Hegde wrote:

On 07/23/2015 01:25 PM, Jacek Anaszewski wrote:

Hi Vasant,



Jacek,

.../...


+/* PowerNV LED data */
+struct powernv_led_data {
+struct led_classdevcdev;
+char*loc_code;/* LED location code */
+intled_type;/* OPAL_SLOT_LED_TYPE_* */
+enum led_brightnessvalue;/* Brightness value */


You don't need 'value' as brightness value is already stored in
cdev.brightness.



Agree. I'll remove.


+/*
+ * LED classdev 'brightness_get' function. This schedules work
+ * to update LED state.
+ */
+static void powernv_brightness_set(struct led_classdev *led_cdev,
+   enum led_brightness value)
+{
+struct powernv_led_data *powernv_led =
+container_of(led_cdev, struct powernv_led_data, cdev);
+
+/* Do not modify LED in unload path */
+if (led_disabled)
+return;
+
+/* Prepare the request */
+powernv_led-value = value;
+
+ return powernv_led_set(powernv_led);


Isn't mutex_lock/unlock missing here?


Yes. I realized this after sending out the patch. I will fix this.
.../...


+
+max_led_type = devm_kzalloc(dev, sizeof(*max_led_type), GFP_KERNEL);
+if (!max_led_type)
+return -ENOMEM;
+
+mutex_init(lock);
+*max_led_type = cpu_to_be64(OPAL_SLOT_LED_TYPE_MAX);
+
+platform_set_drvdata(pdev, lock);


Setting only lock as drvdata looks odd to me and I haven't noticed
anyone doing that.
I'd prefer to put lock, led_disabled and max_led_type in a common
struct and set it as drvdata. I know that I accepted this design, but
I didn't take into account these details.


Yeah. Even I looked into existing code and I don't see anyone using like this.
Since it's void * and we just need lock, I added like this.

If I break this into two structure, then I've to use platform_get_drvdata() call
in multiple functions like powernv_brightness_set() to get max_let_type etc. Is
that fine?


You could add wrappers for grouping instructions required to
retrieve given properties of the common struct.
Alternatively you could add a pointer to the common struct
in the struct powernv_led_data. You can play with these approaches
and choose the one which looks better.

--
Best Regards,
Jacek Anaszewski
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] powerpc/config: enable aquantia PHY

2015-07-23 Thread shh.xie
From: Shaohui Xie shaohui@freescale.com

Aquantia PHYs used on platforms such as T2080RDB, T1024RDB.

Signed-off-by: Shaohui Xie shaohui@freescale.com
---
 arch/powerpc/configs/corenet32_smp_defconfig | 1 +
 arch/powerpc/configs/corenet64_smp_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/corenet32_smp_defconfig 
b/arch/powerpc/configs/corenet32_smp_defconfig
index fe77d60..7e98cc2 100644
--- a/arch/powerpc/configs/corenet32_smp_defconfig
+++ b/arch/powerpc/configs/corenet32_smp_defconfig
@@ -96,6 +96,7 @@ CONFIG_FSL_PQ_MDIO=y
 CONFIG_FSL_XGMAC_MDIO=y
 CONFIG_E1000=y
 CONFIG_E1000E=y
+CONFIG_AQUANTIA_PHY=y
 CONFIG_AT803X_PHY=y
 CONFIG_TERANETICS_PHY=y
 CONFIG_VITESSE_PHY=y
diff --git a/arch/powerpc/configs/corenet64_smp_defconfig 
b/arch/powerpc/configs/corenet64_smp_defconfig
index 3c5b5ac..b935260 100644
--- a/arch/powerpc/configs/corenet64_smp_defconfig
+++ b/arch/powerpc/configs/corenet64_smp_defconfig
@@ -91,6 +91,7 @@ CONFIG_DUMMY=y
 CONFIG_FSL_PQ_MDIO=y
 CONFIG_FSL_XGMAC_MDIO=y
 CONFIG_E1000E=y
+CONFIG_AQUANTIA_PHY=y
 CONFIG_TERANETICS_PHY=y
 CONFIG_VITESSE_PHY=y
 CONFIG_FIXED_PHY=y
-- 
1.8.4.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc/config: enable teranetics PHY

2015-07-23 Thread shh.xie
From: Shaohui Xie shaohui@freescale.com

The PHY uses XAUI interface to connect to MAC, mostly the PHY used on
riser card.

Signed-off-by: Shaohui Xie shaohui@freescale.com
---
 arch/powerpc/configs/corenet32_smp_defconfig | 1 +
 arch/powerpc/configs/corenet64_smp_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/corenet32_smp_defconfig 
b/arch/powerpc/configs/corenet32_smp_defconfig
index 3765993..fe77d60 100644
--- a/arch/powerpc/configs/corenet32_smp_defconfig
+++ b/arch/powerpc/configs/corenet32_smp_defconfig
@@ -97,6 +97,7 @@ CONFIG_FSL_XGMAC_MDIO=y
 CONFIG_E1000=y
 CONFIG_E1000E=y
 CONFIG_AT803X_PHY=y
+CONFIG_TERANETICS_PHY=y
 CONFIG_VITESSE_PHY=y
 CONFIG_FIXED_PHY=y
 CONFIG_MDIO_BUS_MUX_GPIO=y
diff --git a/arch/powerpc/configs/corenet64_smp_defconfig 
b/arch/powerpc/configs/corenet64_smp_defconfig
index 33cd1df..3c5b5ac 100644
--- a/arch/powerpc/configs/corenet64_smp_defconfig
+++ b/arch/powerpc/configs/corenet64_smp_defconfig
@@ -91,6 +91,7 @@ CONFIG_DUMMY=y
 CONFIG_FSL_PQ_MDIO=y
 CONFIG_FSL_XGMAC_MDIO=y
 CONFIG_E1000E=y
+CONFIG_TERANETICS_PHY=y
 CONFIG_VITESSE_PHY=y
 CONFIG_FIXED_PHY=y
 CONFIG_MDIO_BUS_MUX_GPIO=y
-- 
1.8.4.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/mm: Fix pte_pagesize_index() crash on 4K w/64K hash

2015-07-23 Thread Michael Ellerman
The powerpc kernel can be built to have either a 4K PAGE_SIZE or a 64K
PAGE_SIZE.

However when built with a 4K PAGE_SIZE there is an additional config
option which can be enabled, PPC_HAS_HASH_64K, which means the kernel
also knows how to hash a 64K page even though the base PAGE_SIZE is 4K.

This is used in one obscure configuration, to support 64K pages for SPU
local store on the Cell processor when the rest of the kernel is using
4K pages.

In this configuration, pte_pagesize_index() is defined to just pass
through its arguments to get_slice_psize(). However pte_pagesize_index()
is called for both user and kernel addresses, whereas get_slice_psize()
only knows how to handle user addresses.

This has been broken forever, however until recently it happened to
work. That was because in get_slice_psize() the large kernel address
would cause the right shift of the slize mask to return zero.

However in commit 7aa0727f3302 powerpc/mm: Increase the slice range to
64TB, the get_slice_psize() code was changed so that instead of a right
shift we do an array lookup based on the address. When passed a kernel
address this means we index way off the end of the slice array and
return random junk.

That is only fatal if we happen to hit something non-zero, but when we
do return a non-zero value we confuse the MMU code and eventually cause
a check stop.

This fix is ugly, but simple. When we're called for a kernel address we
return 4K, which is always correct in this configuration, otherwise we
use the slice mask.

Fixes: 7aa0727f3302 (powerpc/mm: Increase the slice range to 64TB)
Reported-by: Cyril Bur cyril...@gmail.com
Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3bb7488bd24b..330ae1d81662 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -135,7 +135,15 @@
 #define pte_iterate_hashed_end() } while(0)
 
 #ifdef CONFIG_PPC_HAS_HASH_64K
-#define pte_pagesize_index(mm, addr, pte)  get_slice_psize(mm, addr)
+#define pte_pagesize_index(mm, addr, pte)  \
+   ({  \
+   unsigned int psize; \
+   if (is_kernel_addr(addr))   \
+   psize = MMU_PAGE_4K;\
+   else\
+   psize = get_slice_psize(mm, addr);  \
+   psize;  \
+   })
 #else
 #define pte_pagesize_index(mm, addr, pte)  MMU_PAGE_4K
 #endif
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC v4 03/25] m68k/atari: Move Atari-specific code out of drivers/char/nvram.c

2015-07-23 Thread Michael Schmitz
Hi Christian,

here's what Finn asked me to run as tests:

# dmesg | grep this_id  nvram.out
# cat /proc/driver/nvram  nvram.out
# hexdump -C /dev/nvram  nvram.out
# cp /dev/nvram /tmp/nvram
# cp /tmp/nvram /dev/nvram
# md5sum /dev/nvram /tmp/nvram  nvram.out

What you sent so far looks OK. I've tested the change of SCSI ID
(using EmuTOS) along with a trivial patch to atari_scsi.c (replace
offset 14 by 16) and the driver now uses the stored ID properly. I'll
sent kernels to test Finn's NVRAM patch for regression ASAP.

Thanks for your offer of help!

Cheers,

  Michael


On Thu, Jul 23, 2015 at 9:21 PM, Christian T. Steigies c...@debian.org wrote:
 On Wed, Jul 22, 2015 at 02:22:21PM +1000, Finn Thain wrote:

 Anyone with a suitable Atari, i.e. ATARIHW_PRESENT(TT_CLK), who can boot
 both TOS and Linux could resolve the question. (Perhaps with an emulator?)

 Any old kernel binary would do, since atari_scsi should print either
 HOSTID=n or this_id n at startup.

 If n doesn't agree with what TOS says about the host's SCSI ID, then I
 think a trivial patch is safe enough. Especially if cat /proc/driver/nvram
 produces a SCSI host ID : m that does agree with TOS.

 root@garkin:~cat /proc/hardware
 Model:  Atari Falcon
 System Memory:  522752K
 510 MB at 0x0100 (alternate RAM)
 Detected hardware:
 Falcon Shifter
 Programmable Sound Generator
 PCM 8 Bit Sound
 CODEC Sound
 SCSI Controller NCR5380 (Falcon style)
 IDE Interface
 8/16 Mhz Switch for FDC
 Multi Function Peripheral MFP 68901
 Serial Communications Controller SCC 8530
 Paddle Interface
 DMA Controller for SCC
 Clock Chip MC146818A
 Blitter
 DSP56001 processor

 root@garkin:~dmesg |grep SCSI
 [0.00] Atari hardware found: VIDEL STDMA-SCSI ST_MFP YM2149 PCM CODEC 
 DSP56K SCC_DMA SCC ANALOG_JOY BLITTER IDE TT_CLK FDC_SPEED
 [0.41] SCSI subsystem initialized
 [0.85] Block layer SCSI generic (bsg) driver version 0.4 loaded 
 (major 251)
 [4.23] Atari SCSI: resetting the SCSI bus... done
 [6.75] scsi host0: Atari native SCSI, io_port 0x0, n_io_port 0, base 
 0x0, irq 15, can_queue 8, cmd_per_lun 1, sg_tablesize 0, this_id 7, flags { 
 }, options { REAL_DMA SUPPORT_TAGS }

 root@garkin:~cat /proc/driver/nvram
 Checksum status  : not valid
 Boot preference  : 0xff (undefined)
 SCSI arbitration : on
 SCSI host ID : 7
 OS language  : 255 (undefined)
 Keyboard language: 255 (undefined)
 Date format  : 7 (undefined), 24h clock
 Boot delay   : 255s
 Video mode   : 4 colors, 40 columns, TV NTSC monitor
no overscan, compat. mode off


 Let me know if you need more info.

 Christian
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 7/8] perf: Define PMU_TXN_READ interface

2015-07-23 Thread Sukadev Bhattiprolu
Peter Zijlstra [pet...@infradead.org] wrote:
| On Wed, Jul 22, 2015 at 04:19:16PM -0700, Sukadev Bhattiprolu wrote:
|  Peter Zijlstra [pet...@infradead.org] wrote:
|  | I've not woken up yet, and not actually fully read the email, but can
|  | you stuff the entire above chunk inside the IPI?
|  | 
|  | I think you could then actually optimize __perf_event_read() as well,
|  | because all these events should be on the same context, so no point in
|  | calling update_*time*() for every event or so.
|  | 
|  
|  Do you mean something like this (will move the rename to a separate
|  patch before posting):
| 
| More like so.. please double check, I've not even had tea yet.

Yeah, I realized I had ignored the 'event-cpu' spec.
Will try this out. Thanks,

Sukadev

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] mm: rename and document alloc_pages_exact_node

2015-07-23 Thread David Rientjes
On Thu, 23 Jul 2015, Christoph Lameter wrote:

  The only possible downside would be existing users of
  alloc_pages_node() that are calling it with an offline node.  Since it's a
  VM_BUG_ON() that would catch that, I think it should be changed to a
  VM_WARN_ON() and eventually fixed up because it's nonsensical.
  VM_BUG_ON() here should be avoided.
 
 The offline node thing could be addresses by using numa_mem_id()?
 

I was concerned about any callers that were passing an offline node, not 
NUMA_NO_NODE, today.  One of the alloc-node functions has a VM_BUG_ON() 
for it, the other silently calls node_zonelist() on it.

I suppose the final alloc_pages_node() implementation could be

if (nid == NUMA_NO_NODE || VM_WARN_ON(!node_online(nid)))
nid = numa_mem_id();

VM_BUG_ON(nid  0 || nid = MAX_NUMNODES);
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));

though.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] mm: rename and document alloc_pages_exact_node

2015-07-23 Thread Christoph Lameter
On Wed, 22 Jul 2015, David Rientjes wrote:

 Eek, yeah, that does look bad.  I'm not even sure the

   if (nid  0)
   nid = numa_node_id();

 is correct; I think this should be comparing to NUMA_NO_NODE rather than
 all negative numbers, otherwise we silently ignore overflow and nobody
 ever knows.

Comparing to NUMA_NO_NODE would be better. Also use numa_mem_id() instead
to support memoryless nodes better?

 The only possible downside would be existing users of
 alloc_pages_node() that are calling it with an offline node.  Since it's a
 VM_BUG_ON() that would catch that, I think it should be changed to a
 VM_WARN_ON() and eventually fixed up because it's nonsensical.
 VM_BUG_ON() here should be avoided.

The offline node thing could be addresses by using numa_mem_id()?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: Update corenet32_smp_defconfig for modern distros

2015-07-23 Thread Michael Ellerman
corenet32_smp_defconfig is missing some things that modern distros
require, enable them.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/configs/corenet32_smp_defconfig | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/configs/corenet32_smp_defconfig 
b/arch/powerpc/configs/corenet32_smp_defconfig
index 37659937bd12..12551385234c 100644
--- a/arch/powerpc/configs/corenet32_smp_defconfig
+++ b/arch/powerpc/configs/corenet32_smp_defconfig
@@ -2,6 +2,7 @@ CONFIG_PPC_85xx=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=8
 CONFIG_SYSVIPC=y
+CONFIG_FHANDLE=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
@@ -10,6 +11,10 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
@@ -172,6 +177,7 @@ CONFIG_NLS_CODEPAGE_850=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=m
 CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 5/7] powerpc/powernv: add event attribute and group to nest pmu

2015-07-23 Thread Madhavan Srinivasan


On Wednesday 22 July 2015 10:14 AM, Daniel Axtens wrote:
 On Thu, 2015-07-16 at 16:43 +0530, Madhavan Srinivasan wrote:
 Add code to create event/format attributes and attribute groups for
 each nest pmu.

 Cc: Michael Ellerman m...@ellerman.id.au
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: Paul Mackerras pau...@samba.org
 Cc: Anton Blanchard an...@samba.org
 Cc: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
 Cc: Anshuman Khandual khand...@linux.vnet.ibm.com
 Cc: Stephane Eranian eran...@google.com
 Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com
 ---
  arch/powerpc/perf/nest-pmu.c | 65 
 
  1 file changed, 65 insertions(+)

 diff --git a/arch/powerpc/perf/nest-pmu.c b/arch/powerpc/perf/nest-pmu.c
 index c4c08e4dee55..f3418bdec1cd 100644
 --- a/arch/powerpc/perf/nest-pmu.c
 +++ b/arch/powerpc/perf/nest-pmu.c
 @@ -13,6 +13,17 @@
  static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
  static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];
  
 +PMU_FORMAT_ATTR(event, config:0-20);
 +static struct attribute *p8_nest_format_attrs[] = {
 +format_attr_event.attr,
 +NULL,
 +};
 +
 +static struct attribute_group p8_nest_format_group = {
 +.name = format,
 +.attrs = p8_nest_format_attrs,
 +};
 +
  static int nest_event_info(struct property *pp, char *name,
  struct nest_ima_events *p8_events, int string, u32 val)
  {
 @@ -46,6 +57,56 @@ static int nest_event_info(struct property *pp, char 
 *name,
  return 0;
  }
  
 +/*
 + * Populate event name and string in attribute
 + */
 +static struct attribute *dev_str_attr(const char *name, const char *str)
 +{
 +struct perf_pmu_events_attr *attr;
 +
 +attr = kzalloc(sizeof(*attr), GFP_KERNEL);
 +
 +sysfs_attr_init(attr-attr.attr);
 +
 +attr-event_str = str;
 +attr-attr.attr.name = name;
 +attr-attr.attr.mode = 0444;
 +attr-attr.show = perf_event_sysfs_show;
 +
 +return attr-attr.attr;
 So I asked you about this before, and you pointed me to
 perf_event_sysfs_show. Looking at that in kernel/events/core.c, it looks
 like that uses container_of to pull out the perf_pmu_events_attr. So I
 guess that is at least mostly correct.

 I'm hoping something else uses container_of to pull out attr-attr, so
 that they can actually grab the attr-attr.show function pointer, so
 that perf_event_sysfs_show actually gets called. Where would that be?

OK, what we return is the device attribute struct which also have sysfs_ops.
So -show and -store are those entries in the strucutre and here we only
populate show ops using perf_event_sysfs_show. Now at the time of
pmu registering, we end up calling device_add-device_create_file-
sysfs_create_file which will end up adding a sysfs device file linked to
this
-show ops.

 +}
 +
 +static int update_events_in_group(
 +struct nest_ima_events *p8_events, int idx, struct nest_pmu *pmu)
 +{
 +struct attribute_group *attr_group;
 +struct attribute **attrs;
 +int i;
 +
 +/*
 + * Allocate memory for both event attribute group and for
 + * event attributes array.
 + */
 +attr_group = kzalloc(((sizeof(struct attribute *) * (idx + 1)) +
 +sizeof(*attr_group)), GFP_KERNEL);
 +if (!attr_group)
 +return -ENOMEM;
 +
 +/*
 + * Assign memory for event attribute array
 + */
 +attrs = (struct attribute **)(attr_group + 1);
 +attr_group-name = events;
 +attr_group-attrs = attrs;
 I am super uncomfortable with this block, especially the assignment to
 attrs. I *think* you're trying to allocate an attribute group and a set
 of attributes, but you've combined the allocation into one big
 contiguous chunk, and then you're trying to tease them apart. Is that
 necessary? Could it be two allocs, one for the attribute_group and one
 for the attribute?

I wanted to avoid two function calls here, but this is not a hot path
This happens at the pmu init time (booting), so I guess we can have
two allocs here.  
 


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 4/7] powerpc/powernv: detect supported nest pmus and its events

2015-07-23 Thread Madhavan Srinivasan


On Wednesday 22 July 2015 09:37 AM, Daniel Axtens wrote:
  
  static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
 +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];
 +
 +static int nest_event_info(struct property *pp, char *name,
 +struct nest_ima_events *p8_events, int string, u32 val)
 'int string' is a bit confusing. 'bool is_string' might be clearer, but
 I think it would be even better still to have different functions for
 string and non-string cases, especially because you only need val in the
 non-string case.

I would perfer to be a single function, since most of the code is same
just of the sting or val part. yes. We can make is as is_string and will
add
comment explaining what is done here? 

 That will also allow you to give the functions clearer names. I think
 the function is populating the event with info from the dt property (in
 the string case) or the val argument (non-string case) - maybe the names
 could reflect that somehow?
 +{
 +char *buf;
 +

 +
 +static int nest_pmu_create(struct device_node *dev, int pmu_index)
 +{
 +struct nest_ima_events **p8_events_arr, *p8_events;
 +struct nest_pmu *pmu_ptr;
 +struct property *pp;
 +char *buf, *start;
 +const __be32 *lval;
 +u32 val;
 +int idx = 0, ret;
 +
 +if (!dev)
 +return -EINVAL;
 +
 +/* memory for nest pmus */
 +pmu_ptr = kzalloc(sizeof(struct nest_pmu), GFP_KERNEL);
 +if (!pmu_ptr)
 +return -ENOMEM;
 +
 +/* Needed for hotplug/migration */
 +per_nest_pmu_arr[pmu_index] = pmu_ptr;
 +
 +/* memory for nest pmu events */
 +p8_events_arr = kzalloc((sizeof(struct nest_ima_events) * 64),
 +GFP_KERNEL);
 +if (!p8_events_arr)
 +return -ENOMEM;
 +p8_events = (struct nest_ima_events *)p8_events_arr;
 I'm still quite uncomfortable with this.
  - Why * 64? Should it be * P8_NEST_MAX_EVENTS_SUPPORTED? Or is it a
 different constant?

Yes it should be P8_NEST_MAX_EVENTS_SUPPORTED, and it should be
define as 64. Missed it to replace the macro. Nice catch.

  - p8_events = p8_events_arr[0] would be clearer

 +
 +/*
 + * Loop through each property
 + */
 +for_each_property_of_node(dev, pp) {
 +start = pp-name;
 +
 +if (!strcmp(pp-name, name)) {
 +if (!pp-value ||
 +   (strnlen(pp-value, pp-length) == pp-length) ||
 +   (pp-length  P8_NEST_MAX_PMU_NAME_LEN))
 +return -EINVAL;
 +
 +buf = kzalloc(P8_NEST_MAX_PMU_NAME_LEN, GFP_KERNEL);
 +if (!buf)
 +return -ENOMEM;
 +
 +/* Save the name to register it later */
 +sprintf(buf, Nest_%s, (char *)pp-value);
 +pmu_ptr-pmu.name = (char *)buf;
 +continue;
 +}
 +
 +/* Skip these, we dont need it */
 don't instead of dont.

Will change it.

 +if (!strcmp(pp-name, phandle) ||
 +!strcmp(pp-name, device_type) ||
 +!strcmp(pp-name, linux,phandle))
 +continue;
 +
 +if (strncmp(pp-name, unit., 5) == 0) {
 +/* Skip first few chars in the name */
 The whole comment is pretty uninformative, as is the similar comment
 below. If you need a comment at all, maybe something along the lines of
 Strip the prefix because reason we don't need/want the prefix?

Yes will change it. Thanks

 +start += 5;
 +ret = nest_event_info(pp, start, p8_events++, 1, 0);
 +} else if (strncmp(pp-name, scale., 6) == 0) {
 +/* Skip first few chars in the name */
 +start += 6;
 +ret = nest_event_info(pp, start, p8_events++, 1, 0);
 +} else {
 +lval = of_get_property(dev, pp-name, NULL);
 +val = (uint32_t)be32_to_cpup(lval);
 +
 +ret = nest_event_info(pp, start, p8_events++, 0, val);
 +}
 +
 +if (ret)
 +return ret;
 +
 +/* book keeping */
 +idx++;
 You don't seem to use idx in this function, apart from incrementing it
 here...?

Used in next patch.

 +}
 +
 +return 0;
 +}


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 4/6] mm: mlock: Introduce VM_LOCKONFAULT and add mlock flags to enable it

2015-07-23 Thread Eric B Munson
On Thu, 23 Jul 2015, Vlastimil Babka wrote:

 On 07/22/2015 08:43 PM, Eric B Munson wrote:
  On Wed, 22 Jul 2015, Vlastimil Babka wrote:
  
  
  Hi,
  
  I think you should include a complete description of which
  transitions for vma states and mlock2/munlock2 flags applied on them
  are valid and what they do. It will also help with the manpages.
  You explained some to Jon in the last thread, but I think there
  should be a canonical description in changelog (if not also
  Documentation, if mlock is covered there).
  
  For example the scenario Jon asked, what happens after a
  mlock2(MLOCK_ONFAULT) followed by mlock2(MLOCK_LOCKED), and that the
  answer is nothing. Your promised code comment for
  apply_vma_flags() doesn't suffice IMHO (and I'm not sure it's there,
  anyway?).
  
  I missed adding that comment to the code, will be there in V5 along with
  the description in the changelog.
 
 Thanks!
 
  
  But the more I think about the scenario and your new VM_LOCKONFAULT
  vma flag, it seems awkward to me. Why should munlocking at all care
  if the vma was mlocked with MLOCK_LOCKED or MLOCK_ONFAULT? In either
  case the result is that all pages currently populated are munlocked.
  So the flags for munlock2 should be unnecessary.
  
  Say a user has a large area of interleaved MLOCK_LOCK and MLOCK_ONFAULT
  mappings and they want to unlock only the ones with MLOCK_LOCK.  With
  the current implementation, this is possible in a single system call
  that spans the entire region.  With your suggestion, the user would have
  to know what regions where locked with MLOCK_LOCK and call munlock() on
  each of them.  IMO, the way munlock2() works better mirrors the way
  munlock() currently works when called on a large area of interleaved
  locked and unlocked areas.
 
 Um OK, that scenario is possible in theory. But I have a hard time imagining
 that somebody would really want to do that. I think much more people would
 benefit from a simpler API.

It wasn't about imagining a scenario, more about keeping parity with
something that currently works (unlocking a large area of interleaved
locked and unlocked regions).  However, there is no reason we can't add
the new munlock2 later if it is desired.

 
  
  
  I also think VM_LOCKONFAULT is unnecessary. VM_LOCKED should be
  enough - see how you had to handle the new flag in all places that
  had to handle the old flag? I think the information whether mlock
  was supposed to fault the whole vma is obsolete at the moment mlock
  returns. VM_LOCKED should be enough for both modes, and the flag to
  mlock2 could just control whether the pre-faulting is done.
  
  So what should be IMHO enough:
  - munlock can stay without flags
  - mlock2 has only one new flag MLOCK_ONFAULT. If specified,
  pre-faulting is not done, just set VM_LOCKED and mlock pages already
  present.
  - same with mmap(MAP_LOCKONFAULT) (need to define what happens when
  both MAP_LOCKED and MAP_LOCKONFAULT are specified).
  
  Now mlockall(MCL_FUTURE) muddles the situation in that it stores the
  information for future VMA's in current-mm-def_flags, and this
  def_flags would need to distinguish VM_LOCKED with population and
  without. But that could be still solvable without introducing a new
  vma flag everywhere.
  
  With you right up until that last paragraph.  I have been staring at
  this a while and I cannot come up a way to handle the
  mlockall(MCL_ONFAULT) without introducing a new vm flag.  It doesn't
  have to be VM_LOCKONFAULT, we could use the model that Michal Hocko
  suggested with something like VM_FAULTPOPULATE.  However, we can't
  really use this flag anywhere except the mlock code becuase we have to
  be able to distinguish a caller that wants to use MLOCK_LOCK with
  whatever control VM_FAULTPOPULATE might grant outside of mlock and a
  caller that wants MLOCK_ONFAULT.  That was a long way of saying we need
  an extra vma flag regardless.  However, if that flag only controls if
  mlock pre-populates it would work and it would do away with most of the
  places I had to touch to handle VM_LOCKONFAULT properly.
 
 Yes, it would be a good way. Adding a new vma flag is probably cleanest after
 all, but the flag would be set *in addition* to VM_LOCKED, *just* to prevent
 pre-faulting. The places that check VM_LOCKED for the actual page mlocking 
 (i.e.
 try_to_unmap_one) would just keep checking VM_LOCKED. The places where 
 VM_LOCKED
 is checked to trigger prepopulation, would skip that if VM_LOCKONFAULT is also
 set. Having VM_LOCKONFAULT set without also VM_LOCKED itself would be invalid 
 state.
 
 This should work fine with the simplified API as I proposed so let me 
 reiterate
 and try fill in the blanks:
 
 - mlock2 has only one new flag MLOCK_ONFAULT. If specified, VM_LOCKONFAULT is
 set in addition to VM_LOCKED and no prefaulting is done
   - old mlock syscall naturally behaves as mlock2 without MLOCK_ONFAULT
   - calling mlock/mlock2 on an already-mlocked area (if 

RE: [v3, 2/9] fsl/fman: Add the FMan port FLIB

2015-07-23 Thread Liberman Igal


Regards,
Igal Liberman.

 -Original Message-
 From: Stephen Hemminger [mailto:step...@networkplumber.org]
 Sent: Wednesday, July 22, 2015 7:56 PM
 To: Liberman Igal-B31950
 Cc: net...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
 ker...@vger.kernel.org; Wood Scott-B07421; Bucur Madalin-Cristian-B32716;
 pebo...@tiscali.nl; joakim.tjernl...@transmode.se; p...@mindchasers.com
 Subject: Re: [v3, 2/9] fsl/fman: Add the FMan port FLIB
 
 On Wed, 22 Jul 2015 14:21:48 +0300
 igal.liber...@freescale.com wrote:
 
  diff --git a/drivers/net/ethernet/freescale/fman/Kconfig
 b/drivers/net/ethernet/freescale/fman/Kconfig
  index 8aeae29..af42c3a 100644
  --- a/drivers/net/ethernet/freescale/fman/Kconfig
  +++ b/drivers/net/ethernet/freescale/fman/Kconfig
  @@ -5,3 +5,4 @@ config FSL_FMAN
  help
  Freescale Data-Path Acceleration Architecture Frame
 Manager
  (FMan) support
  +
 
 Bogus blank line introduced at end of file?
 Why was this not in patch 1?

It was added during the patch splitting.
I'll fix that in the next submission, thank you for noticing.

Igal.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [4.2-rc3] macintosh/ans-lcd.c: Missing include causes compile failure

2015-07-23 Thread Tim Gardner

On 07/23/2015 04:18 AM, Michael Ellerman wrote:

On Mon, 2015-20-07 at 19:01:42 UTC, Tim Gardner wrote:

From: Tim Gardner tim.gard...@canonical.com

drivers/macintosh/ans-lcd.c:201:1: warning: data definition has no type or 
storage class
  module_init(anslcd_init);
  ^
drivers/macintosh/ans-lcd.c:201:1: error: type defaults to 'int' in declaration 
of 'module_init' [-Werror=implicit-int]

gcc version 4.9.3 (Ubuntu 4.9.3-1ubuntu1)


Thanks Tim but Luis beat you to it:

   http://patchwork.ozlabs.org/patch/497138/


I'll send the fix to Linus in the next day or two.

cheers



No problem. It looks like a better fix then mine anyway.

rtg
--
Tim Gardner tim.gard...@canonical.com
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 6/7] powerpc/powernv: generic nest pmu event functions

2015-07-23 Thread Madhavan Srinivasan


On Wednesday 22 July 2015 10:26 AM, Daniel Axtens wrote:
 +static void p8_nest_read_counter(struct perf_event *event)
 +{
 +uint64_t *addr;
 +u64 data = 0;
 You've got a u64 and a uint64_t, and then...
 +
 +addr = (u64 *)event-hw.event_base;
 ... you cast to event_base to a u64 pointer, which you assign to a
 uint64_t pointer.
 +data = __be64_to_cpu(*addr);
 And now you dereference the pointer.
 Could you just have:
 data = __be64_to_cpu(*event-hw.event_base);
 +local64_set(event-hw.prev_count, data);
 +}
 +
 +static void p8_nest_perf_event_update(struct perf_event *event)
 +{
 +u64 counter_prev, counter_new, final_count;
 +uint64_t *addr;
 +
 +addr = (uint64_t *)event-hw.event_base;
 Here at least the cast type is the same as the type of addr, but again,
 why do you need the different types, and why local variable?

Damn sorry, copy paste errors. When I added debug prints i messed
the type case in both the functions. I will make them as uint64_t.

Thanks for this detail review
Maddy

 +counter_prev = local64_read(event-hw.prev_count);
 +counter_new = __be64_to_cpu(*addr);
 +final_count = counter_new - counter_prev;
 +
 +local64_set(event-hw.prev_count, counter_new);
 +local64_add(final_count, event-count);
 +}
 +
 +static void p8_nest_event_start(struct perf_event *event, int flags)
 +{
 +event-hw.state = 0;
 Should this be an enum or a #define rather than a bare 0? (It may not
 need to be, I was just wondering because I don't know what 0 means.)

I could remove it since was just initializing at the start.

 +p8_nest_read_counter(event);
 +}
 +

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 7/7] powerpc/powernv: nest pmu cpumask and cpu hotplug support

2015-07-23 Thread Madhavan Srinivasan


On Wednesday 22 July 2015 10:33 AM, Daniel Axtens wrote:
 +static void nest_change_cpu_context(int old_cpu, int new_cpu)
 +{
 +int i;
 +
 +for (i = 0; per_nest_pmu_arr[i] != NULL; i++)
 +perf_pmu_migrate_context(per_nest_pmu_arr[i]-pmu,
 +old_cpu, new_cpu);
 From patch 4, I see per_nest_pmu_arr is defined as:
  +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];

 Therefore, does this loop need to have a check that 
 i  P8_NEST_MAX_PMUS?

No, that is max possible pmu, but we may have only couple for nest pmus
registered.

Thanks for the review comments
Maddy


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 7/8] perf: Define PMU_TXN_READ interface

2015-07-23 Thread Peter Zijlstra
On Wed, Jul 22, 2015 at 04:19:16PM -0700, Sukadev Bhattiprolu wrote:
 Peter Zijlstra [pet...@infradead.org] wrote:
 | I've not woken up yet, and not actually fully read the email, but can
 | you stuff the entire above chunk inside the IPI?
 | 
 | I think you could then actually optimize __perf_event_read() as well,
 | because all these events should be on the same context, so no point in
 | calling update_*time*() for every event or so.
 | 
 
 Do you mean something like this (will move the rename to a separate
 patch before posting):

More like so.. please double check, I've not even had tea yet.

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3174,14 +3174,22 @@ void perf_event_exec(void)
rcu_read_unlock();
 }
 
+struct perf_read_data {
+   struct perf_event *event;
+   bool group;
+   int ret;
+};
+
 /*
  * Cross CPU call to read the hardware event
  */
 static void __perf_event_read(void *info)
 {
-   struct perf_event *event = info;
+   struct perf_read_data *data = info;
+   struct perf_event *sub, *event = data-event;
struct perf_event_context *ctx = event-ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+   struct pmu *pmu = event-pmu;
 
/*
 * If this is a task context, we need to check whether it is
@@ -3199,8 +3207,23 @@ static void __perf_event_read(void *info
update_cgrp_time_from_event(event);
}
update_event_times(event);
-   if (event-state == PERF_EVENT_STATE_ACTIVE)
-   event-pmu-read(event);
+   if (event-state != PERF_EVENT_STATE_ACTIVE)
+   goto unlock;
+
+   if (!data-group) {
+   pmu-read(event);
+   goto unlock;
+   }
+
+   pmu-start_txn(pmu, PERF_PMU_TXN_READ);
+   pmu-read(event);
+   list_for_each_entry(sub, event-sibling_list, group_entry) {
+   if (sub-state == PERF_EVENT_STATE_ACTIVE)
+   pmu-read(sub);
+   }
+   data-ret = pmu-commit_txn(pmu);
+
+unlock:
raw_spin_unlock(ctx-lock);
 }
 
@@ -3212,15 +3235,23 @@ static inline u64 perf_event_count(struc
return __perf_event_count(event);
 }
 
-static void perf_event_read(struct perf_event *event)
+static int perf_event_read(struct perf_event *event, bool group)
 {
+   int ret = 0
+
/*
 * If event is enabled and currently active on a CPU, update the
 * value in the event structure:
 */
if (event-state == PERF_EVENT_STATE_ACTIVE) {
+   struct perf_read_data data = {
+   .event = event,
+   .group = group,
+   .ret = 0,
+   };
smp_call_function_single(event-oncpu,
-__perf_event_read, event, 1);
+__perf_event_read, data, 1);
+   ret = data.ret;
} else if (event-state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event-ctx;
unsigned long flags;
@@ -3235,9 +3266,14 @@ static void perf_event_read(struct perf_
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
-   update_event_times(event);
+   if (group)
+   update_group_times(event);
+   else
+   update_event_times(event);
raw_spin_unlock_irqrestore(ctx-lock, flags);
}
+
+   return ret;
 }
 
 /*
@@ -3718,7 +3754,6 @@ static u64 perf_event_compute(struct per
atomic64_read(event-child_total_time_running);
 
list_for_each_entry(child, event-child_list, child_list) {
-   perf_event_read(child);
total += perf_event_count(child);
*enabled += child-total_time_enabled;
*running += child-total_time_running;
@@ -3772,7 +3807,7 @@ u64 perf_event_read_value(struct perf_ev
 
mutex_lock(event-child_mutex);
 
-   perf_event_read(event);
+   perf_event_read(event, false);
total = perf_event_compute(event, enabled, running);
 
mutex_unlock(event-child_mutex);
@@ -3792,7 +3827,11 @@ static int perf_read_group(struct perf_e
 
lockdep_assert_held(ctx-mutex);
 
-   count = perf_event_read_value(leader, enabled, running);
+   ret = perf_event_read(leader, true);
+   if (ret)
+   return ret;
+
+   count = perf_event_compute(leader, enabled, running);
 
values[n++] = 1 + leader-nr_siblings;
if (read_format  PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -3813,7 +3852,7 @@ static int perf_read_group(struct perf_e
list_for_each_entry(sub, leader-sibling_list, group_entry) {
n = 0;
 
-   values[n++] = perf_event_read_value(sub, enabled, running);
+   values[n++] = perf_event_compute(sub, 

Re: [PATCH v5 0/7]powerpc/powernv: Nest Instrumentation support

2015-07-23 Thread Madhavan Srinivasan


On Tuesday 21 July 2015 12:13 AM, Sukadev Bhattiprolu wrote:
 Madhavan Srinivasan [ma...@linux.vnet.ibm.com] wrote:
 | This patchset enables Nest Instrumentation support on powerpc.
 | POWER8 has per-chip Nest Intrumentation which provides various
 | per-chip metrics like memory, powerbus, Xlink and Alink
 | bandwidth.
 | 
 snip

 | Cc: Michael Ellerman m...@ellerman.id.au
 | Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 | Cc: Paul Mackerras pau...@samba.org
 | Cc: Anton Blanchard an...@samba.org
 | Cc: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
 | Cc: Anshuman Khandual khand...@linux.vnet.ibm.com
 | Cc: Stephane Eranian eran...@google.com
 | Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com

 Thanks for addressing my comments from earlier version.

 Reviewed-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com

Thanks for the review
Maddy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 7/7] powerpc/powernv: nest pmu cpumask and cpu hotplug support

2015-07-23 Thread Daniel Axtens
On Thu, 2015-07-23 at 12:18 +0530, Madhavan Srinivasan wrote:
 
 On Wednesday 22 July 2015 10:33 AM, Daniel Axtens wrote:
  +static void nest_change_cpu_context(int old_cpu, int new_cpu)
  +{
  +  int i;
  +
  +  for (i = 0; per_nest_pmu_arr[i] != NULL; i++)
  +  perf_pmu_migrate_context(per_nest_pmu_arr[i]-pmu,
  +  old_cpu, new_cpu);
  From patch 4, I see per_nest_pmu_arr is defined as:
   +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];
 
  Therefore, does this loop need to have a check that 
  i  P8_NEST_MAX_PMUS?
 
 No, that is max possible pmu, but we may have only couple for nest pmus
 registered.
 
What if we have P8_NEST_MAX_PMUS registered? Then we'll check beyond the
end of the array...

 Thanks for the review comments
 Maddy
 
 

-- 
Regards,
Daniel


signature.asc
Description: This is a digitally signed message part
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 2/6] mm: mlock: Add new mlock, munlock, and munlockall system calls

2015-07-23 Thread Ralf Baechle
On Wed, Jul 22, 2015 at 10:15:01AM -0400, Eric B Munson wrote:

  
  You haven't wired it up properly on powerpc, but I haven't mentioned it 
  because
  I'd rather we did it.
  
  cheers
 
 It looks like I will be spinning a V5, so I will drop all but the x86
 system calls additions in that version.

The MIPS bits are looking good however, so

Acked-by: Ralf Baechle r...@linux-mips.org

With my ack, will you keep them or maybe carry them as a separate patch?

Cheers,

  Ralf
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 7/7] powerpc/powernv: nest pmu cpumask and cpu hotplug support

2015-07-23 Thread Madhavan Srinivasan


On Thursday 23 July 2015 12:19 PM, Daniel Axtens wrote:
 On Thu, 2015-07-23 at 12:18 +0530, Madhavan Srinivasan wrote:
 On Wednesday 22 July 2015 10:33 AM, Daniel Axtens wrote:
 +static void nest_change_cpu_context(int old_cpu, int new_cpu)
 +{
 +  int i;
 +
 +  for (i = 0; per_nest_pmu_arr[i] != NULL; i++)
 +  perf_pmu_migrate_context(per_nest_pmu_arr[i]-pmu,
 +  old_cpu, new_cpu);
 From patch 4, I see per_nest_pmu_arr is defined as:
  +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];

 Therefore, does this loop need to have a check that 
 i  P8_NEST_MAX_PMUS?
 No, that is max possible pmu, but we may have only couple for nest pmus
 registered.

 What if we have P8_NEST_MAX_PMUS registered? Then we'll check beyond the
 end of the array...

OK, i will add check for P8_NEST_MAX_PMUS also.

 Thanks for the review comments
 Maddy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] leds/powernv: Add driver for PowerNV platform

2015-07-23 Thread Vasant Hegde
On 07/23/2015 01:25 PM, Jacek Anaszewski wrote:
 Hi Vasant,
 

Jacek,

.../...

 +/* PowerNV LED data */
 +struct powernv_led_data {
 +struct led_classdevcdev;
 +char*loc_code;/* LED location code */
 +intled_type;/* OPAL_SLOT_LED_TYPE_* */
 +enum led_brightnessvalue;/* Brightness value */
 
 You don't need 'value' as brightness value is already stored in
 cdev.brightness.
 

Agree. I'll remove.

 +/*
 + * LED classdev 'brightness_get' function. This schedules work
 + * to update LED state.
 + */
 +static void powernv_brightness_set(struct led_classdev *led_cdev,
 +   enum led_brightness value)
 +{
 +struct powernv_led_data *powernv_led =
 +container_of(led_cdev, struct powernv_led_data, cdev);
 +
 +/* Do not modify LED in unload path */
 +if (led_disabled)
 +return;
 +
 +/* Prepare the request */
 +powernv_led-value = value;
 +
 + return powernv_led_set(powernv_led);
 
 Isn't mutex_lock/unlock missing here?

Yes. I realized this after sending out the patch. I will fix this.
.../...

 +
 +max_led_type = devm_kzalloc(dev, sizeof(*max_led_type), GFP_KERNEL);
 +if (!max_led_type)
 +return -ENOMEM;
 +
 +mutex_init(lock);
 +*max_led_type = cpu_to_be64(OPAL_SLOT_LED_TYPE_MAX);
 +
 +platform_set_drvdata(pdev, lock);
 
 Setting only lock as drvdata looks odd to me and I haven't noticed
 anyone doing that.
 I'd prefer to put lock, led_disabled and max_led_type in a common
 struct and set it as drvdata. I know that I accepted this design, but
 I didn't take into account these details.

Yeah. Even I looked into existing code and I don't see anyone using like this.
Since it's void * and we just need lock, I added like this.

If I break this into two structure, then I've to use platform_get_drvdata() call
in multiple functions like powernv_brightness_set() to get max_let_type etc. Is
that fine?

-Vasant

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] cxl: Add alternate MMIO error handling

2015-07-23 Thread Ian Munsie
From: Ian Munsie imun...@au1.ibm.com

userspace programs using cxl currently have to use two strategies for
dealing with MMIO errors simultaneously. They have to check every read
for a return of all Fs in case the adapter has gone away and the kernel
has not yet noticed, and they have to deal with SIGBUS in case the
kernel has already noticed, invalidated the mapping and marked the
context as failed.

In order to simplify things, this patch adds an alternative approach
where the kernel will return a page filled with Fs instead of delivering
a SIGBUS. This allows userspace to only need to deal with one of these
two error paths, and is intended for use in libraries that use cxl
transparently and may not be able to safely install a signal handler.

This approach will only work if certain constraints are met. Namely, if
the application is both reading and writing to an address in the problem
state area it cannot assume that a non-FF read is OK, as it may just be
reading out a value it has previously written. Further - since only one
page is used per context a write to a given offset would be visible when
reading the same offset from a different page in the mapping (this only
applies within a single context, not between contexts).

An application could deal with this by e.g. making sure it also reads
from a read-only offset after any reads to a read/write offset.

Due to these constraints, this functionality must be explicitly
requested by userspace when starting the context by passing in the
CXL_START_WORK_ERR_FF flag.

Signed-off-by: Ian Munsie imun...@au1.ibm.com
---
 drivers/misc/cxl/context.c | 14 ++
 drivers/misc/cxl/cxl.h |  4 +++-
 drivers/misc/cxl/file.c|  4 +++-
 include/uapi/misc/cxl.h|  4 +++-
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 1287148..6570f10 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -126,6 +126,18 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, 
struct vm_fault *vmf)
if (ctx-status != STARTED) {
mutex_unlock(ctx-status_mutex);
pr_devel(%s: Context not started, failing problem state 
access\n, __func__);
+   if (ctx-mmio_err_ff) {
+   if (!ctx-ff_page) {
+   ctx-ff_page = alloc_page(GFP_USER);
+   if (!ctx-ff_page)
+   return VM_FAULT_OOM;
+   memset(page_address(ctx-ff_page), 0xff, 
PAGE_SIZE);
+   }
+   get_page(ctx-ff_page);
+   vmf-page = ctx-ff_page;
+   vma-vm_page_prot = pgprot_cached(vma-vm_page_prot);
+   return 0;
+   }
return VM_FAULT_SIGBUS;
}
 
@@ -253,6 +265,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
 
free_page((u64)ctx-sstp);
+   if (ctx-ff_page)
+   __free_page(ctx-ff_page);
ctx-sstp = NULL;
 
kfree(ctx);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 4fd66ca..b7293a4 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -34,7 +34,7 @@ extern uint cxl_verbose;
  * Bump version each time a user API change is made, whether it is
  * backwards compatible ot not.
  */
-#define CXL_API_VERSION 1
+#define CXL_API_VERSION 2
 #define CXL_API_VERSION_COMPATIBLE 1
 
 /*
@@ -418,6 +418,8 @@ struct cxl_context {
/* Used to unmap any mmaps when force detaching */
struct address_space *mapping;
struct mutex mapping_lock;
+   struct page *ff_page;
+   bool mmio_err_ff;
 
spinlock_t sste_lock; /* Protects segment table entries */
struct cxl_sste *sstp;
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index e3f4b69..34c7a5e 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -179,6 +179,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
if (work.flags  CXL_START_WORK_AMR)
amr = work.amr  mfspr(SPRN_UAMOR);
 
+   ctx-mmio_err_ff = !!(work.flags  CXL_START_WORK_ERR_FF);
+
/*
 * We grab the PID here and not in the file open to allow for the case
 * where a process (master, some daemon, etc) has opened the chardev on
@@ -519,7 +521,7 @@ int __init cxl_file_init(void)
 * If these change we really need to update API.  Either change some
 * flags or update API version number CXL_API_VERSION.
 */
-   BUILD_BUG_ON(CXL_API_VERSION != 1);
+   BUILD_BUG_ON(CXL_API_VERSION != 2);
BUILD_BUG_ON(sizeof(struct cxl_ioctl_start_work) != 64);
BUILD_BUG_ON(sizeof(struct cxl_event_header) != 8);
BUILD_BUG_ON(sizeof(struct cxl_event_afu_interrupt) != 8);
diff --git 

Re: [PATCH v3 5/8] perf: Split perf_event_read_value()

2015-07-23 Thread Peter Zijlstra
On Tue, Jul 14, 2015 at 08:01:52PM -0700, Sukadev Bhattiprolu wrote:
 Move the part of perf_event_read_value() that computes the event
 counts and event times into a new function, perf_event_compute().
 
 This would allow us to call perf_event_compute() independently.
 
 Signed-off-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
 
 Changelog[v3]
   Rather than move perf_event_read() into callers and then
   rename, just move the computations into a separate function
   (redesign to address comment from Peter Zijlstra).
 ---
  kernel/events/core.c |   37 -
  1 file changed, 24 insertions(+), 13 deletions(-)
 
 diff --git a/kernel/events/core.c b/kernel/events/core.c
 index 44fb89d..b1e9a42 100644
 --- a/kernel/events/core.c
 +++ b/kernel/events/core.c
 @@ -3704,6 +3704,29 @@ static int perf_release(struct inode *inode, struct 
 file *file)
   return 0;
  }
  
 +static u64 perf_event_compute(struct perf_event *event, u64 *enabled,
 +   u64 *running)
 +{
 + struct perf_event *child;
 + u64 total;
 +
 + total = perf_event_count(event);
 +
 + *enabled += event-total_time_enabled +
 + atomic64_read(event-child_total_time_enabled);
 + *running += event-total_time_running +
 + atomic64_read(event-child_total_time_running);
 +
 + list_for_each_entry(child, event-child_list, child_list) {
 + perf_event_read(child);

Sure we don't want that..

 + total += perf_event_count(child);
 + *enabled += child-total_time_enabled;
 + *running += child-total_time_running;
 + }
 +
 + return total;
 +}
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] leds/powernv: Add driver for PowerNV platform

2015-07-23 Thread Jacek Anaszewski

Hi Vasant,

Thanks for the update.

On 22.07.2015 16:52, Vasant Hegde wrote:

This patch implements LED driver for PowerNV platform using the existing
generic LED class framework.

PowerNV platform has below type of LEDs:
   - System attention
   Indicates there is a problem with the system that needs attention.
   - Identify
   Helps the user locate/identify a particular FRU or resource in the
   system.
   - Fault
   Indicates there is a problem with the FRU or resource at the
   location with which the indicator is associated.

We register classdev structures for all individual LEDs detected on the
system through LED specific device tree nodes. Device tree nodes specify
what all kind of LEDs present on the same location code. It registers
LED classdev structure for each of them.

All the system LEDs can be found in the same regular path /sys/class/leds/.
We don't use LED colors. We use LED node and led-types property to form
LED classdev. Our LEDs have names in this format.

 location_code:attention|identify|fault

Any positive brightness value would turn on the LED and a zero value would
turn off the LED. The driver will return LED_FULL (255) for any turned on
LED and LED_OFF (0) for any turned off LED.

As per the LED class framework, the 'brightness_set' function should not
sleep. Hence these functions have been implemented through global work
queue tasks which might sleep on OPAL async call completion.

The platform level implementation of LED get and set state has been
achieved through OPAL calls. These calls are made available for the
driver by exporting from architecture specific codes.

Signed-off-by: Vasant Hegde hegdevas...@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual khand...@linux.vnet.ibm.com
Acked-by: Stewart Smith stew...@linux.vnet.ibm.com
Tested-by: Stewart Smith stew...@linux.vnet.ibm.com
---
  .../devicetree/bindings/leds/leds-powernv.txt  |  26 ++
  drivers/leds/Kconfig   |  11 +
  drivers/leds/Makefile  |   1 +
  drivers/leds/leds-powernv.c| 342 +
  4 files changed, 380 insertions(+)
  create mode 100644 Documentation/devicetree/bindings/leds/leds-powernv.txt
  create mode 100644 drivers/leds/leds-powernv.c

diff --git a/Documentation/devicetree/bindings/leds/leds-powernv.txt 
b/Documentation/devicetree/bindings/leds/leds-powernv.txt
new file mode 100644
index 000..6665569
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-powernv.txt
@@ -0,0 +1,26 @@
+Device Tree binding for LEDs on IBM Power Systems
+-
+
+Required properties:
+- compatible : Should be ibm,opal-v3-led.
+- led-mode   : Should be lightpath or guidinglight.
+
+Each location code of FRU/Enclosure must be expressed in the
+form of a sub-node.
+
+Required properties for the sub nodes:
+- led-types : Supported LED types (attention/identify/fault) provided
+  in the form of string array.
+
+Example:
+
+leds {
+   compatible = ibm,opal-v3-led;
+   led-mode = lightpath;
+
+   U78C9.001.RST0027-P1-C1 {
+   led-types = identify, fault;
+   };
+   ...
+   ...
+};
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 9ad35f7..f218cc3a 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -560,6 +560,17 @@ config LEDS_BLINKM
  This option enables support for the BlinkM RGB LED connected
  through I2C. Say Y to enable support for the BlinkM LED.

+config LEDS_POWERNV
+   tristate LED support for PowerNV Platform
+   depends on LEDS_CLASS
+   depends on PPC_POWERNV
+   depends on OF
+   help
+ This option enables support for the system LEDs present on
+ PowerNV platforms. Say 'y' to enable this support in kernel.
+ To compile this driver as a module, choose 'm' here: the module
+ will be called leds-powernv.
+
  config LEDS_SYSCON
bool LED support for LEDs on system controllers
depends on LEDS_CLASS=y
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 8d6a24a..6a943d1 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_LEDS_VERSATILE)  += leds-versatile.o
  obj-$(CONFIG_LEDS_MENF21BMC)  += leds-menf21bmc.o
  obj-$(CONFIG_LEDS_PM8941_WLED)+= leds-pm8941-wled.o
  obj-$(CONFIG_LEDS_KTD2692)+= leds-ktd2692.o
+obj-$(CONFIG_LEDS_POWERNV) += leds-powernv.o

  # LED SPI Drivers
  obj-$(CONFIG_LEDS_DAC124S085) += leds-dac124s085.o
diff --git a/drivers/leds/leds-powernv.c b/drivers/leds/leds-powernv.c
new file mode 100644
index 000..9e70291
--- /dev/null
+++ b/drivers/leds/leds-powernv.c
@@ -0,0 +1,342 @@
+/*
+ * PowerNV LED Driver
+ *
+ * Copyright IBM Corp. 2015
+ *
+ * Author: Vasant Hegde hegdevas...@linux.vnet.ibm.com
+ * Author: Anshuman Khandual 

Re: [PATCH V4 4/6] mm: mlock: Introduce VM_LOCKONFAULT and add mlock flags to enable it

2015-07-23 Thread Vlastimil Babka
On 07/22/2015 08:43 PM, Eric B Munson wrote:
 On Wed, 22 Jul 2015, Vlastimil Babka wrote:
 
 
 Hi,
 
 I think you should include a complete description of which
 transitions for vma states and mlock2/munlock2 flags applied on them
 are valid and what they do. It will also help with the manpages.
 You explained some to Jon in the last thread, but I think there
 should be a canonical description in changelog (if not also
 Documentation, if mlock is covered there).
 
 For example the scenario Jon asked, what happens after a
 mlock2(MLOCK_ONFAULT) followed by mlock2(MLOCK_LOCKED), and that the
 answer is nothing. Your promised code comment for
 apply_vma_flags() doesn't suffice IMHO (and I'm not sure it's there,
 anyway?).
 
 I missed adding that comment to the code, will be there in V5 along with
 the description in the changelog.

Thanks!

 
 But the more I think about the scenario and your new VM_LOCKONFAULT
 vma flag, it seems awkward to me. Why should munlocking at all care
 if the vma was mlocked with MLOCK_LOCKED or MLOCK_ONFAULT? In either
 case the result is that all pages currently populated are munlocked.
 So the flags for munlock2 should be unnecessary.
 
 Say a user has a large area of interleaved MLOCK_LOCK and MLOCK_ONFAULT
 mappings and they want to unlock only the ones with MLOCK_LOCK.  With
 the current implementation, this is possible in a single system call
 that spans the entire region.  With your suggestion, the user would have
 to know what regions where locked with MLOCK_LOCK and call munlock() on
 each of them.  IMO, the way munlock2() works better mirrors the way
 munlock() currently works when called on a large area of interleaved
 locked and unlocked areas.

Um OK, that scenario is possible in theory. But I have a hard time imagining
that somebody would really want to do that. I think much more people would
benefit from a simpler API.

 
 
 I also think VM_LOCKONFAULT is unnecessary. VM_LOCKED should be
 enough - see how you had to handle the new flag in all places that
 had to handle the old flag? I think the information whether mlock
 was supposed to fault the whole vma is obsolete at the moment mlock
 returns. VM_LOCKED should be enough for both modes, and the flag to
 mlock2 could just control whether the pre-faulting is done.
 
 So what should be IMHO enough:
 - munlock can stay without flags
 - mlock2 has only one new flag MLOCK_ONFAULT. If specified,
 pre-faulting is not done, just set VM_LOCKED and mlock pages already
 present.
 - same with mmap(MAP_LOCKONFAULT) (need to define what happens when
 both MAP_LOCKED and MAP_LOCKONFAULT are specified).
 
 Now mlockall(MCL_FUTURE) muddles the situation in that it stores the
 information for future VMA's in current-mm-def_flags, and this
 def_flags would need to distinguish VM_LOCKED with population and
 without. But that could be still solvable without introducing a new
 vma flag everywhere.
 
 With you right up until that last paragraph.  I have been staring at
 this a while and I cannot come up a way to handle the
 mlockall(MCL_ONFAULT) without introducing a new vm flag.  It doesn't
 have to be VM_LOCKONFAULT, we could use the model that Michal Hocko
 suggested with something like VM_FAULTPOPULATE.  However, we can't
 really use this flag anywhere except the mlock code becuase we have to
 be able to distinguish a caller that wants to use MLOCK_LOCK with
 whatever control VM_FAULTPOPULATE might grant outside of mlock and a
 caller that wants MLOCK_ONFAULT.  That was a long way of saying we need
 an extra vma flag regardless.  However, if that flag only controls if
 mlock pre-populates it would work and it would do away with most of the
 places I had to touch to handle VM_LOCKONFAULT properly.

Yes, it would be a good way. Adding a new vma flag is probably cleanest after
all, but the flag would be set *in addition* to VM_LOCKED, *just* to prevent
pre-faulting. The places that check VM_LOCKED for the actual page mlocking (i.e.
try_to_unmap_one) would just keep checking VM_LOCKED. The places where VM_LOCKED
is checked to trigger prepopulation, would skip that if VM_LOCKONFAULT is also
set. Having VM_LOCKONFAULT set without also VM_LOCKED itself would be invalid 
state.

This should work fine with the simplified API as I proposed so let me reiterate
and try fill in the blanks:

- mlock2 has only one new flag MLOCK_ONFAULT. If specified, VM_LOCKONFAULT is
set in addition to VM_LOCKED and no prefaulting is done
  - old mlock syscall naturally behaves as mlock2 without MLOCK_ONFAULT
  - calling mlock/mlock2 on an already-mlocked area (if that's permitted
already?) will add/remove VM_LOCKONFAULT as needed. If it's removing,
prepopulate whole range. Of course adding VM_LOCKONFAULT to a vma that was
already prefaulted doesn't make any difference, but it's consistent with the 
rest.
- munlock removes both VM_LOCKED and VM_LOCKONFAULT
- mmap could treat MAP_LOCKONFAULT as a modifier to MAP_LOCKED to be consistent?
or not? I'm not sure here, 

Re: [4.2-rc3] macintosh/ans-lcd.c: Missing include causes compile failure

2015-07-23 Thread Michael Ellerman
On Mon, 2015-20-07 at 19:01:42 UTC, Tim Gardner wrote:
 From: Tim Gardner tim.gard...@canonical.com
 
 drivers/macintosh/ans-lcd.c:201:1: warning: data definition has no type or 
 storage class
  module_init(anslcd_init);
  ^
 drivers/macintosh/ans-lcd.c:201:1: error: type defaults to 'int' in 
 declaration of 'module_init' [-Werror=implicit-int]
 
 gcc version 4.9.3 (Ubuntu 4.9.3-1ubuntu1)

Thanks Tim but Luis beat you to it:

  http://patchwork.ozlabs.org/patch/497138/


I'll send the fix to Linus in the next day or two.

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 01/11] powerpc/kernel: Switch to using MAX_ERRNO

2015-07-23 Thread Michael Ellerman
Currently on powerpc we have our own #define for the highest (negative)
errno value, called _LAST_ERRNO. This is defined to be 516, for reasons
which are not clear.

The generic code, and x86, use MAX_ERRNO, which is defined to be 4095.

In particular seccomp uses MAX_ERRNO to restrict the value that a
seccomp filter can return.

Currently with the mismatch between _LAST_ERRNO and MAX_ERRNO, a seccomp
tracer wanting to return 600, expecting it to be seen as an error, would
instead find on powerpc that userspace sees a successful syscall with a
return value of 600.

To avoid this inconsistency, switch powerpc to use MAX_ERRNO.

We are somewhat confident that generic syscalls that can return a
non-error value above negative MAX_ERRNO have already been updated to
use force_successful_syscall_return().

I have also checked all the powerpc specific syscalls, and believe that
none of them expect to return a non-error value between -MAX_ERRNO and
-516. So this change should be safe ...

Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/uapi/asm/errno.h | 2 --
 arch/powerpc/kernel/entry_32.S| 3 ++-
 arch/powerpc/kernel/entry_64.S| 5 +++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/errno.h 
b/arch/powerpc/include/uapi/asm/errno.h
index 8c145fd17d86..e8b6b5f7de7c 100644
--- a/arch/powerpc/include/uapi/asm/errno.h
+++ b/arch/powerpc/include/uapi/asm/errno.h
@@ -6,6 +6,4 @@
 #undef EDEADLOCK
 #defineEDEADLOCK   58  /* File locking deadlock error */
 
-#define _LAST_ERRNO516
-
 #endif /* _ASM_POWERPC_ERRNO_H */
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 46fc0f4d8982..67ecdf61f4e3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -20,6 +20,7 @@
  */
 
 #include linux/errno.h
+#include linux/err.h
 #include linux/sys.h
 #include linux/threads.h
 #include asm/reg.h
@@ -354,7 +355,7 @@ ret_from_syscall:
SYNC
MTMSRD(r10)
lwz r9,TI_FLAGS(r12)
-   li  r8,-_LAST_ERRNO
+   li  r8,-MAX_ERRNO
andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne-syscall_exit_work
cmplw   0,r3,r8
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 579e0f9a2d57..ee15d3c62e26 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -19,6 +19,7 @@
  */
 
 #include linux/errno.h
+#include linux/err.h
 #include asm/unistd.h
 #include asm/processor.h
 #include asm/page.h
@@ -207,7 +208,7 @@ system_call:/* label this so stack 
traces look sane */
 #endif /* CONFIG_PPC_BOOK3E */
 
ld  r9,TI_FLAGS(r12)
-   li  r11,-_LAST_ERRNO
+   li  r11,-MAX_ERRNO
andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne-syscall_exit_work
cmpld   r3,r11
@@ -277,7 +278,7 @@ syscall_exit_work:
beq+0f
REST_NVGPRS(r1)
b   2f
-0: cmpld   r3,r11  /* r10 is -LAST_ERRNO */
+0: cmpld   r3,r11  /* r11 is -MAX_ERRNO */
blt+1f
andi.   r0,r9,_TIF_NOERROR
bne-1f
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 02/11] powerpc/kernel: Change the do_syscall_trace_enter() API

2015-07-23 Thread Michael Ellerman
The API for calling do_syscall_trace_enter() is currently sensible
enough, it just returns the (modified) syscall number.

However once we enable seccomp filter it will get more complicated. When
seccomp filter runs, the seccomp kernel code (via SECCOMP_RET_ERRNO), or
a ptracer (via SECCOMP_RET_TRACE), may reject the syscall and *may* or may
*not* set a return value in r3.

That means the assembler that calls do_syscall_trace_enter() can not
blindly return ENOSYS, it needs to only return ENOSYS if a return value
has not already been set.

There is no way to implement that logic with the current API. So change
the do_syscall_trace_enter() API to make it deal with the return code
juggling, and the assembler can then just return whatever return code it
is given.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/kernel/entry_32.S |  4 
 arch/powerpc/kernel/entry_64.S | 23 ++--
 arch/powerpc/kernel/ptrace.c   | 48 --
 3 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 67ecdf61f4e3..2405631e91a2 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -458,6 +458,10 @@ syscall_dotrace:
lwz r7,GPR7(r1)
lwz r8,GPR8(r1)
REST_NVGPRS(r1)
+
+   cmplwi  r0,NR_syscalls
+   /* Return code is already in r3 thanks to do_syscall_trace_enter() */
+   bge-ret_from_syscall
b   syscall_dotrace_cont
 
 syscall_exit_work:
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index ee15d3c62e26..a94f155db78e 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -151,8 +151,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
CURRENT_THREAD_INFO(r11, r1)
ld  r10,TI_FLAGS(r11)
andi.   r11,r10,_TIF_SYSCALL_DOTRACE
-   bne syscall_dotrace
-.Lsyscall_dotrace_cont:
+   bne syscall_dotrace /* does not return */
cmpldi  0,r0,NR_syscalls
bge-syscall_enosys
 
@@ -246,22 +245,34 @@ syscall_dotrace:
bl  save_nvgprs
addir3,r1,STACK_FRAME_OVERHEAD
bl  do_syscall_trace_enter
+
/*
-* Restore argument registers possibly just changed.
-* We use the return value of do_syscall_trace_enter
-* for the call number to look up in the table (r0).
+* We use the return value of do_syscall_trace_enter() as the syscall
+* number. If the syscall was rejected for any reason 
do_syscall_trace_enter()
+* returns an invalid syscall number and the test below against
+* NR_syscalls will fail.
 */
mr  r0,r3
+
+   /* Restore argument registers just clobbered and/or possibly changed. */
ld  r3,GPR3(r1)
ld  r4,GPR4(r1)
ld  r5,GPR5(r1)
ld  r6,GPR6(r1)
ld  r7,GPR7(r1)
ld  r8,GPR8(r1)
+
+   /* Repopulate r9 and r10 for the system_call path */
addir9,r1,STACK_FRAME_OVERHEAD
CURRENT_THREAD_INFO(r10, r1)
ld  r10,TI_FLAGS(r10)
-   b   .Lsyscall_dotrace_cont
+
+   cmpldi  r0,NR_syscalls
+   blt+system_call
+
+   /* Return code is already in r3 thanks to do_syscall_trace_enter() */
+   b   .Lsyscall_exit
+
 
 syscall_enosys:
li  r3,-ENOSYS
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index f21897b42057..7484221bb3f8 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1762,26 +1762,42 @@ long arch_ptrace(struct task_struct *child, long 
request,
return ret;
 }
 
-/*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
+/**
+ * do_syscall_trace_enter() - Do syscall tracing on kernel entry.
+ * @regs: the pt_regs of the task to trace (current)
+ *
+ * Performs various types of tracing on syscall entry. This includes seccomp,
+ * ptrace, syscall tracepoints and audit.
+ *
+ * The pt_regs are potentially visible to userspace via ptrace, so their
+ * contents is ABI.
+ *
+ * One or more of the tracers may modify the contents of pt_regs, in particular
+ * to modify arguments or even the syscall number itself.
+ *
+ * It's also possible that a tracer can choose to reject the system call. In
+ * that case this function will return an illegal syscall number, and will put
+ * an appropriate return value in regs-r3.
+ *
+ * Return: the (possibly changed) syscall number.
  */
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
-   long ret = 0;
+   bool abort = false;
 
user_exit();
 
secure_computing_strict(regs-gpr[0]);
 
-   if (test_thread_flag(TIF_SYSCALL_TRACE) 
-   tracehook_report_syscall_entry(regs))
+   if (test_thread_flag(TIF_SYSCALL_TRACE)) {
/*
-  

Re: [PATCH v5 3/7] powerpc/powernv: Nest PMU detection and device tree parser

2015-07-23 Thread Michael Ellerman
On Thu, 2015-07-23 at 11:24 +0530, Madhavan Srinivasan wrote:
 
 On Wednesday 22 July 2015 09:19 AM, Daniel Axtens wrote:
  Hi,
 
  +static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
  +
  +static int nest_ima_dt_parser(void)
  +{
  +  const __be32 *gcid;
  +  const __be64 *chip_ima_reg;
  +  const __be64 *chip_ima_size;
  +  struct device_node *dev;
  +  struct perchip_nest_info *p8ni;
  +  int idx;
  +
  +  /*
  +   * nest-ima folder contains two things,
  +   * a) per-chip reserved memory region for Nest PMU Counter data
  +   * b) Support Nest PMU units and their event files
  +   */
  +  for_each_node_with_property(dev, ibm,ima-chip) {
  +  gcid = of_get_property(dev, ibm,chip-id, NULL);
  +  chip_ima_reg = of_get_property(dev, reg, NULL);
  +  chip_ima_size = of_get_property(dev, size, NULL);
  +
  +  if ((!gcid) || (!chip_ima_reg) || (!chip_ima_size)) {
  +  pr_err(Nest_PMU: device %s missing property\n,
  +  dev-full_name);
  +  return -ENODEV;
  +  }
  +
  +  /* chip id to save reserve memory region */
  +  idx = (uint32_t)be32_to_cpup(gcid);
  So be32_to_cpup returns a __u32. You're casting to a uint32_t and then
  assigning to an int.
   - Do you need the intermediate cast?
   - Should idx be an unsigned type?
 
 my bad, sorry abt type case of uint to int.
 And your are right, idx can be __u32 (__u32 and uint32_t are same i
 guess). 

It should be u32. Don't use the uintx_t types in kernel code unless there's
some good reason for it.

The __u32 etc. types are for things that are exposed to userspace, which this
is not, so u32 is correct.

Having said that, this code should be using of_property_read_u32() etc.

And having said that, this is all based on a device tree binding that hasn't
been reviewed yet on the OPAL side, so it's subject to change too.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 6/7] powerpc/powernv: generic nest pmu event functions

2015-07-23 Thread Madhavan Srinivasan


On Thursday 23 July 2015 02:34 PM, Michael Ellerman wrote:
 On Thu, 2015-07-23 at 12:14 +0530, Madhavan Srinivasan wrote:
 On Wednesday 22 July 2015 10:26 AM, Daniel Axtens wrote:
 +static void p8_nest_read_counter(struct perf_event *event)
 +{
 +  uint64_t *addr;
 +  u64 data = 0;
 You've got a u64 and a uint64_t, and then...
 +
 +  addr = (u64 *)event-hw.event_base;
 ... you cast to event_base to a u64 pointer, which you assign to a
 uint64_t pointer.
 +  data = __be64_to_cpu(*addr);
 And now you dereference the pointer.
 Could you just have:
 data = __be64_to_cpu(*event-hw.event_base);
 +  local64_set(event-hw.prev_count, data);
 +}
 +
 +static void p8_nest_perf_event_update(struct perf_event *event)
 +{
 +  u64 counter_prev, counter_new, final_count;
 +  uint64_t *addr;
 +
 +  addr = (uint64_t *)event-hw.event_base;
 Here at least the cast type is the same as the type of addr, but again,
 why do you need the different types, and why local variable?
 Damn sorry, copy paste errors. When I added debug prints i messed
 the type case in both the functions. I will make them as uint64_t.
 No please use u64/u32 etc. Most code in powerpc does and I prefer them.

 cheers
Ok Sure. Will use only u64 and u32.

Thanks
maddy



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 4/7] powerpc/powernv: detect supported nest pmus and its events

2015-07-23 Thread Madhavan Srinivasan


On Thursday 23 July 2015 02:41 PM, Michael Ellerman wrote:
 On Thu, 2015-07-23 at 11:33 +0530, Madhavan Srinivasan wrote:
 On Wednesday 22 July 2015 09:37 AM, Daniel Axtens wrote:
  
  static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
 +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];
 +
 +static int nest_event_info(struct property *pp, char *name,
 +  struct nest_ima_events *p8_events, int string, u32 val)
 'int string' is a bit confusing. 'bool is_string' might be clearer, but
 I think it would be even better still to have different functions for
 string and non-string cases, especially because you only need val in the
 non-string case.
 I would perfer to be a single function, since most of the code is same
 just of the sting or val part. yes. We can make is as is_string and will
 add comment explaining what is done here? 
 I think Daniel's right, it would be better as two functions.

 The only part that is common after the if (string) check is the
 p8_events-ev_value = buf assignment.

 So you should be able to keep all the code up to the if (string) check in a
 shared function and just have two wrappers that use it.

 cheers

Sure. Will do.

Maddy



 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@lists.ozlabs.org
 https://lists.ozlabs.org/listinfo/linuxppc-dev

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 3/7] powerpc/powernv: Nest PMU detection and device tree parser

2015-07-23 Thread Madhavan Srinivasan


On Thursday 23 July 2015 02:46 PM, Michael Ellerman wrote:
 On Thu, 2015-07-23 at 11:24 +0530, Madhavan Srinivasan wrote:
 On Wednesday 22 July 2015 09:19 AM, Daniel Axtens wrote:
 Hi,

 +static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
 +
 +static int nest_ima_dt_parser(void)
 +{
 +  const __be32 *gcid;
 +  const __be64 *chip_ima_reg;
 +  const __be64 *chip_ima_size;
 +  struct device_node *dev;
 +  struct perchip_nest_info *p8ni;
 +  int idx;
 +
 +  /*
 +   * nest-ima folder contains two things,
 +   * a) per-chip reserved memory region for Nest PMU Counter data
 +   * b) Support Nest PMU units and their event files
 +   */
 +  for_each_node_with_property(dev, ibm,ima-chip) {
 +  gcid = of_get_property(dev, ibm,chip-id, NULL);
 +  chip_ima_reg = of_get_property(dev, reg, NULL);
 +  chip_ima_size = of_get_property(dev, size, NULL);
 +
 +  if ((!gcid) || (!chip_ima_reg) || (!chip_ima_size)) {
 +  pr_err(Nest_PMU: device %s missing property\n,
 +  dev-full_name);
 +  return -ENODEV;
 +  }
 +
 +  /* chip id to save reserve memory region */
 +  idx = (uint32_t)be32_to_cpup(gcid);
 So be32_to_cpup returns a __u32. You're casting to a uint32_t and then
 assigning to an int.
  - Do you need the intermediate cast?
  - Should idx be an unsigned type?
 my bad, sorry abt type case of uint to int.
 And your are right, idx can be __u32 (__u32 and uint32_t are same i
 guess). 
 It should be u32. Don't use the uintx_t types in kernel code unless there's
 some good reason for it.

 The __u32 etc. types are for things that are exposed to userspace, which this
 is not, so u32 is correct.

 Having said that, this code should be using of_property_read_u32() etc.

Ok will change it to use of_property_read_u32.

 And having said that, this is all based on a device tree binding that hasn't
 been reviewed yet on the OPAL side, so it's subject to change too.

Have posted new version in the skiboot mailinglist based on the reviews.
hoping to get it reviewed soon.

Maddy
 cheers



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC v4 03/25] m68k/atari: Move Atari-specific code out of drivers/char/nvram.c

2015-07-23 Thread Christian T. Steigies
On Wed, Jul 22, 2015 at 02:22:21PM +1000, Finn Thain wrote:
 
 Anyone with a suitable Atari, i.e. ATARIHW_PRESENT(TT_CLK), who can boot 
 both TOS and Linux could resolve the question. (Perhaps with an emulator?)
 
 Any old kernel binary would do, since atari_scsi should print either 
 HOSTID=n or this_id n at startup.
 
 If n doesn't agree with what TOS says about the host's SCSI ID, then I 
 think a trivial patch is safe enough. Especially if cat /proc/driver/nvram 
 produces a SCSI host ID : m that does agree with TOS.

root@garkin:~cat /proc/hardware 
Model:  Atari Falcon
System Memory:  522752K
510 MB at 0x0100 (alternate RAM)
Detected hardware:
Falcon Shifter
Programmable Sound Generator
PCM 8 Bit Sound
CODEC Sound
SCSI Controller NCR5380 (Falcon style)
IDE Interface
8/16 Mhz Switch for FDC
Multi Function Peripheral MFP 68901
Serial Communications Controller SCC 8530
Paddle Interface
DMA Controller for SCC
Clock Chip MC146818A
Blitter
DSP56001 processor

root@garkin:~dmesg |grep SCSI
[0.00] Atari hardware found: VIDEL STDMA-SCSI ST_MFP YM2149 PCM CODEC 
DSP56K SCC_DMA SCC ANALOG_JOY BLITTER IDE TT_CLK FDC_SPEED 
[0.41] SCSI subsystem initialized
[0.85] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 
251)
[4.23] Atari SCSI: resetting the SCSI bus... done
[6.75] scsi host0: Atari native SCSI, io_port 0x0, n_io_port 0, base 
0x0, irq 15, can_queue 8, cmd_per_lun 1, sg_tablesize 0, this_id 7, flags { }, 
options { REAL_DMA SUPPORT_TAGS } 

root@garkin:~cat /proc/driver/nvram
Checksum status  : not valid
Boot preference  : 0xff (undefined)
SCSI arbitration : on
SCSI host ID : 7
OS language  : 255 (undefined)
Keyboard language: 255 (undefined)
Date format  : 7 (undefined), 24h clock
Boot delay   : 255s
Video mode   : 4 colors, 40 columns, TV NTSC monitor
   no overscan, compat. mode off


Let me know if you need more info.

Christian
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel] powerpc/powernv/ioda2: Fix calculation for memory allocated for TCE table

2015-07-23 Thread Alexey Kardashevskiy

On 07/21/2015 04:24 PM, Michael Ellerman wrote:

On Mon, 2015-07-20 at 20:45 +1000, Alexey Kardashevskiy wrote:

The existing code stores the amount of memory allocated for a TCE table.
At the moment it uses @offset which is a virtual offset in the TCE table
which is only correct for a one level tables and it does not include
memory allocated for intermediate levels. When multilevel TCE table is
requested, WARN_ON in tce_iommu_create_table() prints a warning.

This adds an additional counter to pnv_pci_ioda2_table_do_alloc_pages()
to count actually allocated memory.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---

I was sure I sent it already but could not find it anywhere so reposting.
Sorry if it is a duplicate.


Stable?


No, this is for DDW patchset I recently posted and which did not get to any 
release yet.




--
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 4/7] powerpc/powernv: detect supported nest pmus and its events

2015-07-23 Thread Michael Ellerman
On Thu, 2015-07-23 at 11:33 +0530, Madhavan Srinivasan wrote:
 
 On Wednesday 22 July 2015 09:37 AM, Daniel Axtens wrote:
   
   static struct perchip_nest_info p8_nest_perchip_info[P8_NEST_MAX_CHIPS];
  +static struct nest_pmu *per_nest_pmu_arr[P8_NEST_MAX_PMUS];
  +
  +static int nest_event_info(struct property *pp, char *name,
  +  struct nest_ima_events *p8_events, int string, u32 val)
  'int string' is a bit confusing. 'bool is_string' might be clearer, but
  I think it would be even better still to have different functions for
  string and non-string cases, especially because you only need val in the
  non-string case.
 
 I would perfer to be a single function, since most of the code is same
 just of the sting or val part. yes. We can make is as is_string and will
 add comment explaining what is done here? 

I think Daniel's right, it would be better as two functions.

The only part that is common after the if (string) check is the
p8_events-ev_value = buf assignment.

So you should be able to keep all the code up to the if (string) check in a
shared function and just have two wrappers that use it.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 6/7] powerpc/powernv: generic nest pmu event functions

2015-07-23 Thread Michael Ellerman
On Thu, 2015-07-23 at 12:14 +0530, Madhavan Srinivasan wrote:
 
 On Wednesday 22 July 2015 10:26 AM, Daniel Axtens wrote:
  +static void p8_nest_read_counter(struct perf_event *event)
  +{
  +  uint64_t *addr;
  +  u64 data = 0;
  You've got a u64 and a uint64_t, and then...
  +
  +  addr = (u64 *)event-hw.event_base;
  ... you cast to event_base to a u64 pointer, which you assign to a
  uint64_t pointer.
  +  data = __be64_to_cpu(*addr);
  And now you dereference the pointer.
  Could you just have:
  data = __be64_to_cpu(*event-hw.event_base);
  +  local64_set(event-hw.prev_count, data);
  +}
  +
  +static void p8_nest_perf_event_update(struct perf_event *event)
  +{
  +  u64 counter_prev, counter_new, final_count;
  +  uint64_t *addr;
  +
  +  addr = (uint64_t *)event-hw.event_base;
  Here at least the cast type is the same as the type of addr, but again,
  why do you need the different types, and why local variable?
 
 Damn sorry, copy paste errors. When I added debug prints i messed
 the type case in both the functions. I will make them as uint64_t.

No please use u64/u32 etc. Most code in powerpc does and I prefer them.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 03/11] powerpc: Drop unused syscall_get_error()

2015-07-23 Thread Michael Ellerman
syscall_get_error() is unused, and never has been.

It's also probably wrong, as it negates r3 before returning it, but that
depends on what the caller is expecting.

It also doesn't deal with compat, and doesn't deal with TIF_NOERROR.

Although we could fix those, until it has a caller and it's clear what
semantics the caller wants it's just untested code. So drop it.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/syscall.h | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index ff21b7a2f0cc..c6239dabcfb1 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -34,12 +34,6 @@ static inline void syscall_rollback(struct task_struct *task,
regs-gpr[3] = regs-orig_gpr3;
 }
 
-static inline long syscall_get_error(struct task_struct *task,
-struct pt_regs *regs)
-{
-   return (regs-ccr  0x1000) ? -regs-gpr[3] : 0;
-}
-
 static inline long syscall_get_return_value(struct task_struct *task,
struct pt_regs *regs)
 {
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 04/11] powerpc: Don't negate error in syscall_set_return_value()

2015-07-23 Thread Michael Ellerman
Currently the only caller of syscall_set_return_value() is seccomp
filter, which is not enabled on powerpc.

This means we have not noticed that our implementation of
syscall_set_return_value() negates error, even though the value passed
in is already negative.

So remove the negation in syscall_set_return_value(), and expect the
caller to do it like all other implementations do.

Also add a comment about the ccr handling.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/syscall.h | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index c6239dabcfb1..cabe90133e69 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -44,9 +44,15 @@ static inline void syscall_set_return_value(struct 
task_struct *task,
struct pt_regs *regs,
int error, long val)
 {
+   /*
+* In the general case it's not obvious that we must deal with CCR
+* here, as the syscall exit path will also do that for us. However
+* there are some places, eg. the signal code, which check ccr to
+* decide if the value in r3 is actually an error.
+*/
if (error) {
regs-ccr |= 0x1000L;
-   regs-gpr[3] = -error;
+   regs-gpr[3] = error;
} else {
regs-ccr = ~0x1000L;
regs-gpr[3] = val;
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 05/11] powerpc: Rework syscall_get_arguments() so there is only one loop

2015-07-23 Thread Michael Ellerman
Currently syscall_get_arguments() has two loops, one for compat and one
for regular tasks. In prepartion for the next patch, which changes which
registers we use, switch it to only have one loop, so we only have one
place to update.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/syscall.h | 19 ---
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index cabe90133e69..403e2303fe18 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -64,19 +64,16 @@ static inline void syscall_get_arguments(struct task_struct 
*task,
 unsigned int i, unsigned int n,
 unsigned long *args)
 {
+   unsigned long mask = -1UL;
+
BUG_ON(i + n  6);
-#ifdef CONFIG_PPC64
-   if (test_tsk_thread_flag(task, TIF_32BIT)) {
-   /*
-* Zero-extend 32-bit argument values.  The high bits are
-* garbage ignored by the actual syscall dispatch.
-*/
-   while (n--  0)
-   args[n] = (u32) regs-gpr[3 + i + n];
-   return;
-   }
+
+#ifdef CONFIG_COMPAT
+   if (test_tsk_thread_flag(task, TIF_32BIT))
+   mask = 0x;
 #endif
-   memcpy(args, regs-gpr[3 + i], n * sizeof(args[0]));
+   while (n--)
+   args[n] = regs-gpr[3 + i + n]  mask;
 }
 
 static inline void syscall_set_arguments(struct task_struct *task,
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 06/11] powerpc: Use orig_gpr3 in syscall_get_arguments()

2015-07-23 Thread Michael Ellerman
Currently syscall_get_arguments() is used by syscall tracepoints, and
collect_syscall() which is used in some debugging as well as
/proc/pid/syscall.

The current implementation just copies regs-gpr[3 .. 5] out, which is
fine for all the current use cases.

When we enable seccomp filter, that will also start using
syscall_get_arguments(). However for seccomp filter we want to use r3
as the return value of the syscall, and orig_gpr3 as the first
parameter. This will allow seccomp to modify the return value in r3.

To support this we need to modify syscall_get_arguments() to return
orig_gpr3 instead of r3. This is safe for all uses because orig_gpr3
always contains the r3 value that was passed to the syscall. We store it
in the syscall entry path and never modify it.

Update syscall_set_arguments() while we're here, even though it's never
used.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/syscall.h | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index 403e2303fe18..8d79a87c0511 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -64,7 +64,7 @@ static inline void syscall_get_arguments(struct task_struct 
*task,
 unsigned int i, unsigned int n,
 unsigned long *args)
 {
-   unsigned long mask = -1UL;
+   unsigned long val, mask = -1UL;
 
BUG_ON(i + n  6);
 
@@ -72,8 +72,14 @@ static inline void syscall_get_arguments(struct task_struct 
*task,
if (test_tsk_thread_flag(task, TIF_32BIT))
mask = 0x;
 #endif
-   while (n--)
-   args[n] = regs-gpr[3 + i + n]  mask;
+   while (n--) {
+   if (n == 0  i == 0)
+   val = regs-orig_gpr3;
+   else
+   val = regs-gpr[3 + i + n];
+
+   args[n] = val  mask;
+   }
 }
 
 static inline void syscall_set_arguments(struct task_struct *task,
@@ -83,6 +89,10 @@ static inline void syscall_set_arguments(struct task_struct 
*task,
 {
BUG_ON(i + n  6);
memcpy(regs-gpr[3 + i], args, n * sizeof(args[0]));
+
+   /* Also copy the first argument into orig_gpr3 */
+   if (i == 0  n  0)
+   regs-orig_gpr3 = args[0];
 }
 
 static inline int syscall_get_arch(void)
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 07/11] powerpc: Change syscall_get_nr() to return int

2015-07-23 Thread Michael Ellerman
The documentation for syscall_get_nr() in asm-generic says:

 Note this returns int even on 64-bit machines. Only 32 bits of
 system call number can be meaningful. If the actual arch value
 is 64 bits, this truncates to 32 bits so 0x means -1.

However our implementation was never updated to reflect this.

Generally it's not important, but there is once case where it matters.

For seccomp filter with SECCOMP_RET_TRACE, the tracer will set
regs-gpr[0] to -1 to reject the syscall. When the task is a compat
task, this means we end up with 0x in r0 because ptrace will
zero extend the 32-bit value.

If syscall_get_nr() returns an unsigned long, then a 64-bit kernel will
see a positive value in r0 and will incorrectly allow the syscall
through seccomp.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/syscall.h | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index 8d79a87c0511..ab9f3f0a8637 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -22,10 +22,15 @@
 extern const unsigned long sys_call_table[];
 #endif /* CONFIG_FTRACE_SYSCALLS */
 
-static inline long syscall_get_nr(struct task_struct *task,
- struct pt_regs *regs)
+static inline int syscall_get_nr(struct task_struct *task, struct pt_regs 
*regs)
 {
-   return TRAP(regs) == 0xc00 ? regs-gpr[0] : -1L;
+   /*
+* Note that we are returning an int here. That means 0x, ie.
+* 32-bit negative 1, will be interpreted as -1 on a 64-bit kernel.
+* This is important for seccomp so that compat tasks can set r0 = -1
+* to reject the syscall.
+*/
+   return TRAP(regs) == 0xc00 ? regs-gpr[0] : -1;
 }
 
 static inline void syscall_rollback(struct task_struct *task,
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 08/11] powerpc/kernel: Add SIG_SYS support for compat tasks

2015-07-23 Thread Michael Ellerman
SIG_SYS was added in commit a0727e8ce513 signal, x86: add SIGSYS info
and make it synchronous.

Because we use the asm-generic struct siginfo, we got support for
SIG_SYS for free as part of that commit.

However there was no compat handling added for powerpc. That means we've
been advertising the existence of signfo._sifields._sigsys to compat
tasks, but not actually filling in the fields correctly.

Luckily it looks like no one has noticed, presumably because the only
user of SIGSYS in the kernel is seccomp filter, which we don't support
yet.

So before we enable seccomp filter, add compat handling for SIGSYS.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/compat.h | 7 +++
 arch/powerpc/kernel/signal_32.c   | 5 +
 2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/compat.h 
b/arch/powerpc/include/asm/compat.h
index b142b8e0ed9e..4f2df589ec1d 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -174,6 +174,13 @@ typedef struct compat_siginfo {
int _band;  /* POLL_IN, POLL_OUT, POLL_MSG */
int _fd;
} _sigpoll;
+
+   /* SIGSYS */
+   struct {
+   unsigned int _call_addr; /* calling insn */
+   int _syscall;/* triggering system call 
number */
+   unsigned int _arch;  /* AUDIT_ARCH_* of syscall */
+   } _sigsys;
} _sifields;
 } compat_siginfo_t;
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d3a831ac0f92..77f97284124e 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -949,6 +949,11 @@ int copy_siginfo_to_user32(struct compat_siginfo __user 
*d, const siginfo_t *s)
err |= __put_user(s-si_overrun, d-si_overrun);
err |= __put_user(s-si_int, d-si_int);
break;
+   case __SI_SYS  16:
+   err |= __put_user(ptr_to_compat(s-si_call_addr), 
d-si_call_addr);
+   err |= __put_user(s-si_syscall, d-si_syscall);
+   err |= __put_user(s-si_arch, d-si_arch);
+   break;
case __SI_RT  16: /* This is not generated by the kernel as of now.  
*/
case __SI_MESGQ  16:
err |= __put_user(s-si_int, d-si_int);
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 09/11] powerpc/kernel: Enable seccomp filter

2015-07-23 Thread Michael Ellerman
This commit enables seccomp filter on powerpc, now that we have all the
necessary pieces in place.

To support seccomp's desire to modify the syscall return value under
some circumstances, we use a different ABI to the ptrace ABI. That is we
use r3 as the syscall return value, and orig_gpr3 is the first syscall
parameter.

This means the seccomp code, or a ptracer via SECCOMP_RET_TRACE, will
see -ENOSYS preloaded in r3. This is identical to the behaviour on x86,
and allows seccomp or the ptracer to either leave the -ENOSYS or change
it to something else, as well as rejecting or not the syscall by
modifying r0.

If seccomp does not reject the syscall, we restore the register state to
match what ptrace and audit expect, ie. r3 is the first syscall
parameter again. We do this restore using orig_gpr3, which may have been
modified by seccomp, which allows seccomp to modify the first syscall
paramater and allow the syscall to proceed.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/ptrace.c | 28 +++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5ef27113b898..b6cb6a87b7a2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
+   select HAVE_ARCH_SECCOMP_FILTER
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 7484221bb3f8..de79eb5218c6 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1787,7 +1787,33 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 
user_exit();
 
-   secure_computing_strict(regs-gpr[0]);
+   if (test_thread_flag(TIF_SECCOMP)) {
+   /*
+* The ABI we present to seccomp tracers is that r3 contains
+* the syscall return value and orig_gpr3 contains the first
+* syscall parameter. This is different to the ptrace ABI where
+* both r3 and orig_gpr3 contain the first syscall parameter.
+*/
+   regs-gpr[3] = -ENOSYS;
+
+   /*
+* We use the __ version here because we have already checked
+* TIF_SECCOMP. If this fails, there is nothing left to do, we
+* have already loaded -ENOSYS into r3, or seccomp has put
+* something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
+*/
+   if (__secure_computing())
+   return -1;
+
+   /*
+* The syscall was allowed by seccomp, restore the register
+* state to what ptrace and audit expect.
+* Note that we use orig_gpr3, which means a seccomp tracer can
+* modify the first syscall parameter (in orig_gpr3) and also
+* allow the syscall to proceed.
+*/
+   regs-gpr[3] = regs-orig_gpr3;
+   }
 
if (test_thread_flag(TIF_SYSCALL_TRACE)) {
/*
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 10/11] selftests/seccomp: Make seccomp tests work on big endian

2015-07-23 Thread Michael Ellerman
The seccomp_bpf test uses BPF_LD|BPF_W|BPF_ABS to load 32-bit values
from seccomp_data-args. On big endian machines this will load the high
word of the argument, which is not what the test wants.

Borrow a hack from samples/seccomp/bpf-helper.h which changes the offset
on big endian to account for this.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index c5abe7fd7590..2303a8dff9a2 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -82,7 +82,13 @@ struct seccomp_data {
 };
 #endif
 
+#if __BYTE_ORDER == __LITTLE_ENDIAN
 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + 
sizeof(__u32))
+#else
+#error wut? Unknown __BYTE_ORDER?!
+#endif
 
 #define SIBLING_EXIT_UNKILLED  0xbadbeef
 #define SIBLING_EXIT_FAILURE   0xbadface
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 11/11] selftests/seccomp: Add powerpc support

2015-07-23 Thread Michael Ellerman
Wire up the syscall number and regs so the tests work on powerpc.

Acked-by: Kees Cook keesc...@chromium.org
Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 2303a8dff9a2..a004b4cce99e 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -14,6 +14,7 @@
 #include linux/filter.h
 #include sys/prctl.h
 #include sys/ptrace.h
+#include sys/types.h
 #include sys/user.h
 #include linux/prctl.h
 #include linux/ptrace.h
@@ -1205,6 +1206,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS struct user_pt_regs
 # define SYSCALL_NUM   regs[8]
 # define SYSCALL_RET   regs[0]
+#elif defined(__powerpc__)
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM   gpr[0]
+# define SYSCALL_RET   gpr[3]
 #else
 # error Do not know how to find your architecture's registers and syscalls
 #endif
@@ -1238,7 +1243,7 @@ void change_syscall(struct __test_metadata *_metadata,
ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, iov);
EXPECT_EQ(0, ret);
 
-#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || 
defined(__powerpc__)
{
regs.SYSCALL_NUM = syscall;
}
@@ -1402,6 +1407,8 @@ TEST_F(TRACE_syscall, syscall_dropped)
 #  define __NR_seccomp 383
 # elif defined(__aarch64__)
 #  define __NR_seccomp 277
+# elif defined(__powerpc__)
+#  define __NR_seccomp 358
 # else
 #  warning seccomp syscall number unknown for this architecture
 #  define __NR_seccomp 0x
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev