[PATCH v4 2/13] Correct buffer parsing in update_dt_node()

2013-04-24 Thread Nathan Fontenot
Correct parsing of the buffer returned from ibm,update-properties. The first
element is a length and the path to the property which is slightly different
from the list of properties in the buffer so we need to specifically
handle this.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/platforms/pseries/mobility.c |   20 
 1 file changed, 16 insertions(+), 4 deletions(-)

Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-04-23 
13:22:05.0 -0500
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-04-23 
13:39:36.0 -0500
@@ -134,6 +134,7 @@
char *prop_data;
char *rtas_buf;
int update_properties_token;
+   u32 vd;
 
update_properties_token = rtas_token("ibm,update-properties");
if (update_properties_token == RTAS_UNKNOWN_SERVICE)
@@ -160,13 +161,24 @@
 
prop_data = rtas_buf + sizeof(*upwa);
 
-   for (i = 0; i < upwa->nprops; i++) {
+   /* The first element of the buffer is the path of the node
+* being updated in the form of a 8 byte string length
+* followed by the string. Skip past this to get to the
+* properties being updated.
+*/
+   vd = *prop_data++;
+   prop_data += vd;
+
+   /* The path we skipped over is counted as one of the elements
+* returned so start counting at one.
+*/
+   for (i = 1; i < upwa->nprops; i++) {
char *prop_name;
-   u32 vd;
 
-   prop_name = prop_data + 1;
+   prop_name = prop_data;
prop_data += strlen(prop_name) + 1;
-   vd = *prop_data++;
+   vd = *(u32 *)prop_data;
+   prop_data += sizeof(vd);
 
switch (vd) {
case 0x:

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v4 3/13] Add PRRN RTAS event handler

2013-04-24 Thread Nathan Fontenot
From: Jesse Larrew 

A PRRN event is signaled via the RTAS event-scan mechanism, which
returns a Hot Plug Event message "fixed part" indicating "Platform
Resource Reassignment". In response to the Hot Plug Event message,
we must call ibm,update-nodes to determine which resources were
reassigned and then ibm,update-properties to obtain the new affinity
information about those resources.

The PRRN event-scan RTAS message contains only the "fixed part" with
the "Type" field set to the value 160 and no Extended Event Log. The
four-byte Extended Event Log Length field is re-purposed (since no
Extended Event Log message is included) to pass the "scope" parameter
that causes the ibm,update-nodes to return the nodes affected by the
specific resource reassignment.

This patch adds a handler for RTAS events. The function
pseries_devicetree_update() (from mobility.c) is used to make the
ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
(handled by a subsequent patch) will require significant processing,
so pseries_devicetree_update() is called from an asynchronous workqueue
to allow event processing to continue. 

PRRN RTAS events on pseries systems are rare events that have to be
initiated from the HMC console for the system by an IBM tech. This allows
us to assume that these events are widely spaced. Additionally, all work
on the queue is flushed before handling any new work to ensure we only have
one event in flight being handled at a time.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h |2 +
 arch/powerpc/kernel/rtasd.c |   46 +++-
 2 files changed, 47 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-04-23 
13:22:37.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-04-23 13:40:36.0 
-0500
@@ -143,6 +143,8 @@
 #define RTAS_TYPE_PMGM_TIME_ALARM  0x6f
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE   0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC0x71
+/* Platform Resource Reassignment Notification */
+#define RTAS_TYPE_PRRN 0xA0
 
 /* RTAS check-exception vector offset */
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-04-23 12:54:23.0 
-0500
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-04-23 13:52:08.0 -0500
@@ -87,6 +87,8 @@
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
+   case RTAS_TYPE_PRRN:
+   return "Platform Resource Reassignment Event";
}
 
return rtas_type[0];
@@ -265,9 +267,49 @@
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
+}
+
+#ifdef CONFIG_PPC_PSERIES
+static s32 prrn_update_scope;
+
+static void prrn_work_fn(struct work_struct *work)
+{
+   /*
+* For PRRN, we must pass the negative of the scope value in
+* the RTAS event.
+*/
+   pseries_devicetree_update(-prrn_update_scope);
+}
+
+static DECLARE_WORK(prrn_work, prrn_work_fn);
+
+void prrn_schedule_update(u32 scope)
+{
+   flush_work(&prrn_work);
+   prrn_update_scope = scope;
+   schedule_work(&prrn_work);
+}
+
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+   if (log->type == RTAS_TYPE_PRRN)
+   /* For PRRN Events the extended log length is used to denote
+* the scope for calling rtas update-nodes.
+*/
+   prrn_schedule_update(log->extended_log_length);
+
+   return;
+}
+
+#else
 
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+   return;
 }
 
+#endif
+
 static int rtas_log_open(struct inode * inode, struct file * file)
 {
return 0;
@@ -388,8 +430,10 @@
break;
}
 
-   if (error == 0)
+   if (error == 0) {
pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+   handle_rtas_event((struct rtas_error_log *)logdata);
+   }
 
} while(error == 0);
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v4 1/13] Expose pseries devicetree_update()

2013-04-24 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

When rtasd receives a PRRN event, it needs to make a series of RTAS
calls (ibm,update-nodes and ibm,update-properties) to retrieve the
updated device tree information. These calls are already handled in the
pseries_devicetree_update() routine used in partition migration.

This patch exposes pseries_devicetree_update() to make it accessible
to other pseries routines, this patch also updates pseries_devicetree_update()
to take a 32-bit scope parameter. The scope value, which was previously hard
coded to 1 for partition migration, is used for the RTAS calls 
ibm,update-nodes/properties to update the device tree.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h   |4 
 arch/powerpc/platforms/pseries/mobility.c |   21 -
 2 files changed, 16 insertions(+), 9 deletions(-)

Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-04-15 
09:18:10.0 -0500
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-04-23 
13:22:05.0 -0500
@@ -37,14 +37,16 @@
 #define UPDATE_DT_NODE 0x0200
 #define ADD_DT_NODE0x0300
 
-static int mobility_rtas_call(int token, char *buf)
+#define MIGRATION_SCOPE(1)
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
 {
int rc;
 
spin_lock(&rtas_data_buf_lock);
 
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
-   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
 
spin_unlock(&rtas_data_buf_lock);
@@ -123,7 +125,7 @@
return 0;
 }
 
-static int update_dt_node(u32 phandle)
+static int update_dt_node(u32 phandle, s32 scope)
 {
struct update_props_workarea *upwa;
struct device_node *dn;
@@ -151,7 +153,8 @@
upwa->phandle = phandle;
 
do {
-   rc = mobility_rtas_call(update_properties_token, rtas_buf);
+   rc = mobility_rtas_call(update_properties_token, rtas_buf,
+   scope);
if (rc < 0)
break;
 
@@ -219,7 +222,7 @@
return rc;
 }
 
-static int pseries_devicetree_update(void)
+int pseries_devicetree_update(s32 scope)
 {
char *rtas_buf;
u32 *data;
@@ -235,7 +238,7 @@
return -ENOMEM;
 
do {
-   rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+   rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
if (rc && rc != 1)
break;
 
@@ -256,7 +259,7 @@
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
-   update_dt_node(phandle);
+   update_dt_node(phandle, scope);
break;
case ADD_DT_NODE:
drc_index = *data++;
@@ -276,7 +279,7 @@
int rc;
int activate_fw_token;
 
-   rc = pseries_devicetree_update();
+   rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc) {
printk(KERN_ERR "Initial post-mobility device tree update "
   "failed: %d\n", rc);
@@ -292,7 +295,7 @@
 
rc = rtas_call(activate_fw_token, 0, 1, NULL);
if (!rc) {
-   rc = pseries_devicetree_update();
+   rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc)
printk(KERN_ERR "Secondary post-mobility device tree "
   "update failed: %d\n", rc);
Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-04-15 
09:18:10.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-04-23 13:22:37.0 
-0500
@@ -277,6 +277,10 @@
 
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
+#ifdef CONF

[PATCH v4 0/13] NUMA CPU Reconfiguration using PRRN

2013-04-24 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

PRRN Events are RTAS events sent up through the event-scan mechanism on
Power. When these events are received the system needs can get the updated
device tree affinity information for the affected CPUs/memory via the
rtas update-nodes and update-properties calls. This information is then
used to update the NUMA affinity of the CPUs/Memory in the kernel.

This patch set adds the ability to recognize PRRN events, update the device
tree and kernel information for CPUs (memory will be handled in a later
patch), and add an interface to enable/disable toplogy updates from /proc.

Additionally, these updates solve an existing problem with the VPHN (Virtual
Processor Home Node) capability and allow us to re-enable this feature.

Nathan Fontenot

 arch/powerpc/include/asm/firmware.h   |7 
 arch/powerpc/include/asm/prom.h   |   46 ++--
 arch/powerpc/include/asm/rtas.h   |2 
 arch/powerpc/kernel/prom_init.c   |   98 ++
 arch/powerpc/kernel/rtasd.c   |   46 
 arch/powerpc/mm/numa.c|  214 +++---
 arch/powerpc/platforms/pseries/firmware.c |   50 -
 arch/powerpc/platforms/pseries/mobility.c |   21 +-
 powerpc/arch/powerpc/include/asm/firmware.h   |1 
 powerpc/arch/powerpc/include/asm/prom.h   |   71 +++
 powerpc/arch/powerpc/include/asm/rtas.h   |4 
 powerpc/arch/powerpc/include/asm/topology.h   |5 
 powerpc/arch/powerpc/kernel/prom_init.c   |2 
 powerpc/arch/powerpc/kernel/rtasd.c   |7 
 powerpc/arch/powerpc/mm/numa.c|   62 ++
 powerpc/arch/powerpc/platforms/pseries/firmware.c |8 
 powerpc/arch/powerpc/platforms/pseries/mobility.c |   20 +-
 powerpc/arch/powerpc/platforms/pseries/pseries.h  |5 
 powerpc/arch/powerpc/platforms/pseries/setup.c|   40 ++--
 19 files changed, 496 insertions(+), 213 deletions(-)

Updates for v4 of the patchset:
--
1/13 - Remove the hook in ppc_md for updating te device tree.

3/13 - Put the rtas code to handle PRRN events in #ifdef CONFIG_PPC_PSERIES

4/13 - New patch. Update the iteration over arrays in firmware.c to use
ARRAY_SIZE()

5/13 (was 4/12) - Remove the unnecessary #ifdef

6/13 (was 5/12) - Removed the references to platform_has_feature() and update
the firmware.c updates to use ARRAY_SIZE() for iteration.

8/13 (was 7/12) - Correct subject.

13/13 (was 12/12) - Remove inlining of prrn_is_enabled().

Updates for v3 of the patchset:
--
1/12 - Updated to use a ppc_md interface to invoke device tree updates, this
corrects the build break previously seen in patch 2/12 for non-pseries
platforms.

2/12 - New patch in the series to correct the parsing of the buffer returned
from ibm,update-properties rtas call.

5/12 - The parsing of architecture vector 5 has been made more efficient.

7/12 - Correct #define used in call the firmware_has_feature()

8/12 - Updated calling of stop_machine() to only call it once per PRRN event.

12/12 - Added inclusion of topology.h to rtasd.c to correct a build failure
on non-pseries platforms.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3 12/12] Add /proc interface to control topology updates

2013-04-23 Thread Nathan Fontenot
On 04/22/2013 09:49 PM, Michael Ellerman wrote:
> On Tue, Apr 23, 2013 at 12:00:26PM +1000, Stephen Rothwell wrote:
>> Hi Nathan,
>>
>> On Mon, 22 Apr 2013 13:47:55 -0500 Nathan Fontenot 
>>  wrote:
>>>
>>>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
>>>  extern int start_topology_update(void);
>>>  extern int stop_topology_update(void);
>>> +extern inline int prrn_is_enabled(void);
>>
>> You really can't do "extern inline" with no body ...
> 
> No you can't, and at least with my compiler it causes a build error.
> 

Easy enough, no more inline for this.

This for looking,
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3 7/12] Use stop machine to update cpu maps

2013-04-23 Thread Nathan Fontenot
On 04/22/2013 07:24 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2013-04-22 at 13:41 -0500, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> Platform events such as partition migration or the new PRRN firmware
>> feature can cause the NUMA characteristics of a CPU to change, and these
>> changes will be reflected in the device tree nodes for the affected
>> CPUs.
>>
>> This patch registers a handler for Open Firmware device tree updates
>> and reconfigures the CPU and node maps whenever the associativity
>> changes. Currently, this is accomplished by marking the affected CPUs in
>> the cpu_associativity_changes_mask and allowing
>> arch_update_cpu_topology() to retrieve the new associativity information
>> using hcall_vphn().
>>
>> Protecting the NUMA cpu maps from concurrent access during an update
>> operation will be addressed in a subsequent patch in this series.
> 
> I see no more mention of stop_machine() ... is the patch subject stale ?
> 

Nope, just me mistakenly putting the wrong subject for this patch. I'll
correct it in the next version.

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3 5/12] Update firmware_has_feature() to check architecture bits

2013-04-23 Thread Nathan Fontenot
On 04/22/2013 08:50 PM, Stephen Rothwell wrote:
> Hi Nathan,
> 
> On Mon, 22 Apr 2013 13:38:47 -0500 Nathan Fontenot  
> wrote:
>>
>> -/* Option vector 5: PAPR/OF options supported */
>> -#define OV5_LPAR0x80/* logical partitioning supported */
>> -#define OV5_SPLPAR  0x40/* shared-processor LPAR supported */
>> +/* Option vector 5: PAPR/OF options supported
>> + * Thses bits are also used for the platform_has_feature() call so
>   ^
> typo

will fix.

> 
>> + * we encode the vector index in the define and use the OV5_FEAT()
>> + * and OV5_INDX() macros to extract the desired information.
>> + */
>> +#define OV5_FEAT(x) ((x) & 0xff)
>> +#define OV5_INDX(x) ((x) >> 8)
>> +#define OV5_LPAR0x0280  /* logical partitioning supported */
>> +#define OV5_SPLPAR  0x0240  /* shared-processor LPAR supported */
> 
> Wouldn't it be clearer to say
> 
> #define OV5_LPAR  (OV5_INDX(0x2) | OV5_FEAT(0x80))

The defines won't work the way you used them, they were designed to take the
combined value, i.e. 0x0280, and parse out the index and the feature.

I do think having macros to create the actual values as your example does is 
easier
to read. We could do something like...

#define OV5_FEAT(x) ((x) & 0xff)
#define OV5_SETINDX(x)  ((x) << 8)
#define OV5_GETINDX(x)  ((x) >> 8)

#define OV5_LPAR(OV5_SETINDX(0x2) | OV5_FEAT(0x80))

Thoughts?

> 
> etc?
> 
>> @@ -145,6 +141,7 @@
>>   * followed by # option vectors - 1, followed by the option vectors.
>>   */
>>  extern unsigned char ibm_architecture_vec[];
>> +bool platform_has_feature(unsigned int);
> 
> "extern", please (if nothing else, for consistency).
> 

That shouldn't really be there, its an artifact from a previous patch. I'll 
remove it.

>> +static __initdata struct vec5_fw_feature
>> +vec5_fw_features_table[FIRMWARE_MAX_FEATURES] = {
> 
> Why make this array FIRMWARE_MAX_FEATURES (63) long?  You could just
> restrict the for loop below to ARRAY_SIZE(vec5_fw_features_table).
> 
>> +{FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY},
>> +};
>> +
>> +void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
>> +{
>> +unsigned int index, feat;
>> +int i;
>> +
>> +pr_debug(" -> fw_vec5_feature_init()\n");
>> +
>> +for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) {
>> +if (!vec5_fw_features_table[i].feature)
>> +continue;
> 
> And this test could go away.
> 
> I realise that you have just copied the existing code, but you should not
> do that blindly.  Maybe you could even add an (earlier) patch that fixes
> the existing code.

I think that could be done easily enough.

Thanks for looking,
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3 1/12] Create a powerpc update_devicetree interface

2013-04-23 Thread Nathan Fontenot
On 04/22/2013 07:15 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2013-04-22 at 13:30 -0500, Nathan Fontenot wrote:
> 
>> This patch exposes a method for updating the device tree via
>> ppc_md.update_devicetree that takes a single 32-bit value as a parameter.
>> For pseries platforms this is the existing pseries_devicetree_update routine
>> which is updated to take the new parameter which is a scope value
>> to indicate the the reason for making the rtas calls. This parameter is
>> required by the ibm,update-nodes/ibm,update-properties RTAS calls, and
>> the appropriate value is contained within the RTAS event for PRRN
>> notifications. In pseries_devicetree_update() it was previously
>> hard-coded to 1, the scope value for partition migration.
> 
> I think that's too much abstraction (see below)
> 
> Also you add this helper:
> 
>> Index: powerpc/arch/powerpc/kernel/rtas.c
>> ===
>> --- powerpc.orig/arch/powerpc/kernel/rtas.c  2013-03-08 19:23:06.0 
>> -0600
>> +++ powerpc/arch/powerpc/kernel/rtas.c   2013-04-17 13:02:29.0 
>> -0500
>> @@ -1085,3 +1085,13 @@
>>  timebase = 0;
>>  arch_spin_unlock(&timebase_lock);
>>  }
>> +
>> +int update_devicetree(s32 scope)
>> +{
>> +int rc = 0;
>> +
>> +if (ppc_md.update_devicetree)
>> +rc = ppc_md.update_devicetree(scope);
>> +
>> +return rc;
>> +}
> 
> But then don't use it afaik (you call directly ppc_md.update_... from
> prrn_work_fn().
> 
> In the end, the caller (PRRN stuff), while in rtasd, is really pseries
> specific and the resulting update_device_tree() as well, so I don't
> think we need the ppc_md. hook in the middle with that "oddball" scope
> parameter which is not defined outside of pseries specific areas.
> 
> In this case, it might be better to make sure the PRRN related stuff in
> rtasd is inside an ifdef CONFIG_PPC_PSERIES and have it call directly
> into pseries_update_devicetree().
> 
> It makes the code somewhat easier to follow and I doubt anybody else
> will ever use that specific hook, at least not in its current form. If
> we need an abstraction later, we can add one then.
> 

ok, good. I was not crazy about using ppc_md to do this and if you're fine
with putting the pseries specific stuff in ifdef CONFIG_PPC_PSERIES I'll
update the code to do that.

Question concerning rtas code. There is quite a bit of pseries specific 
pieces in the general powerpc/kernel directory. Has there been, or should
there be, any effort to move that to the pseries directory?

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 12/12] Add /proc interface to control topology updates

2013-04-22 Thread Nathan Fontenot
There are instances in which we do not want topology updates to occur.
In order to allow this a /proc interface (/proc/powerpc/topology_updates)
is introduced so that topology updates can be enabled and disabled.

This patch also adds a prrn_is_enabled() call so that PRRN events are
handled in the kernel only if topology updating is enabled.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/topology.h |5 ++
 arch/powerpc/kernel/rtasd.c |7 ++--
 arch/powerpc/mm/numa.c  |   62 +++-
 3 files changed, 71 insertions(+), 3 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-22 09:46:13.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-22 09:51:10.0 -0500
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -1585,7 +1588,6 @@
 
return rc;
 }
-__initcall(start_topology_update);
 
 /*
  * Disable polling for VPHN associativity changes.
@@ -1604,4 +1606,62 @@
 
return rc;
 }
+
+inline int prrn_is_enabled(void)
+{
+   return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+   if (vphn_enabled || prrn_enabled)
+   seq_puts(file, "on\n");
+   else
+   seq_puts(file, "off\n");
+
+   return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+   char kbuf[4]; /* "on" or "off" plus null. */
+   int read_len;
+
+   read_len = count < 3 ? count : 3;
+   if (copy_from_user(kbuf, buf, read_len))
+   return -EINVAL;
+
+   kbuf[read_len] = '\0';
+
+   if (!strncmp(kbuf, "on", 2))
+   start_topology_update();
+   else if (!strncmp(kbuf, "off", 3))
+   stop_topology_update();
+   else
+   return -EINVAL;
+
+   return count;
+}
+
+static const struct file_operations topology_ops = {
+   .read = seq_read,
+   .write = topology_write,
+   .open = topology_open,
+   .release = single_release
+};
+
+static int topology_update_init(void)
+{
+   start_topology_update();
+   proc_create("powerpc/topology_updates", 644, NULL, &topology_ops);
+
+   return 0;
+}
+device_initcall(topology_update_init);
 #endif /* CONFIG_PPC_SPLPAR */
Index: powerpc/arch/powerpc/include/asm/topology.h
===
--- powerpc.orig/arch/powerpc/include/asm/topology.h2013-04-18 
09:09:21.0 -0500
+++ powerpc/arch/powerpc/include/asm/topology.h 2013-04-22 09:51:10.0 
-0500
@@ -71,6 +71,7 @@
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
+extern inline int prrn_is_enabled(void);
 #else
 static inline int start_topology_update(void)
 {
@@ -80,6 +81,10 @@
 {
return 0;
 }
+static inline int prrn_is_enabled(void)
+{
+   return 0;
+}
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-04-18 09:09:21.0 
-0500
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-04-22 09:51:10.0 -0500
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 static DEFINE_SPINLOCK(rtasd_log_lock);
@@ -294,11 +295,13 @@
 {
pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
 
-   if (log->type == RTAS_TYPE_PRRN)
+   if (log->type == RTAS_TYPE_PRRN) {
/* For PRRN Events the extended log length is used to denote
 * the scope for calling rtas update-nodes.
 */
-   prrn_schedule_update(log->extended_log_length);
+   if (prrn_is_enabled())
+   prrn_schedule_update(log->extended_log_length);
+   }
 
return;
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 11/12] Enable PRRN Event handling

2013-04-22 Thread Nathan Fontenot
The Linux kernel and platform firmware negotiate their mutual support
of the PRRN option via the ibm,client-architecture-support interface.
This patch simply sets the appropriate fields in the client architecture
vector to indicate Linux support and will cause the firmware to begin
sending PRRN events via the RTAS event-scan mechanism.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/kernel/prom_init.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-04-18 
09:09:22.0 -0500
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-04-22 09:49:28.0 
-0500
@@ -698,7 +698,7 @@
 #else
0,
 #endif
-   OV5_FEAT(OV5_TYPE1_AFFINITY),
+   OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
0,
0,
0,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 10/12] Re-enable Virtual Private Home Node capabilities

2013-04-22 Thread Nathan Fontenot
From: Jesse Larrew 

The new PRRN firmware feature provides a more convenient and event-driven
interface than VPHN for notifying Linux of changes to the NUMA affinity of
platform resources. However, for practical reasons, it may not be feasible
for some customers to update to the latest firmware. For these customers,
the VPHN feature supported on previous firmware versions may still be the
best option.

The VPHN feature was previously disabled due to races with the load
balancing code when accessing the NUMA cpu maps, but the new stop_machine()
approach protects the NUMA cpu maps from these concurrent accesses. It
should be safe to re-enable this feature now.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-22 09:39:02.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-22 09:46:13.0 -0500
@@ -1572,9 +1572,8 @@
vphn_enabled = 0;
rc = of_reconfig_notifier_register(&dt_update_nb);
}
-   } else if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
+   } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
   get_lppaca()->shared_proc) {
-   /* Disabled until races with load balancing are fixed */
if (!vphn_enabled) {
prrn_enabled = 0;
vphn_enabled = 1;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 9/12] Update NUMA VDSO information

2013-04-22 Thread Nathan Fontenot
From: Jesse Larrew 

The following patch adds vdso_getcpu_init(), which stores the NUMA node for
a cpu in SPRG3:

Commit 18ad51dd34 ("powerpc: Add VDSO version of getcpu") adds
vdso_getcpu_init(), which stores the NUMA node for a cpu in SPRG3.

This patch ensures that this information is also updated when the NUMA
affinity of a cpu changes.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-18 09:10:11.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-22 09:39:02.0 -0500
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int numa_enabled = 1;
 
@@ -1434,6 +1435,7 @@
unregister_cpu_under_node(update->cpu, update->old_nid);
unmap_cpu_from_node(update->cpu);
map_cpu_to_node(update->cpu, update->new_nid);
+   vdso_getcpu_init();
register_cpu_under_node(update->cpu, update->new_nid);
}
 
@@ -1449,6 +1451,7 @@
unsigned int cpu, changed = 0;
struct topology_update_data *updates, *ud;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   cpumask_t updated_cpus;
struct device *dev;
int weight, i = 0;
 
@@ -1460,6 +1463,8 @@
if (!updates)
return 0;
 
+   cpumask_clear(&updated_cpus);
+
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
ud = &updates[i++];
ud->cpu = cpu;
@@ -1470,12 +1475,13 @@
ud->new_nid = first_online_node;
 
ud->old_nid = numa_cpu_lookup_table[cpu];
+   cpumask_set_cpu(cpu, &updated_cpus);
 
if (i < weight)
ud->next = &updates[i];
}
 
-   stop_machine(update_cpu_topology, &updates[0], cpu_online_mask);
+   stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
 
for (ud = &updates[0]; ud; ud = ud->next) {
dev = get_cpu_device(ud->cpu);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 8/12] Use stop machine to update cpu maps

2013-04-22 Thread Nathan Fontenot
The new PRRN firmware feature allows CPU and memory resources to be
transparently reassigned across NUMA boundaries. When this happens, the
kernel must update the node maps to reflect the new affinity information.

Although the NUMA maps can be protected by locking primitives during the
update itself, this is insufficient to prevent concurrent accesses to these
structures. Since cpumask_of_node() hands out a pointer to these
structures, they can still be modified outside of the lock. Furthermore,
tracking down each usage of these pointers and adding locks would be quite
invasive and difficult to maintain.

The approach used is to make a list of affected cpus and call stop_machine
to have the update routine run on each of the affected cpus allowing them
to update themselves. Each cpu finds itself in the list of cpus and makes
the appropriate updates. We need to have each cpu do this for themselves to
handle calls to vdso_getcpu_init that is added in a subsequent patch.

Situations like these are best handled using stop_machine(). Since the NUMA
affinity updates are exceptionally rare events, this approach has the
benefit of not adding any overhead while accessing the NUMA maps during
normal operation.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   82 ++---
 1 file changed, 64 insertions(+), 18 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-17 14:04:12.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-18 09:10:11.0 -0500
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1254,6 +1255,13 @@
 
 /* Virtual Processor Home Node (VPHN) support */
 #ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+   struct topology_update_data *next;
+   unsigned int cpu;
+   int old_nid;
+   int new_nid;
+};
+
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
@@ -1405,41 +1413,79 @@
 }
 
 /*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+   struct topology_update_data *update;
+   unsigned long cpu;
+
+   if (!data)
+   return -EINVAL;
+
+   cpu = get_cpu();
+
+   for (update = data; update; update = update->next) {
+   if (cpu != update->cpu)
+   continue;
+
+   unregister_cpu_under_node(update->cpu, update->old_nid);
+   unmap_cpu_from_node(update->cpu);
+   map_cpu_to_node(update->cpu, update->new_nid);
+   register_cpu_under_node(update->cpu, update->new_nid);
+   }
+
+   return 0;
+}
+
+/*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  */
 int arch_update_cpu_topology(void)
 {
-   int cpu, nid, old_nid, changed = 0;
+   unsigned int cpu, changed = 0;
+   struct topology_update_data *updates, *ud;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
+   int weight, i = 0;
+
+   weight = cpumask_weight(&cpu_associativity_changes_mask);
+   if (!weight)
+   return 0;
+
+   updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+   if (!updates)
+   return 0;
 
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+   ud = &updates[i++];
+   ud->cpu = cpu;
vphn_get_associativity(cpu, associativity);
-   nid = associativity_to_nid(associativity);
+   ud->new_nid = associativity_to_nid(associativity);
 
-   if (nid < 0 || !node_online(nid))
-   nid = first_online_node;
+   if (ud->new_nid < 0 || !node_online(ud->new_nid))
+   ud->new_nid = first_online_node;
 
-   old_nid = numa_cpu_lookup_table[cpu];
+   ud->old_nid = numa_cpu_lookup_table[cpu];
 
-   /* Disable hotplug while we update the cpu
-* masks and sysfs.
-*/
-   get_online_cpus();
-   unregister_cpu_under_node(cpu, old_nid);
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, nid);
-   register_cpu_under_node(cpu, nid);
-   put_online_cpus();
+   if (i < weight)
+   ud->next = &updates[i];
+   }
+
+   stop_machine(update_cpu_topology, &updates[0], cpu_online_mask);
 
-   dev = get_cpu_de

[PATCH v3 7/12] Use stop machine to update cpu maps

2013-04-22 Thread Nathan Fontenot
From: Jesse Larrew 

Platform events such as partition migration or the new PRRN firmware
feature can cause the NUMA characteristics of a CPU to change, and these
changes will be reflected in the device tree nodes for the affected
CPUs.

This patch registers a handler for Open Firmware device tree updates
and reconfigures the CPU and node maps whenever the associativity
changes. Currently, this is accomplished by marking the affected CPUs in
the cpu_associativity_changes_mask and allowing
arch_update_cpu_topology() to retrieve the new associativity information
using hcall_vphn().

Protecting the NUMA cpu maps from concurrent access during an update
operation will be addressed in a subsequent patch in this series.

Signed-off-by: Nathan Fontenot 
---

 arch/powerpc/include/asm/firmware.h   |3 
 arch/powerpc/include/asm/prom.h   |1 
 arch/powerpc/mm/numa.c|   99 ++
 arch/powerpc/platforms/pseries/firmware.c |1 
 4 files changed, 79 insertions(+), 25 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-04-15 
14:03:52.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-04-15 14:04:47.0 
-0500
@@ -128,6 +128,7 @@
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
 #define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
 #define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-15 14:04:46.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-15 14:06:20.0 -0500
@@ -1257,7 +1257,8 @@
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
-static void set_topology_timer(void);
+static int prrn_enabled;
+static void reset_topology_timer(void);
 
 /*
  * Store the current values of the associativity change counters in the
@@ -1293,11 +1294,9 @@
  */
 static int update_cpu_associativity_changes_mask(void)
 {
-   int cpu, nr_cpus = 0;
+   int cpu;
cpumask_t *changes = &cpu_associativity_changes_mask;
 
-   cpumask_clear(changes);
-
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
@@ -1311,11 +1310,10 @@
}
if (changed) {
cpumask_set_cpu(cpu, changes);
-   nr_cpus++;
}
}
 
-   return nr_cpus;
+   return cpumask_weight(changes);
 }
 
 /*
@@ -1416,7 +1414,7 @@
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
 
-   for_each_cpu(cpu,&cpu_associativity_changes_mask) {
+   for_each_cpu(cpu, &cpu_associativity_changes_mask) {
vphn_get_associativity(cpu, associativity);
nid = associativity_to_nid(associativity);
 
@@ -1438,6 +1436,7 @@
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+   cpumask_clear_cpu(cpu, &cpu_associativity_changes_mask);
changed = 1;
}
 
@@ -1457,37 +1456,80 @@
 
 static void topology_timer_fn(unsigned long ignored)
 {
-   if (!vphn_enabled)
-   return;
-   if (update_cpu_associativity_changes_mask() > 0)
+   if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
topology_schedule_update();
-   set_topology_timer();
+   else if (vphn_enabled) {
+   if (update_cpu_associativity_changes_mask() > 0)
+   topology_schedule_update();
+   reset_topology_timer();
+   }
 }
 static struct timer_list topology_timer =
TIMER_INITIALIZER(topology_timer_fn, 0, 0);
 
-static void set_topology_timer(void)
+static void reset_topology_timer(void)
 {
topology_timer.data = 0;
topology_timer.expires = jiffies + 60 * HZ;
-   add_timer(&topology_timer);
+   mod_timer(&topology_timer, topology_timer.expires);
+}
+
+static void stage_topology_update(int core_id)
+{
+   cpumask_or(&cpu_associativity_changes_mask,
+   &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+   reset_topology_timer();
 }
 
+static int dt_update_callback(struct notifier_block *nb,
+  

[PATCH v3 6/12] Update numa.c to use updated firmware_has_feature()

2013-04-22 Thread Nathan Fontenot
Update the numa code to use the updated firmware_has_feature() when checking
for type 1 affinity.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-04-15 09:18:07.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-04-15 09:54:59.0 -0500
@@ -291,9 +291,7 @@
 static int __init find_min_common_depth(void)
 {
int depth;
-   struct device_node *chosen;
struct device_node *root;
-   const char *vec5;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -325,24 +323,10 @@
 
distance_ref_points_depth /= sizeof(int);
 
-#define VEC5_AFFINITY_BYTE 5
-#define VEC5_AFFINITY  0x80
-
-   if (firmware_has_feature(FW_FEATURE_OPAL))
+   if (firmware_has_feature(FW_FEATURE_OPAL) ||
+   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
form1_affinity = 1;
-   else {
-   chosen = of_find_node_by_path("/chosen");
-   if (chosen) {
-   vec5 = of_get_property(chosen,
-  "ibm,architecture-vec-5", NULL);
-   if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
-   VEC5_AFFINITY)) {
-   dbg("Using form 1 affinity\n");
-   form1_affinity = 1;
-   }
-
-   of_node_put(chosen);
-   }
}
 
if (form1_affinity) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 5/12] Update firmware_has_feature() to check architecture bits

2013-04-22 Thread Nathan Fontenot
The firmware_has_feature() function makes it easy to check for supported
features of the hypervisor. This patch extends the capability of the
firmware_has_feature() function to include checking for specified bits
in vector 5 of the architecture vector as is reported in the device tree.

As part of this the #defines used for the architecture vector are
re-defined such that the vector 5 options have the vector
index and the feature bits encoded into them. This makes for a much
simpler design to update firmware_has_feature() to check for bits
in the architecture vector.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/firmware.h   |4 +-
 arch/powerpc/include/asm/prom.h   |   45 ---
 arch/powerpc/kernel/prom_init.c   |   23 ++
 arch/powerpc/platforms/pseries/firmware.c |   49 +-
 arch/powerpc/platforms/pseries/pseries.h  |5 ++-
 arch/powerpc/platforms/pseries/setup.c|   40 
 6 files changed, 113 insertions(+), 53 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-04-17 
13:43:13.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-04-17 13:51:46.0 
-0500
@@ -111,31 +111,27 @@
 /* Option vector 4: IBM PAPR implementation */
 #define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
 
-/* Option vector 5: PAPR/OF options supported */
-#define OV5_LPAR   0x80/* logical partitioning supported */
-#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* Option vector 5: PAPR/OF options supported
+ * Thses bits are also used for the platform_has_feature() call so
+ * we encode the vector index in the define and use the OV5_FEAT()
+ * and OV5_INDX() macros to extract the desired information.
+ */
+#define OV5_FEAT(x)((x) & 0xff)
+#define OV5_INDX(x)((x) >> 8)
+#define OV5_LPAR   0x0280  /* logical partitioning supported */
+#define OV5_SPLPAR 0x0240  /* shared-processor LPAR supported */
 /* ibm,dynamic-reconfiguration-memory property supported */
-#define OV5_DRCONF_MEMORY  0x20
-#define OV5_LARGE_PAGES0x10/* large pages supported */
-#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
-/* PCIe/MSI support.  Without MSI full PCIe is not supported */
-#ifdef CONFIG_PCI_MSI
-#define OV5_MSI0x01/* PCIe/MSI support */
-#else
-#define OV5_MSI0x00
-#endif /* CONFIG_PCI_MSI */
-#ifdef CONFIG_PPC_SMLPAR
-#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
-#define OV5_XCMO   0x40/* Page Coalescing */
-#else
-#define OV5_CMO0x00
-#define OV5_XCMO   0x00
-#endif
-#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
-#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
-#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+#define OV5_DRCONF_MEMORY  0x0220
+#define OV5_LARGE_PAGES0x0210  /* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x0202  /* donate dedicated CPU support 
*/
+#define OV5_MSI0x0201  /* PCIe/MSI support */
+#define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x0440  /* Page Coalescing */
+#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x0F01  /* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
@@ -145,6 +141,7 @@
  * followed by # option vectors - 1, followed by the option vectors.
  */
 extern unsigned char ibm_architecture_vec[];
+bool platform_has_feature(unsigned int);
 #endif
 
 /* These includes are put at the bottom because they may contain things
Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-04-17 
13:43:13.0 -0500
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-04-17 13:51:46.0 
-0500
@@ -684,11 +684,21 @@
/* option vector 5: PAPR/OF options */
19 - 2, /* length */
0,  /* don't ign

[PATCH v3 4/12] Move architecture vector definitions to prom.h

2013-04-22 Thread Nathan Fontenot
As part of handling handling PRRN events we will need to check the
vector 5 portion of the architecture bits reported in the device tree
to ensure that PRRN event handling is enabled. In order to do this
firmware_has_feature is updated (in a subsequent patch) to
make this check.  To avoid having to re-define bits in the architecture
vector the bits are moved to prom.h.

This patch is the first step in updating firmware_has_feature
by simply moving the bit definitions from prom_init.c to asm/prom.h.
There are no functional changes.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/include/asm/prom.h |   73 ++
 arch/powerpc/kernel/prom_init.c |   75 +++-
 2 files changed, 79 insertions(+), 69 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-04-16 
21:25:16.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-04-17 13:43:13.0 
-0500
@@ -74,6 +74,79 @@
 #define DRCONF_MEM_AI_INVALID  0x0040
 #define DRCONF_MEM_RESERVED0x0080
 
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * There are two methods for telling firmware what our capabilities are.
+ * Newer machines have an "ibm,client-architecture-support" method on the
+ * root node.  For older machines, we have to call the "process-elf-header"
+ * method in the /packages/elf-loader node, passing it a fake 32-bit
+ * ELF header containing a couple of PT_NOTE sections that contain
+ * structures that contain various information.
+ */
+
+/* New method - extensible architecture description vector. */
+
+/* Option vector bits - generic bits in byte 1 */
+#define OV_IGNORE  0x80/* ignore this vector */
+#define OV_CESSATION_POLICY0x40/* halt if unsupported option present*/
+
+/* Option vector 1: processor architectures supported */
+#define OV1_PPC_2_00   0x80/* set if we support PowerPC 2.00 */
+#define OV1_PPC_2_01   0x40/* set if we support PowerPC 2.01 */
+#define OV1_PPC_2_02   0x20/* set if we support PowerPC 2.02 */
+#define OV1_PPC_2_03   0x10/* set if we support PowerPC 2.03 */
+#define OV1_PPC_2_04   0x08/* set if we support PowerPC 2.04 */
+#define OV1_PPC_2_05   0x04/* set if we support PowerPC 2.05 */
+#define OV1_PPC_2_06   0x02/* set if we support PowerPC 2.06 */
+#define OV1_PPC_2_07   0x01/* set if we support PowerPC 2.07 */
+
+/* Option vector 2: Open Firmware options supported */
+#define OV2_REAL_MODE  0x20/* set if we want OF in real mode */
+
+/* Option vector 3: processor options supported */
+#define OV3_FP 0x80/* floating point */
+#define OV3_VMX0x40/* VMX/Altivec */
+#define OV3_DFP0x20/* decimal FP */
+
+/* Option vector 4: IBM PAPR implementation */
+#define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
+
+/* Option vector 5: PAPR/OF options supported */
+#define OV5_LPAR   0x80/* logical partitioning supported */
+#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* ibm,dynamic-reconfiguration-memory property supported */
+#define OV5_DRCONF_MEMORY  0x20
+#define OV5_LARGE_PAGES0x10/* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
+/* PCIe/MSI support.  Without MSI full PCIe is not supported */
+#ifdef CONFIG_PCI_MSI
+#define OV5_MSI0x01/* PCIe/MSI support */
+#else
+#define OV5_MSI0x00
+#endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x40/* Page Coalescing */
+#else
+#define OV5_CMO0x00
+#define OV5_XCMO   0x00
+#endif
+#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+
+/* Option Vector 6: IBM PAPR hints */
+#define OV6_LINUX  0x02/* Linux is our OS */
+
+/*
+ * The architecture vector has an array of PVR mask/value pairs,
+ * followed by # option vectors - 1, followed by the option vectors.
+ */
+extern unsigned char ibm_architecture_vec[];
+#endif
+
 /* These includes are put at the bottom because they may contain things
  * that are overridden by this file.  Ideally they shouldn't be included
  * by this file, but there are

[PATCH v3 3/12] Add PRRN event handler

2013-04-22 Thread Nathan Fontenot
From: Jesse Larrew 

A PRRN event is signaled via the RTAS event-scan mechanism, which
returns a Hot Plug Event message "fixed part" indicating "Platform
Resource Reassignment". In response to the Hot Plug Event message,
we must call ibm,update-nodes to determine which resources were
reassigned and then ibm,update-properties to obtain the new affinity
information about those resources.

The PRRN event-scan RTAS message contains only the "fixed part" with
the "Type" field set to the value 160 and no Extended Event Log. The
four-byte Extended Event Log Length field is repurposed (since no
Extended Event Log message is included) to pass the "scope" parameter
that causes the ibm,update-nodes to return the nodes affected by the
specific resource reassignment.

This patch adds a handler for PRRN RTAS events. The function
pseries_devicetree_update() (from mobility.c) is used to make the
ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
(handled by a subsequent patch) will require significant processing,
so pseries_devicetree_update() is called from an asynchronous workqueue
to allow event processing to continue. 

PRRN RTAS events on pseries systems are rare events that have to be
initiated from the HMC console for the system by an IBM tech. This allows
us to assume that these events are widely spaced. Additionally, all work
on the queue is flushed before handling any new work to ensure we only have
one event in flight being handled at a time.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h |2 ++
 arch/powerpc/kernel/rtasd.c |   37 -
 2 files changed, 38 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-04-17 
12:58:33.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-04-17 13:24:06.0 
-0500
@@ -143,6 +143,8 @@
 #define RTAS_TYPE_PMGM_TIME_ALARM  0x6f
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE   0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC0x71
+/* Platform Resource Reassignment Notification */
+#define RTAS_TYPE_PRRN 0xA0
 
 /* RTAS check-exception vector offset */
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-04-17 12:55:11.0 
-0500
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-04-17 13:27:00.0 -0500
@@ -87,6 +87,8 @@
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
+   case RTAS_TYPE_PRRN:
+   return "Platform Resource Reassignment Event";
}
 
return rtas_type[0];
@@ -265,7 +267,40 @@
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
+}
+
+static s32 update_scope;
+
+static void prrn_work_fn(struct work_struct *work)
+{
+   /*
+* For PRRN, we must pass the negative of the scope value in
+* the RTAS event.
+*/
+   if (ppc_md.update_devicetree)
+   ppc_md.update_devicetree(-update_scope);
+}
+
+static DECLARE_WORK(prrn_work, prrn_work_fn);
+
+void prrn_schedule_update(u32 scope)
+{
+   flush_work(&prrn_work);
+   update_scope = scope;
+   schedule_work(&prrn_work);
+}
+
+static void pseries_handle_event(const struct rtas_error_log *log)
+{
+   pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
+
+   if (log->type == RTAS_TYPE_PRRN)
+   /* For PRRN Events the extended log length is used to denote
+* the scope for calling rtas update-nodes.
+*/
+   prrn_schedule_update(log->extended_log_length);
 
+   return;
 }
 
 static int rtas_log_open(struct inode * inode, struct file * file)
@@ -389,7 +424,7 @@
}
 
if (error == 0)
-   pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+   pseries_handle_event((struct rtas_error_log *)logdata);
 
} while(error == 0);
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 2/12] Correct buffer parsing in update-properties

2013-04-22 Thread Nathan Fontenot
Correct parsing of the buffer returned from ibm,update-properties. The first
element is a length and the path to the property which is slightly different
from the list of properties in the buffer so we need to specifically
handle this.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/platforms/pseries/mobility.c |   20 
 1 file changed, 16 insertions(+), 4 deletions(-)

Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-04-17 
13:27:23.0 -0500
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-04-17 
13:28:58.0 -0500
@@ -135,6 +135,7 @@
char *prop_data;
char *rtas_buf;
int update_properties_token;
+   u32 vd;
 
update_properties_token = rtas_token("ibm,update-properties");
if (update_properties_token == RTAS_UNKNOWN_SERVICE)
@@ -161,13 +162,24 @@
 
prop_data = rtas_buf + sizeof(*upwa);
 
-   for (i = 0; i < upwa->nprops; i++) {
+   /* The first element of the buffer is the path of the node
+* being updated in the form of a 8 byte string length
+* followed by the string. Skip past this to get to the
+* properties being updated.
+*/
+   vd = *prop_data++;
+   prop_data += vd;
+
+   /* The path we skipped over is counted as one of the elements
+* returned so start counting at one.
+*/
+   for (i = 1; i < upwa->nprops; i++) {
char *prop_name;
-   u32 vd;
 
-   prop_name = prop_data + 1;
+   prop_name = prop_data;
prop_data += strlen(prop_name) + 1;
-   vd = *prop_data++;
+   vd = *(u32 *)prop_data;
+   prop_data += sizeof(vd);
 
switch (vd) {
case 0x:

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 1/12] Create a powerpc update_devicetree interface

2013-04-22 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

When rtasd receives a PRRN event, it needs to make a series of RTAS
calls (ibm,update-nodes and ibm,update-properties) to retrieve the
updated device tree information. These calls are already handled in the
pseries_devtree_update() routine used in partition migration.

This patch exposes a method for updating the device tree via
ppc_md.update_devicetree that takes a single 32-bit value as a parameter.
For pseries platforms this is the existing pseries_devicetree_update routine
which is updated to take the new parameter which is a scope value
to indicate the the reason for making the rtas calls. This parameter is
required by the ibm,update-nodes/ibm,update-properties RTAS calls, and
the appropriate value is contained within the RTAS event for PRRN
notifications. In pseries_devicetree_update() it was previously
hard-coded to 1, the scope value for partition migration.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/machdep.h|2 ++
 arch/powerpc/include/asm/rtas.h   |1 +
 arch/powerpc/kernel/rtas.c|   10 ++
 arch/powerpc/platforms/pseries/mobility.c |   24 +++-
 4 files changed, 28 insertions(+), 9 deletions(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-04-15 
09:18:10.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-04-17 12:58:33.0 
-0500
@@ -276,6 +276,7 @@
const char *uname, int depth, void *data);
 
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
+extern int update_devicetree(s32 scope);
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
 extern void rtas_cancel_event_scan(void);
Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-04-15 
09:18:10.0 -0500
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-04-17 
13:01:08.0 -0500
@@ -19,6 +19,7 @@
 #include 
 
 #include 
+#include 
 #include "pseries.h"
 
 static struct kobject *mobility_kobj;
@@ -37,14 +38,16 @@
 #define UPDATE_DT_NODE 0x0200
 #define ADD_DT_NODE0x0300
 
-static int mobility_rtas_call(int token, char *buf)
+#define MIGRATION_SCOPE(1)
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
 {
int rc;
 
spin_lock(&rtas_data_buf_lock);
 
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
-   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
 
spin_unlock(&rtas_data_buf_lock);
@@ -123,7 +126,7 @@
return 0;
 }
 
-static int update_dt_node(u32 phandle)
+static int update_dt_node(u32 phandle, s32 scope)
 {
struct update_props_workarea *upwa;
struct device_node *dn;
@@ -151,7 +154,8 @@
upwa->phandle = phandle;
 
do {
-   rc = mobility_rtas_call(update_properties_token, rtas_buf);
+   rc = mobility_rtas_call(update_properties_token, rtas_buf,
+   scope);
if (rc < 0)
break;
 
@@ -219,7 +223,7 @@
return rc;
 }
 
-static int pseries_devicetree_update(void)
+static int pseries_devicetree_update(s32 scope)
 {
char *rtas_buf;
u32 *data;
@@ -235,7 +239,7 @@
return -ENOMEM;
 
do {
-   rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+   rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
if (rc && rc != 1)
break;
 
@@ -256,7 +260,7 @@
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
-   update_dt_node(phandle);
+   update_dt_node(phandle, scope);
break;
case ADD_DT_NODE:
drc_index = *data++;
@@ -276,7 +280,7 @@
int rc;
int

[PATCH v3 0/12] NUMA CPU Reconfiguration using PRRN

2013-04-22 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

PRRN Events are RTAS events sent up through the event-scan mechanism on
Power. When these events are received the system needs can get the updated
device tree affinity information for the affected CPUs/memory via the
rtas update-nodes and update-properties calls. This information is then
used to update the NUMA affinity of the CPUs/Memory in the kernel.

This patch set adds the ability to recognize PRRN events, update the device
tree and kernel information for CPUs (memory will be handled in a later
patch), and add an interface to enable/disable toplogy updates from /proc.

Additionally, these updates solve an existing problem with the VPHN (Virtual
Processor Home Node) capability and allow us to re-enable this feature.

Nathan Fontenot

 arch/powerpc/include/asm/firmware.h   |3 
 arch/powerpc/include/asm/prom.h   |   46 ++--
 arch/powerpc/include/asm/rtas.h   |2 
 arch/powerpc/kernel/prom_init.c   |   98 ++
 arch/powerpc/kernel/rtasd.c   |   37 +++
 arch/powerpc/mm/numa.c|  214 +++---
 arch/powerpc/platforms/pseries/firmware.c |1 
 arch/powerpc/platforms/pseries/mobility.c |   24 +-
 powerpc/arch/powerpc/include/asm/firmware.h   |4 
 powerpc/arch/powerpc/include/asm/machdep.h|2 
 powerpc/arch/powerpc/include/asm/prom.h   |   73 +++
 powerpc/arch/powerpc/include/asm/rtas.h   |1 
 powerpc/arch/powerpc/include/asm/topology.h   |5 
 powerpc/arch/powerpc/kernel/prom_init.c   |2 
 powerpc/arch/powerpc/kernel/rtas.c|   10 +
 powerpc/arch/powerpc/kernel/rtasd.c   |7 
 powerpc/arch/powerpc/mm/numa.c|   62 ++
 powerpc/arch/powerpc/platforms/pseries/firmware.c |   49 -
 powerpc/arch/powerpc/platforms/pseries/mobility.c |   20 +-
 powerpc/arch/powerpc/platforms/pseries/pseries.h  |5 
 powerpc/arch/powerpc/platforms/pseries/setup.c|   40 ++--
 21 files changed, 500 insertions(+), 205 deletions(-)

Updates for v3 of the patchset:

1/12 - Updated to use a ppc_md interface to invoke device tree updates, this
corrects the build break previously seen in patch 2/12 for non-pseries
platforms.

2/12 - New patch in the series to correct the parsing of the buffer returned
from ibm,update-properties rtas call.

5/12 - The parsing of architecture vector 5 has been made more efficient.

7/12 - Correct #define used in call the firmware_has_feature()

8/12 - Updated calling of stop_machine() to only call it once per PRRN event.

12/12 - Added inclusion of topology.h to rtasd.c to correct a build failure
on non-pseries platforms.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2 2/11] Add PRRN Event Handler

2013-04-15 Thread Nathan Fontenot
On 04/10/2013 03:30 AM, Michael Ellerman wrote:
> On Mon, Mar 25, 2013 at 01:52:32PM -0500, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> A PRRN event is signaled via the RTAS event-scan mechanism, which
>> returns a Hot Plug Event message "fixed part" indicating "Platform
>> Resource Reassignment". In response to the Hot Plug Event message,
>> we must call ibm,update-nodes to determine which resources were
>> reassigned and then ibm,update-properties to obtain the new affinity
>> information about those resources.
> ..
> 
>> Index: powerpc/arch/powerpc/kernel/rtasd.c
>> ===
>> --- powerpc.orig/arch/powerpc/kernel/rtasd.c 2013-03-20 08:24:14.0 
>> -0500
>> +++ powerpc/arch/powerpc/kernel/rtasd.c  2013-03-20 08:52:08.0 
>> -0500
>> @@ -87,6 +87,8 @@
>>  return "Resource Deallocation Event";
>>  case RTAS_TYPE_DUMP:
>>  return "Dump Notification Event";
>> +case RTAS_TYPE_PRRN:
>> +return "Platform Resource Reassignment Event";
>>  }
>>  
>>  return rtas_type[0];
>> @@ -265,7 +267,38 @@
>>  spin_unlock_irqrestore(&rtasd_log_lock, s);
>>  return;
>>  }
>> +}
>> +
>> +static s32 update_scope;
>> +
>> +static void prrn_work_fn(struct work_struct *work)
>> +{
>> +/*
>> + * For PRRN, we must pass the negative of the scope value in
>> + * the RTAS event.
>> + */
>> +pseries_devicetree_update(-update_scope);
>> +}
>> +static DECLARE_WORK(prrn_work, prrn_work_fn);
> 
> This breaks the 32-bit build (ppc6xx_defconfig):
> 
> arch/powerpc/kernel/rtasd.c:280: undefined reference to 
> `pseries_devicetree_update'
> 

I'm not seeing this error. rtasd.c compilkes fine, but I am hitting another
error later in the build that keeps it from finishing.

arch/powerpc/platforms/52xx/mpc52xx_pic.c: In function ‘mpc52xx_irqhost_map’:
arch/powerpc/platforms/52xx/mpc52xx_pic.c:343: error: ‘irqchip’ may be used 
uninitialized in this function


-Nathan 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2 7/11] Use stop machine to update cpu maps

2013-04-05 Thread Nathan Fontenot
On 04/03/2013 11:46 PM, Paul Mackerras wrote:
> On Mon, Mar 25, 2013 at 01:58:04PM -0500, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> The new PRRN firmware feature allows CPU and memory resources to be
>> transparently reassigned across NUMA boundaries. When this happens, the
>> kernel must update the node maps to reflect the new affinity
>> information.
>>
>> Although the NUMA maps can be protected by locking primitives during the
>> update itself, this is insufficient to prevent concurrent accesses to these
>> structures. Since cpumask_of_node() hands out a pointer to these
>> structures, they can still be modified outside of the lock. Furthermore,
>> tracking down each usage of these pointers and adding locks would be quite
>> invasive and difficult to maintain.
>>
>> Situations like these are best handled using stop_machine(). Since the NUMA
>> affinity updates are exceptionally rare events, this approach has the
>> benefit of not adding any overhead while accessing the NUMA maps during
>> normal operation.
> 
> I notice you do one stop_machine() call for every cpu whose affinity
> has changed.  Couldn't we update the affinity for them all in one
> stop_machine call?  Given that stopping the whole machine can be quite
> slow, wouldn't it be better to do one call rather than potentially
> many?
> 

Agreed, having to call stop_machine() for each cpu that gets updated is
pretty brutal. The plus side is that PRRN events should a rare occurrence 
and not cause too much pain.

The current design ties into the of notification chain so that we can do
the affinity update when the affinity property in the device tree is updated.
Switching to doing one stop and updating all of the cpus would require a
design changeand

I went back and looked at the code again and there is another issue with
way this is done. Tying into the of notification chain is great for
being informed of when a property changes but the code (from patch 6/11)

+   case OF_RECONFIG_ADD_PROPERTY:
+   case OF_RECONFIG_UPDATE_PROPERTY:
+   update = (struct of_prop_reconfig *)data;
+   if (!of_prop_cmp(update->dn->type, "cpu")) {
+   u32 core_id;
+   of_property_read_u32(update->dn, "reg", &core_id);
+   stage_topology_update(core_id);
+   rc = NOTIFY_OK;
+   }
+   break;

Does not check to see which property is being updated and just assumes
the affinity is being updated. This code as is will do an affinity update
every time any property of a cpu is updated or added.

Since this needs an update I will also look at possibly doing this so
that we call stop_machine only once.

-- 
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2 6/11] Update CPU Maps

2013-04-05 Thread Nathan Fontenot
On 04/03/2013 11:42 PM, Paul Mackerras wrote:
> On Mon, Mar 25, 2013 at 01:57:08PM -0500, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> Platform events such as partition migration or the new PRRN firmware
>> feature can cause the NUMA characteristics of a CPU to change, and these
>> changes will be reflected in the device tree nodes for the affected
>> CPUs.
>>
>> This patch registers a handler for Open Firmware device tree updates
>> and reconfigures the CPU and node maps whenever the associativity
>> changes. Currently, this is accomplished by marking the affected CPUs in
>> the cpu_associativity_changes_mask and allowing
>> arch_update_cpu_topology() to retrieve the new associativity information
>> using hcall_vphn().
>>
>> Protecting the NUMA cpu maps from concurrent access during an update
>> operation will be addressed in a subsequent patch in this series.
>>
>> Signed-off-by: Nathan Fontenot 
> 
> [snip]
> 
>> +if (firmware_has_feature(OV5_PRRN)) {
> 
> Shouldn't this be FW_FEATURE_PRRN?  How well has this patch been
> tested? :-/

Yes this should have been FW_FEATURE_PRRN.

I know I tested this and it took some digging to find out why my test succeeded
even though I used the wrong value in the call to firmware_has_feature. The 
value
for OV5_PRRN (0x0540) just happens to match some of he bits that are set in
powerpc_firmware_features bit field and cause the check to return true. My test
worked out of sheer luck. I'll update this patch and re-test to ensure it works
with the real value.

This does make me think, should we update firmware_has_feature() to avoid this
kind of false positive in the future. something like

#define firmware_has_feature(feature)   
   \
((FW_FEATURE_ALWAYS & (feature)) == (feature) ||
   \
 (FW_FEATURE_POSSIBLE & powerpc_firmware_features & (feature)) == 
(feature)

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2 2/11] Add PRRN Event Handler

2013-04-05 Thread Nathan Fontenot
On 04/03/2013 10:34 PM, Paul Mackerras wrote:
> On Mon, Mar 25, 2013 at 01:52:32PM -0500, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> A PRRN event is signaled via the RTAS event-scan mechanism, which
>> returns a Hot Plug Event message "fixed part" indicating "Platform
>> Resource Reassignment". In response to the Hot Plug Event message,
>> we must call ibm,update-nodes to determine which resources were
>> reassigned and then ibm,update-properties to obtain the new affinity
>> information about those resources.
>>
>> The PRRN event-scan RTAS message contains only the "fixed part" with
>> the "Type" field set to the value 160 and no Extended Event Log. The
>> four-byte Extended Event Log Length field is repurposed (since no
>> Extended Event Log message is included) to pass the "scope" parameter
>> that causes the ibm,update-nodes to return the nodes affected by the
>> specific resource reassignment.
>>
>> This patch adds a handler in rtasd for PRRN RTAS events. The function
>> pseries_devicetree_update() (from mobility.c) is used to make the
>> ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
>> (handled by a subsequent patch) will require significant processing,
>> so pseries_devicetree_update() is called from an asynchronous workqueue
>> to allow rtasd to continue processing events. Since we flush all work
>> on the queue before handling any new work there should only be one event
>> in flight of being handled at a time.
> ^^ "of" is superfluous

will remove it.

> 
> In the worst case where PRRN events come close together in time, the
> flush_work will block for however long it takes to do this
> "significant processing", meaning that we're no better off using a
> workqueue.  Do we have any reason to think that these PRRN events will
> normally be widely spaced in time?  If so you should mention it in the
> patch description.

Yes. PRRN events can only be triggered from the HMC by an IBM tech who has
to actualy log into a customer system and initiate the PRRN event. There
is no method for a user to initiate a PRRN event. Given this is is safe
to assume that these events will be widely spaced in time.

> 
> Also, rtasd isn't actually a task, it's just a function that gets run
> via schedule_delayed_work_on() and re-schedules itself each time it
> runs.  Is there any deadlock possibility in calling flush_work from a
> work function?

I don't know of any but I will investigate.

Thanks for the feedback.
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 10/11] Enable PRRN

2013-03-25 Thread Nathan Fontenot
The Linux kernel and platform firmware negotiate their mutual support
of the PRRN option via the ibm,client-architecture-support interface.
This patch simply sets the appropriate fields in the client architecture
vector to indicate Linux support and will cause the firmware to begin
sending PRRN events via the RTAS event-scan mechanism.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/kernel/prom_init.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-03-20 
12:25:38.0 -0500
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-03-20 12:27:50.0 
-0500
@@ -698,7 +698,7 @@
 #else
0,
 #endif
-   OV5_FEAT(OV5_TYPE1_AFFINITY),
+   OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
0,
0,
0,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 11/11] Add /proc interface to control topology updates

2013-03-25 Thread Nathan Fontenot
There are instances in which we do not want topology updates to occur.
In order to allow this a /proc interface (/proc/powerpc/topology_updates)
is introduced so that topology updates can be enabled and disabled.

This patch also adds a prrn_is_enabled() call so that PRRN events are
handled in the kernel only if topology updating is enabled.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/topology.h |5 ++
 arch/powerpc/kernel/rtasd.c |6 ++-
 arch/powerpc/mm/numa.c  |   62 +++-
 3 files changed, 70 insertions(+), 3 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-20 12:27:48.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-20 12:27:52.0 -0500
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -1558,7 +1561,6 @@
 
return rc;
 }
-__initcall(start_topology_update);
 
 /*
  * Disable polling for VPHN associativity changes.
@@ -1577,4 +1579,62 @@
 
return rc;
 }
+
+inline int prrn_is_enabled(void)
+{
+   return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+   if (vphn_enabled || prrn_enabled)
+   seq_puts(file, "on\n");
+   else
+   seq_puts(file, "off\n");
+
+   return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+   char kbuf[4]; /* "on" or "off" plus null. */
+   int read_len;
+
+   read_len = count < 3 ? count : 3;
+   if (copy_from_user(kbuf, buf, read_len))
+   return -EINVAL;
+
+   kbuf[read_len] = '\0';
+
+   if (!strncmp(kbuf, "on", 2))
+   start_topology_update();
+   else if (!strncmp(kbuf, "off", 3))
+   stop_topology_update();
+   else
+   return -EINVAL;
+
+   return count;
+}
+
+static const struct file_operations topology_ops = {
+   .read = seq_read,
+   .write = topology_write,
+   .open = topology_open,
+   .release = single_release
+};
+
+static int topology_update_init(void)
+{
+   start_topology_update();
+   proc_create("powerpc/topology_updates", 644, NULL, &topology_ops);
+
+   return 0;
+}
+device_initcall(topology_update_init);
 #endif /* CONFIG_PPC_SPLPAR */
Index: powerpc/arch/powerpc/include/asm/topology.h
===
--- powerpc.orig/arch/powerpc/include/asm/topology.h2013-03-20 
12:25:37.0 -0500
+++ powerpc/arch/powerpc/include/asm/topology.h 2013-03-20 12:27:52.0 
-0500
@@ -71,6 +71,7 @@
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
+extern inline int prrn_is_enabled(void);
 #else
 static inline int start_topology_update(void)
 {
@@ -80,6 +81,10 @@
 {
return 0;
 }
+static inline int prrn_is_enabled(void)
+{
+   return 0;
+}
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-03-20 12:25:37.0 
-0500
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-03-20 12:27:52.0 -0500
@@ -292,11 +292,13 @@
 {
pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
 
-   if (log->type == RTAS_TYPE_PRRN)
+   if (log->type == RTAS_TYPE_PRRN) {
/* For PRRN Events the extended log length is used to denote
 * the scope for calling rtas update-nodes.
 */
-   prrn_schedule_update(log->extended_log_length);
+   if (prrn_is_enabled())
+   prrn_schedule_update(log->extended_log_length);
+   }
 
return;
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 9/11] Re-enable Virtual Private Home Node capabilities

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

The new PRRN firmware feature provides a more convenient and event-driven
interface than VPHN for notifying Linux of changes to the NUMA affinity of
platform resources. However, for practical reasons, it may not be feasible
for some customers to update to the latest firmware. For these customers,
the VPHN feature supported on previous firmware versions may still be the
best option.

The VPHN feature was previously disabled due to races with the load
balancing code when accessing the NUMA cpu maps, but the new stop_machine()
approach protects the NUMA cpu maps from these concurrent accesses. It
should be safe to re-enable this feature now.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-20 12:27:46.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-20 12:27:48.0 -0500
@@ -1545,9 +1545,8 @@
vphn_enabled = 0;
rc = of_reconfig_notifier_register(&dt_update_nb);
}
-   } else if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
+   } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
   get_lppaca()->shared_proc) {
-   /* Disabled until races with load balancing are fixed */
if (!vphn_enabled) {
prrn_enabled = 0;
vphn_enabled = 1;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 8/11] Update numa cpu vdso info

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

The following patch adds vdso_getcpu_init(), which stores the NUMA node for
a cpu in SPRG3:

Commit 18ad51dd34 ("powerpc: Add VDSO version of getcpu") adds
vdso_getcpu_init(), which stores the NUMA node for a cpu in SPRG3.

This patch ensures that this information is also updated when the NUMA
affinity of a cpu changes.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-20 12:27:43.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-20 12:27:46.0 -0500
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int numa_enabled = 1;
 
@@ -1426,6 +1427,7 @@
unregister_cpu_under_node(update->cpu, update->old_nid);
unmap_cpu_from_node(update->cpu);
map_cpu_to_node(update->cpu, update->new_nid);
+   vdso_getcpu_init();
register_cpu_under_node(update->cpu, update->new_nid);
 
return 0;
@@ -1440,8 +1442,11 @@
int cpu, changed = 0;
struct topology_update_data update;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   cpumask_t updated_cpu;
struct device *dev;
 
+   cpumask_clear(&updated_cpu);
+
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
update.cpu = cpu;
vphn_get_associativity(cpu, associativity);
@@ -1451,7 +1456,8 @@
update.new_nid = first_online_node;
 
update.old_nid = numa_cpu_lookup_table[cpu];
-   stop_machine(update_cpu_topology, &update, cpu_online_mask);
+   cpumask_set_cpu(cpu, &updated_cpu);
+   stop_machine(update_cpu_topology, &update, &updated_cpu);
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 7/11] Use stop machine to update cpu maps

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

The new PRRN firmware feature allows CPU and memory resources to be
transparently reassigned across NUMA boundaries. When this happens, the
kernel must update the node maps to reflect the new affinity
information.

Although the NUMA maps can be protected by locking primitives during the
update itself, this is insufficient to prevent concurrent accesses to these
structures. Since cpumask_of_node() hands out a pointer to these
structures, they can still be modified outside of the lock. Furthermore,
tracking down each usage of these pointers and adding locks would be quite
invasive and difficult to maintain.

Situations like these are best handled using stop_machine(). Since the NUMA
affinity updates are exceptionally rare events, this approach has the
benefit of not adding any overhead while accessing the NUMA maps during
normal operation.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   51 +
 1 file changed, 35 insertions(+), 16 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-20 12:26:36.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-20 12:27:43.0 -0500
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1254,6 +1255,12 @@
 
 /* Virtual Processor Home Node (VPHN) support */
 #ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+   int cpu;
+   int old_nid;
+   int new_nid;
+};
+
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
@@ -1405,34 +1412,46 @@
 }
 
 /*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+   struct topology_update_data *update = data;
+
+   if (!update)
+   return -EINVAL;
+
+   unregister_cpu_under_node(update->cpu, update->old_nid);
+   unmap_cpu_from_node(update->cpu);
+   map_cpu_to_node(update->cpu, update->new_nid);
+   register_cpu_under_node(update->cpu, update->new_nid);
+
+   return 0;
+}
+
+/*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  */
 int arch_update_cpu_topology(void)
 {
-   int cpu, nid, old_nid, changed = 0;
+   int cpu, changed = 0;
+   struct topology_update_data update;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
 
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+   update.cpu = cpu;
vphn_get_associativity(cpu, associativity);
-   nid = associativity_to_nid(associativity);
-
-   if (nid < 0 || !node_online(nid))
-   nid = first_online_node;
+   update.new_nid = associativity_to_nid(associativity);
 
-   old_nid = numa_cpu_lookup_table[cpu];
-
-   /* Disable hotplug while we update the cpu
-* masks and sysfs.
-*/
-   get_online_cpus();
-   unregister_cpu_under_node(cpu, old_nid);
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, nid);
-   register_cpu_under_node(cpu, nid);
-   put_online_cpus();
+   if (update.new_nid < 0 || !node_online(update.new_nid))
+   update.new_nid = first_online_node;
 
+   update.old_nid = numa_cpu_lookup_table[cpu];
+   stop_machine(update_cpu_topology, &update, cpu_online_mask);
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 6/11] Update CPU Maps

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

Platform events such as partition migration or the new PRRN firmware
feature can cause the NUMA characteristics of a CPU to change, and these
changes will be reflected in the device tree nodes for the affected
CPUs.

This patch registers a handler for Open Firmware device tree updates
and reconfigures the CPU and node maps whenever the associativity
changes. Currently, this is accomplished by marking the affected CPUs in
the cpu_associativity_changes_mask and allowing
arch_update_cpu_topology() to retrieve the new associativity information
using hcall_vphn().

Protecting the NUMA cpu maps from concurrent access during an update
operation will be addressed in a subsequent patch in this series.

Signed-off-by: Nathan Fontenot 
---

 arch/powerpc/include/asm/firmware.h   |3 
 arch/powerpc/include/asm/prom.h   |1 
 arch/powerpc/mm/numa.c|   99 ++
 arch/powerpc/platforms/pseries/firmware.c |1 
 4 files changed, 79 insertions(+), 25 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-25 
11:07:56.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-25 11:27:11.0 
-0500
@@ -128,6 +128,7 @@
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
 #define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
 #define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-25 11:22:44.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-25 11:27:11.0 -0500
@@ -1257,7 +1257,8 @@
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
-static void set_topology_timer(void);
+static int prrn_enabled;
+static void reset_topology_timer(void);
 
 /*
  * Store the current values of the associativity change counters in the
@@ -1293,11 +1294,9 @@
  */
 static int update_cpu_associativity_changes_mask(void)
 {
-   int cpu, nr_cpus = 0;
+   int cpu;
cpumask_t *changes = &cpu_associativity_changes_mask;
 
-   cpumask_clear(changes);
-
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
@@ -1311,11 +1310,10 @@
}
if (changed) {
cpumask_set_cpu(cpu, changes);
-   nr_cpus++;
}
}
 
-   return nr_cpus;
+   return cpumask_weight(changes);
 }
 
 /*
@@ -1416,7 +1414,7 @@
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
 
-   for_each_cpu(cpu,&cpu_associativity_changes_mask) {
+   for_each_cpu(cpu, &cpu_associativity_changes_mask) {
vphn_get_associativity(cpu, associativity);
nid = associativity_to_nid(associativity);
 
@@ -1438,6 +1436,7 @@
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+   cpumask_clear_cpu(cpu, &cpu_associativity_changes_mask);
changed = 1;
}
 
@@ -1457,37 +1456,80 @@
 
 static void topology_timer_fn(unsigned long ignored)
 {
-   if (!vphn_enabled)
-   return;
-   if (update_cpu_associativity_changes_mask() > 0)
+   if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
topology_schedule_update();
-   set_topology_timer();
+   else if (vphn_enabled) {
+   if (update_cpu_associativity_changes_mask() > 0)
+   topology_schedule_update();
+   reset_topology_timer();
+   }
 }
 static struct timer_list topology_timer =
TIMER_INITIALIZER(topology_timer_fn, 0, 0);
 
-static void set_topology_timer(void)
+static void reset_topology_timer(void)
 {
topology_timer.data = 0;
topology_timer.expires = jiffies + 60 * HZ;
-   add_timer(&topology_timer);
+   mod_timer(&topology_timer, topology_timer.expires);
+}
+
+static void stage_topology_update(int core_id)
+{
+   cpumask_or(&cpu_associativity_changes_mask,
+   &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+   reset_topology_timer();
 }
 
+static int dt_update_callback(struct notifier_block *nb,
+  

[PATCH v2 5/11] Update numa.c to use updated firmware_has_feature()

2013-03-25 Thread Nathan Fontenot
Update the numa code to use the updated firmware_has_feature() when checking
for type 1 affinity.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-20 12:25:42.0 -0500
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-20 12:26:29.0 -0500
@@ -291,9 +291,7 @@
 static int __init find_min_common_depth(void)
 {
int depth;
-   struct device_node *chosen;
struct device_node *root;
-   const char *vec5;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -325,24 +323,10 @@
 
distance_ref_points_depth /= sizeof(int);
 
-#define VEC5_AFFINITY_BYTE 5
-#define VEC5_AFFINITY  0x80
-
-   if (firmware_has_feature(FW_FEATURE_OPAL))
+   if (firmware_has_feature(FW_FEATURE_OPAL) ||
+   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
form1_affinity = 1;
-   else {
-   chosen = of_find_node_by_path("/chosen");
-   if (chosen) {
-   vec5 = of_get_property(chosen,
-  "ibm,architecture-vec-5", NULL);
-   if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
-   VEC5_AFFINITY)) {
-   dbg("Using form 1 affinity\n");
-   form1_affinity = 1;
-   }
-
-   of_node_put(chosen);
-   }
}
 
if (form1_affinity) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 4/11] Update firmware_has_feature() to check architecture bits

2013-03-25 Thread Nathan Fontenot
The firmware_has_feature() function makes it easy to check for supported
features of the hypervisor. This patch extends the capability of the
firmware_has_feature() function to include checking for specified bits
in vector 5 of the architecture vector as is reported in the device tree.

As part of this the #defines used for the architecture vector are
moved to prom.h and re-defined such that the vector 5 options have the vector
index and the feature bits encoded into them. This makes for a much
simpler design to add bits from the architecture vector to be added to
the checking done in firmware_has_feature().

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/firmware.h   |4 +
 arch/powerpc/include/asm/prom.h   |   45 +---
 arch/powerpc/kernel/prom_init.c   |   23 +++---
 arch/powerpc/platforms/pseries/firmware.c |   67 ++
 arch/powerpc/platforms/pseries/pseries.h  |5 +-
 arch/powerpc/platforms/pseries/setup.c|   40 -
 6 files changed, 131 insertions(+), 53 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-25 
10:47:54.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-25 11:07:56.0 
-0500
@@ -111,31 +111,27 @@
 /* Option vector 4: IBM PAPR implementation */
 #define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
 
-/* Option vector 5: PAPR/OF options supported */
-#define OV5_LPAR   0x80/* logical partitioning supported */
-#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* Option vector 5: PAPR/OF options supported
+ * Thses bits are also used for the platform_has_feature() call so
+ * we encode the vector index in the define and use the OV5_FEAT()
+ * and OV5_INDX() macros to extract the desired information.
+ */
+#define OV5_FEAT(x)((x) & 0xff)
+#define OV5_INDX(x)((x) >> 8)
+#define OV5_LPAR   0x0280  /* logical partitioning supported */
+#define OV5_SPLPAR 0x0240  /* shared-processor LPAR supported */
 /* ibm,dynamic-reconfiguration-memory property supported */
-#define OV5_DRCONF_MEMORY  0x20
-#define OV5_LARGE_PAGES0x10/* large pages supported */
-#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
-/* PCIe/MSI support.  Without MSI full PCIe is not supported */
-#ifdef CONFIG_PCI_MSI
-#define OV5_MSI0x01/* PCIe/MSI support */
-#else
-#define OV5_MSI0x00
-#endif /* CONFIG_PCI_MSI */
-#ifdef CONFIG_PPC_SMLPAR
-#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
-#define OV5_XCMO   0x40/* Page Coalescing */
-#else
-#define OV5_CMO0x00
-#define OV5_XCMO   0x00
-#endif
-#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
-#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
-#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+#define OV5_DRCONF_MEMORY  0x0220
+#define OV5_LARGE_PAGES0x0210  /* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x0202  /* donate dedicated CPU support 
*/
+#define OV5_MSI0x0201  /* PCIe/MSI support */
+#define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x0440  /* Page Coalescing */
+#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x0F01  /* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
@@ -145,6 +141,7 @@
  * followed by # option vectors - 1, followed by the option vectors.
  */
 extern unsigned char ibm_architecture_vec[];
+bool platform_has_feature(unsigned int);
 #endif
 
 /* These includes are put at the bottom because they may contain things
Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-03-25 
10:47:54.0 -0500
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-03-25 11:07:56.0 
-0500
@@ -684,11 +684,21 @@
/* option vector 5: PAPR/OF options */
19 - 2, /* length */

[PATCH v2 3/11] Move architecture vector definitions to prom.h

2013-03-25 Thread Nathan Fontenot
As part of handling of hndling PRRN events we will need to check the
vector 5 portion of the architectire bits reported in the device tree
to ensure that PRRN event handling is enabled. In order to do this a
new platform_has_feature call is introduced (in a subsequent patch) to
make this check.  To avoid having to re-define bits in the architecture
vector the bits are moved to prom.h.

This patch is the first step in implementing the platform_has_feature
call by simply moving the bit definitions from prom_init.c to asm/prom.h.
There are no functional.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/include/asm/prom.h |   73 ++
 arch/powerpc/kernel/prom_init.c |   75 +++-
 2 files changed, 79 insertions(+), 69 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-20 
08:24:13.0 -0500
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-20 08:52:59.0 
-0500
@@ -74,6 +74,79 @@
 #define DRCONF_MEM_AI_INVALID  0x0040
 #define DRCONF_MEM_RESERVED0x0080
 
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * There are two methods for telling firmware what our capabilities are.
+ * Newer machines have an "ibm,client-architecture-support" method on the
+ * root node.  For older machines, we have to call the "process-elf-header"
+ * method in the /packages/elf-loader node, passing it a fake 32-bit
+ * ELF header containing a couple of PT_NOTE sections that contain
+ * structures that contain various information.
+ */
+
+/* New method - extensible architecture description vector. */
+
+/* Option vector bits - generic bits in byte 1 */
+#define OV_IGNORE  0x80/* ignore this vector */
+#define OV_CESSATION_POLICY0x40/* halt if unsupported option present*/
+
+/* Option vector 1: processor architectures supported */
+#define OV1_PPC_2_00   0x80/* set if we support PowerPC 2.00 */
+#define OV1_PPC_2_01   0x40/* set if we support PowerPC 2.01 */
+#define OV1_PPC_2_02   0x20/* set if we support PowerPC 2.02 */
+#define OV1_PPC_2_03   0x10/* set if we support PowerPC 2.03 */
+#define OV1_PPC_2_04   0x08/* set if we support PowerPC 2.04 */
+#define OV1_PPC_2_05   0x04/* set if we support PowerPC 2.05 */
+#define OV1_PPC_2_06   0x02/* set if we support PowerPC 2.06 */
+#define OV1_PPC_2_07   0x01/* set if we support PowerPC 2.07 */
+
+/* Option vector 2: Open Firmware options supported */
+#define OV2_REAL_MODE  0x20/* set if we want OF in real mode */
+
+/* Option vector 3: processor options supported */
+#define OV3_FP 0x80/* floating point */
+#define OV3_VMX0x40/* VMX/Altivec */
+#define OV3_DFP0x20/* decimal FP */
+
+/* Option vector 4: IBM PAPR implementation */
+#define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
+
+/* Option vector 5: PAPR/OF options supported */
+#define OV5_LPAR   0x80/* logical partitioning supported */
+#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* ibm,dynamic-reconfiguration-memory property supported */
+#define OV5_DRCONF_MEMORY  0x20
+#define OV5_LARGE_PAGES0x10/* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
+/* PCIe/MSI support.  Without MSI full PCIe is not supported */
+#ifdef CONFIG_PCI_MSI
+#define OV5_MSI0x01/* PCIe/MSI support */
+#else
+#define OV5_MSI0x00
+#endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x40/* Page Coalescing */
+#else
+#define OV5_CMO0x00
+#define OV5_XCMO   0x00
+#endif
+#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+
+/* Option Vector 6: IBM PAPR hints */
+#define OV6_LINUX  0x02/* Linux is our OS */
+
+/*
+ * The architecture vector has an array of PVR mask/value pairs,
+ * followed by # option vectors - 1, followed by the option vectors.
+ */
+extern unsigned char ibm_architecture_vec[];
+#endif
+
 /* These includes are put at the bottom because they may contain things
  * that are overridden by this file.  Ideally they shouldn't be included
  * by this fi

[PATCH v2 2/11] Add PRRN Event Handler

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

A PRRN event is signaled via the RTAS event-scan mechanism, which
returns a Hot Plug Event message "fixed part" indicating "Platform
Resource Reassignment". In response to the Hot Plug Event message,
we must call ibm,update-nodes to determine which resources were
reassigned and then ibm,update-properties to obtain the new affinity
information about those resources.

The PRRN event-scan RTAS message contains only the "fixed part" with
the "Type" field set to the value 160 and no Extended Event Log. The
four-byte Extended Event Log Length field is repurposed (since no
Extended Event Log message is included) to pass the "scope" parameter
that causes the ibm,update-nodes to return the nodes affected by the
specific resource reassignment.

This patch adds a handler in rtasd for PRRN RTAS events. The function
pseries_devicetree_update() (from mobility.c) is used to make the
ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
(handled by a subsequent patch) will require significant processing,
so pseries_devicetree_update() is called from an asynchronous workqueue
to allow rtasd to continue processing events. Since we flush all work
on the queue before handling any new work there should only be one event
in flight of being handled at a time.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h |2 ++
 arch/powerpc/kernel/rtasd.c |   35 ++-
 2 files changed, 36 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-03-20 
08:51:59.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-03-20 08:52:08.0 
-0500
@@ -143,6 +143,8 @@
 #define RTAS_TYPE_PMGM_TIME_ALARM  0x6f
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE   0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC0x71
+/* Platform Resource Reassignment Notification */
+#define RTAS_TYPE_PRRN 0xA0
 
 /* RTAS check-exception vector offset */
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-03-20 08:24:14.0 
-0500
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-03-20 08:52:08.0 -0500
@@ -87,6 +87,8 @@
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
+   case RTAS_TYPE_PRRN:
+   return "Platform Resource Reassignment Event";
}
 
return rtas_type[0];
@@ -265,7 +267,38 @@
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
+}
+
+static s32 update_scope;
+
+static void prrn_work_fn(struct work_struct *work)
+{
+   /*
+* For PRRN, we must pass the negative of the scope value in
+* the RTAS event.
+*/
+   pseries_devicetree_update(-update_scope);
+}
+static DECLARE_WORK(prrn_work, prrn_work_fn);
+
+void prrn_schedule_update(u32 scope)
+{
+   flush_work(&prrn_work);
+   update_scope = scope;
+   schedule_work(&prrn_work);
+}
+
+static void pseries_handle_event(const struct rtas_error_log *log)
+{
+   pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
+
+   if (log->type == RTAS_TYPE_PRRN)
+   /* For PRRN Events the extended log length is used to denote
+* the scope for calling rtas update-nodes.
+*/
+   prrn_schedule_update(log->extended_log_length);
 
+   return;
 }
 
 static int rtas_log_open(struct inode * inode, struct file * file)
@@ -389,7 +422,7 @@
}
 
if (error == 0)
-   pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+   pseries_handle_event((struct rtas_error_log *)logdata);
 
} while(error == 0);
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 1/11] Expose pseries devicetree_update()

2013-03-25 Thread Nathan Fontenot
From: Jesse Larrew 

Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

When rtasd receives a PRRN event, it needs to make a series of RTAS
calls (ibm,update-nodes and ibm,update-properties) to retrieve the
updated device tree information. These calls are already handled in the
pseries_devtree_update() routine used in partition migration.

This patch simply exposes pseries_devicetree_update() so it can be
called by rtasd. pseries_devicetree_update() and supporting functions
are also modified to take a 32-bit 'scope' parameter. This parameter is
required by the ibm,update-nodes/ibm,update-properties RTAS calls, and
the appropriate value is contained within the RTAS event for PRRN
notifications. In pseries_devicetree_update() it was previously
hard-coded to 1, the scope value for partition migration.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h   |1 +
 arch/powerpc/platforms/pseries/mobility.c |   21 -
 2 files changed, 13 insertions(+), 9 deletions(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-03-20 
08:24:15.0 -0500
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-03-20 08:51:59.0 
-0500
@@ -276,6 +276,7 @@
const char *uname, int depth, void *data);
 
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
+extern int pseries_devicetree_update(s32 scope);
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
 extern void rtas_cancel_event_scan(void);
Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-03-20 
08:24:15.0 -0500
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-03-20 
08:51:59.0 -0500
@@ -37,14 +37,16 @@
 #define UPDATE_DT_NODE 0x0200
 #define ADD_DT_NODE0x0300
 
-static int mobility_rtas_call(int token, char *buf)
+#define MIGRATION_SCOPE(1)
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
 {
int rc;
 
spin_lock(&rtas_data_buf_lock);
 
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
-   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
 
spin_unlock(&rtas_data_buf_lock);
@@ -123,7 +125,7 @@
return 0;
 }
 
-static int update_dt_node(u32 phandle)
+static int update_dt_node(u32 phandle, s32 scope)
 {
struct update_props_workarea *upwa;
struct device_node *dn;
@@ -151,7 +153,8 @@
upwa->phandle = phandle;
 
do {
-   rc = mobility_rtas_call(update_properties_token, rtas_buf);
+   rc = mobility_rtas_call(update_properties_token, rtas_buf,
+   scope);
if (rc < 0)
break;
 
@@ -219,7 +222,7 @@
return rc;
 }
 
-static int pseries_devicetree_update(void)
+int pseries_devicetree_update(s32 scope)
 {
char *rtas_buf;
u32 *data;
@@ -235,7 +238,7 @@
return -ENOMEM;
 
do {
-   rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+   rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
if (rc && rc != 1)
break;
 
@@ -256,7 +259,7 @@
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
-   update_dt_node(phandle);
+   update_dt_node(phandle, scope);
break;
case ADD_DT_NODE:
drc_index = *data++;
@@ -276,7 +279,7 @@
int rc;
int activate_fw_token;
 
-   rc = pseries_devicetree_update();
+   rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc) {
printk(KERN_ERR "Initial post-mobility device tree update "
   "failed: %d\n", rc);
@@ -292,7 +295,7 @@
 
rc = rtas_call(activate_fw_token

[PATCH v2 0/11] NUMA CPU Reconfiguration using PRRN

2013-03-25 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

PRRN Events are RTAS events sent up through the event-scan mechanism on
Power. When these events are received the system needs can get the updated
device tree affinity information for the affected CPUs/memory via the
rtas update-nodes and update-properties calls. This information is then
used to update the NUMA affinity of the CPUs/Memory in the kernel.

This patch set adds the ability to recognize PRRN events, update the device
tree and kernel information for CPUs (memory will be handled in a later
patch), and add an interface to enable/disable toplogy updates from /proc.

Additionally, these updates solve an exisitng problem with the VPHN (Virtual
Processor Home Node) capability and allow us to re-enable this feature.

Nathan Fontenot

Updates for Version 2 of this patchset

- Merged the functionality of platform_has_feature into the existing
  firmware_has_feature routine.
- Corrected the new way certain bits in the architecture vector are
  defined based on config options.
---

 arch/powerpc/include/asm/firmware.h   |3 
 arch/powerpc/include/asm/prom.h   |   46 ++---
 arch/powerpc/include/asm/rtas.h   |2 
 arch/powerpc/kernel/prom_init.c   |   98 ++-
 arch/powerpc/kernel/rtasd.c   |   35 
 arch/powerpc/mm/numa.c|  183 ++
 arch/powerpc/platforms/pseries/firmware.c |1 
 powerpc/arch/powerpc/include/asm/firmware.h   |4 
 powerpc/arch/powerpc/include/asm/prom.h   |   73 
 powerpc/arch/powerpc/include/asm/rtas.h   |1 
 powerpc/arch/powerpc/include/asm/topology.h   |5 
 powerpc/arch/powerpc/kernel/prom_init.c   |2 
 powerpc/arch/powerpc/kernel/rtasd.c   |6 
 powerpc/arch/powerpc/mm/numa.c|   62 +++
 powerpc/arch/powerpc/platforms/pseries/firmware.c |   67 +++-
 powerpc/arch/powerpc/platforms/pseries/mobility.c |   21 +-
 powerpc/arch/powerpc/platforms/pseries/pseries.h  |5 
 powerpc/arch/powerpc/platforms/pseries/setup.c|   40 +++-
 18 files changed, 455 insertions(+), 199 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/11] Add platform_has_feature()

2013-03-19 Thread Nathan Fontenot
On 03/14/2013 08:42 AM, Michael Ellerman wrote:
> On Fri, Mar 08, 2013 at 10:02:31PM -0600, Nathan Fontenot wrote:
>> The firmware_has_feature() function makes it easy to check for supported
>> features of the hardware. There is not corresponding function to check for
>> features supported by the client architecture.
> 
> Actually it doesn't tell you about features of the hardware, it tells
> you about features of the firmware, or the platform ..
> 
> So I think you should really just be adding a new firmware feature flag,
> and adding whatever glue code is required to set it based on what you
> find in the device tree.
> 
> Also notice where you end up using it:
> 
> -   if (firmware_has_feature(FW_FEATURE_OPAL))
> +   if (firmware_has_feature(FW_FEATURE_OPAL) ||
> +   platform_has_feature(OV5_TYPE1_AFFINITY)) {
> +   dbg("Using form 1 affinity\n");
>   form1_affinity = 1;
> 
> Could be:
> 
> +   if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY) ||
> 

To make sure I understand what you're suggesting...

You think there should be a single firmware_has_feature() for all current
uses and also for checking items such as FORM1_AFFINITY and PRRN 
features as reported by the device tree for vector 5 portions of the
client architecture bits. I think this could be done by checking the
device tree ibm,architecture-vec-5 node for a specified feature and
setting a bit the appropriate bit in powerpc_firmware_features.

I like this more than separate firmware_has_feature() and platform_has_feature()
routines to check.

-- 
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/11] Add platform_has_feature()

2013-03-19 Thread Nathan Fontenot
On 03/14/2013 03:59 AM, Paul Mackerras wrote:
> On Fri, Mar 08, 2013 at 10:02:31PM -0600, Nathan Fontenot wrote:
>> This patch adds a platform_has_feature() function to check features selected
>> by firmware and reported via the device tree 'ibm,architecture-vec5'
>> property. As part of this the #defines used for the architecture vector are
>> moved to prom.h and re-defined such that the vector 5 options have the vector
>> index and the feature bits encoded into them. This allows for callers of
>> platform_has_feature() to pass in a single pre-defined value.
> 
> One other comment...
> 
>> +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
>> +bool platform_has_feature(unsigned int feature)
>> +{
>> +struct device_node *chosen;
>> +const char *vec5;
>> +bool has_option;
>> +
>> +chosen = of_find_node_by_path("/chosen");
>> +if (!chosen)
>> +return false;
>> +
>> +vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
>> +has_option = vec5 && (vec5[OV5_INDX(feature)] & OV5_FEAT(feature));
> 
> You access vec5[index] without checking that the vector is at least
> index+1 bytes long, according to either the length byte at the
> beginning of the vector, or the total length of the property.
> Checking both would be a good idea.
> 

Yep. Thanks for letting me know.

-- 
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/11] Add platform_has_feature()

2013-03-19 Thread Nathan Fontenot
On 03/14/2013 03:56 AM, Paul Mackerras wrote:
> On Fri, Mar 08, 2013 at 10:02:31PM -0600, Nathan Fontenot wrote:
>> The firmware_has_feature() function makes it easy to check for supported
>> features of the hardware. There is not corresponding function to check for
>> features supported by the client architecture.
> 
> Actually, firmware_has_feature checks for supported features of the
> hypervisor, or in a sense the platform, rather than hardware.

Ahh, thanks for clarifying that for me. I'll update the description.

> 
>> This patch adds a platform_has_feature() function to check features selected
>> by firmware and reported via the device tree 'ibm,architecture-vec5'
>> property. As part of this the #defines used for the architecture vector are
>> moved to prom.h and re-defined such that the vector 5 options have the vector
>> index and the feature bits encoded into them. This allows for callers of
>> platform_has_feature() to pass in a single pre-defined value.
> 
> One other comment below...
> 
>>  /* PCIe/MSI support.  Without MSI full PCIe is not supported */
>>  #ifdef CONFIG_PCI_MSI
>> -#define OV5_MSI 0x01/* PCIe/MSI support */
>> +#define OV5_MSI 0x0201  /* PCIe/MSI support */
>>  #else
>> -#define OV5_MSI 0x00
>> +#define OV5_MSI 0x0200
>>  #endif /* CONFIG_PCI_MSI */
> 
> The #ifdef was done this way in order to control what ended up in the
> option vector we pass to the platform firmware.  For checking what the
> platform supports, wouldn't we want OV5_MSI to be 0x0201 always?
> Similarly for OV5_CMO, OV5_XCMO, etc.?

Yes, you're correct. I will update this.

-- 
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH2/11] Add PRRN Event Handler

2013-03-19 Thread Nathan Fontenot
On 03/14/2013 03:51 AM, Paul Mackerras wrote:
> On Fri, Mar 08, 2013 at 10:00:09PM -0600, Nathan Fontenot wrote:
>> From: Jesse Larrew 
>>
>> A PRRN event is signaled via the RTAS event-scan mechanism, which
>> returns a Hot Plug Event message "fixed part" indicating "Platform
>> Resource Reassignment". In response to the Hot Plug Event message,
>> we must call ibm,update-nodes to determine which resources were
>> reassigned and then ibm,update-properties to obtain the new affinity
>> information about those resources.
>>
>> The PRRN event-scan RTAS message contains only the "fixed part" with
>> the "Type" field set to the value 160 and no Extended Event Log. The
>> four-byte Extended Event Log Length field is repurposed (since no
>> Extended Event Log message is included) to pass the "scope" parameter
>> that causes the ibm,update-nodes to return the nodes affected by the
>> specific resource reassignment.
>>
>> This patch adds a handler in rtasd for PRRN RTAS events. The function
>> pseries_devicetree_update() (from mobility.c) is used to make the
>> ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
>> (handled by a subsequent patch) will require significant processing,
>> so pseries_devicetree_update() is called from an asynchronous workqueue
>> to allow rtasd to continue processing events.
>>
>> Signed-off-by: Nathan Fontenot 
> 
> [snip]
> 
>> +static s32 update_scope;
> 
> Do we have a guarantee that there can only be one of these events
> outstanding at a time?  If so it would be nice to document that in a
> comment next to this declaration, so we know in future that this is
> why this is safe.
> 

We only allow for one event to be outstanding. When a PRRN Event is
received we flush any work currently queued up and add the new event
event to the workqueue (see prrn_schedule_work() from the patch).

As I understand flush_work(), this would wait for any work in flight
to complete, then remove all work before returning. I'll add a comment
and update the patch description.

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 11/11] Add /proc interface to control topology updates

2013-03-08 Thread Nathan Fontenot
There are instances in which we do not want topology updates to occur.
In order to allow this a /proc interface (/proc/powerpc/topology_updates)
is introduced so that topology updates can be enabled and disabled.

This patch also adds a prrn_is_enabled() call so that PRRN events are
handled in the kernel only if topology updating is enabled.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/topology.h |5 ++
 arch/powerpc/kernel/rtasd.c |6 ++-
 arch/powerpc/mm/numa.c  |   62 +++-
 3 files changed, 70 insertions(+), 3 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:58:09.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:58:37.0 -0600
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -1558,7 +1561,6 @@
 
return rc;
 }
-__initcall(start_topology_update);
 
 /*
  * Disable polling for VPHN associativity changes.
@@ -1577,4 +1579,62 @@
 
return rc;
 }
+
+inline int prrn_is_enabled(void)
+{
+   return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+   if (vphn_enabled || prrn_enabled)
+   seq_puts(file, "on\n");
+   else
+   seq_puts(file, "off\n");
+
+   return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+   char kbuf[4]; /* "on" or "off" plus null. */
+   int read_len;
+
+   read_len = count < 3 ? count : 3;
+   if (copy_from_user(kbuf, buf, read_len))
+   return -EINVAL;
+
+   kbuf[read_len] = '\0';
+
+   if (!strncmp(kbuf, "on", 2))
+   start_topology_update();
+   else if (!strncmp(kbuf, "off", 3))
+   stop_topology_update();
+   else
+   return -EINVAL;
+
+   return count;
+}
+
+static const struct file_operations topology_ops = {
+   .read = seq_read,
+   .write = topology_write,
+   .open = topology_open,
+   .release = single_release
+};
+
+static int topology_update_init(void)
+{
+   start_topology_update();
+   proc_create("powerpc/topology_updates", 644, NULL, &topology_ops);
+
+   return 0;
+}
+device_initcall(topology_update_init);
 #endif /* CONFIG_PPC_SPLPAR */
Index: powerpc/arch/powerpc/include/asm/topology.h
===
--- powerpc.orig/arch/powerpc/include/asm/topology.h2013-03-08 
19:23:06.0 -0600
+++ powerpc/arch/powerpc/include/asm/topology.h 2013-03-08 19:58:37.0 
-0600
@@ -71,6 +71,7 @@
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
+extern inline int prrn_is_enabled(void);
 #else
 static inline int start_topology_update(void)
 {
@@ -80,6 +81,10 @@
 {
return 0;
 }
+static inline int prrn_is_enabled(void)
+{
+   return 0;
+}
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-03-08 19:56:48.0 
-0600
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-03-08 19:58:37.0 -0600
@@ -292,11 +292,13 @@
 {
pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
 
-   if (log->type == RTAS_TYPE_PRRN)
+   if (log->type == RTAS_TYPE_PRRN) {
/* For PRRN Events the extended log length is used to denote
 * the scope for calling rtas update-nodes.
 */
-   prrn_schedule_update(log->extended_log_length);
+   if (prrn_is_enabled())
+   prrn_schedule_update(log->extended_log_length);
+   }
 
return;
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 10/11] Enable PRRN

2013-03-08 Thread Nathan Fontenot
The Linux kernel and platform firmware negotiate their mutual support
of the PRRN option via the ibm,client-architecture-support interface.
This patch simply sets the appropriate fields in the client architecture
vector to indicate Linux support and will cause the firmware to begin
sending PRRN events via the RTAS event-scan mechanism.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/kernel/prom_init.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-03-08 
19:57:14.0 -0600
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-03-08 19:58:18.0 
-0600
@@ -689,7 +689,7 @@
OV5_FEAT(OV5_MSI),
0,
OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO),
-   OV5_FEAT(OV5_TYPE1_AFFINITY),
+   OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
0,
0,
0,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 9/11] Re-enable Virtual Private Home Node capabilities

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

The new PRRN firmware feature provides a more convenient and event-driven
interface than VPHN for notifying Linux of changes to the NUMA affinity of
platform resources. However, for practical reasons, it may not be feasible
for some customers to update to the latest firmware. For these customers,
the VPHN feature supported on previous firmware versions may still be the
best option.

The VPHN feature was previously disabled due to races with the load
balancing code when accessing the NUMA cpu maps, but the new stop_machine()
approach protects the NUMA cpu maps from these concurrent accesses. It
should be safe to re-enable this feature now.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:57:59.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:58:09.0 -0600
@@ -1545,9 +1545,8 @@
vphn_enabled = 0;
rc = of_reconfig_notifier_register(&dt_update_nb);
}
-   } else if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
+   } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
   get_lppaca()->shared_proc) {
-   /* Disabled until races with load balancing are fixed */
if (!vphn_enabled) {
prrn_enabled = 0;
vphn_enabled = 1;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 8/11] Update numa cpu vdso info

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

The following patch adds vdso_getcpu_init(), which stores the NUMA node for
a cpu in SPRG3:

http://patchwork.ozlabs.org/patch/169070/

This patch ensures that this information is also updated when the NUMA
affinity of a cpu changes.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:57:47.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:57:59.0 -0600
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int numa_enabled = 1;
 
@@ -1426,6 +1427,7 @@
unregister_cpu_under_node(update->cpu, update->old_nid);
unmap_cpu_from_node(update->cpu);
map_cpu_to_node(update->cpu, update->new_nid);
+   vdso_getcpu_init();
register_cpu_under_node(update->cpu, update->new_nid);
 
return 0;
@@ -1440,8 +1442,11 @@
int cpu, changed = 0;
struct topology_update_data update;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   cpumask_t updated_cpu;
struct device *dev;
 
+   cpumask_clear(&updated_cpu);
+
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
update.cpu = cpu;
vphn_get_associativity(cpu, associativity);
@@ -1451,7 +1456,8 @@
update.new_nid = first_online_node;
 
update.old_nid = numa_cpu_lookup_table[cpu];
-   stop_machine(update_cpu_topology, &update, cpu_online_mask);
+   cpumask_set_cpu(cpu, &updated_cpu);
+   stop_machine(update_cpu_topology, &update, &updated_cpu);
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 7/11] Use stop machine to update cpu maps

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

The new PRRN firmware feature allows CPU and memory resources to be
transparently reassigned across NUMA boundaries. When this happens, the
kernel must update the node maps to reflect the new affinity
information.

Although the NUMA maps can be protected by locking primitives during the
update itself, this is insufficient to prevent concurrent accesses to these
structures. Since cpumask_of_node() hands out a pointer to these
structures, they can still be modified outside of the lock. Furthermore,
tracking down each usage of these pointers and adding locks would be quite
invasive and difficult to maintain.

Situations like these are best handled using stop_machine(). Since the NUMA
affinity updates are exceptionally rare events, this approach has the
benefit of not adding any overhead while accessing the NUMA maps during
normal operation.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   51 +
 1 file changed, 35 insertions(+), 16 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:57:38.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:57:47.0 -0600
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1254,6 +1255,12 @@
 
 /* Virtual Processor Home Node (VPHN) support */
 #ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+   int cpu;
+   int old_nid;
+   int new_nid;
+};
+
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
@@ -1405,34 +1412,46 @@
 }
 
 /*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+   struct topology_update_data *update = data;
+
+   if (!update)
+   return -EINVAL;
+
+   unregister_cpu_under_node(update->cpu, update->old_nid);
+   unmap_cpu_from_node(update->cpu);
+   map_cpu_to_node(update->cpu, update->new_nid);
+   register_cpu_under_node(update->cpu, update->new_nid);
+
+   return 0;
+}
+
+/*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  */
 int arch_update_cpu_topology(void)
 {
-   int cpu, nid, old_nid, changed = 0;
+   int cpu, changed = 0;
+   struct topology_update_data update;
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
 
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+   update.cpu = cpu;
vphn_get_associativity(cpu, associativity);
-   nid = associativity_to_nid(associativity);
-
-   if (nid < 0 || !node_online(nid))
-   nid = first_online_node;
+   update.new_nid = associativity_to_nid(associativity);
 
-   old_nid = numa_cpu_lookup_table[cpu];
-
-   /* Disable hotplug while we update the cpu
-* masks and sysfs.
-*/
-   get_online_cpus();
-   unregister_cpu_under_node(cpu, old_nid);
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, nid);
-   register_cpu_under_node(cpu, nid);
-   put_online_cpus();
+   if (update.new_nid < 0 || !node_online(update.new_nid))
+   update.new_nid = first_online_node;
 
+   update.old_nid = numa_cpu_lookup_table[cpu];
+   stop_machine(update_cpu_topology, &update, cpu_online_mask);
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 6/11] Update CPU maps

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

Platform events such as partition migration or the new PRRN firmware
feature can cause the NUMA characteristics of a CPU to change, and these
changes will be reflected in the device tree nodes for the affected
CPUs.

This patch registers a handler for Open Firmware device tree updates
and reconfigures the CPU and node maps whenever the associativity
changes. Currently, this is accomplished by marking the affected CPUs in
the cpu_associativity_changes_mask and allowing
arch_update_cpu_topology() to retrieve the new associativity information
using hcall_vphn().

Protecting the NUMA cpu maps from concurrent access during an update
operation will be addressed in a subsequent patch in this series.

Signed-off-by: Nathan Fontenot 
---

 arch/powerpc/include/asm/prom.h |1 
 arch/powerpc/mm/numa.c  |   99 ++--
 2 files changed, 76 insertions(+), 24 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-08 
19:57:14.0 -0600
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-08 19:57:38.0 
-0600
@@ -138,6 +138,7 @@
 #define OV5_XCMO   0x0400
 #endif
 #define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
 #define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:57:27.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:57:38.0 -0600
@@ -1257,7 +1257,8 @@
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
-static void set_topology_timer(void);
+static int prrn_enabled;
+static void reset_topology_timer(void);
 
 /*
  * Store the current values of the associativity change counters in the
@@ -1293,11 +1294,9 @@
  */
 static int update_cpu_associativity_changes_mask(void)
 {
-   int cpu, nr_cpus = 0;
+   int cpu;
cpumask_t *changes = &cpu_associativity_changes_mask;
 
-   cpumask_clear(changes);
-
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
@@ -1311,11 +1310,10 @@
}
if (changed) {
cpumask_set_cpu(cpu, changes);
-   nr_cpus++;
}
}
 
-   return nr_cpus;
+   return cpumask_weight(changes);
 }
 
 /*
@@ -1416,7 +1414,7 @@
unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
struct device *dev;
 
-   for_each_cpu(cpu,&cpu_associativity_changes_mask) {
+   for_each_cpu(cpu, &cpu_associativity_changes_mask) {
vphn_get_associativity(cpu, associativity);
nid = associativity_to_nid(associativity);
 
@@ -1438,6 +1436,7 @@
dev = get_cpu_device(cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+   cpumask_clear_cpu(cpu, &cpu_associativity_changes_mask);
changed = 1;
}
 
@@ -1457,37 +1456,80 @@
 
 static void topology_timer_fn(unsigned long ignored)
 {
-   if (!vphn_enabled)
-   return;
-   if (update_cpu_associativity_changes_mask() > 0)
+   if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
topology_schedule_update();
-   set_topology_timer();
+   else if (vphn_enabled) {
+   if (update_cpu_associativity_changes_mask() > 0)
+   topology_schedule_update();
+   reset_topology_timer();
+   }
 }
 static struct timer_list topology_timer =
TIMER_INITIALIZER(topology_timer_fn, 0, 0);
 
-static void set_topology_timer(void)
+static void reset_topology_timer(void)
 {
topology_timer.data = 0;
topology_timer.expires = jiffies + 60 * HZ;
-   add_timer(&topology_timer);
+   mod_timer(&topology_timer, topology_timer.expires);
+}
+
+static void stage_topology_update(int core_id)
+{
+   cpumask_or(&cpu_associativity_changes_mask,
+   &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+   reset_topology_timer();
 }
 
+static int dt_update_callback(struct notifier_block *nb,
+   unsigned long action, void *data)
+{
+   struct of_prop_reconfig *update;
+   int rc = NOTIFY_DONE;
+
+   switch (action) {
+   case OF_RECONFIG_ADD_PROPERTY:
+   case OF_R

[PATCH 5/11] Update numa.c to use platform_has_feature()

2013-03-08 Thread Nathan Fontenot
Update the numa code to use the new platform_has_feature() when checking
for type 1 affinity.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/mm/numa.c |   22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

Index: powerpc/arch/powerpc/mm/numa.c
===
--- powerpc.orig/arch/powerpc/mm/numa.c 2013-03-08 19:23:06.0 -0600
+++ powerpc/arch/powerpc/mm/numa.c  2013-03-08 19:57:27.0 -0600
@@ -291,9 +291,7 @@
 static int __init find_min_common_depth(void)
 {
int depth;
-   struct device_node *chosen;
struct device_node *root;
-   const char *vec5;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -325,24 +323,10 @@
 
distance_ref_points_depth /= sizeof(int);
 
-#define VEC5_AFFINITY_BYTE 5
-#define VEC5_AFFINITY  0x80
-
-   if (firmware_has_feature(FW_FEATURE_OPAL))
+   if (firmware_has_feature(FW_FEATURE_OPAL) ||
+   platform_has_feature(OV5_TYPE1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
form1_affinity = 1;
-   else {
-   chosen = of_find_node_by_path("/chosen");
-   if (chosen) {
-   vec5 = of_get_property(chosen,
-  "ibm,architecture-vec-5", NULL);
-   if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
-   VEC5_AFFINITY)) {
-   dbg("Using form 1 affinity\n");
-   form1_affinity = 1;
-   }
-
-   of_node_put(chosen);
-   }
}
 
if (form1_affinity) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/11] Add platform_has_feature()

2013-03-08 Thread Nathan Fontenot
The firmware_has_feature() function makes it easy to check for supported
features of the hardware. There is not corresponding function to check for
features supported by the client architecture.

This patch adds a platform_has_feature() function to check features selected
by firmware and reported via the device tree 'ibm,architecture-vec5'
property. As part of this the #defines used for the architecture vector are
moved to prom.h and re-defined such that the vector 5 options have the vector
index and the feature bits encoded into them. This allows for callers of
platform_has_feature() to pass in a single pre-defined value.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/prom.h |   41 +++-
 arch/powerpc/kernel/prom.c  |   19 ++
 arch/powerpc/kernel/prom_init.c |   14 +++--
 3 files changed, 51 insertions(+), 23 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-08 
19:57:05.0 -0600
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-08 19:57:14.0 
-0600
@@ -111,31 +111,37 @@
 /* Option vector 4: IBM PAPR implementation */
 #define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
 
-/* Option vector 5: PAPR/OF options supported */
-#define OV5_LPAR   0x80/* logical partitioning supported */
-#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* Option vector 5: PAPR/OF options supported
+ * These bits are also used for the platform_has_feature() call so
+ * we encode the vector index in the define and use the OV5_FEAT()
+ * and OV5_INDX() macros to extract the desired information.
+ */
+#define OV5_FEAT(x)((x) & 0xff)
+#define OV5_INDX(x)((x) >> 8)
+#define OV5_LPAR   0x0280  /* logical partitioning supported */
+#define OV5_SPLPAR 0x0240  /* shared-processor LPAR supported */
 /* ibm,dynamic-reconfiguration-memory property supported */
-#define OV5_DRCONF_MEMORY  0x20
-#define OV5_LARGE_PAGES0x10/* large pages supported */
-#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
+#define OV5_DRCONF_MEMORY  0x0220
+#define OV5_LARGE_PAGES0x0210  /* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x0202  /* donate dedicated CPU support 
*/
 /* PCIe/MSI support.  Without MSI full PCIe is not supported */
 #ifdef CONFIG_PCI_MSI
-#define OV5_MSI0x01/* PCIe/MSI support */
+#define OV5_MSI0x0201  /* PCIe/MSI support */
 #else
-#define OV5_MSI0x00
+#define OV5_MSI0x0200
 #endif /* CONFIG_PCI_MSI */
 #ifdef CONFIG_PPC_SMLPAR
-#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
-#define OV5_XCMO   0x40/* Page Coalescing */
+#define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x0440  /* Page Coalescing */
 #else
-#define OV5_CMO0x00
-#define OV5_XCMO   0x00
+#define OV5_CMO0x0400
+#define OV5_XCMO   0x0400
 #endif
-#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
-#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
-#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x0E80  /* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x0E40  /* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x0E20  /* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x0F01  /* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
@@ -145,6 +151,7 @@
  * followed by # option vectors - 1, followed by the option vectors.
  */
 extern unsigned char ibm_architecture_vec[];
+bool platform_has_feature(unsigned int);
 #endif
 
 /* These includes are put at the bottom because they may contain things
Index: powerpc/arch/powerpc/kernel/prom_init.c
===
--- powerpc.orig/arch/powerpc/kernel/prom_init.c2013-03-08 
19:57:05.0 -0600
+++ powerpc/arch/powerpc/kernel/prom_init.c 2013-03-08 19:57:14.0 
-0600
@@ -684,11 +684,12 @@
/* option vector 5: PAPR/OF options */
19 - 2, /* length */
0,  /* don't ignore, don&#

[PATCH 3/11] Move architecture vector definitions to prom.h

2013-03-08 Thread Nathan Fontenot
As part of handling of handling PRRN events we will need to check the
vector 5 portion of the architecture bits reported in the device tree
to ensure that PRRN event handling is enabled. In order to do this a
new platform_has_feature call is introduced (in a subsequent patch) to
make this check.  To avoid having to re-define bits in the architecture
vector the bits are moved to prom.h.

This patch is the first step in implementing the platform_has_feature
call by simply moving the bit definitions from prom_init.c to asm/prom.h.
There are no functional.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/include/asm/prom.h |   73 ++
 arch/powerpc/kernel/prom_init.c |   75 +++-
 2 files changed, 79 insertions(+), 69 deletions(-)

Index: powerpc/arch/powerpc/include/asm/prom.h
===
--- powerpc.orig/arch/powerpc/include/asm/prom.h2013-03-08 
19:23:06.0 -0600
+++ powerpc/arch/powerpc/include/asm/prom.h 2013-03-08 19:57:05.0 
-0600
@@ -74,6 +74,79 @@
 #define DRCONF_MEM_AI_INVALID  0x0040
 #define DRCONF_MEM_RESERVED0x0080
 
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * There are two methods for telling firmware what our capabilities are.
+ * Newer machines have an "ibm,client-architecture-support" method on the
+ * root node.  For older machines, we have to call the "process-elf-header"
+ * method in the /packages/elf-loader node, passing it a fake 32-bit
+ * ELF header containing a couple of PT_NOTE sections that contain
+ * structures that contain various information.
+ */
+
+/* New method - extensible architecture description vector. */
+
+/* Option vector bits - generic bits in byte 1 */
+#define OV_IGNORE  0x80/* ignore this vector */
+#define OV_CESSATION_POLICY0x40/* halt if unsupported option present*/
+
+/* Option vector 1: processor architectures supported */
+#define OV1_PPC_2_00   0x80/* set if we support PowerPC 2.00 */
+#define OV1_PPC_2_01   0x40/* set if we support PowerPC 2.01 */
+#define OV1_PPC_2_02   0x20/* set if we support PowerPC 2.02 */
+#define OV1_PPC_2_03   0x10/* set if we support PowerPC 2.03 */
+#define OV1_PPC_2_04   0x08/* set if we support PowerPC 2.04 */
+#define OV1_PPC_2_05   0x04/* set if we support PowerPC 2.05 */
+#define OV1_PPC_2_06   0x02/* set if we support PowerPC 2.06 */
+#define OV1_PPC_2_07   0x01/* set if we support PowerPC 2.07 */
+
+/* Option vector 2: Open Firmware options supported */
+#define OV2_REAL_MODE  0x20/* set if we want OF in real mode */
+
+/* Option vector 3: processor options supported */
+#define OV3_FP 0x80/* floating point */
+#define OV3_VMX0x40/* VMX/Altivec */
+#define OV3_DFP0x20/* decimal FP */
+
+/* Option vector 4: IBM PAPR implementation */
+#define OV4_MIN_ENT_CAP0x01/* minimum VP entitled capacity 
*/
+
+/* Option vector 5: PAPR/OF options supported */
+#define OV5_LPAR   0x80/* logical partitioning supported */
+#define OV5_SPLPAR 0x40/* shared-processor LPAR supported */
+/* ibm,dynamic-reconfiguration-memory property supported */
+#define OV5_DRCONF_MEMORY  0x20
+#define OV5_LARGE_PAGES0x10/* large pages supported */
+#define OV5_DONATE_DEDICATE_CPU0x02/* donate dedicated CPU support 
*/
+/* PCIe/MSI support.  Without MSI full PCIe is not supported */
+#ifdef CONFIG_PCI_MSI
+#define OV5_MSI0x01/* PCIe/MSI support */
+#else
+#define OV5_MSI0x00
+#endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO0x80/* Cooperative Memory 
Overcommitment */
+#define OV5_XCMO   0x40/* Page Coalescing */
+#else
+#define OV5_CMO0x00
+#define OV5_XCMO   0x00
+#endif
+#define OV5_TYPE1_AFFINITY 0x80/* Type 1 NUMA affinity */
+#define OV5_PFO_HW_RNG 0x80/* PFO Random Number Generator */
+#define OV5_PFO_HW_842 0x40/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR0x20/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS 0x01/* 1,2,or 4 Sub-Processors supported */
+
+/* Option Vector 6: IBM PAPR hints */
+#define OV6_LINUX  0x02/* Linux is our OS */
+
+/*
+ * The architecture vector has an array of PVR mask/value pairs,
+ * followed by # option vectors - 1, followed by the option vectors.
+ */
+extern unsigned char ibm_architecture_vec[];
+#endif
+
 /* These includes are put at the bottom because they may contain things
  * that are overridden by this file.  Ideally they shouldn't be included
  * by this fi

[PATCH2/11] Add PRRN Event Handler

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

A PRRN event is signaled via the RTAS event-scan mechanism, which
returns a Hot Plug Event message "fixed part" indicating "Platform
Resource Reassignment". In response to the Hot Plug Event message,
we must call ibm,update-nodes to determine which resources were
reassigned and then ibm,update-properties to obtain the new affinity
information about those resources.

The PRRN event-scan RTAS message contains only the "fixed part" with
the "Type" field set to the value 160 and no Extended Event Log. The
four-byte Extended Event Log Length field is repurposed (since no
Extended Event Log message is included) to pass the "scope" parameter
that causes the ibm,update-nodes to return the nodes affected by the
specific resource reassignment.

This patch adds a handler in rtasd for PRRN RTAS events. The function
pseries_devicetree_update() (from mobility.c) is used to make the
ibm,update-nodes/ibm,update-properties RTAS calls. Updating the NUMA maps
(handled by a subsequent patch) will require significant processing,
so pseries_devicetree_update() is called from an asynchronous workqueue
to allow rtasd to continue processing events.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h |2 ++
 arch/powerpc/kernel/rtasd.c |   35 ++-
 2 files changed, 36 insertions(+), 1 deletion(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-03-08 
19:56:13.0 -0600
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-03-08 19:56:48.0 
-0600
@@ -143,6 +143,8 @@
 #define RTAS_TYPE_PMGM_TIME_ALARM  0x6f
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE   0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC0x71
+/* Platform Resource Reassignment Notification */
+#define RTAS_TYPE_PRRN 0xA0
 
 /* RTAS check-exception vector offset */
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
Index: powerpc/arch/powerpc/kernel/rtasd.c
===
--- powerpc.orig/arch/powerpc/kernel/rtasd.c2013-03-08 19:23:06.0 
-0600
+++ powerpc/arch/powerpc/kernel/rtasd.c 2013-03-08 19:56:48.0 -0600
@@ -87,6 +87,8 @@
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
+   case RTAS_TYPE_PRRN:
+   return "Platform Resource Reassignment Event";
}
 
return rtas_type[0];
@@ -265,7 +267,38 @@
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
+}
+
+static s32 update_scope;
+
+static void prrn_work_fn(struct work_struct *work)
+{
+   /*
+* For PRRN, we must pass the negative of the scope value in
+* the RTAS event.
+*/
+   pseries_devicetree_update(-update_scope);
+}
+static DECLARE_WORK(prrn_work, prrn_work_fn);
+
+void prrn_schedule_update(u32 scope)
+{
+   flush_work(&prrn_work);
+   update_scope = scope;
+   schedule_work(&prrn_work);
+}
+
+static void pseries_handle_event(const struct rtas_error_log *log)
+{
+   pSeries_log_error((char *)log, ERR_TYPE_RTAS_LOG, 0);
+
+   if (log->type == RTAS_TYPE_PRRN)
+   /* For PRRN Events the extended log length is used to denote
+* the scope for calling rtas update-nodes.
+*/
+   prrn_schedule_update(log->extended_log_length);
 
+   return;
 }
 
 static int rtas_log_open(struct inode * inode, struct file * file)
@@ -389,7 +422,7 @@
}
 
if (error == 0)
-   pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+   pseries_handle_event((struct rtas_error_log *)logdata);
 
} while(error == 0);
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/11] Expose pseries devicetree_update()

2013-03-08 Thread Nathan Fontenot
From: Jesse Larrew 

When rtasd receives a PRRN event, it needs to make a series of RTAS
calls (ibm,update-nodes and ibm,update-properties) to retrieve the
updated device tree information. These calls are already handled in the
pseries_devtree_update() routine used in partition migration.

This patch simply exposes pseries_devicetree_update() so it can be
called by rtasd. pseries_devicetree_update() and supporting functions
are also modified to take a 32-bit 'scope' parameter. This parameter is
required by the ibm,update-nodes/ibm,update-properties RTAS calls, and
the appropriate value is contained within the RTAS event for PRRN
notifications. In pseries_devicetree_update() it was previously
hard-coded to 1, the scope value for partition migration.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/rtas.h   |1 +
 arch/powerpc/platforms/pseries/mobility.c |   21 -
 2 files changed, 13 insertions(+), 9 deletions(-)

Index: powerpc/arch/powerpc/include/asm/rtas.h
===
--- powerpc.orig/arch/powerpc/include/asm/rtas.h2013-03-08 
19:23:06.0 -0600
+++ powerpc/arch/powerpc/include/asm/rtas.h 2013-03-08 19:56:13.0 
-0600
@@ -276,6 +276,7 @@
const char *uname, int depth, void *data);
 
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
+extern int pseries_devicetree_update(s32 scope);
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
 extern void rtas_cancel_event_scan(void);
Index: powerpc/arch/powerpc/platforms/pseries/mobility.c
===
--- powerpc.orig/arch/powerpc/platforms/pseries/mobility.c  2013-03-08 
19:23:07.0 -0600
+++ powerpc/arch/powerpc/platforms/pseries/mobility.c   2013-03-08 
19:56:13.0 -0600
@@ -37,14 +37,16 @@
 #define UPDATE_DT_NODE 0x0200
 #define ADD_DT_NODE0x0300
 
-static int mobility_rtas_call(int token, char *buf)
+#define MIGRATION_SCOPE(1)
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
 {
int rc;
 
spin_lock(&rtas_data_buf_lock);
 
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
-   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+   rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
 
spin_unlock(&rtas_data_buf_lock);
@@ -123,7 +125,7 @@
return 0;
 }
 
-static int update_dt_node(u32 phandle)
+static int update_dt_node(u32 phandle, s32 scope)
 {
struct update_props_workarea *upwa;
struct device_node *dn;
@@ -151,7 +153,8 @@
upwa->phandle = phandle;
 
do {
-   rc = mobility_rtas_call(update_properties_token, rtas_buf);
+   rc = mobility_rtas_call(update_properties_token, rtas_buf,
+   scope);
if (rc < 0)
break;
 
@@ -219,7 +222,7 @@
return rc;
 }
 
-static int pseries_devicetree_update(void)
+int pseries_devicetree_update(s32 scope)
 {
char *rtas_buf;
u32 *data;
@@ -235,7 +238,7 @@
return -ENOMEM;
 
do {
-   rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+   rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
if (rc && rc != 1)
break;
 
@@ -256,7 +259,7 @@
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
-   update_dt_node(phandle);
+   update_dt_node(phandle, scope);
break;
case ADD_DT_NODE:
drc_index = *data++;
@@ -276,7 +279,7 @@
int rc;
int activate_fw_token;
 
-   rc = pseries_devicetree_update();
+   rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc) {
printk(KERN_ERR "Initial post-mobility device tree update "
   "failed: %d\n", rc);
@@ -292,7 +295,7 @@
 
rc = rtas_call(activate_fw_token, 0, 1, NULL);
if (!rc) {
-   rc = pseries_devicetree_update();
+   rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc)
printk(KERN_ERR "Secondary post-mobility device tree "
   "update failed: %d\n", rc);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 0/11] NUMA CPU Reconfiguration using PRRN

2013-03-08 Thread Nathan Fontenot
Newer firmware on Power systems can transparently reassign platform resources
(CPU and Memory) in use. For instance, if a processor or memory unit is
predicted to fail, the platform may transparently move the processing to an
equivalent unused processor or the memory state to an equivalent unused
memory unit. However, reassigning resources across NUMA boundaries may alter
the performance of the partition. When such reassignment is necessary, the
Platform Resource Reassignment Notification (PRRN) option provides a
mechanism to inform the Linux kernel of changes to the NUMA affinity of
its platform resources.

PRRN Events are RTAS events sent up through the event-scan mechanism on
Power. When these events are received the system needs can get the updated
device tree affinity information for the affected CPUs/memory via the
rtas update-nodes and update-properties calls. This information is then
used to update the NUMA affinity of the CPUs/Memory in the kernel.

This patch set adds the ability to recognize PRRN events, update the device
tree and kernel information for CPUs (memory will be handled in a later
patch), and add an interface to enable/disable toplogy updates from /proc.

Additionally, these updates solve an exisitng problem with the VPHN (Virtual
Processor Home Node) capability and allow us to re-enable this feature.

Nathan Fontenot
---

 arch/powerpc/include/asm/prom.h   |   42 +++--
 arch/powerpc/include/asm/rtas.h   |2 
 arch/powerpc/kernel/prom_init.c   |   89 +-
 arch/powerpc/kernel/rtasd.c   |   35 
 arch/powerpc/mm/numa.c|  183 ++
 powerpc/arch/powerpc/include/asm/prom.h   |   73 
 powerpc/arch/powerpc/include/asm/rtas.h   |1 
 powerpc/arch/powerpc/include/asm/topology.h   |5 
 powerpc/arch/powerpc/kernel/prom.c|   19 ++
 powerpc/arch/powerpc/kernel/prom_init.c   |2 
 powerpc/arch/powerpc/kernel/rtasd.c   |6 
 powerpc/arch/powerpc/mm/numa.c|   62 +++
 powerpc/arch/powerpc/platforms/pseries/mobility.c |   21 +-
 13 files changed, 372 insertions(+), 168 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] lsprop: Fixes to work correctly when built little endian

2013-01-07 Thread Nathan Fontenot
On 01/07/2013 08:12 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2013-01-07 at 15:23 +1100, Michael Ellerman wrote:
>> Add and use dt_swap_int() to byte swap on little endian.
>>
>> Also declare buf as unsigned char, so that we don't sign extend when
>> printing values from it.
>>
>> Signed-off-by: Michael Ellerman 
>> ---
>>
>> Ben, based on your patch, can you add your s-o-b? :
>>   https://lists.ozlabs.org/pipermail/linuxppc-dev/2008-May/056088.html
> 
> I didn't know powerpc-utils required sob's :-)

Not technically, it's more a CYA thing. It (hopefully) keeps big blue legal
happy, which keeps me happy.

-Nathan

> 
> Signed-off-by: Benjamin Herrenschmidt 
> 
>> ---
>>  src/lsprop.c |   17 ++---
>>  1 file changed, 14 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/lsprop.c b/src/lsprop.c
>> index 5969a97..38a8fa5 100644
>> --- a/src/lsprop.c
>> +++ b/src/lsprop.c
>> @@ -13,11 +13,22 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>> +#include 
>> +
>> +static inline unsigned int dt_swap_int(unsigned int data)
>> +{
>> +#if __BYTE_ORDER == __LITTLE_ENDIAN
>> +return bswap_32(data);
>> +#else
>> +return data;
>> +#endif
>> +}
>>  
>>  int recurse;
>>  int maxbytes = 128;
>>  int words_per_line = 0;
>> -char *buf;
>> +unsigned char *buf;
>>  
>>  void lsprop(FILE *f, char *name);
>>  void lsdir(char *name);
>> @@ -183,7 +194,7 @@ void lsprop(FILE *f, char *name)
>>  } else if ((n & 3) == 0) {
>>  nw = n >> 2;
>>  if (nw == 1) {
>> -i = *(int *)buf;
>> +i = dt_swap_int(*(int *)buf);
>>  printf(" %.8x", i);
>>  if (i > -0x1 && !(i >= 0 && i <= 9))
>>  printf(" (%d)", i);
>> @@ -201,7 +212,7 @@ void lsprop(FILE *f, char *name)
>>  if (i != 0)
>>  printf("\n\t\t");
>>  for (j = 0; j < npl && i + j < nw; ++j)
>> -printf(" %.8x", ((unsigned int *)buf)[i+j]);
>> +printf(" %.8x", dt_swap_int(((unsigned int *)buf)[i+j]));
>>  }
>>  }
>>  } else {
> 
> 


-- 
-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] export of_reconfig_notifier_[register,unregister]

2012-11-28 Thread Nathan Fontenot
The of reconfiguration notification chains should be exported for use
by modules.

Signed-off-by:Nathan Fontenot 
---
Index: linux-next/drivers/of/base.c
===
--- linux-next.orig/drivers/of/base.c   2012-11-28 09:18:02.0 -0600
+++ linux-next/drivers/of/base.c2012-11-28 11:05:00.0 -0600
@@ -1282,11 +1282,13 @@
 {
return blocking_notifier_chain_register(&of_reconfig_chain, nb);
 }
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
 
 int of_reconfig_notifier_unregister(struct notifier_block *nb)
 {
return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
 }
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
 
 int of_reconfig_notify(unsigned long action, void *p)
 {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/5] Remove the pSeries_reconfig.h file

2012-10-02 Thread Nathan Fontenot
Remove the pSeries_reconfig.h header file. At this point there is only one
definition in the file, pSeries_coalesce_init(), which can be
moved to rtas.h.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/pSeries_reconfig.h |   15 ---
 arch/powerpc/include/asm/rtas.h |5 +
 arch/powerpc/kernel/rtas.c  |1 -
 arch/powerpc/platforms/pseries/smp.c|1 -
 4 files changed, 5 insertions(+), 17 deletions(-)

Index: dt-next/arch/powerpc/include/asm/pSeries_reconfig.h
===
--- dt-next.orig/arch/powerpc/include/asm/pSeries_reconfig.h2012-10-02 
09:14:01.0 -0500
+++ /dev/null   1970-01-01 00:00:00.0 +
@@ -1,15 +0,0 @@
-#ifndef _PPC64_PSERIES_RECONFIG_H
-#define _PPC64_PSERIES_RECONFIG_H
-#ifdef __KERNEL__
-
-#ifdef CONFIG_PPC_PSERIES
-/* Not the best place to put this, will be fixed when we move some
- * of the rtas suspend-me stuff to pseries */
-extern void pSeries_coalesce_init(void);
-#else /* !CONFIG_PPC_PSERIES */
-static inline void pSeries_coalesce_init(void) { }
-#endif /* CONFIG_PPC_PSERIES */
-
-
-#endif /* __KERNEL__ */
-#endif /* _PPC64_PSERIES_RECONFIG_H */
Index: dt-next/arch/powerpc/include/asm/rtas.h
===
--- dt-next.orig/arch/powerpc/include/asm/rtas.h2012-10-02 
09:14:01.0 -0500
+++ dt-next/arch/powerpc/include/asm/rtas.h 2012-10-02 09:14:40.0 
-0500
@@ -353,8 +353,13 @@
return 1;
return 0;
 }
+
+/* Not the best place to put pSeries_coalesce_init, will be fixed when we
+ * move some of the rtas suspend-me stuff to pseries */
+extern void pSeries_coalesce_init(void);
 #else
 static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;}
+static inline void pSeries_coalesce_init(void) { }
 #endif
 
 extern int call_rtas(const char *, int, int, unsigned long *, ...);
Index: dt-next/arch/powerpc/kernel/rtas.c
===
--- dt-next.orig/arch/powerpc/kernel/rtas.c 2012-10-02 09:14:01.0 
-0500
+++ dt-next/arch/powerpc/kernel/rtas.c  2012-10-02 09:14:40.0 -0500
@@ -42,7 +42,6 @@
 #include 
 #include 
 #include 
-#include 
 
 struct rtas_t rtas = {
.lock = __ARCH_SPIN_LOCK_UNLOCKED
Index: dt-next/arch/powerpc/platforms/pseries/smp.c
===
--- dt-next.orig/arch/powerpc/platforms/pseries/smp.c   2012-10-02 
09:14:01.0 -0500
+++ dt-next/arch/powerpc/platforms/pseries/smp.c2012-10-02 
09:14:40.0 -0500
@@ -38,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/5] Rename the drivers/of prom_* functions to of_*

2012-10-02 Thread Nathan Fontenot
Rename the prom_*_property routines of the generic OF code to of_*_property.
This brings them in line with the naming used by the rest of the OF code.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/kernel/machine_kexec.c   |   12 ++--
 arch/powerpc/kernel/machine_kexec_64.c|8 
 arch/powerpc/kernel/pci_32.c  |2 +-
 arch/powerpc/platforms/ps3/os-area.c  |6 +++---
 arch/powerpc/platforms/pseries/iommu.c|4 ++--
 arch/powerpc/platforms/pseries/mobility.c |6 +++---
 arch/powerpc/platforms/pseries/reconfig.c |8 
 drivers/macintosh/smu.c   |2 +-
 drivers/of/base.c |   15 +++
 include/linux/of.h|9 -
 10 files changed, 35 insertions(+), 37 deletions(-)

Index: dt-next/include/linux/of.h
===
--- dt-next.orig/include/linux/of.h 2012-10-02 08:50:22.0 -0500
+++ dt-next/include/linux/of.h  2012-10-02 09:07:23.0 -0500
@@ -263,11 +263,10 @@
 
 extern int of_machine_is_compatible(const char *compat);
 
-extern int prom_add_property(struct device_node* np, struct property* prop);
-extern int prom_remove_property(struct device_node *np, struct property *prop);
-extern int prom_update_property(struct device_node *np,
-   struct property *newprop,
-   struct property *oldprop);
+extern int of_add_property(struct device_node *np, struct property *prop);
+extern int of_remove_property(struct device_node *np, struct property *prop);
+extern int of_update_property(struct device_node *np, struct property *newprop,
+ struct property *oldprop);
 
 #if defined(CONFIG_OF_DYNAMIC)
 /* For updating the device tree at runtime */
Index: dt-next/arch/powerpc/kernel/pci_32.c
===
--- dt-next.orig/arch/powerpc/kernel/pci_32.c   2012-10-02 08:30:22.0 
-0500
+++ dt-next/arch/powerpc/kernel/pci_32.c2012-10-02 09:01:10.0 
-0500
@@ -208,7 +208,7 @@
of_prop->name = "pci-OF-bus-map";
of_prop->length = 256;
of_prop->value = &of_prop[1];
-   prom_add_property(dn, of_prop);
+   of_add_property(dn, of_prop);
of_node_put(dn);
}
 }
Index: dt-next/arch/powerpc/kernel/machine_kexec.c
===
--- dt-next.orig/arch/powerpc/kernel/machine_kexec.c2012-10-02 
08:30:22.0 -0500
+++ dt-next/arch/powerpc/kernel/machine_kexec.c 2012-10-02 09:01:10.0 
-0500
@@ -212,16 +212,16 @@
 * be sure what's in them, so remove them. */
prop = of_find_property(node, "linux,crashkernel-base", NULL);
if (prop)
-   prom_remove_property(node, prop);
+   of_remove_property(node, prop);
 
prop = of_find_property(node, "linux,crashkernel-size", NULL);
if (prop)
-   prom_remove_property(node, prop);
+   of_remove_property(node, prop);
 
if (crashk_res.start != 0) {
-   prom_add_property(node, &crashk_base_prop);
+   of_add_property(node, &crashk_base_prop);
crashk_size = resource_size(&crashk_res);
-   prom_add_property(node, &crashk_size_prop);
+   of_add_property(node, &crashk_size_prop);
}
 }
 
@@ -237,11 +237,11 @@
/* remove any stale properties so ours can be found */
prop = of_find_property(node, kernel_end_prop.name, NULL);
if (prop)
-   prom_remove_property(node, prop);
+   of_remove_property(node, prop);
 
/* information needed by userspace when using default_machine_kexec */
kernel_end = __pa(_end);
-   prom_add_property(node, &kernel_end_prop);
+   of_add_property(node, &kernel_end_prop);
 
export_crashk_values(node);
 
Index: dt-next/arch/powerpc/kernel/machine_kexec_64.c
===
--- dt-next.orig/arch/powerpc/kernel/machine_kexec_64.c 2012-10-02 
08:30:22.0 -0500
+++ dt-next/arch/powerpc/kernel/machine_kexec_64.c  2012-10-02 
09:01:10.0 -0500
@@ -389,14 +389,14 @@
/* remove any stale propertys so ours can be found */
prop = of_find_property(node, htab_base_prop.name, NULL);
if (prop)
-   prom_remove_property(node, prop);
+   of_remove_property(node, prop);
prop = of_find_property(node, htab_size_prop.name, NULL);
if (prop)
-   prom_remove_property(node, prop);
+   of_remove_property(node, prop);
 
htab_base = __pa(htab_address);
-   prom_add_property(node, &htab_base_prop)

[PATCH 3/5] Add of node/property notification chain for adds and removes

2012-10-02 Thread Nathan Fontenot
This patch moves the notification chain for updates to the device tree
from the powerpc/pseries code to the base OF code. This makes this
functionality available to all architectures.

Additionally the notification chain is updated to allow notifications
for property add/remove/update. To make this work a pointer to a new
struct (of_prop_reconfig) is passed to the routines in the notification chain.
The of_prop_reconfig property contains a pointer to the node containing the
property and a pointer to the property itself. In the case of property
updates, the property pointer refers to the new property.

Signed-off-by: Nathan Fontenot 
---
 arch/powerpc/include/asm/pSeries_reconfig.h |   32 --
 arch/powerpc/kernel/prom.c  |6 -
 arch/powerpc/platforms/pseries/dlpar.c  |   14 ++--
 arch/powerpc/platforms/pseries/hotplug-cpu.c|8 +-
 arch/powerpc/platforms/pseries/hotplug-memory.c |   60 +--
 arch/powerpc/platforms/pseries/iommu.c  |6 -
 arch/powerpc/platforms/pseries/reconfig.c   |   65 -
 arch/powerpc/platforms/pseries/setup.c  |6 -
 drivers/of/base.c   |   74 ++--
 include/linux/of.h  |   20 +-
 10 files changed, 154 insertions(+), 137 deletions(-)

Index: dt-next/arch/powerpc/platforms/pseries/reconfig.c
===
--- dt-next.orig/arch/powerpc/platforms/pseries/reconfig.c  2012-10-02 
08:40:51.0 -0500
+++ dt-next/arch/powerpc/platforms/pseries/reconfig.c   2012-10-02 
08:45:12.0 -0500
@@ -16,11 +16,11 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
-#include 
 #include 
 
 /**
@@ -55,28 +55,6 @@
return parent;
 }
 
-static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
-
-int pSeries_reconfig_notifier_register(struct notifier_block *nb)
-{
-   return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_register);
-
-void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
-{
-   blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_unregister);
-
-int pSeries_reconfig_notify(unsigned long action, void *p)
-{
-   int err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
-   action, p);
-
-   return notifier_to_errno(err);
-}
-
 static int pSeries_reconfig_add_node(const char *path, struct property 
*proplist)
 {
struct device_node *np;
@@ -100,13 +78,12 @@
goto out_err;
}
 
-   err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np);
+   err = of_attach_node(np);
if (err) {
printk(KERN_ERR "Failed to add device node %s\n", path);
goto out_err;
}
 
-   of_attach_node(np);
of_node_put(np->parent);
 
return 0;
@@ -134,9 +111,7 @@
return -EBUSY;
}
 
-   pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
of_detach_node(np);
-
of_node_put(parent);
of_node_put(np); /* Must decrement the refcount */
return 0;
@@ -381,7 +356,6 @@
 static int do_update_property(char *buf, size_t bufsize)
 {
struct device_node *np;
-   struct pSeries_reconfig_prop_update upd_value;
unsigned char *value;
char *name, *end, *next_prop;
int rc, length;
@@ -410,41 +384,8 @@
return -ENODEV;
}
 
-   upd_value.node = np;
-   upd_value.property = newprop;
-   pSeries_reconfig_notify(PSERIES_UPDATE_PROPERTY, &upd_value);
-
rc = prom_update_property(np, newprop, oldprop);
-   if (rc)
-   return rc;
-
-   /* For memory under the ibm,dynamic-reconfiguration-memory node
-* of the device tree, adding and removing memory is just an update
-* to the ibm,dynamic-memory property instead of adding/removing a
-* memory node in the device tree.  For these cases we still need to
-* involve the notifier chain.
-*/
-   if (!strcmp(name, "ibm,dynamic-memory")) {
-   int action;
-
-   next_prop = parse_next_property(next_prop, end, &name,
-   &length, &value);
-   if (!next_prop)
-   return -EINVAL;
-
-   if (!strcmp(name, "add"))
-   action = PSERIES_DRCONF_MEM_ADD;
-   else
-   action = PSERIES_DRCONF_MEM_REMOVE;
-
-   rc = pSeries_reconfig_notify(action, value);
-   if (rc) {
-   prom_update_property(np, oldprop, newprop);
-   return rc;
-   }
-  

[PATCH 2/5] Move of_drconf_cell struct definition to asm/prom.h

2012-10-02 Thread Nathan Fontenot
This patch moves the definition of the of_drconf_cell struct to asm/prom.h 
to make it available for all powerpc/pseries code.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/include/asm/prom.h |   16 
 arch/powerpc/mm/numa.c  |   12 
 2 files changed, 16 insertions(+), 12 deletions(-)

Index: dt-next/arch/powerpc/mm/numa.c
===
--- dt-next.orig/arch/powerpc/mm/numa.c 2012-10-02 08:30:23.0 -0500
+++ dt-next/arch/powerpc/mm/numa.c  2012-10-02 08:41:42.0 -0500
@@ -397,18 +397,6 @@
return result;
 }
 
-struct of_drconf_cell {
-   u64 base_addr;
-   u32 drc_index;
-   u32 reserved;
-   u32 aa_index;
-   u32 flags;
-};
-
-#define DRCONF_MEM_ASSIGNED0x0008
-#define DRCONF_MEM_AI_INVALID  0x0040
-#define DRCONF_MEM_RESERVED0x0080
-
 /*
  * Read the next memblock list entry from the ibm,dynamic-memory property
  * and return the information in the provided of_drconf_cell structure.
Index: dt-next/arch/powerpc/include/asm/prom.h
===
--- dt-next.orig/arch/powerpc/include/asm/prom.h2011-11-17 
09:12:07.0 -0600
+++ dt-next/arch/powerpc/include/asm/prom.h 2012-10-02 08:41:42.0 
-0500
@@ -58,6 +58,22 @@
 
 extern void of_instantiate_rtc(void);
 
+/* The of_drconf_cell struct defines the layout of the LMB array
+ * specified in the device tree property
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory
+ */
+struct of_drconf_cell {
+   u64 base_addr;
+   u32 drc_index;
+   u32 reserved;
+   u32 aa_index;
+   u32 flags;
+};
+
+#define DRCONF_MEM_ASSIGNED0x0008
+#define DRCONF_MEM_AI_INVALID  0x0040
+#define DRCONF_MEM_RESERVED0x0080
+
 /* These includes are put at the bottom because they may contain things
  * that are overridden by this file.  Ideally they shouldn't be included
  * by this file, but there are a bunch of .c files that currently depend

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/5] Add /proc device tree updating to of node add/remove

2012-10-02 Thread Nathan Fontenot
When adding or removing a device tree node we should also update
the device tree in /proc/device-tree. This action is already done in the
generic OF code for adding/removing properties of a node. This patch adds
this functionality for nodes.

Signed-off-by: Nathan Fontenot  
---
 arch/powerpc/platforms/pseries/dlpar.c|   24 -
 arch/powerpc/platforms/pseries/reconfig.c |   47 -
 drivers/of/base.c |   55 +++---
 3 files changed, 51 insertions(+), 75 deletions(-)

Index: dt-next/arch/powerpc/platforms/pseries/dlpar.c
===
--- dt-next.orig/arch/powerpc/platforms/pseries/dlpar.c 2012-10-02 
08:30:23.0 -0500
+++ dt-next/arch/powerpc/platforms/pseries/dlpar.c  2012-10-02 
08:40:51.0 -0500
@@ -13,7 +13,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -255,9 +254,6 @@
 
 int dlpar_attach_node(struct device_node *dn)
 {
-#ifdef CONFIG_PROC_DEVICETREE
-   struct proc_dir_entry *ent;
-#endif
int rc;
 
of_node_set_flag(dn, OF_DYNAMIC);
@@ -274,32 +270,12 @@
}
 
of_attach_node(dn);
-
-#ifdef CONFIG_PROC_DEVICETREE
-   ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
-   if (ent)
-   proc_device_tree_add_node(dn, ent);
-#endif
-
of_node_put(dn->parent);
return 0;
 }
 
 int dlpar_detach_node(struct device_node *dn)
 {
-#ifdef CONFIG_PROC_DEVICETREE
-   struct device_node *parent = dn->parent;
-   struct property *prop = dn->properties;
-
-   while (prop) {
-   remove_proc_entry(prop->name, dn->pde);
-   prop = prop->next;
-   }
-
-   if (dn->pde)
-   remove_proc_entry(dn->pde->name, parent->pde);
-#endif
-
pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn);
of_detach_node(dn);
of_node_put(dn); /* Must decrement the refcount */
Index: dt-next/arch/powerpc/platforms/pseries/reconfig.c
===
--- dt-next.orig/arch/powerpc/platforms/pseries/reconfig.c  2012-10-02 
08:30:23.0 -0500
+++ dt-next/arch/powerpc/platforms/pseries/reconfig.c   2012-10-02 
08:40:51.0 -0500
@@ -23,48 +23,6 @@
 #include 
 #include 
 
-
-
-/*
- * Routines for "runtime" addition and removal of device tree nodes.
- */
-#ifdef CONFIG_PROC_DEVICETREE
-/*
- * Add a node to /proc/device-tree.
- */
-static void add_node_proc_entries(struct device_node *np)
-{
-   struct proc_dir_entry *ent;
-
-   ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
-   if (ent)
-   proc_device_tree_add_node(np, ent);
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-   struct property *pp = np->properties;
-   struct device_node *parent = np->parent;
-
-   while (pp) {
-   remove_proc_entry(pp->name, np->pde);
-   pp = pp->next;
-   }
-   if (np->pde)
-   remove_proc_entry(np->pde->name, parent->pde);
-}
-#else /* !CONFIG_PROC_DEVICETREE */
-static void add_node_proc_entries(struct device_node *np)
-{
-   return;
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-   return;
-}
-#endif /* CONFIG_PROC_DEVICETREE */
-
 /**
  * derive_parent - basically like dirname(1)
  * @path:  the full_name of a node to be added to the tree
@@ -149,9 +107,6 @@
}
 
of_attach_node(np);
-
-   add_node_proc_entries(np);
-
of_node_put(np->parent);
 
return 0;
@@ -179,8 +134,6 @@
return -EBUSY;
}
 
-   remove_node_proc_entries(np);
-
pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
of_detach_node(np);
 
Index: dt-next/drivers/of/base.c
===
--- dt-next.orig/drivers/of/base.c  2012-10-02 08:30:47.0 -0500
+++ dt-next/drivers/of/base.c   2012-10-02 08:40:51.0 -0500
@@ -1103,6 +1103,22 @@
  * device tree nodes.
  */
 
+#ifdef CONFIG_PROC_DEVICETREE
+static void of_add_proc_dt_entry(struct device_node *dn)
+{
+   struct proc_dir_entry *ent;
+
+   ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+   if (ent)
+   proc_device_tree_add_node(dn, ent);
+}
+#else
+static void of_add_proc_dt_entry(struct device_node *dn)
+{
+   return;
+}
+#endif
+
 /**
  * of_attach_node - Plug a device node into the tree and global list.
  */
@@ -1116,7 +1132,30 @@
np->parent->child = np;
allnodes = np;
write_unlock_irqrestore(&devtree_lock, flags);
+
+   of_add_proc_dt_entry(np);
+}
+
+#ifdef CONFIG_PROC_DEVICETREE
+static void of_remove

[PATCH 0/5] Move some OF functionality from pseries to generic OF code

2012-10-02 Thread Nathan Fontenot
This set of patches moves some OF code that has been living
in the pseries tree over to the generic OF code base. The
functionality being migrated over is something that, I believe,
should live in the generic code base. The specific functionality
being migrated to generic OF code is;

o Updating the device tree in /proc when adding/removing a node.
o Adding a notification chain for adding/removing nodes and
  properties of the device tree.
o Re-naming the base OF code prom_* routines to of_* to better go
  with the naming used for OF code.

-Nathan 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] memory hotplug: Correct page reservation checking

2011-10-04 Thread Nathan Fontenot
On 10/03/2011 07:50 PM, Andrew Morton wrote:
> On Mon, 26 Sep 2011 10:22:33 -0500
> Nathan Fontenot  wrote:
> 
>> The check to ensure that pages of recently added memory sections are 
>> correctly
>> marked as reserved before trying to online the memory is broken.  The request
>> to online the memory fails with the following:
>>
>> kernel: section number XXX page number 256 not reserved, was it already 
>> online?
>>
>> This updates the page reservation checking to check the pages of each memory
>> section of the memory block being onlined individually.
> 
> Why was this only noticed now?  Is there something unusual about the
> way in which you're using it, or has nobody ever used this code, or...?
> 

As far as I know it is only the powerpc/pseries code that uses the feature that
allows memory blocks in sysfs to span multiple memory sections.  We do this
because on pseries memory add/remove is done on a per LMB basis and we can have
machine where an LMB spans multiple memory sections.

This was just noticed due to a lack of testing between the 2.6.38/39 kernels 
where
this feature originally went in and the current mainline kernel.

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] memory hotplug: Correct page reservation checking

2011-09-26 Thread Nathan Fontenot
The check to ensure that pages of recently added memory sections are correctly
marked as reserved before trying to online the memory is broken.  The request
to online the memory fails with the following:

kernel: section number XXX page number 256 not reserved, was it already online?

This updates the page reservation checking to check the pages of each memory
section of the memory block being onlined individually.

Signed-off-by: Nathan Fontenot 
---
 drivers/base/memory.c |   60 ++
 1 file changed, 37 insertions(+), 23 deletions(-)

Index: linux/drivers/base/memory.c
===
--- linux.orig/drivers/base/memory.c2011-09-26 08:33:14.0 -0500
+++ linux/drivers/base/memory.c 2011-09-26 08:42:14.0 -0500
@@ -227,41 +227,42 @@
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
  */
+static int check_page_reservations(unsigned long phys_index)
+{
+   int i;
+   struct page *page;
+
+   page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
+
+   for (i = 0; i < PAGES_PER_SECTION; i++) {
+   if (PageReserved(page + i))
+   continue;
+
+   printk(KERN_WARNING "section number %ld page number %d "
+   "not reserved, was it already online?\n", phys_index, 
i);
+   return -EBUSY;
+   }
+
+   return 0;
+}
+
 static int
 memory_block_action(unsigned long phys_index, unsigned long action)
 {
-   int i;
unsigned long start_pfn, start_paddr;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   struct page *first_page;
+   struct page *page;
int ret;
 
-   first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
-
-   /*
-* The probe routines leave the pages reserved, just
-* as the bootmem code does.  Make sure they're still
-* that way.
-*/
-   if (action == MEM_ONLINE) {
-   for (i = 0; i < nr_pages; i++) {
-   if (PageReserved(first_page+i))
-   continue;
-
-   printk(KERN_WARNING "section number %ld page number %d "
-   "not reserved, was it already online?\n",
-   phys_index, i);
-   return -EBUSY;
-   }
-   }
+   page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
 
switch (action) {
case MEM_ONLINE:
-   start_pfn = page_to_pfn(first_page);
+   start_pfn = page_to_pfn(page);
ret = online_pages(start_pfn, nr_pages);
break;
case MEM_OFFLINE:
-   start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
+   start_paddr = page_to_pfn(page) << PAGE_SHIFT;
ret = remove_memory(start_paddr,
nr_pages << PAGE_SHIFT);
break;
@@ -277,7 +278,7 @@
 static int memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
 {
-   int ret = 0;
+   int i, ret = 0;
 
mutex_lock(&mem->state_mutex);
 
@@ -289,6 +290,19 @@
if (to_state == MEM_OFFLINE)
mem->state = MEM_GOING_OFFLINE;
 
+   if (to_state == MEM_ONLINE) {
+   /*
+* The probe routines leave the pages reserved, just
+* as the bootmem code does.  Make sure they're still
+* that way.
+*/
+   for (i = 0; i < sections_per_block; i++) {
+   ret = check_page_reservations(mem->start_section_nr + 
i);
+   if (ret)
+   return ret;
+   }
+   }
+
ret = memory_block_action(mem->start_section_nr, to_state);
 
if (ret)
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/4] De-couple sysfs memory directories from memory sections

2011-01-20 Thread Nathan Fontenot
On 01/20/2011 10:45 AM, Greg KH wrote:
> On Thu, Jan 20, 2011 at 10:36:40AM -0600, Nathan Fontenot wrote:
>> The root of this issue is in sysfs directory creation. Every time
>> a directory is created a string compare is done against sibling
>> directories ( see sysfs_find_dirent() ) to ensure we do not create 
>> duplicates.  The list of directory nodes in sysfs is kept as an
>> unsorted list which results in this being an exponentially longer
>> operation as the number of directories are created.
> 
> Again, are you sure about this?  I thought we resolved this issue in the
> past, but you were going to check it.  Did you?
> 

Yes, the string compare is still present in the sysfs code.  There was
discussion around this sometime last year when I sent a patch out that
stored the directory entries in something other than a linked list.
That patch was rejected but it was agreed that something should be done.

-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/4] Define memory_block_size_bytes for x86_64 with CONFIG_X86_UV

2011-01-20 Thread Nathan Fontenot
Define a version of memory_block_size_bytes for x86_64 when CONFIG_X86_UV is
set.

Signed-off-by: Robin Holt 
Signed-off-by: Jack Steiner 
Signed-off-by: Nathan Fontenot 

---
 arch/x86/mm/init_64.c |   14 ++
 1 file changed, 14 insertions(+)

Index: linux-2.6/arch/x86/mm/init_64.c
===
--- linux-2.6.orig/arch/x86/mm/init_64.c2011-01-20 08:18:20.0 
-0600
+++ linux-2.6/arch/x86/mm/init_64.c 2011-01-20 08:21:10.0 -0600
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int __init parse_direct_gbpages_off(char *arg)
 {
@@ -908,6 +909,19 @@ const char *arch_vma_name(struct vm_area
return NULL;
 }
 
+#ifdef CONFIG_X86_UV
+#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
+
+unsigned long memory_block_size_bytes(void)
+{
+   if (is_uv_system()) {
+   printk(KERN_INFO "UV: memory block size 2GB\n");
+   return 2UL * 1024 * 1024 * 1024;
+   }
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/4]Define memory_block_size_bytes for powerpc/pseries

2011-01-20 Thread Nathan Fontenot
Define a version of memory_block_size_bytes() for powerpc/pseries such that
a memory block spans an entire lmb.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 

---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   66 +++-
 1 file changed, 53 insertions(+), 13 deletions(-)

Index: linux-2.6/arch/powerpc/platforms/pseries/hotplug-memory.c
===
--- linux-2.6.orig/arch/powerpc/platforms/pseries/hotplug-memory.c  
2011-01-20 08:18:21.0 -0600
+++ linux-2.6/arch/powerpc/platforms/pseries/hotplug-memory.c   2011-01-20 
08:21:07.0 -0600
@@ -17,6 +17,54 @@
 #include 
 #include 
 
+static unsigned long get_memblock_size(void)
+{
+   struct device_node *np;
+   unsigned int memblock_size = 0;
+
+   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+   if (np) {
+   const unsigned long *size;
+
+   size = of_get_property(np, "ibm,lmb-size", NULL);
+   memblock_size = size ? *size : 0;
+
+   of_node_put(np);
+   } else {
+   unsigned int memzero_size = 0;
+   const unsigned int *regs;
+
+   np = of_find_node_by_path("/memory@0");
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memzero_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+
+   if (memzero_size) {
+   /* We now know the size of memory@0, use this to find
+* the first memoryblock and get its size.
+*/
+   char buf[64];
+
+   sprintf(buf, "/memory@%x", memzero_size);
+   np = of_find_node_by_path(buf);
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memblock_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+   }
+   }
+
+   return memblock_size;
+}
+
+unsigned long memory_block_size_bytes(void)
+{
+   return get_memblock_size();
+}
+
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
@@ -127,30 +175,22 @@ static int pseries_add_memory(struct dev
 
 static int pseries_drconf_memory(unsigned long *base, unsigned int action)
 {
-   struct device_node *np;
-   const unsigned long *lmb_size;
+   unsigned long memblock_size;
int rc;
 
-   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!np)
+   memblock_size = get_memblock_size();
+   if (!memblock_size)
return -EINVAL;
 
-   lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-   if (!lmb_size) {
-   of_node_put(np);
-   return -EINVAL;
-   }
-
if (action == PSERIES_DRCONF_MEM_ADD) {
-   rc = memblock_add(*base, *lmb_size);
+   rc = memblock_add(*base, memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-   rc = pseries_remove_memblock(*base, *lmb_size);
+   rc = pseries_remove_memblock(*base, memblock_size);
} else {
rc = -EINVAL;
}
 
-   of_node_put(np);
return rc;
 }
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/4] Update phys_index to [start|end]_section_nr

2011-01-20 Thread Nathan Fontenot
Update the 'phys_index' property of a the memory_block struct to be
called start_section_nr, and add a end_section_nr property.  The
data tracked here is the same but the updated naming is more in line
with what is stored here, namely the first and last section number
that the memory block spans.

The names presented to userspace remain the same, phys_index for
start_section_nr and end_phys_index for end_section_nr, to avoid breaking
anything in userspace.

This also updates the node sysfs code to be aware of the new capability for
a memory block to contain multiple memory sections and be aware of the memory
block structure name changes (start_section_nr).  This requires an additional
parameter to unregister_mem_sect_under_nodes so that we know which memory
section of the memory block to unregister.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 
Reviewed-by: KAMEZAWA Hiroyuki 

---
 drivers/base/memory.c  |   41 +++--
 drivers/base/node.c|   12 
 include/linux/memory.h |3 ++-
 include/linux/node.h   |6 --
 4 files changed, 45 insertions(+), 17 deletions(-)

Index: linux-2.6/drivers/base/memory.c
===
--- linux-2.6.orig/drivers/base/memory.c2011-01-20 08:20:54.0 
-0600
+++ linux-2.6/drivers/base/memory.c 2011-01-20 08:20:56.0 -0600
@@ -97,7 +97,7 @@ int register_memory(struct memory_block
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = memory->phys_index / sections_per_block;
+   memory->sysdev.id = memory->start_section_nr / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
@@ -138,12 +138,26 @@ static unsigned long get_memory_block_si
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev,
+static ssize_t show_mem_start_phys_index(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
+   unsigned long phys_index;
+
+   phys_index = mem->start_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
+}
+
+static ssize_t show_mem_end_phys_index(struct sys_device *dev,
+   struct sysdev_attribute *attr, char *buf)
+{
+   struct memory_block *mem =
+   container_of(dev, struct memory_block, sysdev);
+   unsigned long phys_index;
+
+   phys_index = mem->end_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
 }
 
 /*
@@ -158,7 +172,7 @@ static ssize_t show_mem_removable(struct
container_of(dev, struct memory_block, sysdev);
 
for (i = 0; i < sections_per_block; i++) {
-   pfn = section_nr_to_pfn(mem->phys_index + i);
+   pfn = section_nr_to_pfn(mem->start_section_nr + i);
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
}
 
@@ -275,14 +289,15 @@ static int memory_block_change_state(str
mem->state = MEM_GOING_OFFLINE;
 
for (i = 0; i < sections_per_block; i++) {
-   ret = memory_section_action(mem->phys_index + i, to_state);
+   ret = memory_section_action(mem->start_section_nr + i,
+   to_state);
if (ret)
break;
}
 
if (ret) {
for (i = 0; i < sections_per_block; i++)
-   memory_section_action(mem->phys_index + i,
+   memory_section_action(mem->start_section_nr + i,
  from_state_req);
 
mem->state = from_state_req;
@@ -330,7 +345,8 @@ static ssize_t show_phys_device(struct s
return sprintf(buf, "%d\n", mem->phys_device);
 }
 
-static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -522,17 +538,21 @@ static int init_memory_block(struct memo
return -ENOMEM;
 
scn_nr = __section_nr(section);
-   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
+   mem->start_section_nr =
+   base_memory_block_id(scn_nr) * sections_per_block;
+   mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem-&

[PATCH 1/4] Allow memory blocks to span multiple memory sections

2011-01-20 Thread Nathan Fontenot
Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Update the memory hotplug documentation to reflect the new behaviors of
memory blocks reflected in sysfs.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 
Reviewed-by: KAMEZAWA Hiroyuki 

---
 Documentation/memory-hotplug.txt |   47 +++
 drivers/base/memory.c|  155 +++
 2 files changed, 139 insertions(+), 63 deletions(-)

Index: linux-2.6/Documentation/memory-hotplug.txt
===
--- linux-2.6.orig/Documentation/memory-hotplug.txt 2011-01-05 
10:08:16.0 -0600
+++ linux-2.6/Documentation/memory-hotplug.txt  2011-01-05 10:17:37.0 
-0600
@@ -126,36 +126,51 @@ config options.
 
 4 sysfs files for memory hotplug
 
-All sections have their device information under /sys/devices/system/memory as
+All sections have their device information in sysfs.  Each section is part of
+a memory block under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is section id.)
+(XXX is the section id.)
 
-Now, XXX is defined as start_address_of_section / section_size.
+Now, XXX is defined as (start_address_of_section / section_size) of the first
+section contained in the memory block.  The files 'phys_index' and
+'end_phys_index' under each directory report the beginning and end section id's
+for the memory block covered by the sysfs directory.  It is expected that all
+memory sections in this range are present and no memory holes exist in the
+range. Currently there is no way to determine if there is a memory hole, but
+the existence of one should not affect the hotplug capabilities of the memory
+block.
 
 For example, assume 1GiB section size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 files.
+Under each section, you can see 4 or 5 files, the end_phys_index file being
+a recent addition and not present on older kernels.
 
-/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/start_phys_index
+/sys/devices/system/memory/memoryXXX/end_phys_index
 /sys/devices/system/memory/memoryXXX/phys_device
 /sys/devices/system/memory/memoryXXX/state
 /sys/devices/system/memory/memoryXXX/removable
 
-'phys_index' : read-only and contains section id, same as XXX.
-'state'  : read-write
-   at read:  contains online/offline state of memory.
-   at write: user can specify "online", "offline" command
-'phys_device': read-only: designed to show the name of physical memory device.
-   This is not well implemented now.
-'removable'  : read-only: contains an integer value indicating
-   whether the memory section is removable or not
-   removable.  A value of 1 indicates that the memory
-   section is removable and a value of 0 indicates that
-   it is not removable.
+'phys_index'  : read-only and contains section id of the first section
+   in the memory block, same as XXX.
+'end_phys_index'  : read-only and contains section id of the last section
+   in the memory block.
+'state'   : read-write
+at read:  contains online/offline state of memory.
+at write: user can specify "online", "offline" command
+which will be performed on al sections in the block.
+'phys_device' : read-only: designed to show the name of physical memory
+device.  This is not well implemented now.
+'removable'   : read-only: contains an integer value indicating
+whether the memory block is removable or not
+removable.  A value of 1 indicates that the memory
+block is removable and a value of 0 indicates that
+it is not removable. A memory block is removable only if
+every section in the block is removable.
 
 NOTE:
   These directories/files appear after physical memory hotplug phase.
Index: linux-2.6/drivers/base/memory.c
===
--- l

[PATCH 0/4] De-couple sysfs memory directories from memory sections

2011-01-20 Thread Nathan Fontenot
This is a re-send of the remaining patches that did not make it
into the last kernel release for de-coupling sysfs memory
directories from memory sections.  The first three patches of the
previous set went in, and this is the remaining patches that
need to be applied.

The patches decouple the concept that a single memory section corresponds
to a single directory in /sys/devices/system/memory/.  On systems
with large amounts of memory (1+ TB) there are performance issues
related to creating the large number of sysfs directories.  For
a powerpc machine with 1 TB of memory we are creating 63,000+
directories.  This is resulting in boot times of around 45-50
minutes for systems with 1 TB of memory and 8+ hours for systems
with 2 TB of memory.  With this patch set applied I am now seeing
boot times of 5 minutes or less.

The root of this issue is in sysfs directory creation. Every time
a directory is created a string compare is done against sibling
directories ( see sysfs_find_dirent() ) to ensure we do not create 
duplicates.  The list of directory nodes in sysfs is kept as an
unsorted list which results in this being an exponentially longer
operation as the number of directories are created.

The solution solved by this patch set is to allow a single
directory in sysfs to span multiple memory sections.  This is
controlled by an optional architecturally defined function
memory_block_size_bytes().  The default definition of this
routine returns a memory block size equal to the memory section
size. This maintains the current layout of sysfs memory
directories as it appears to userspace to remain the same as it
is today.

For architectures that define their own version of this routine,
as is done for powerpc and x86 in this patchset, the view in userspace
would change such that each memoryXXX directory would span
multiple memory sections.  The number of sections spanned would
depend on the value reported by memory_block_size_bytes.

-Nathan Fontenot
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/4] De-couple sysfs memory directories from memory sections

2011-01-10 Thread Nathan Fontenot
On 01/10/2011 12:44 PM, Greg KH wrote:
> On Mon, Jan 10, 2011 at 12:08:56PM -0600, Nathan Fontenot wrote:
>> This is a re-send of the remaining patches that did not make it
>> into the last kernel release for de-coupling sysfs memory
>> directories from memory sections.  The first three patches of the
>> previous set went in, and this is the remaining patches that
>> need to be applied.
> 
> Well, it's a bit late right now, as we are merging stuff that is already
> in our trees, and we are busy with that, so this is likely to be ignored
> until after .38-rc1 is out.
> 
> So, care to resend this after .38-rc1 is out so people can pay attention
> to it?

I was afraid of this. I didn't get a chance to get it out sooner but thought
I would send it out anyway.

> 
> 
>> The root of this issue is in sysfs directory creation. Every time
>> a directory is created a string compare is done against all sibling
>> directories to ensure we do not create duplicates.  The list of
>> directory nodes in sysfs is kept as an unsorted list which results
>> in this being an exponentially longer operation as the number of
>> directories are created.
> 
> Are you sure this is still an issue?  I thought we solved this last
> kernel or so with a simple patch?

I'll go back and look at this again.

thanks,
-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/4] Define memory_block_size_bytes for x86_64 with CONFIG_X86_UV defined

2011-01-10 Thread Nathan Fontenot
Define a version of memory_block_size_bytes for x86_64 when CONFIG_X86_UV is
set.

Signed-off-by: Robin Holt 
Signed-off-by: Jack Steiner 
Signed-off-by: Nathan Fontenot 

---
 arch/x86/mm/init_64.c |   14 ++
 1 file changed, 14 insertions(+)

Index: linux-2.6/arch/x86/mm/init_64.c
===
--- linux-2.6.orig/arch/x86/mm/init_64.c2011-01-05 10:08:13.0 
-0600
+++ linux-2.6/arch/x86/mm/init_64.c 2011-01-05 10:17:51.0 -0600
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int __init parse_direct_gbpages_off(char *arg)
 {
@@ -908,6 +909,19 @@ const char *arch_vma_name(struct vm_area
return NULL;
 }
 
+#ifdef CONFIG_X86_UV
+#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
+
+unsigned long memory_block_size_bytes(void)
+{
+   if (is_uv_system()) {
+   printk(KERN_INFO "UV: memory block size 2GB\n");
+   return 2UL * 1024 * 1024 * 1024;
+   }
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/4] Define memory_block_size_bytes for powerpc/pseries

2011-01-10 Thread Nathan Fontenot
Define a version of memory_block_size_bytes() for powerpc/pseries such that
a memory block spans an entire lmb.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 

---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   66 +++-
 1 file changed, 53 insertions(+), 13 deletions(-)

Index: linux-2.6/arch/powerpc/platforms/pseries/hotplug-memory.c
===
--- linux-2.6.orig/arch/powerpc/platforms/pseries/hotplug-memory.c  
2011-01-05 10:08:14.0 -0600
+++ linux-2.6/arch/powerpc/platforms/pseries/hotplug-memory.c   2011-01-05 
10:17:49.0 -0600
@@ -17,6 +17,54 @@
 #include 
 #include 
 
+static unsigned long get_memblock_size(void)
+{
+   struct device_node *np;
+   unsigned int memblock_size = 0;
+
+   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+   if (np) {
+   const unsigned long *size;
+
+   size = of_get_property(np, "ibm,lmb-size", NULL);
+   memblock_size = size ? *size : 0;
+
+   of_node_put(np);
+   } else {
+   unsigned int memzero_size = 0;
+   const unsigned int *regs;
+
+   np = of_find_node_by_path("/mem...@0");
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memzero_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+
+   if (memzero_size) {
+   /* We now know the size of mem...@0, use this to find
+* the first memoryblock and get its size.
+*/
+   char buf[64];
+
+   sprintf(buf, "/mem...@%x", memzero_size);
+   np = of_find_node_by_path(buf);
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memblock_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+   }
+   }
+
+   return memblock_size;
+}
+
+unsigned long memory_block_size_bytes(void)
+{
+   return get_memblock_size();
+}
+
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
@@ -127,30 +175,22 @@ static int pseries_add_memory(struct dev
 
 static int pseries_drconf_memory(unsigned long *base, unsigned int action)
 {
-   struct device_node *np;
-   const unsigned long *lmb_size;
+   unsigned long memblock_size;
int rc;
 
-   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!np)
+   memblock_size = get_memblock_size();
+   if (!memblock_size)
return -EINVAL;
 
-   lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-   if (!lmb_size) {
-   of_node_put(np);
-   return -EINVAL;
-   }
-
if (action == PSERIES_DRCONF_MEM_ADD) {
-   rc = memblock_add(*base, *lmb_size);
+   rc = memblock_add(*base, memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-   rc = pseries_remove_memblock(*base, *lmb_size);
+   rc = pseries_remove_memblock(*base, memblock_size);
} else {
rc = -EINVAL;
}
 
-   of_node_put(np);
return rc;
 }
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/4] Update phys_index to [start|end]_section_nr

2011-01-10 Thread Nathan Fontenot
Update the 'phys_index' property of a the memory_block struct to be
called start_section_nr, and add a end_section_nr property.  The
data tracked here is the same but the updated naming is more in line
with what is stored here, namely the first and last section number
that the memory block spans.

The names presented to userspace remain the same, phys_index for
start_section_nr and end_phys_index for end_section_nr, to avoid breaking
anything in userspace.

This also updates the node sysfs code to be aware of the new capability for
a memory block to contain multiple memory sections and be aware of the memory
block structure name changes (start_section_nr).  This requires an additional
parameter to unregister_mem_sect_under_nodes so that we know which memory
section of the memory block to unregister.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 
Reviewed-by: KAMEZAWA Hiroyuki 

---
 drivers/base/memory.c  |   41 +++--
 drivers/base/node.c|   12 
 include/linux/memory.h |3 ++-
 include/linux/node.h   |6 --
 4 files changed, 45 insertions(+), 17 deletions(-)

Index: linux-2.6/drivers/base/memory.c
===
--- linux-2.6.orig/drivers/base/memory.c2011-01-05 10:17:37.0 
-0600
+++ linux-2.6/drivers/base/memory.c 2011-01-05 10:17:46.0 -0600
@@ -97,7 +97,7 @@ int register_memory(struct memory_block
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = memory->phys_index / sections_per_block;
+   memory->sysdev.id = memory->start_section_nr / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
@@ -138,12 +138,26 @@ static unsigned long get_memory_block_si
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev,
+static ssize_t show_mem_start_phys_index(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
+   unsigned long phys_index;
+
+   phys_index = mem->start_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
+}
+
+static ssize_t show_mem_end_phys_index(struct sys_device *dev,
+   struct sysdev_attribute *attr, char *buf)
+{
+   struct memory_block *mem =
+   container_of(dev, struct memory_block, sysdev);
+   unsigned long phys_index;
+
+   phys_index = mem->end_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
 }
 
 /*
@@ -158,7 +172,7 @@ static ssize_t show_mem_removable(struct
container_of(dev, struct memory_block, sysdev);
 
for (i = 0; i < sections_per_block; i++) {
-   pfn = section_nr_to_pfn(mem->phys_index + i);
+   pfn = section_nr_to_pfn(mem->start_section_nr + i);
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
}
 
@@ -275,14 +289,15 @@ static int memory_block_change_state(str
mem->state = MEM_GOING_OFFLINE;
 
for (i = 0; i < sections_per_block; i++) {
-   ret = memory_section_action(mem->phys_index + i, to_state);
+   ret = memory_section_action(mem->start_section_nr + i,
+   to_state);
if (ret)
break;
}
 
if (ret) {
for (i = 0; i < sections_per_block; i++)
-   memory_section_action(mem->phys_index + i,
+   memory_section_action(mem->start_section_nr + i,
  from_state_req);
 
mem->state = from_state_req;
@@ -330,7 +345,8 @@ static ssize_t show_phys_device(struct s
return sprintf(buf, "%d\n", mem->phys_device);
 }
 
-static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -522,17 +538,21 @@ static int init_memory_block(struct memo
return -ENOMEM;
 
scn_nr = __section_nr(section);
-   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
+   mem->start_section_nr =
+   base_memory_block_id(scn_nr) * sections_per_block;
+   mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem-&

[PATCH 1/4] allow memory blocks to span multiple memory sections

2011-01-10 Thread Nathan Fontenot
Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Update the memory hotplug documentation to reflect the new behaviors of
memory blocks reflected in sysfs.

Signed-off-by: Nathan Fontenot 
Reviewed-by: Robin Holt 
Reviewed-by: KAMEZAWA Hiroyuki 

---
 Documentation/memory-hotplug.txt |   47 +++
 drivers/base/memory.c|  155 +++
 2 files changed, 139 insertions(+), 63 deletions(-)

Index: linux-2.6/Documentation/memory-hotplug.txt
===
--- linux-2.6.orig/Documentation/memory-hotplug.txt 2011-01-05 
10:08:16.0 -0600
+++ linux-2.6/Documentation/memory-hotplug.txt  2011-01-05 10:17:37.0 
-0600
@@ -126,36 +126,51 @@ config options.
 
 4 sysfs files for memory hotplug
 
-All sections have their device information under /sys/devices/system/memory as
+All sections have their device information in sysfs.  Each section is part of
+a memory block under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is section id.)
+(XXX is the section id.)
 
-Now, XXX is defined as start_address_of_section / section_size.
+Now, XXX is defined as (start_address_of_section / section_size) of the first
+section contained in the memory block.  The files 'phys_index' and
+'end_phys_index' under each directory report the beginning and end section id's
+for the memory block covered by the sysfs directory.  It is expected that all
+memory sections in this range are present and no memory holes exist in the
+range. Currently there is no way to determine if there is a memory hole, but
+the existence of one should not affect the hotplug capabilities of the memory
+block.
 
 For example, assume 1GiB section size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 files.
+Under each section, you can see 4 or 5 files, the end_phys_index file being
+a recent addition and not present on older kernels.
 
-/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/start_phys_index
+/sys/devices/system/memory/memoryXXX/end_phys_index
 /sys/devices/system/memory/memoryXXX/phys_device
 /sys/devices/system/memory/memoryXXX/state
 /sys/devices/system/memory/memoryXXX/removable
 
-'phys_index' : read-only and contains section id, same as XXX.
-'state'  : read-write
-   at read:  contains online/offline state of memory.
-   at write: user can specify "online", "offline" command
-'phys_device': read-only: designed to show the name of physical memory device.
-   This is not well implemented now.
-'removable'  : read-only: contains an integer value indicating
-   whether the memory section is removable or not
-   removable.  A value of 1 indicates that the memory
-   section is removable and a value of 0 indicates that
-   it is not removable.
+'phys_index'  : read-only and contains section id of the first section
+   in the memory block, same as XXX.
+'end_phys_index'  : read-only and contains section id of the last section
+   in the memory block.
+'state'   : read-write
+at read:  contains online/offline state of memory.
+at write: user can specify "online", "offline" command
+which will be performed on al sections in the block.
+'phys_device' : read-only: designed to show the name of physical memory
+device.  This is not well implemented now.
+'removable'   : read-only: contains an integer value indicating
+whether the memory block is removable or not
+removable.  A value of 1 indicates that the memory
+block is removable and a value of 0 indicates that
+it is not removable. A memory block is removable only if
+every section in the block is removable.
 
 NOTE:
   These directories/files appear after physical memory hotplug phase.
Index: linux-2.6/drivers/base/memory.c
===
--- l

[PATCH 0/4] De-couple sysfs memory directories from memory sections

2011-01-10 Thread Nathan Fontenot
This is a re-send of the remaining patches that did not make it
into the last kernel release for de-coupling sysfs memory
directories from memory sections.  The first three patches of the
previous set went in, and this is the remaining patches that
need to be applied.

The patches decouple the concept that a single memory
section corresponds to a single directory in 
/sys/devices/system/memory/.  On systems
with large amounts of memory (1+ TB) there are performance issues
related to creating the large number of sysfs directories.  For
a powerpc machine with 1 TB of memory we are creating 63,000+
directories.  This is resulting in boot times of around 45-50
minutes for systems with 1 TB of memory and 8 hours for systems
with 2 TB of memory.  With this patch set applied I am now seeing
boot times of 5 minutes or less.

The root of this issue is in sysfs directory creation. Every time
a directory is created a string compare is done against all sibling
directories to ensure we do not create duplicates.  The list of
directory nodes in sysfs is kept as an unsorted list which results
in this being an exponentially longer operation as the number of
directories are created.

The solution solved by this patch set is to allow a single
directory in sysfs to span multiple memory sections.  This is
controlled by an optional architecturally defined function
memory_block_size_bytes().  The default definition of this
routine returns a memory block size equal to the memory section
size. This maintains the current layout of sysfs memory
directories as it appears to userspace to remain the same as it
is today.

For architectures that define their own version of this routine,
as is done for powerpc and x86 in this patchset, the view in userspace
would change such that each memoryXXX directory would span
multiple memory sections.  The number of sections spanned would
depend on the value reported by memory_block_size_bytes.

-Nathan Fontenot
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 7/9] v3 Define memory_block_size_bytes for powerpc/pseries

2010-10-04 Thread Nathan Fontenot
On 10/03/2010 01:27 PM, Balbir Singh wrote:
> * Dave Hansen  [2010-10-03 11:11:01]:
> 
>> On Sun, 2010-10-03 at 13:07 -0500, Robin Holt wrote:
>>> On Sun, Oct 03, 2010 at 11:25:00PM +0530, Balbir Singh wrote:
>>>> * Nathan Fontenot  [2010-10-01 13:35:54]:
>>>>
>>>>> Define a version of memory_block_size_bytes() for powerpc/pseries such 
>>>>> that
>>>>> a memory block spans an entire lmb.
>>>>
>>>> I hope I am not missing anything obvious, but why not just call it
>>>> lmb_size, why do we need memblock_size?
>>>>
>>>> Is lmb_size == memblock_size after your changes true for all
>>>> platforms?
>>>
>>> What is an lmb?  I don't recall anything like lmb being referred to in
>>> the rest of the kernel.
>>
>> Heh.  It's the OpenFirmware name for a Logical Memory Block.  Basically
>> what we use to determine the SECTION_SIZE on powerpc.  Probably not the
>> best terminology to use elsewhere in the kernel.
> 
> Agreed for the kernel, this patch was for powerpc/pseries, hence was
> checking in this context.
> 

I don't really see a reason to name it lmb_size, it seems easier
to stick with the naming used by the rest of the kernel.

-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/9] v3 Allow memory blocks to span multiple memory sections

2010-10-01 Thread Nathan Fontenot
Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Signed-off-by: Nathan Fontenot 

Updated patch to correct get_memory_block_size() variable block_sz to be
an unsigned long.

---
 drivers/base/memory.c |  155 ++
 1 file changed, 108 insertions(+), 47 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:13:50.0 
-0500
+++ linux-next/drivers/base/memory.c2010-10-01 13:50:19.0 -0500
@@ -30,6 +30,14 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME  "memory"
+#define MIN_MEMORY_BLOCK_SIZE  (1 << SECTION_SIZE_BITS)
+
+static int sections_per_block;
+
+static inline int base_memory_block_id(int section_nr)
+{
+   return section_nr / sections_per_block;
+}
 
 static struct sysdev_class memory_sysdev_class = {
.name = MEMORY_CLASS_NAME,
@@ -84,28 +92,47 @@
  * register_memory - Setup a sysfs device for a memory block
  */
 static
-int register_memory(struct memory_block *memory, struct mem_section *section)
+int register_memory(struct memory_block *memory)
 {
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = __section_nr(section);
+   memory->sysdev.id = memory->phys_index / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
 }
 
 static void
-unregister_memory(struct memory_block *memory, struct mem_section *section)
+unregister_memory(struct memory_block *memory)
 {
BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
-   BUG_ON(memory->sysdev.id != __section_nr(section));
 
/* drop the ref. we got in remove_memory_block() */
kobject_put(&memory->sysdev.kobj);
sysdev_unregister(&memory->sysdev);
 }
 
+unsigned long __weak memory_block_size_bytes(void)
+{
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+
+static unsigned long get_memory_block_size(void)
+{
+   unsigned long block_sz;
+
+   block_sz = memory_block_size_bytes();
+
+   /* Validate blk_sz is a power of 2 and not less than section size */
+   if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
+   WARN_ON(1);
+   block_sz = MIN_MEMORY_BLOCK_SIZE;
+   }
+
+   return block_sz;
+}
+
 /*
  * use this as the physical section index that this memsection
  * uses.
@@ -116,7 +143,7 @@
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index);
+   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
 }
 
 /*
@@ -125,13 +152,16 @@
 static ssize_t show_mem_removable(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
-   unsigned long start_pfn;
-   int ret;
+   unsigned long i, pfn;
+   int ret = 1;
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
 
-   start_pfn = section_nr_to_pfn(mem->phys_index);
-   ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
+   for (i = 0; i < sections_per_block; i++) {
+   pfn = section_nr_to_pfn(mem->phys_index + i);
+   ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+   }
+
return sprintf(buf, "%d\n", ret);
 }
 
@@ -184,17 +214,14 @@
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(struct memory_block *mem, unsigned long action)
+memory_section_action(unsigned long phys_index, unsigned long action)
 {
int i;
-   unsigned long psection;
unsigned long start_pfn, start_paddr;
struct page *first_page;
int ret;
-   int old_state = mem->state;
 
-   psection = mem->phys_index;
-   first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+   first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
 
/*
 * The probe routines leave the pages reserved, just
@@ -207,8 +234,8 @@
continue;
 
printk(KERN_WARNING "section number %ld page number %d "
-   "not reserved, was it already online? \n",
-   

Re: [PATCH 4/9] v3 Allow memory blocks to span multiple memory sections

2010-10-01 Thread Nathan Fontenot
On 10/01/2010 01:52 PM, Robin Holt wrote:
> On Fri, Oct 01, 2010 at 01:31:51PM -0500, Nathan Fontenot wrote:
>> Update the memory sysfs code such that each sysfs memory directory is now
>> considered a memory block that can span multiple memory sections per
>> memory block.  The default size of each memory block is SECTION_SIZE_BITS
>> to maintain the current behavior of having a single memory section per
>> memory block (i.e. one sysfs directory per memory section).
>>
>> For architectures that want to have memory blocks span multiple
>> memory sections they need only define their own memory_block_size_bytes()
>> routine.
>>
>> Signed-off-by: Nathan Fontenot 
>>
>> ---
>>  drivers/base/memory.c |  155 
>> ++
>>  1 file changed, 108 insertions(+), 47 deletions(-)
>>
>> Index: linux-next/drivers/base/memory.c
>> ===
>> --- linux-next.orig/drivers/base/memory.c2010-09-30 14:13:50.0 
>> -0500
>> +++ linux-next/drivers/base/memory.c 2010-09-30 14:46:00.0 -0500
> ...
>> +static unsigned long get_memory_block_size(void)
>> +{
>> +u32 block_sz;
> ^^^
> 
> I think this should be unsigned long.  u32 will work, but everything
> else has been changed to use unsigned long.  If you disagree, I will
> happily acquiesce as nothing is currently broken.  If SGI decides to make
> memory_block_size_bytes more dynamic, we will fix this up at that time.

You're right, that should have been made an unsigned long also.  I'll attach a 
new
patch with that corrected.

-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 9/9] v3 Update memory hotplug documentation

2010-10-01 Thread Nathan Fontenot
Update the memory hotplug documentation to reflect the new behaviors of
memory blocks reflected in sysfs.

Signed-off-by: Nathan Fontenot 

---
 Documentation/memory-hotplug.txt |   47 +--
 1 file changed, 31 insertions(+), 16 deletions(-)

Index: linux-next/Documentation/memory-hotplug.txt
===
--- linux-next.orig/Documentation/memory-hotplug.txt2010-09-29 
14:56:24.0 -0500
+++ linux-next/Documentation/memory-hotplug.txt 2010-09-30 14:59:47.0 
-0500
@@ -126,36 +126,51 @@
 
 4 sysfs files for memory hotplug
 
-All sections have their device information under /sys/devices/system/memory as
+All sections have their device information in sysfs.  Each section is part of
+a memory block under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is section id.)
+(XXX is the section id.)
 
-Now, XXX is defined as start_address_of_section / section_size.
+Now, XXX is defined as (start_address_of_section / section_size) of the first
+section contained in the memory block.  The files 'phys_index' and
+'end_phys_index' under each directory report the beginning and end section id's
+for the memory block covered by the sysfs directory.  It is expected that all
+memory sections in this range are present and no memory holes exist in the
+range. Currently there is no way to determine if there is a memory hole, but
+the existence of one should not affect the hotplug capabilities of the memory
+block.
 
 For example, assume 1GiB section size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 files.
+Under each section, you can see 4 or 5 files, the end_phys_index file being
+a recent addition and not present on older kernels.
 
-/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/start_phys_index
+/sys/devices/system/memory/memoryXXX/end_phys_index
 /sys/devices/system/memory/memoryXXX/phys_device
 /sys/devices/system/memory/memoryXXX/state
 /sys/devices/system/memory/memoryXXX/removable
 
-'phys_index' : read-only and contains section id, same as XXX.
-'state'  : read-write
-   at read:  contains online/offline state of memory.
-   at write: user can specify "online", "offline" command
-'phys_device': read-only: designed to show the name of physical memory device.
-   This is not well implemented now.
-'removable'  : read-only: contains an integer value indicating
-   whether the memory section is removable or not
-   removable.  A value of 1 indicates that the memory
-   section is removable and a value of 0 indicates that
-   it is not removable.
+'phys_index'  : read-only and contains section id of the first section
+   in the memory block, same as XXX.
+'end_phys_index'  : read-only and contains section id of the last section
+   in the memory block.
+'state'   : read-write
+at read:  contains online/offline state of memory.
+at write: user can specify "online", "offline" command
+which will be performed on al sections in the block.
+'phys_device' : read-only: designed to show the name of physical memory
+device.  This is not well implemented now.
+'removable'   : read-only: contains an integer value indicating
+whether the memory block is removable or not
+removable.  A value of 1 indicates that the memory
+block is removable and a value of 0 indicates that
+it is not removable. A memory block is removable only if
+every section in the block is removable.
 
 NOTE:
   These directories/files appear after physical memory hotplug phase.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 8/9] v3 Define memory_block_size_bytes for x86_64 with CONFIG_X86_UV set

2010-10-01 Thread Nathan Fontenot
Define a version of memory_block_size_bytes for x86_64 when CONFIG_X86_UV is
set.

Signed-off-by: Robin Holt 
Signed-off-by: Jack Steiner 

---
 arch/x86/mm/init_64.c |   14 ++
 1 file changed, 14 insertions(+)

Index: linux-next/arch/x86/mm/init_64.c
===
--- linux-next.orig/arch/x86/mm/init_64.c   2010-09-29 14:56:25.0 
-0500
+++ linux-next/arch/x86/mm/init_64.c2010-10-01 13:00:50.0 -0500
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static int __init parse_direct_gbpages_off(char *arg)
@@ -902,6 +903,19 @@
return NULL;
 }
 
+#ifdef CONFIG_X86_UV
+#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
+
+unsigned long memory_block_size_bytes(void)
+{
+   if (is_uv_system()) {
+   printk(KERN_INFO "UV: memory block size 2GB\n");
+   return 2UL * 1024 * 1024 * 1024;
+   }
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 7/9] v3 Define memory_block_size_bytes for powerpc/pseries

2010-10-01 Thread Nathan Fontenot
Define a version of memory_block_size_bytes() for powerpc/pseries such that
a memory block spans an entire lmb.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   66 +++-
 1 file changed, 53 insertions(+), 13 deletions(-)

Index: linux-next/arch/powerpc/platforms/pseries/hotplug-memory.c
===
--- linux-next.orig/arch/powerpc/platforms/pseries/hotplug-memory.c 
2010-09-30 14:44:37.0 -0500
+++ linux-next/arch/powerpc/platforms/pseries/hotplug-memory.c  2010-09-30 
14:47:04.0 -0500
@@ -17,6 +17,54 @@
 #include 
 #include 
 
+static unsigned long get_memblock_size(void)
+{
+   struct device_node *np;
+   unsigned int memblock_size = 0;
+
+   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+   if (np) {
+   const unsigned long *size;
+
+   size = of_get_property(np, "ibm,lmb-size", NULL);
+   memblock_size = size ? *size : 0;
+
+   of_node_put(np);
+   } else {
+   unsigned int memzero_size = 0;
+   const unsigned int *regs;
+
+   np = of_find_node_by_path("/mem...@0");
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memzero_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+
+   if (memzero_size) {
+   /* We now know the size of mem...@0, use this to find
+* the first memoryblock and get its size.
+*/
+   char buf[64];
+
+   sprintf(buf, "/mem...@%x", memzero_size);
+   np = of_find_node_by_path(buf);
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memblock_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+   }
+   }
+
+   return memblock_size;
+}
+
+unsigned long memory_block_size_bytes(void)
+{
+   return get_memblock_size();
+}
+
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
@@ -127,30 +175,22 @@
 
 static int pseries_drconf_memory(unsigned long *base, unsigned int action)
 {
-   struct device_node *np;
-   const unsigned long *lmb_size;
+   unsigned long memblock_size;
int rc;
 
-   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!np)
+   memblock_size = get_memblock_size();
+   if (!memblock_size)
return -EINVAL;
 
-   lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-   if (!lmb_size) {
-   of_node_put(np);
-   return -EINVAL;
-   }
-
if (action == PSERIES_DRCONF_MEM_ADD) {
-   rc = memblock_add(*base, *lmb_size);
+   rc = memblock_add(*base, memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-   rc = pseries_remove_memblock(*base, *lmb_size);
+   rc = pseries_remove_memblock(*base, memblock_size);
} else {
rc = -EINVAL;
}
 
-   of_node_put(np);
return rc;
 }
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 6/9] v3 Update node sysfs code

2010-10-01 Thread Nathan Fontenot
Update the node sysfs code to be aware of the new capability for a memory
block to contain multiple memory sections and be aware of the memory block
structure name changes (start_section_nr).  This requires an additional
parameter to unregister_mem_sect_under_nodes so that we know which memory
section of the memory block to unregister.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |2 +-
 drivers/base/node.c   |   12 
 include/linux/node.h  |6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

Index: linux-next/drivers/base/node.c
===
--- linux-next.orig/drivers/base/node.c 2010-09-30 14:44:38.0 -0500
+++ linux-next/drivers/base/node.c  2010-09-30 14:46:12.0 -0500
@@ -346,8 +346,10 @@
return -EFAULT;
if (!node_online(nid))
return 0;
-   sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index);
-   sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+
+   sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+   sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
+   sect_end_pfn += PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int page_nid;
 
@@ -371,7 +373,8 @@
 }
 
 /* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk)
+int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+   unsigned long phys_index)
 {
NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
unsigned long pfn, sect_start_pfn, sect_end_pfn;
@@ -383,7 +386,8 @@
if (!unlinked_nodes)
return -ENOMEM;
nodes_clear(*unlinked_nodes);
-   sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index);
+
+   sect_start_pfn = section_nr_to_pfn(phys_index);
sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int nid;
Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:46:09.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:46:12.0 -0500
@@ -587,10 +587,10 @@
 
mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
+   unregister_mem_sect_under_nodes(mem, __section_nr(section));
 
mem->section_count--;
if (mem->section_count == 0) {
-   unregister_mem_sect_under_nodes(mem);
mem_remove_simple_file(mem, phys_index);
mem_remove_simple_file(mem, end_phys_index);
mem_remove_simple_file(mem, state);
Index: linux-next/include/linux/node.h
===
--- linux-next.orig/include/linux/node.h2010-09-30 14:44:38.0 
-0500
+++ linux-next/include/linux/node.h 2010-09-30 14:46:12.0 -0500
@@ -44,7 +44,8 @@
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
int nid);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk);
+extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+  unsigned long phys_index);
 
 #ifdef CONFIG_HUGETLBFS
 extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
@@ -72,7 +73,8 @@
 {
return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk)
+static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+ unsigned long phys_index)
 {
return 0;
 }


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/9] v3 rename phys_index properties of memory block struct

2010-10-01 Thread Nathan Fontenot
Update the 'phys_index' property of a the memory_block struct to be
called start_section_nr, and add a end_section_nr property.  The
data tracked here is the same but the updated naming is more in line
with what is stored here, namely the first and last section number
that the memory block spans.

The names presented to userspace remain the same, phys_index for
start_section_nr and end_phys_index for end_section_nr, to avoid breaking
anything in userspace.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c  |   39 ++-
 include/linux/memory.h |3 ++-
 2 files changed, 32 insertions(+), 10 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:46:00.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:46:09.0 -0500
@@ -97,7 +97,7 @@
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = memory->phys_index / sections_per_block;
+   memory->sysdev.id = memory->start_section_nr / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
@@ -138,12 +138,26 @@
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev,
+static ssize_t show_mem_start_phys_index(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
+   unsigned long phys_index;
+
+   phys_index = mem->start_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
+}
+
+static ssize_t show_mem_end_phys_index(struct sys_device *dev,
+   struct sysdev_attribute *attr, char *buf)
+{
+   struct memory_block *mem =
+   container_of(dev, struct memory_block, sysdev);
+   unsigned long phys_index;
+
+   phys_index = mem->end_section_nr / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
 }
 
 /*
@@ -158,7 +172,7 @@
container_of(dev, struct memory_block, sysdev);
 
for (i = 0; i < sections_per_block; i++) {
-   pfn = section_nr_to_pfn(mem->phys_index + i);
+   pfn = section_nr_to_pfn(mem->start_section_nr + i);
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
}
 
@@ -275,14 +289,15 @@
mem->state = MEM_GOING_OFFLINE;
 
for (i = 0; i < sections_per_block; i++) {
-   ret = memory_section_action(mem->phys_index + i, to_state);
+   ret = memory_section_action(mem->start_section_nr + i,
+   to_state);
if (ret)
break;
}
 
if (ret) {
for (i = 0; i < sections_per_block; i++)
-   memory_section_action(mem->phys_index + i,
+   memory_section_action(mem->start_section_nr + i,
  from_state_req);
 
mem->state = from_state_req;
@@ -330,7 +345,8 @@
return sprintf(buf, "%d\n", mem->phys_device);
 }
 
-static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -514,17 +530,21 @@
return -ENOMEM;
 
scn_nr = __section_nr(section);
-   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
+   mem->start_section_nr =
+   base_memory_block_id(scn_nr) * sections_per_block;
+   mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
mem->section_count++;
mutex_init(&mem->state_mutex);
-   start_pfn = section_nr_to_pfn(mem->phys_index);
+   start_pfn = section_nr_to_pfn(mem->start_section_nr);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
 
ret = register_memory(mem);
if (!ret)
ret = mem_create_simple_file(mem, phys_index);
if (!ret)
+   ret = mem_create_simple_file(mem, end_phys_index);
+   if (!ret)
ret = mem_create_simple_file(mem, state);
if (!ret)
ret = mem_create_simple_file(mem, phys_device);
@@ -572,6 +592,7 @@
if (mem->section_count == 0) {
   

[PATCH 4/9] v3 Allow memory blocks to span multiple memory sections

2010-10-01 Thread Nathan Fontenot
Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |  155 ++
 1 file changed, 108 insertions(+), 47 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:13:50.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:46:00.0 -0500
@@ -30,6 +30,14 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME  "memory"
+#define MIN_MEMORY_BLOCK_SIZE  (1 << SECTION_SIZE_BITS)
+
+static int sections_per_block;
+
+static inline int base_memory_block_id(int section_nr)
+{
+   return section_nr / sections_per_block;
+}
 
 static struct sysdev_class memory_sysdev_class = {
.name = MEMORY_CLASS_NAME,
@@ -84,28 +92,47 @@
  * register_memory - Setup a sysfs device for a memory block
  */
 static
-int register_memory(struct memory_block *memory, struct mem_section *section)
+int register_memory(struct memory_block *memory)
 {
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = __section_nr(section);
+   memory->sysdev.id = memory->phys_index / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
 }
 
 static void
-unregister_memory(struct memory_block *memory, struct mem_section *section)
+unregister_memory(struct memory_block *memory)
 {
BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
-   BUG_ON(memory->sysdev.id != __section_nr(section));
 
/* drop the ref. we got in remove_memory_block() */
kobject_put(&memory->sysdev.kobj);
sysdev_unregister(&memory->sysdev);
 }
 
+unsigned long __weak memory_block_size_bytes(void)
+{
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+
+static unsigned long get_memory_block_size(void)
+{
+   u32 block_sz;
+
+   block_sz = memory_block_size_bytes();
+
+   /* Validate blk_sz is a power of 2 and not less than section size */
+   if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
+   WARN_ON(1);
+   block_sz = MIN_MEMORY_BLOCK_SIZE;
+   }
+
+   return block_sz;
+}
+
 /*
  * use this as the physical section index that this memsection
  * uses.
@@ -116,7 +143,7 @@
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index);
+   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
 }
 
 /*
@@ -125,13 +152,16 @@
 static ssize_t show_mem_removable(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
-   unsigned long start_pfn;
-   int ret;
+   unsigned long i, pfn;
+   int ret = 1;
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
 
-   start_pfn = section_nr_to_pfn(mem->phys_index);
-   ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
+   for (i = 0; i < sections_per_block; i++) {
+   pfn = section_nr_to_pfn(mem->phys_index + i);
+   ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+   }
+
return sprintf(buf, "%d\n", ret);
 }
 
@@ -184,17 +214,14 @@
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(struct memory_block *mem, unsigned long action)
+memory_section_action(unsigned long phys_index, unsigned long action)
 {
int i;
-   unsigned long psection;
unsigned long start_pfn, start_paddr;
struct page *first_page;
int ret;
-   int old_state = mem->state;
 
-   psection = mem->phys_index;
-   first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+   first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
 
/*
 * The probe routines leave the pages reserved, just
@@ -207,8 +234,8 @@
continue;
 
printk(KERN_WARNING "section number %ld page number %d "
-   "not reserved, was it already online? \n",
-   psection, i);
+   "not reserved, was it already online?\n"

[PATCH 3/9] v3 Add section count to memory_block struct

2010-10-01 Thread Nathan Fontenot
Add a section count property to the memory_block struct to track the number
of memory sections that have been added/removed from a memory block. This
allows us to know when the last memory section of a memory block has been
removed so we can remove the memory block.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c  |   17 +++--
 include/linux/memory.h |2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:12:41.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:13:50.0 -0500
@@ -482,6 +482,7 @@
 
mem->phys_index = __section_nr(section);
mem->state = state;
+   mem->section_count++;
mutex_init(&mem->state_mutex);
start_pfn = section_nr_to_pfn(mem->phys_index);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
@@ -511,12 +512,16 @@
 
mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
-   unregister_mem_sect_under_nodes(mem);
-   mem_remove_simple_file(mem, phys_index);
-   mem_remove_simple_file(mem, state);
-   mem_remove_simple_file(mem, phys_device);
-   mem_remove_simple_file(mem, removable);
-   unregister_memory(mem, section);
+
+   mem->section_count--;
+   if (mem->section_count == 0) {
+   unregister_mem_sect_under_nodes(mem);
+   mem_remove_simple_file(mem, phys_index);
+   mem_remove_simple_file(mem, state);
+   mem_remove_simple_file(mem, phys_device);
+   mem_remove_simple_file(mem, removable);
+   unregister_memory(mem, section);
+   }
 
mutex_unlock(&mem_sysfs_mutex);
return 0;
Index: linux-next/include/linux/memory.h
===
--- linux-next.orig/include/linux/memory.h  2010-09-29 14:56:29.0 
-0500
+++ linux-next/include/linux/memory.h   2010-09-30 14:13:50.0 -0500
@@ -23,6 +23,8 @@
 struct memory_block {
unsigned long phys_index;
unsigned long state;
+   int section_count;
+
/*
 * This serializes all state change requests.  It isn't
 * held during creation because the control files are
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/9] v3 Add mutex for adding/removing memory blocks

2010-10-01 Thread Nathan Fontenot
Add a new mutex for use in adding and removing of memory blocks.  This
is needed to avoid any race conditions in which the same memory block could
be added and removed at the same time.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |7 +++
 1 file changed, 7 insertions(+)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-30 14:09:36.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:12:41.0 -0500
@@ -27,6 +27,8 @@
 #include 
 #include 
 
+static DEFINE_MUTEX(mem_sysfs_mutex);
+
 #define MEMORY_CLASS_NAME  "memory"
 
 static struct sysdev_class memory_sysdev_class = {
@@ -476,6 +478,8 @@
if (!mem)
return -ENOMEM;
 
+   mutex_lock(&mem_sysfs_mutex);
+
mem->phys_index = __section_nr(section);
mem->state = state;
mutex_init(&mem->state_mutex);
@@ -496,6 +500,7 @@
ret = register_mem_sect_under_node(mem, nid);
}
 
+   mutex_unlock(&mem_sysfs_mutex);
return ret;
 }
 
@@ -504,6 +509,7 @@
 {
struct memory_block *mem;
 
+   mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
unregister_mem_sect_under_nodes(mem);
mem_remove_simple_file(mem, phys_index);
@@ -512,6 +518,7 @@
mem_remove_simple_file(mem, removable);
unregister_memory(mem, section);
 
+   mutex_unlock(&mem_sysfs_mutex);
return 0;
 }
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/9] v3 Move find_memory_block routine

2010-10-01 Thread Nathan Fontenot
Move the find_memory_block() routine up to avoid needing a forward
declaration in subsequent patches.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |   62 +-
 1 file changed, 31 insertions(+), 31 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-29 14:56:26.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-30 14:09:36.0 -0500
@@ -435,6 +435,37 @@
return 0;
 }
 
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+struct memory_block *find_memory_block(struct mem_section *section)
+{
+   struct kobject *kobj;
+   struct sys_device *sysdev;
+   struct memory_block *mem;
+   char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+   /*
+* This only works because we know that section == sysdev->id
+* slightly redundant with sysdev_register()
+*/
+   sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+   kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+   if (!kobj)
+   return NULL;
+
+   sysdev = container_of(kobj, struct sys_device, kobj);
+   mem = container_of(sysdev, struct memory_block, sysdev);
+
+   return mem;
+}
+
 static int add_memory_block(int nid, struct mem_section *section,
unsigned long state, enum mem_add_context context)
 {
@@ -468,37 +499,6 @@
return ret;
 }
 
-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all sysdev classes.
- */
-struct memory_block *find_memory_block(struct mem_section *section)
-{
-   struct kobject *kobj;
-   struct sys_device *sysdev;
-   struct memory_block *mem;
-   char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
-
-   /*
-* This only works because we know that section == sysdev->id
-* slightly redundant with sysdev_register()
-*/
-   sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
-
-   kobj = kset_find_obj(&memory_sysdev_class.kset, name);
-   if (!kobj)
-   return NULL;
-
-   sysdev = container_of(kobj, struct sys_device, kobj);
-   mem = container_of(sysdev, struct memory_block, sysdev);
-
-   return mem;
-}
-
 int remove_memory_block(unsigned long node_id, struct mem_section *section,
int phys_device)
 {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 0/9] v3 De-couple sysfs memory directories from memory sections

2010-10-01 Thread Nathan Fontenot
This set of patches decouples the concept that a single memory
section corresponds to a single directory in 
/sys/devices/system/memory/.  On systems
with large amounts of memory (1+ TB) there are performance issues
related to creating the large number of sysfs directories.  For
a powerpc machine with 1 TB of memory we are creating 63,000+
directories.  This is resulting in boot times of around 45-50
minutes for systems with 1 TB of memory and 8 hours for systems
with 2 TB of memory.  With this patch set applied I am now seeing
boot times of 5 minutes or less.

The root of this issue is in sysfs directory creation. Every time
a directory is created a string compare is done against all sibling
directories to ensure we do not create duplicates.  The list of
directory nodes in sysfs is kept as an unsorted list which results
in this being an exponentially longer operation as the number of
directories are created.

The solution solved by this patch set is to allow a single
directory in sysfs to span multiple memory sections.  This is
controlled by an optional architecturally defined function
memory_block_size_bytes().  The default definition of this
routine returns a memory block size equal to the memory section
size. This maintains the current layout of sysfs memory
directories as it appears to userspace to remain the same as it
is today.

For architectures that define their own version of this routine,
as is done for powerpc and x86_64 in this patchset, the view in userspace
would change such that each memoryXXX directory would span
multiple memory sections.  The number of sections spanned would
depend on the value reported by memory_block_size_bytes.

In both cases a new file 'end_phys_index' is created in each
memoryXXX directory.  This file will contain the physical id
of the last memory section covered by the sysfs directory.  For
the default case, the value in 'end_phys_index' will be the same
as in the existng 'phys_index' file.

Updates for this version of the patch:

- Patches 2 and 3 have been swapped which has alleviated the need for the
  section count in the memory_block struct to be an atomic.

- The get_memory_block_size and memory_block_size_bytes routines now return
  an unsigned long instead of a u32.  This affects patches 4, 7, and 8.

- [Patch 5/9] The phys_index member of the memory block struct is changed to
  start_section_nr and the new end_phys_index is now named end_section_nr.

- [Patch 8/9] A new patch added to the set to define a version of
  memory_block_size_bytes() for x86_64 when CONFIG_X86_UV is set.

- [Patch 9/9] Correct the updates to hotplug documentation to indicate that
  4 or 5 files may be seen for each memory directory in sysfs.

-Nathan Fontenot
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/8] v2 De-Couple sysfs memory directories from memory sections

2010-09-30 Thread Nathan Fontenot
On 09/29/2010 02:28 PM, Robin Holt wrote:
> On Tue, Sep 28, 2010 at 01:17:33PM -0500, Nathan Fontenot wrote:
>> On 09/28/2010 07:38 AM, Robin Holt wrote:
>>> I was tasked with looking at a slowdown in similar sized SGI machines
>>> booting x86_64.  Jack Steiner had already looked into the memory_dev_init.
>>> I was looking at link_mem_sections().
>>>
>>> I made a dramatic improvement on a 16TB machine in that function by
>>> merely caching the most recent memory section and checking to see if
>>> the next memory section happens to be the subsequent in the linked list
>>> of kobjects.
>>>
>>> That simple cache reduced the time for link_mem_sections from 1 hour 27
>>> minutes down to 46 seconds.
>>
>> Nice!
>>
>>>
>>> I would like to propose we implement something along those lines also,
>>> but I am currently swamped.  I can probably get you a patch tomorrow
>>> afternoon that applies at the end of this set.
>>
>> Should this be done as a separate patch?  This patch set concentrates on
>> updates to the memory code with the node updates only being done due to the
>> memory changes.
>>
>> I think its a good idea to do the caching and have no problem adding on to
>> this patchset if no one else has any objections.
> 
> I am sorry.  I had meant to include you on the Cc: list.  I just posted a
> set of patches (3 small patches) which implement the cache most recent bit
> I aluded to above.  Search for a subject of "Speed up link_mem_sections
> during boot" and you will find them.  I did add you to the Cc: list for
> the next time I end up sending the set.
> 
> My next task is to implement a x86_64 SGI UV specific chunk of code
> to memory_block_size_bytes().  Would you consider adding that to your
> patch set?  I expect to have that either later today or early tomorrow.
> 

No problem. I'm putting together a new patch set with updates from all of
the comments now so go ahead and send it to me when you have it ready.

-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/8] v2 Allow memory block to span multiple memory sections

2010-09-28 Thread Nathan Fontenot
On 09/28/2010 07:48 AM, Robin Holt wrote:
>> +u32 __weak memory_block_size_bytes(void)
>> +{
>> +return MIN_MEMORY_BLOCK_SIZE;
>> +}
>> +
>> +static u32 get_memory_block_size(void)
> 
> Can we make this an unsigned long?  We are testing on a system whose
> smallest possible configuration is 4GB per socket with 512 sockets.
> We would like to be able to specify this as 2GB by default (results
> in the least lost memory) and suggest we add a command line option
> which overrides this value.  We have many installations where 16GB may
> be optimal.  Large configurations will certainly become more prevalent.

Works for me.

> 
> ...
>> @@ -551,12 +608,16 @@
>>  unsigned int i;
>>  int ret;
>>  int err;
>> +int block_sz;
> 
> This one needs to match the return above.  In our tests, we ended up
> with a negative sections_per_block which caused very unexpected results.

Oh, nice catch.  I'll update both of these.

-Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 8/8] v2 Update memory hotplug documentation

2010-09-28 Thread Nathan Fontenot
On 09/28/2010 07:45 AM, Avi Kivity wrote:
>  On 09/27/2010 09:28 PM, Nathan Fontenot wrote:
>>
>>   For example, assume 1GiB section size. A device for a memory
>> starting at
>>   0x1 is /sys/device/system/memory/memory4
>>   (0x1 / 1Gib = 4)
>>   This device covers address range [0x1 ... 0x14000)
>>
>> -Under each section, you can see 4 files.
>> +Under each section, you can see 5 files.
> 
> Shouldn't this be, 4 or 5 files depending on kernel version?
> 

Correct,  I'll update this.  Thanks.

-Nathan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/8] v2 De-Couple sysfs memory directories from memory sections

2010-09-28 Thread Nathan Fontenot
On 09/28/2010 07:38 AM, Robin Holt wrote:
> I was tasked with looking at a slowdown in similar sized SGI machines
> booting x86_64.  Jack Steiner had already looked into the memory_dev_init.
> I was looking at link_mem_sections().
> 
> I made a dramatic improvement on a 16TB machine in that function by
> merely caching the most recent memory section and checking to see if
> the next memory section happens to be the subsequent in the linked list
> of kobjects.
> 
> That simple cache reduced the time for link_mem_sections from 1 hour 27
> minutes down to 46 seconds.

Nice!

> 
> I would like to propose we implement something along those lines also,
> but I am currently swamped.  I can probably get you a patch tomorrow
> afternoon that applies at the end of this set.

Should this be done as a separate patch?  This patch set concentrates on
updates to the memory code with the node updates only being done due to the
memory changes.

I think its a good idea to do the caching and have no problem adding on to
this patchset if no one else has any objections.

-Nathan

> 
> Thanks,
> Robin
> 
> On Mon, Sep 27, 2010 at 02:09:31PM -0500, Nathan Fontenot wrote:
>> This set of patches decouples the concept that a single memory
>> section corresponds to a single directory in 
>> /sys/devices/system/memory/.  On systems
>> with large amounts of memory (1+ TB) there are perfomance issues
>> related to creating the large number of sysfs directories.  For
>> a powerpc machine with 1 TB of memory we are creating 63,000+
>> directories.  This is resulting in boot times of around 45-50
>> minutes for systems with 1 TB of memory and 8 hours for systems
>> with 2 TB of memory.  With this patch set applied I am now seeing
>> boot times of 5 minutes or less.
>>
>> The root of this issue is in sysfs directory creation. Every time
>> a directory is created a string compare is done against all sibling
>> directories to ensure we do not create duplicates.  The list of
>> directory nodes in sysfs is kept as an unsorted list which results
>> in this being an exponentially longer operation as the number of
>> directories are created.
>>
>> The solution solved by this patch set is to allow a single
>> directory in sysfs to span multiple memory sections.  This is
>> controlled by an optional architecturally defined function
>> memory_block_size_bytes().  The default definition of this
>> routine returns a memory block size equal to the memory section
>> size. This maintains the current layout of sysfs memory
>> directories as it appears to userspace to remain the same as it
>> is today.
>>
>> For architectures that define their own version of this routine,
>> as is done for powerpc in this patchset, the view in userspace
>> would change such that each memoryXXX directory would span
>> multiple memory sections.  The number of sections spanned would
>> depend on the value reported by memory_block_size_bytes.
>>
>> In both cases a new file 'end_phys_index' is created in each
>> memoryXXX directory.  This file will contain the physical id
>> of the last memory section covered by the sysfs directory.  For
>> the default case, the value in 'end_phys_index' will be the same
>> as in the existing 'phys_index' file.
>>
>> This version of the patch set includes an update to to properly
>> report block_size_bytes, phys_index, and end_phys_index.  Additionally,
>> the patch that adds the end_phys_index sysfs file is now patch 5/8
>> instead of being patch 2/8 as in the previous version of the patches.
>>
>> -Nathan Fontenot
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/8] v2 Add section count to memory_block struct

2010-09-28 Thread Nathan Fontenot
On 09/28/2010 04:31 AM, Robin Holt wrote:
> In the next patch, you introduce a mutex for adding/removing memory blocks.
> Is there really a need for this to be atomic?  If you reorder the patches
> so the mutex comes first, would the atomic be needed any longer?
> 

I think you're right.  Looking at the code with all patches applied I am only
updating the atomic when holding the mem_sysfs_mutex.  I think the atomic
could safely be changed to a regular int.

-Nathan

> Robin
> 
> On Mon, Sep 27, 2010 at 02:22:24PM -0500, Nathan Fontenot wrote:
>> Add a section count property to the memory_block struct to track the number
>> of memory sections that have been added/removed from a memory block. This
>> allows us to know when the last memory section of a memory block has been
>> removed so we can remove the memory block.
>>
>> Signed-off-by: Nathan Fontenot 
>>
>> ---
>>  drivers/base/memory.c  |   16 ++--
>>  include/linux/memory.h |3 +++
>>  2 files changed, 13 insertions(+), 6 deletions(-)
>>
>> Index: linux-next/drivers/base/memory.c
>> ===
>> --- linux-next.orig/drivers/base/memory.c2010-09-27 09:17:20.0 
>> -0500
>> +++ linux-next/drivers/base/memory.c 2010-09-27 09:31:35.0 -0500
>> @@ -478,6 +478,7 @@
>>  
>>  mem->phys_index = __section_nr(section);
>>  mem->state = state;
>> +atomic_inc(&mem->section_count);
>>  mutex_init(&mem->state_mutex);
>>  start_pfn = section_nr_to_pfn(mem->phys_index);
>>  mem->phys_device = arch_get_memory_phys_device(start_pfn);
>> @@ -505,12 +506,15 @@
>>  struct memory_block *mem;
>>  
>>  mem = find_memory_block(section);
>> -unregister_mem_sect_under_nodes(mem);
>> -mem_remove_simple_file(mem, phys_index);
>> -mem_remove_simple_file(mem, state);
>> -mem_remove_simple_file(mem, phys_device);
>> -mem_remove_simple_file(mem, removable);
>> -unregister_memory(mem, section);
>> +
>> +if (atomic_dec_and_test(&mem->section_count)) {
>> +unregister_mem_sect_under_nodes(mem);
>> +mem_remove_simple_file(mem, phys_index);
>> +mem_remove_simple_file(mem, state);
>> +mem_remove_simple_file(mem, phys_device);
>> +mem_remove_simple_file(mem, removable);
>> +unregister_memory(mem, section);
>> +}
>>  
>>  return 0;
>>  }
>> Index: linux-next/include/linux/memory.h
>> ===
>> --- linux-next.orig/include/linux/memory.h   2010-09-27 09:17:20.0 
>> -0500
>> +++ linux-next/include/linux/memory.h2010-09-27 09:22:56.0 
>> -0500
>> @@ -19,10 +19,13 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  struct memory_block {
>>  unsigned long phys_index;
>>  unsigned long state;
>> +atomic_t section_count;
>> +
>>  /*
>>   * This serializes all state change requests.  It isn't
>>   * held during creation because the control files are
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 4/8] v2 Allow memory block to span multiple memory sections

2010-09-28 Thread Nathan Fontenot
On 09/27/2010 06:55 PM, Dave Hansen wrote:
> On Mon, 2010-09-27 at 14:25 -0500, Nathan Fontenot wrote:
>> +static inline int base_memory_block_id(int section_nr)
>> +{
>> +   return section_nr / sections_per_block;
>> +}
> ...
>> -   mutex_lock(&mem_sysfs_mutex);
>> -
>> -   mem->phys_index = __section_nr(section);
>> +   scn_nr = __section_nr(section);
>> +   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block; 
> 
> I'm really regretting giving this variable such a horrid name.  I suck.
> 
> I think this is correct now:
> 
>   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
>   mem->phys_index = section_nr / sections_per_block * sections_per_block;
>   mem->phys_index = section_nr
> 
> Since it gets exported to userspace this way:
> 
>> +static ssize_t show_mem_start_phys_index(struct sys_device *dev,
>> struct sysdev_attribute *attr, char *buf)
>>  {
>> struct memory_block *mem =
>> container_of(dev, struct memory_block, sysdev);
>> -   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
>> +   unsigned long phys_index;
>> +
>> +   phys_index = mem->start_phys_index / sections_per_block;
>> +   return sprintf(buf, "%08lx\n", phys_index);
>> +}
> 
> The only other thing I'd say is that we need to put phys_index out of
> its misery and call it what it is now: a section number.  I think it's
> OK to call them "start/end_section_nr", at least inside the kernel.  I
> intentionally used "phys_index" terminology in sysfs so that we _could_
> eventually do this stuff and break the relationship between sections and
> the sysfs dirs, but I think keeping the terminology around inside the
> kernel is confusing now.

Yes, it took me a couple o looks to get the phys_index <-> section number
correlation.  I think changing the kernel names to start/end_section_number
is a good idea.

-Nathan

> 
> -- Dave
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 8/8] v2 Update memory hotplug documentation

2010-09-27 Thread Nathan Fontenot
Update the memory hotplug documentation to reflect the new behaviors of
memory blocks reflected in sysfs.

Signed-off-by: Nathan Fontenot 

---
 Documentation/memory-hotplug.txt |   46 +--
 1 file changed, 30 insertions(+), 16 deletions(-)

Index: linux-next/Documentation/memory-hotplug.txt
===
--- linux-next.orig/Documentation/memory-hotplug.txt2010-09-27 
13:49:33.0 -0500
+++ linux-next/Documentation/memory-hotplug.txt 2010-09-27 13:50:48.0 
-0500
@@ -126,36 +126,50 @@
 
 4 sysfs files for memory hotplug
 
-All sections have their device information under /sys/devices/system/memory as
+All sections have their device information in sysfs.  Each section is part of
+a memory block under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is section id.)
+(XXX is the section id.)
 
-Now, XXX is defined as start_address_of_section / section_size.
+Now, XXX is defined as (start_address_of_section / section_size) of the first
+section contained in the memory block.  The files 'phys_index' and
+'end_phys_index' under each directory report the beginning and end section id's
+for the memory block covered by the sysfs directory.  It is expected that all
+memory sections in this range are present and no memory holes exist in the
+range. Currently there is no way to determine if there is a memory hole, but
+the existence of one should not affect the hotplug capabilities of the memory
+block.
 
 For example, assume 1GiB section size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 files.
+Under each section, you can see 5 files.
 
-/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/start_phys_index
+/sys/devices/system/memory/memoryXXX/end_phys_index
 /sys/devices/system/memory/memoryXXX/phys_device
 /sys/devices/system/memory/memoryXXX/state
 /sys/devices/system/memory/memoryXXX/removable
 
-'phys_index' : read-only and contains section id, same as XXX.
-'state'  : read-write
-   at read:  contains online/offline state of memory.
-   at write: user can specify "online", "offline" command
-'phys_device': read-only: designed to show the name of physical memory device.
-   This is not well implemented now.
-'removable'  : read-only: contains an integer value indicating
-   whether the memory section is removable or not
-   removable.  A value of 1 indicates that the memory
-   section is removable and a value of 0 indicates that
-   it is not removable.
+'phys_index'  : read-only and contains section id of the first section
+   in the memory block, same as XXX.
+'end_phys_index'  : read-only and contains section id of the last section
+   in the memory block.
+'state'   : read-write
+at read:  contains online/offline state of memory.
+at write: user can specify "online", "offline" command
+which will be performed on al sections in the block.
+'phys_device' : read-only: designed to show the name of physical memory
+device.  This is not well implemented now.
+'removable'   : read-only: contains an integer value indicating
+whether the memory block is removable or not
+removable.  A value of 1 indicates that the memory
+block is removable and a value of 0 indicates that
+it is not removable. A memory block is removable only if
+every section in the block is removable.
 
 NOTE:
   These directories/files appear after physical memory hotplug phase.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 7/8] v2 Define memory_block_size_bytes() for powerpc/pseries

2010-09-27 Thread Nathan Fontenot
Define a version of memory_block_size_bytes() for powerpc/pseries such that
a memory block spans an entire lmb.

Signed-off-by: Nathan Fontenot 

---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   66 +++-
 1 file changed, 53 insertions(+), 13 deletions(-)

Index: linux-next/arch/powerpc/platforms/pseries/hotplug-memory.c
===
--- linux-next.orig/arch/powerpc/platforms/pseries/hotplug-memory.c 
2010-09-27 13:49:34.0 -0500
+++ linux-next/arch/powerpc/platforms/pseries/hotplug-memory.c  2010-09-27 
13:50:45.0 -0500
@@ -17,6 +17,54 @@
 #include 
 #include 
 
+static u32 get_memblock_size(void)
+{
+   struct device_node *np;
+   unsigned int memblock_size = 0;
+
+   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+   if (np) {
+   const unsigned long *size;
+
+   size = of_get_property(np, "ibm,lmb-size", NULL);
+   memblock_size = size ? *size : 0;
+
+   of_node_put(np);
+   } else {
+   unsigned int memzero_size = 0;
+   const unsigned int *regs;
+
+   np = of_find_node_by_path("/mem...@0");
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memzero_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+
+   if (memzero_size) {
+   /* We now know the size of mem...@0, use this to find
+* the first memoryblock and get its size.
+*/
+   char buf[64];
+
+   sprintf(buf, "/mem...@%x", memzero_size);
+   np = of_find_node_by_path(buf);
+   if (np) {
+   regs = of_get_property(np, "reg", NULL);
+   memblock_size = regs ? regs[3] : 0;
+   of_node_put(np);
+   }
+   }
+   }
+
+   return memblock_size;
+}
+
+u32 memory_block_size_bytes(void)
+{
+   return get_memblock_size();
+}
+
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
@@ -127,30 +175,22 @@
 
 static int pseries_drconf_memory(unsigned long *base, unsigned int action)
 {
-   struct device_node *np;
-   const unsigned long *lmb_size;
+   unsigned long memblock_size;
int rc;
 
-   np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!np)
+   memblock_size = get_memblock_size();
+   if (!memblock_size)
return -EINVAL;
 
-   lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-   if (!lmb_size) {
-   of_node_put(np);
-   return -EINVAL;
-   }
-
if (action == PSERIES_DRCONF_MEM_ADD) {
-   rc = memblock_add(*base, *lmb_size);
+   rc = memblock_add(*base, memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-   rc = pseries_remove_memblock(*base, *lmb_size);
+   rc = pseries_remove_memblock(*base, memblock_size);
} else {
rc = -EINVAL;
}
 
-   of_node_put(np);
return rc;
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 6/8] v2 Update node sysfs code

2010-09-27 Thread Nathan Fontenot
Update the node sysfs code to be aware of the new capability for a memory
block to contain multiple memory sections.  This requires an additional
parameter to unregister_mem_sect_under_nodes so that we know which memory
section of the memory block to unregister.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |2 +-
 drivers/base/node.c   |   12 
 include/linux/node.h  |6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

Index: linux-next/drivers/base/node.c
===
--- linux-next.orig/drivers/base/node.c 2010-09-27 13:49:36.0 -0500
+++ linux-next/drivers/base/node.c  2010-09-27 13:50:43.0 -0500
@@ -346,8 +346,10 @@
return -EFAULT;
if (!node_online(nid))
return 0;
-   sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index);
-   sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+
+   sect_start_pfn = section_nr_to_pfn(mem_blk->start_phys_index);
+   sect_end_pfn = section_nr_to_pfn(mem_blk->end_phys_index);
+   sect_end_pfn += PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int page_nid;
 
@@ -371,7 +373,8 @@
 }
 
 /* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk)
+int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+   unsigned long phys_index)
 {
NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
unsigned long pfn, sect_start_pfn, sect_end_pfn;
@@ -383,7 +386,8 @@
if (!unlinked_nodes)
return -ENOMEM;
nodes_clear(*unlinked_nodes);
-   sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index);
+
+   sect_start_pfn = section_nr_to_pfn(phys_index);
sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
int nid;
Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-27 13:50:38.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-27 13:50:43.0 -0500
@@ -587,9 +587,9 @@
 
mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
+   unregister_mem_sect_under_nodes(mem, __section_nr(section));
 
if (atomic_dec_and_test(&mem->section_count)) {
-   unregister_mem_sect_under_nodes(mem);
mem_remove_simple_file(mem, phys_index);
mem_remove_simple_file(mem, end_phys_index);
mem_remove_simple_file(mem, state);
Index: linux-next/include/linux/node.h
===
--- linux-next.orig/include/linux/node.h2010-09-27 13:49:36.0 
-0500
+++ linux-next/include/linux/node.h 2010-09-27 13:50:43.0 -0500
@@ -44,7 +44,8 @@
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
int nid);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk);
+extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+  unsigned long phys_index);
 
 #ifdef CONFIG_HUGETLBFS
 extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
@@ -72,7 +73,8 @@
 {
return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk)
+static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
+ unsigned long phys_index)
 {
return 0;
 }


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/8] v2 Add end_phys_index file

2010-09-27 Thread Nathan Fontenot
Update the 'phys_index' properties of a memory block to include a
'start_phys_index' which is the same as the current 'phys_index' property.
The property still appears as 'phys_index' in sysfs but the memory_block
struct name is updated to indicate the start and end values.
This also adds an 'end_phys_index' property to indicate the id of the
last section in th memory block.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c  |   39 ++-
 include/linux/memory.h |3 ++-
 2 files changed, 32 insertions(+), 10 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-27 13:50:18.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-27 13:50:38.0 -0500
@@ -97,7 +97,7 @@
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = memory->phys_index / sections_per_block;
+   memory->sysdev.id = memory->start_phys_index / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
@@ -138,12 +138,26 @@
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev,
+static ssize_t show_mem_start_phys_index(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
+   unsigned long phys_index;
+
+   phys_index = mem->start_phys_index / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
+}
+
+static ssize_t show_mem_end_phys_index(struct sys_device *dev,
+   struct sysdev_attribute *attr, char *buf)
+{
+   struct memory_block *mem =
+   container_of(dev, struct memory_block, sysdev);
+   unsigned long phys_index;
+
+   phys_index = mem->end_phys_index / sections_per_block;
+   return sprintf(buf, "%08lx\n", phys_index);
 }
 
 /*
@@ -158,7 +172,7 @@
container_of(dev, struct memory_block, sysdev);
 
for (i = 0; i < sections_per_block; i++) {
-   pfn = section_nr_to_pfn(mem->phys_index + i);
+   pfn = section_nr_to_pfn(mem->start_phys_index + i);
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
}
 
@@ -275,14 +289,15 @@
mem->state = MEM_GOING_OFFLINE;
 
for (i = 0; i < sections_per_block; i++) {
-   ret = memory_section_action(mem->phys_index + i, to_state);
+   ret = memory_section_action(mem->start_phys_index + i,
+   to_state);
if (ret)
break;
}
 
if (ret) {
for (i = 0; i < sections_per_block; i++)
-   memory_section_action(mem->phys_index + i,
+   memory_section_action(mem->start_phys_index + i,
  from_state_req);
 
mem->state = from_state_req;
@@ -330,7 +345,8 @@
return sprintf(buf, "%d\n", mem->phys_device);
 }
 
-static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -514,17 +530,21 @@
return -ENOMEM;
 
scn_nr = __section_nr(section);
-   mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
+   mem->start_phys_index =
+   base_memory_block_id(scn_nr) * sections_per_block;
+   mem->end_phys_index = mem->start_phys_index + sections_per_block - 1;
mem->state = state;
atomic_inc(&mem->section_count);
mutex_init(&mem->state_mutex);
-   start_pfn = section_nr_to_pfn(mem->phys_index);
+   start_pfn = section_nr_to_pfn(mem->start_phys_index);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
 
ret = register_memory(mem);
if (!ret)
ret = mem_create_simple_file(mem, phys_index);
if (!ret)
+   ret = mem_create_simple_file(mem, end_phys_index);
+   if (!ret)
ret = mem_create_simple_file(mem, state);
if (!ret)
ret = mem_create_simple_file(mem, phys_device);
@@ -571,6 +591,7 @@
if (atomic_dec_and_test(&mem->section_count)) {
   

[PATCH 4/8] v2 Allow memory block to span multiple memory sections

2010-09-27 Thread Nathan Fontenot
Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |  155 ++
 1 file changed, 108 insertions(+), 47 deletions(-)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-27 09:31:57.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-27 13:50:18.0 -0500
@@ -30,6 +30,14 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME  "memory"
+#define MIN_MEMORY_BLOCK_SIZE  (1 << SECTION_SIZE_BITS)
+
+static int sections_per_block;
+
+static inline int base_memory_block_id(int section_nr)
+{
+   return section_nr / sections_per_block;
+}
 
 static struct sysdev_class memory_sysdev_class = {
.name = MEMORY_CLASS_NAME,
@@ -84,28 +92,47 @@
  * register_memory - Setup a sysfs device for a memory block
  */
 static
-int register_memory(struct memory_block *memory, struct mem_section *section)
+int register_memory(struct memory_block *memory)
 {
int error;
 
memory->sysdev.cls = &memory_sysdev_class;
-   memory->sysdev.id = __section_nr(section);
+   memory->sysdev.id = memory->phys_index / sections_per_block;
 
error = sysdev_register(&memory->sysdev);
return error;
 }
 
 static void
-unregister_memory(struct memory_block *memory, struct mem_section *section)
+unregister_memory(struct memory_block *memory)
 {
BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
-   BUG_ON(memory->sysdev.id != __section_nr(section));
 
/* drop the ref. we got in remove_memory_block() */
kobject_put(&memory->sysdev.kobj);
sysdev_unregister(&memory->sysdev);
 }
 
+u32 __weak memory_block_size_bytes(void)
+{
+   return MIN_MEMORY_BLOCK_SIZE;
+}
+
+static u32 get_memory_block_size(void)
+{
+   u32 block_sz;
+
+   block_sz = memory_block_size_bytes();
+
+   /* Validate blk_sz is a power of 2 and not less than section size */
+   if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
+   WARN_ON(1);
+   block_sz = MIN_MEMORY_BLOCK_SIZE;
+   }
+
+   return block_sz;
+}
+
 /*
  * use this as the physical section index that this memsection
  * uses.
@@ -116,7 +143,7 @@
 {
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
-   return sprintf(buf, "%08lx\n", mem->phys_index);
+   return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
 }
 
 /*
@@ -125,13 +152,16 @@
 static ssize_t show_mem_removable(struct sys_device *dev,
struct sysdev_attribute *attr, char *buf)
 {
-   unsigned long start_pfn;
-   int ret;
+   unsigned long i, pfn;
+   int ret = 1;
struct memory_block *mem =
container_of(dev, struct memory_block, sysdev);
 
-   start_pfn = section_nr_to_pfn(mem->phys_index);
-   ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
+   for (i = 0; i < sections_per_block; i++) {
+   pfn = section_nr_to_pfn(mem->phys_index + i);
+   ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+   }
+
return sprintf(buf, "%d\n", ret);
 }
 
@@ -184,17 +214,14 @@
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(struct memory_block *mem, unsigned long action)
+memory_section_action(unsigned long phys_index, unsigned long action)
 {
int i;
-   unsigned long psection;
unsigned long start_pfn, start_paddr;
struct page *first_page;
int ret;
-   int old_state = mem->state;
 
-   psection = mem->phys_index;
-   first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+   first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
 
/*
 * The probe routines leave the pages reserved, just
@@ -207,8 +234,8 @@
continue;
 
printk(KERN_WARNING "section number %ld page number %d "
-   "not reserved, was it already online? \n",
-   psection, i);
+   "not reserved, was it already online?\n",
+ 

[PATCH 3/8] v2 Add mutex for adding/removing memory blocks

2010-09-27 Thread Nathan Fontenot
Add a new mutex for use in adding and removing of memory blocks.  This
is needed to avoid any race conditions in which the same memory block could
be added and removed at the same time.

Signed-off-by: Nathan Fontenot 

---
 drivers/base/memory.c |7 +++
 1 file changed, 7 insertions(+)

Index: linux-next/drivers/base/memory.c
===
--- linux-next.orig/drivers/base/memory.c   2010-09-27 09:31:35.0 
-0500
+++ linux-next/drivers/base/memory.c2010-09-27 09:31:57.0 -0500
@@ -27,6 +27,8 @@
 #include 
 #include 
 
+static DEFINE_MUTEX(mem_sysfs_mutex);
+
 #define MEMORY_CLASS_NAME  "memory"
 
 static struct sysdev_class memory_sysdev_class = {
@@ -476,6 +478,8 @@
if (!mem)
return -ENOMEM;
 
+   mutex_lock(&mem_sysfs_mutex);
+
mem->phys_index = __section_nr(section);
mem->state = state;
atomic_inc(&mem->section_count);
@@ -497,6 +501,7 @@
ret = register_mem_sect_under_node(mem, nid);
}
 
+   mutex_unlock(&mem_sysfs_mutex);
return ret;
 }
 
@@ -505,6 +510,7 @@
 {
struct memory_block *mem;
 
+   mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
 
if (atomic_dec_and_test(&mem->section_count)) {
@@ -516,6 +522,7 @@
unregister_memory(mem, section);
}
 
+   mutex_unlock(&mem_sysfs_mutex);
return 0;
 }


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


<    1   2   3   4   5   6   7   8   >