Changelog V2 -> V3

- removed zone ordering selection knobs...

much simpler one. just changing zonelist ordering.
tested on ia64 NUMA works well as expected.

-Kame


change zonelist order on NUMA v3.

[Description]
Assume 2 node NUMA, only node(0) has ZONE_DMA.
(ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)

In this case, current default (node0's) zonelist order is

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.

This means Node(0)'s DMA will be used before Node(1)'s NORMAL.
This will cause OOM on ZONE_DMA easily.

This patch changes *default* zone order to

Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

tested ia64 2-Node NUMA. works well.

Signed-Off-By: KAMEZAWA Hiroyuki <[EMAIL PROTECTED]>

Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
@@ -2023,6 +2023,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
+#ifndef CONFIG_NUMA
 static int __meminit build_zonelists_node(pg_data_t *pgdat,
                        struct zonelist *zonelist, int nr_zones, enum zone_type 
zone_type)
 {
@@ -2042,6 +2043,7 @@ static int __meminit build_zonelists_nod
        } while (zone_type);
        return nr_zones;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
@@ -2106,52 +2108,51 @@ static int __meminit find_next_best_node
        return best_node;
 }
 
+/*
+ * Build zonelist based on zone priority.
+ */
+static int __meminitdata node_order[MAX_NUMNODES];
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-       int j, node, local_node;
-       enum zone_type i;
-       int prev_node, load;
-       struct zonelist *zonelist;
+       int i, j, pos, zone_type, node, load;
        nodemask_t used_mask;
+       int local_node, prev_node;
+       struct zone *z;
+       struct zonelist *zonelist;
 
-       /* initialize zonelists */
        for (i = 0; i < MAX_NR_ZONES; i++) {
                zonelist = pgdat->node_zonelists + i;
                zonelist->zones[0] = NULL;
        }
-
-       /* NUMA-aware ordering of nodes */
+       memset(node_order, 0, sizeof(node_order));
        local_node = pgdat->node_id;
        load = num_online_nodes();
        prev_node = local_node;
        nodes_clear(used_mask);
+       j = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                int distance = node_distance(local_node, node);
-
-               /*
-                * If another node is sufficiently far away then it is better
-                * to reclaim pages in a zone before going off node.
-                */
                if (distance > RECLAIM_DISTANCE)
                        zone_reclaim_mode = 1;
-
-               /*
-                * We don't want to pressure a particular node.
-                * So adding penalty to the first node in same
-                * distance group to make it round-robin.
-                */
-
                if (distance != node_distance(local_node, prev_node))
-                       node_load[node] += load;
+                       node_load[node] = load;
+               node_order[j++] = node;
                prev_node = node;
                load--;
-               for (i = 0; i < MAX_NR_ZONES; i++) {
-                       zonelist = pgdat->node_zonelists + i;
-                       for (j = 0; zonelist->zones[j] != NULL; j++);
-
-                       j = build_zonelists_node(NODE_DATA(node), zonelist, j, 
i);
-                       zonelist->zones[j] = NULL;
+       }
+       /* calculate node order */
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               zonelist = pgdat->node_zonelists + i;
+               pos = 0;
+               for (zone_type = i; zone_type >= 0; zone_type--) {
+                       for (j = 0; j < num_online_nodes(); j++) {
+                               node = node_order[j];
+                               z = &NODE_DATA(node)->node_zones[zone_type];
+                               if (populated_zone(z))
+                                       zonelist->zones[pos++] = z;
+                       }
                }
+               zonelist->zones[pos] = NULL;
        }
 }
 
@@ -2239,6 +2240,7 @@ void __meminit build_all_zonelists(void)
                __build_all_zonelists(NULL);
                cpuset_init_current_mems_allowed();
        } else {
+               memset(node_load, 0, sizeof(node_load));
                /* we have to stop all cpus to guaranntee there is no user
                   of zonelist */
                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to