Hello.

I'm sending v2 of the patch set based on the discussion I had with Honza.
Changes from previous version:
- I changed type of edge count from uint32_t to uint64_t.
- The algorithm traverses recursively inline clones.
- TDF_DUMP_DETAILS is supported and provides more information.
- I added cgraph_node_cmp_by_text_sorted function that is used
  for symbol sorting both in cgraphunit.c and lto-cgraph.c.

I made the same testing as before I get following numbers of tramp3d w/ -O2:
Total runs: 15, before: 13.89, after: 13.81, cmp: 99.383%

and iTLB-load-misses:u goes from 15,450,298 to 11,633,215.

The corresponding binutils patch part was partially accepted (for BFD) and
today I was given a feedback for ld.gold.

Thoughts?
Thanks,
Martin
>From 9d62da95dc8efd4194565fca6471b3ddb6109c9b Mon Sep 17 00:00:00 2001
From: Martin Liska <mli...@suse.cz>
Date: Thu, 5 Sep 2019 13:32:41 +0200
Subject: [PATCH] Add new ipa-reorder pass.

gcc/ChangeLog:

2019-11-25  Martin Liska  <mli...@suse.cz>

	* Makefile.in: Add ipa-reorder.o.
	* cgraph.c (cgraph_node::dump): Dump
	text_sorted_order.
	(cgraph_node_cmp_by_text_sorted):
	New function that sorts functions based
	on text_sorted_order.
	* cgraph.h (cgraph_node): Add text_sorted_order.
	(cgraph_node_cmp_by_text_sorted): New.
	* cgraphclones.c (cgraph_node::create_clone):
	Clone also text_sorted_order.
	* cgraphunit.c (node_cmp): Remove.
	(expand_all_functions): Use new function
	cgraph_node_cmp_by_text_sorted.
	* common.opt: Add new option reorder_functions_algorithm.
	* flag-types.h (enum reorder_functions_algorithm):
	New enum.
	* ipa-reorder.c: New file.
	* lto-cgraph.c (lto_output_node): Stream in and out
	text_sorted_order.
	(input_node): Likewise.
	* passes.def: Add pass_ipa_reorder.
	* timevar.def (TV_IPA_REORDER): New.
	* tree-pass.h (make_pass_ipa_reorder): New.
	* varasm.c (default_function_section): Assign text.sorted.X
	section.

gcc/lto/ChangeLog:

2019-11-25  Martin Liska  <mli...@suse.cz>

	* lto-partition.c (node_cmp): Remove.
	(lto_balanced_map): Use new cgraph_node_cmp_by_text_sorted.
---
 gcc/Makefile.in         |   1 +
 gcc/cgraph.c            |  20 +++
 gcc/cgraph.h            |   3 +
 gcc/cgraphclones.c      |   1 +
 gcc/cgraphunit.c        |  21 +--
 gcc/common.opt          |  14 ++
 gcc/flag-types.h        |   8 +
 gcc/ipa-reorder.c       | 383 ++++++++++++++++++++++++++++++++++++++++
 gcc/lto-cgraph.c        |   2 +
 gcc/lto/lto-partition.c |  33 +---
 gcc/passes.def          |   1 +
 gcc/timevar.def         |   1 +
 gcc/tree-pass.h         |   1 +
 gcc/varasm.c            |   9 +
 14 files changed, 448 insertions(+), 50 deletions(-)
 create mode 100644 gcc/ipa-reorder.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 7d3c13230e4..163d47c47f1 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1369,6 +1369,7 @@ OBJS = \
 	init-regs.o \
 	internal-fn.o \
 	ipa-cp.o \
+	ipa-reorder.o \
 	ipa-sra.o \
 	ipa-devirt.o \
 	ipa-fnsummary.o \
diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 1f7a5c58d98..0e31582d2ce 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -1951,6 +1951,8 @@ cgraph_node::dump (FILE *f)
     }
   if (tp_first_run > 0)
     fprintf (f, " first_run:%i", tp_first_run);
+  if (text_sorted_order > 0)
+    fprintf (f, " text_sorted_order:%i", text_sorted_order);
   if (origin)
     fprintf (f, " nested in:%s", origin->asm_name ());
   if (gimple_has_body_p (decl))
@@ -3696,6 +3698,24 @@ cgraph_edge::possibly_call_in_translation_unit_p (void)
   return node->get_availability () >= AVAIL_INTERPOSABLE;
 }
 
+/* Sort cgraph_nodes by text_sorted_order if available, or by order.  */
+
+int
+cgraph_node_cmp_by_text_sorted (const void *pa, const void *pb)
+{
+  const cgraph_node *a = *(const cgraph_node * const *) pa;
+  const cgraph_node *b = *(const cgraph_node * const *) pb;
+
+  /* Functions with text_sorted_order should be before these
+     without profile.  */
+  if (a->text_sorted_order == 0 || b->text_sorted_order == 0)
+    return a->text_sorted_order - b->text_sorted_order;
+
+  return a->text_sorted_order != b->text_sorted_order
+	 ? b->text_sorted_order - a->text_sorted_order
+	 : b->order - a->order;
+}
+
 /* A stashed copy of "symtab" for use by selftest::symbol_table_test.
    This needs to be a global so that it can be a GC root, and thus
    prevent the stashed copy from being garbage-collected if the GC runs
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index a4f14743f00..ad6ad7ef513 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1431,6 +1431,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node
   unsigned int profile_id;
   /* Time profiler: first run of function.  */
   int tp_first_run;
+  /* Order in .text.sorted.* section.  */
+  int text_sorted_order;
 
   /* Set when decl is an abstract function pointed to by the
      ABSTRACT_DECL_ORIGIN of a reachable function.  */
@@ -2447,6 +2449,7 @@ bool cgraph_function_possibly_inlined_p (tree);
 
 const char* cgraph_inline_failed_string (cgraph_inline_failed_t);
 cgraph_inline_failed_type_t cgraph_inline_failed_type (cgraph_inline_failed_t);
+int cgraph_node_cmp_by_text_sorted (const void *pa, const void *pb);
 
 /* In cgraphunit.c  */
 void cgraphunit_c_finalize (void);
diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c
index bfcebb20495..c15e171d40c 100644
--- a/gcc/cgraphclones.c
+++ b/gcc/cgraphclones.c
@@ -364,6 +364,7 @@ cgraph_node::create_clone (tree new_decl, profile_count prof_count,
   new_node->rtl = rtl;
   new_node->frequency = frequency;
   new_node->tp_first_run = tp_first_run;
+  new_node->text_sorted_order = text_sorted_order;
   new_node->tm_clone = tm_clone;
   new_node->icf_merged = icf_merged;
   new_node->merged_comdat = merged_comdat;
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index aa26160bf3f..bacf01ccb22 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -2267,24 +2267,6 @@ cgraph_node::expand (void)
   remove_all_references ();
 }
 
-/* Node comparator that is responsible for the order that corresponds
-   to time when a function was launched for the first time.  */
-
-static int
-node_cmp (const void *pa, const void *pb)
-{
-  const cgraph_node *a = *(const cgraph_node * const *) pa;
-  const cgraph_node *b = *(const cgraph_node * const *) pb;
-
-  /* Functions with time profile must be before these without profile.  */
-  if (!a->tp_first_run || !b->tp_first_run)
-    return a->tp_first_run - b->tp_first_run;
-
-  return a->tp_first_run != b->tp_first_run
-	 ? b->tp_first_run - a->tp_first_run
-	 : b->order - a->order;
-}
-
 /* Expand all functions that must be output.
 
    Attempt to topologically sort the nodes so function is output when
@@ -2315,7 +2297,8 @@ expand_all_functions (void)
       order[new_order_pos++] = order[i];
 
   if (flag_profile_reorder_functions)
-    qsort (order, new_order_pos, sizeof (cgraph_node *), node_cmp);
+    qsort (order, new_order_pos, sizeof (cgraph_node *),
+	   cgraph_node_cmp_by_text_sorted);
 
   for (i = new_order_pos - 1; i >= 0; i--)
     {
diff --git a/gcc/common.opt b/gcc/common.opt
index 404b6aac298..c2ac1f1f29b 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2276,6 +2276,20 @@ freorder-functions
 Common Report Var(flag_reorder_functions) Optimization
 Reorder functions to improve code placement.
 
+freorder-functions-algorithm=
+Common Joined RejectNegative Enum(reorder_functions_algorithm) Var(flag_reorder_functions_algorithm) Init(REORDER_FUNCTIONS_ALGORITHM_CALL_CHAIN_CLUSTERING) Optimization
+-freorder-functions-algorithm=[first-run|call-chain-clustering]	Set the used function reordering algorithm.
+
+Enum
+Name(reorder_functions_algorithm) Type(enum reorder_functions_algorithm) UnknownError(unknown function reordering algorithm %qs)
+
+EnumValue
+Enum(reorder_functions_algorithm) String(first-run) Value(REORDER_FUNCTIONS_ALGORITHM_FIRST_RUN)
+
+EnumValue
+Enum(reorder_functions_algorithm) String(call-chain-clustering) Value(REORDER_FUNCTIONS_ALGORITHM_CALL_CHAIN_CLUSTERING)
+
+
 frerun-cse-after-loop
 Common Report Var(flag_rerun_cse_after_loop) Optimization
 Add a common subexpression elimination pass after loop optimizations.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 0c23aadefed..15901280383 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -138,6 +138,14 @@ enum reorder_blocks_algorithm
   REORDER_BLOCKS_ALGORITHM_STC
 };
 
+/* The algorithm used for function reordering.  */
+enum reorder_functions_algorithm
+{
+  REORDER_FUNCTIONS_ALGORITHM_FIRST_RUN,
+  REORDER_FUNCTIONS_ALGORITHM_CALL_CHAIN_CLUSTERING
+};
+
+
 /* The algorithm used for the integrated register allocator (IRA).  */
 enum ira_algorithm
 {
diff --git a/gcc/ipa-reorder.c b/gcc/ipa-reorder.c
new file mode 100644
index 00000000000..36eb5d760b3
--- /dev/null
+++ b/gcc/ipa-reorder.c
@@ -0,0 +1,383 @@
+/* Reorder functions based on profile.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "cgraph.h"
+#include "alloc-pool.h"
+#include "symbol-summary.h"
+#include "tree-vrp.h"
+#include "alloc-pool.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "fibonacci_heap.h"
+#include <limits>
+
+using namespace std;
+
+namespace {
+
+#define C3_CLUSTER_THRESHOLD 1024
+
+struct cluster_edge;
+
+/* Cluster is set of functions considered in C3 algorithm.  */
+
+struct cluster
+{
+  cluster (cgraph_node *node, int size, sreal time):
+    m_functions (), m_callers (), m_size (size), m_time (time)
+  {
+    m_functions.safe_push (node);
+  }
+
+  vec<cgraph_node *> m_functions;
+  hash_map <cluster *, cluster_edge *> m_callers;
+  int m_size;
+  sreal m_time;
+};
+
+/* Cluster edge is an oriented edge in between two clusters.  */
+
+struct cluster_edge
+{
+  cluster_edge (cluster *caller, cluster *callee, uint64_t count):
+    m_caller (caller), m_callee (callee), m_count (count), m_heap_node (NULL)
+  {}
+
+
+  uint64_t inverted_count ()
+  {
+    return numeric_limits<uint64_t>::max () - m_count;
+  }
+
+  cluster *m_caller;
+  cluster *m_callee;
+  uint64_t m_count;
+  fibonacci_node<uint64_t, cluster_edge> *m_heap_node;
+};
+
+/* Sort functions based of first execution of the function.  */
+
+static void
+sort_functions_by_first_run (void)
+{
+  cgraph_node *node;
+
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (node->tp_first_run && !node->alias)
+      node->text_sorted_order = node->tp_first_run;
+}
+
+/* Compare clusters by density after that are established.  */
+
+static int
+cluster_cmp (const void *a_p, const void *b_p)
+{
+  const cluster *a = *(cluster * const *)a_p;
+  const cluster *b = *(cluster * const *)b_p;
+
+  unsigned fncounta = a->m_functions.length ();
+  unsigned fncountb = b->m_functions.length ();
+  if (fncounta <= 1 || fncountb <= 1)
+    return fncountb - fncounta;
+
+  sreal r = b->m_time * a->m_size - a->m_time * b->m_size;
+  return (r < 0) ? -1 : ((r > 0) ? 1 : 0);
+}
+
+/* Visit callgraph edge CS until we reach a real cgraph_node (not a clone).
+   Record such edge to EDGES or traverse recursively.  */
+
+static void
+visit_all_edges_for_caller (auto_vec<cluster_edge *> *edges,
+			    cgraph_node *node, cgraph_edge *cs)
+{
+  if (cs->inline_failed)
+    {
+      gcov_type count;
+      profile_count pcount = cs->count.ipa ();
+      /* A real call edge.  */
+      if (!cs->callee->alias
+	  && cs->callee->definition
+	  && pcount.initialized_p ()
+	  && (count = pcount.to_gcov_type ()) > 0)
+	{
+	  cluster *caller = (cluster *)node->aux;
+	  cluster *callee = (cluster *)cs->callee->aux;
+	  cluster_edge **cedge = callee->m_callers.get (caller);
+
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Adding edge:%s->%s:%" PRIu64 "\n",
+		     node->dump_name (), cs->callee->dump_name (), count);
+
+	  if (cedge != NULL)
+	    (*cedge)->m_count += count;
+	  else
+	    {
+	      cluster_edge *cedge = new cluster_edge (caller, callee, count);
+	      edges->safe_push (cedge);
+	      callee->m_callers.put (caller, cedge);
+	    }
+	}
+    }
+  else
+    {
+      cgraph_node *clone = cs->callee;
+      for (cgraph_edge *cs = clone->callees; cs; cs = cs->next_callee)
+	visit_all_edges_for_caller (edges, node, cs);
+    }
+}
+
+/* Sort functions based on call chain clustering, which is an algorithm
+   mentioned in the following article:
+   https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+   .  */
+
+static void
+sort_functions_by_c3 (void)
+{
+  cgraph_node *node;
+  auto_vec<cluster *> clusters;
+
+  /* Create a cluster for each function.  */
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (!node->alias
+	&& !node->inlined_to)
+      {
+	if (dump_file && (dump_flags & TDF_DETAILS))
+	  fprintf (dump_file, "Adding node:%s\n", node->dump_name ());
+
+	ipa_size_summary *size_summary = ipa_size_summaries->get (node);
+	ipa_fn_summary *fn_summary = ipa_fn_summaries->get (node);
+	cluster *c = new cluster (node, size_summary->size, fn_summary->time);
+	node->aux = c;
+	clusters.safe_push (c);
+      }
+
+  auto_vec<cluster_edge *> edges;
+
+  /* Insert edges between clusters that have a profile.  */
+  for (unsigned i = 0; i < clusters.length (); i++)
+    {
+      cgraph_node *node = clusters[i]->m_functions[0];
+      for (cgraph_edge *cs = node->callees; cs; cs = cs->next_callee)
+	visit_all_edges_for_caller (&edges, node, cs);
+    }
+
+  /* Now insert all created edges into a heap.  */
+  fibonacci_heap <uint64_t, cluster_edge> heap (0);
+
+  for (unsigned i = 0; i < clusters.length (); i++)
+    {
+      cluster *c = clusters[i];
+      for (hash_map<cluster *, cluster_edge *>::iterator it
+	   = c->m_callers.begin (); it != c->m_callers.end (); ++it)
+	{
+	  cluster_edge *cedge = (*it).second;
+	  cedge->m_heap_node = heap.insert (cedge->inverted_count (), cedge);
+	}
+    }
+
+  while (!heap.empty ())
+    {
+      cluster_edge *cedge = heap.extract_min ();
+      cluster *caller = cedge->m_caller;
+      cluster *callee = cedge->m_callee;
+      cedge->m_heap_node = NULL;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Processing cluster edge: %p->%p, count: %"
+		   PRIu64 "\n", (void *)caller, (void *)callee, cedge->m_count);
+	  fprintf (dump_file, "  source functions (%u): ", caller->m_size);
+	  for (unsigned j = 0; j < caller->m_functions.length (); j++)
+	    fprintf (dump_file, "%s ", caller->m_functions[j]->dump_name ());
+	  fprintf (dump_file, "\n  target functions (%u): ", callee->m_size);
+	  for (unsigned j = 0; j < callee->m_functions.length (); j++)
+	    fprintf (dump_file, "%s ", callee->m_functions[j]->dump_name ());
+	  fprintf (dump_file, "\n");
+	}
+
+      if (caller == callee)
+	continue;
+      if (caller->m_size + callee->m_size <= C3_CLUSTER_THRESHOLD)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  (clusters merged)\n");
+
+	  caller->m_size += callee->m_size;
+	  caller->m_time += callee->m_time;
+
+	  /* Append all cgraph_nodes from callee to caller.  */
+	  for (unsigned i = 0; i < callee->m_functions.length (); i++)
+	    caller->m_functions.safe_push (callee->m_functions[i]);
+
+	  callee->m_functions.truncate (0);
+
+	  /* Iterate all cluster_edges of callee and add them to the caller.  */
+	  for (hash_map<cluster *, cluster_edge *>::iterator it
+	       = callee->m_callers.begin (); it != callee->m_callers.end ();
+	       ++it)
+	    {
+	      (*it).second->m_callee = caller;
+	      cluster_edge **ce = caller->m_callers.get ((*it).first);
+
+	      if (ce != NULL)
+		{
+		  (*ce)->m_count += (*it).second->m_count;
+		  if ((*ce)->m_heap_node != NULL)
+		    heap.decrease_key ((*ce)->m_heap_node,
+				       (*ce)->inverted_count ());
+		}
+	      else
+		caller->m_callers.put ((*it).first, (*it).second);
+	    }
+	}
+      else if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "  (clusters too big to be merged)\n");
+    }
+
+  /* Sort the candidate clusters.  */
+  clusters.qsort (cluster_cmp);
+
+  /* Dump clusters.  */
+  if (dump_file)
+    {
+      for (unsigned i = 0; i < clusters.length (); i++)
+	{
+	  cluster *c = clusters[i];
+	  if (c->m_functions.length () == 0)
+	    continue;
+
+	  fprintf (dump_file, "\nCluster %d with functions: %d, size: %d,"
+		   " density: %f\n", i, c->m_functions.length (), c->m_size,
+		   (c->m_time / c->m_size).to_double ());
+	  fprintf (dump_file, "  functions: ");
+	  for (unsigned j = 0; j < c->m_functions.length (); j++)
+	    fprintf (dump_file, "%s ", c->m_functions[j]->dump_name ());
+	  fprintf (dump_file, "\n");
+	}
+      fprintf (dump_file, "\n");
+    }
+
+  /* Assign .text.sorted.* section names.  */
+  int counter = 1;
+  for (unsigned i = 0; i < clusters.length (); i++)
+    {
+      cluster *c = clusters[i];
+      if (c->m_functions.length () <= 1)
+	continue;
+
+      for (unsigned j = 0; j < c->m_functions.length (); j++)
+	{
+	  cgraph_node *node = c->m_functions[j];
+
+	  if (dump_file)
+	    fprintf (dump_file, "setting: %d for %s with size:%d\n",
+		     counter, node->dump_asm_name (),
+		     ipa_size_summaries->get (node)->size);
+	  node->text_sorted_order = counter++;
+	}
+    }
+
+  /* Release memory.  */
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (!node->alias)
+      node->aux = NULL;
+
+  for (unsigned i = 0; i < clusters.length (); i++)
+    delete clusters[i];
+
+  for (unsigned i = 0; i < edges.length (); i++)
+    delete edges[i];
+}
+
+/* The main function for function sorting.  */
+
+static unsigned int
+ipa_reorder (void)
+{
+  switch (flag_reorder_functions_algorithm)
+    {
+    case REORDER_FUNCTIONS_ALGORITHM_CALL_CHAIN_CLUSTERING:
+      sort_functions_by_c3 ();
+      break;
+    case REORDER_FUNCTIONS_ALGORITHM_FIRST_RUN:
+      sort_functions_by_first_run ();
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return 0;
+}
+
+const pass_data pass_data_ipa_reorder =
+{
+  IPA_PASS, /* type */
+  "reorder", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_IPA_REORDER, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_ipa_reorder : public ipa_opt_pass_d
+{
+public:
+  pass_ipa_reorder (gcc::context *ctxt)
+    : ipa_opt_pass_d (pass_data_ipa_reorder, ctxt,
+		      NULL, /* generate_summary */
+		      NULL, /* write_summary */
+		      NULL, /* read_summary */
+		      NULL, /* write_optimization_summary */
+		      NULL, /* read_optimization_summary */
+		      NULL, /* stmt_fixup */
+		      0, /* function_transform_todo_flags_start */
+		      NULL, /* function_transform */
+		      NULL) /* variable_transform */
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *);
+  virtual unsigned int execute (function *) { return ipa_reorder (); }
+
+}; // class pass_ipa_reorder
+
+bool
+pass_ipa_reorder::gate (function *)
+{
+  return flag_profile_reorder_functions && flag_profile_use && flag_wpa;
+}
+
+} // anon namespace
+
+ipa_opt_pass_d *
+make_pass_ipa_reorder (gcc::context *ctxt)
+{
+  return new pass_ipa_reorder (ctxt);
+}
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index a4a70e7848c..6ff1ac87ad9 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -502,6 +502,7 @@ lto_output_node (struct lto_simple_output_block *ob, struct cgraph_node *node,
     section = "";
 
   streamer_write_hwi_stream (ob->main_stream, node->tp_first_run);
+  streamer_write_hwi_stream (ob->main_stream, node->text_sorted_order);
 
   bp = bitpack_create (ob->main_stream);
   bp_pack_value (&bp, node->local, 1);
@@ -1274,6 +1275,7 @@ input_node (struct lto_file_decl_data *file_data,
 		    "node with uid %d", node->get_uid ());
 
   node->tp_first_run = streamer_read_uhwi (ib);
+  node->text_sorted_order = streamer_read_uhwi (ib);
 
   bp = streamer_read_bitpack (ib);
 
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 75cfdaabf42..a798243ea6d 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -372,35 +372,6 @@ lto_max_map (void)
     new_partition ("empty");
 }
 
-/* Helper function for qsort; sort nodes by order. noreorder functions must have
-   been removed earlier.  */
-static int
-node_cmp (const void *pa, const void *pb)
-{
-  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
-  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
-
-  /* Profile reorder flag enables function reordering based on first execution
-     of a function. All functions with profile are placed in ascending
-     order at the beginning.  */
-
-  if (flag_profile_reorder_functions)
-  {
-    /* Functions with time profile are sorted in ascending order.  */
-    if (a->tp_first_run && b->tp_first_run)
-      return a->tp_first_run != b->tp_first_run
-	? a->tp_first_run - b->tp_first_run
-        : a->order - b->order;
-
-    /* Functions with time profile are sorted before the functions
-       that do not have the profile.  */
-    if (a->tp_first_run || b->tp_first_run)
-      return b->tp_first_run - a->tp_first_run;
-  }
-
-  return b->order - a->order;
-}
-
 /* Helper function for qsort; sort nodes by order.  */
 static int
 varpool_node_cmp (const void *pa, const void *pb)
@@ -537,8 +508,8 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size)
      unit tends to import a lot of global trees defined there.  We should
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
-  order.qsort (node_cmp);
-  noreorder.qsort (node_cmp);
+  order.qsort (cgraph_node_cmp_by_text_sorted);
+  noreorder.qsort (cgraph_node_cmp_by_text_sorted);
 
   if (dump_file)
     {
diff --git a/gcc/passes.def b/gcc/passes.def
index 798a391bd35..983be6ebe3c 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -153,6 +153,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_ipa_fn_summary);
   NEXT_PASS (pass_ipa_inline);
   NEXT_PASS (pass_ipa_pure_const);
+  NEXT_PASS (pass_ipa_reorder);
   NEXT_PASS (pass_ipa_free_fn_summary, false /* small_p */);
   NEXT_PASS (pass_ipa_reference);
   /* This pass needs to be scheduled after any IP code duplication.   */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 357fcfd65c5..4e575ee406c 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -76,6 +76,7 @@ DEFTIMEVAR (TV_IPA_INHERITANCE       , "ipa inheritance graph")
 DEFTIMEVAR (TV_IPA_VIRTUAL_CALL      , "ipa virtual call target")
 DEFTIMEVAR (TV_IPA_DEVIRT	     , "ipa devirtualization")
 DEFTIMEVAR (TV_IPA_CONSTANT_PROP     , "ipa cp")
+DEFTIMEVAR (TV_IPA_REORDER	     , "ipa reorder")
 DEFTIMEVAR (TV_IPA_INLINING          , "ipa inlining heuristics")
 DEFTIMEVAR (TV_IPA_FNSPLIT           , "ipa function splitting")
 DEFTIMEVAR (TV_IPA_COMDATS	     , "ipa comdats")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index a987661530e..d59ceef0dc2 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -499,6 +499,7 @@ extern ipa_opt_pass_d *make_pass_ipa_inline (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_free_lang_data (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_free_fn_summary (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_cp (gcc::context *ctxt);
+extern ipa_opt_pass_d *make_pass_ipa_reorder (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_sra (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_icf (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_devirt (gcc::context *ctxt);
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 57365ad2e76..2fbbdc0526b 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -601,6 +601,15 @@ default_function_section (tree decl, enum node_frequency freq,
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
     return get_named_text_section (decl, ".text.exit", NULL);
 
+  cgraph_node *node = cgraph_node::get (decl);
+  if (node->text_sorted_order > 0)
+    {
+      char section_name[64];
+      sprintf (section_name, ".text.sorted.%010d",
+	       node->text_sorted_order);
+      return get_named_text_section (decl, section_name, NULL);
+    }
+
   /* Group cold functions together, similarly for hot code.  */
   switch (freq)
     {
-- 
2.24.0

Reply via email to