On October 11, 2019 9:03:53 AM GMT+02:00, Jan Hubicka <hubi...@ucw.cz> wrote:
>Hi,
>this patch adds ggc_trim that releases free pages used by GGC memory
>to system.  This is useful to reduce memory footprint of WPA streaming:
>WPA streaming ought to not use any more GGC memory (patches in testing
>for that) and trimming the memory makes it available to fork&malloc
>used
>by stream out machinery.
>
>I collected some stats for cc1 for both GGC and heap (using mallinfo).
>Memory footprints are as follows:
>
>After streaming in global stream: 123MB GGC;  25MB of heap.
>After streaming in callgraph    : 228MB GGC;  45MB of heap.
>After streaming in summaries    : 373MB GGC; 126MB of heap.
>After symbol merging           : 348MB GGC; 130MB of heap.
>After IPA-ICF                  : 501MB GGC; 160MB of heap. (this is all ICF)
>After IPA-CP                   : 528MB GGC; 163MB of heap.
>After IPA-SRA                  : 532MB GGC; 163MB of heap.
>After Inline                   : 644MB GGC; 173MB of heap
>                               This is after collecting of 118MB of
>                               garbage and returning 740k to system
>                               by madvise_dontneed
>After ipa-reference            : 644MB GGC; 370MB of heap
>                               I checked this all goes into the
>                               bitmaps; I have WIP patch for that
>After releasing summariess     : 431MB GGC; 383MB of heap
>                               Trim releases 43MB by unmap
>                               and 321MB by madvise_dontneed
>
>At least i learnt new fact about ipa-reference consuming  200MB of
>memory which was not obvious from our detailed mem stats.
>
>I think the lowest hanging fruit after this patch is to add
>malloc_madvise which further reduces footpring and fix ipa-reference.
>Hopefully Martin will do a bit about ipa-icf.
>
>I will dig into what inliner does but it produces a lot of clones so I
>think it is mostly clone and summary duplication. Perhaps we can avoid
>copying some of summaries for inline clones.
>
>In TOP I see about 900MB instead of 1.4GB before WPA streaming starts
>with both ggc_trim and madvise.
>
>Note that I also tried to hack ggc_free to recognize free pages but at
>least in simple implementation it is a loss since it makes ggc_alloc
>more expensive (it needs to bring pages back and add into freelists)
>which hurts stream-in performance.
>
>I think sweeping once per WPA is no problem, it is definitly less than
>1% of WPA time.
>
>Bootstrapped/regtested x86_64-linux, OK?

Ok.

Richard. 

>       * ggc-page.c (release_pages): Output statistics when !quiet_flag.
>       (ggc_collect): Dump later to not interfere with release_page dump.
>       (ggc_trim): New function.
>       * ggc-none.c (ggc_trim): New.
>
>       * lto.c (lto_wpa_write_files): Call ggc_trim.
>Index: ggc-page.c
>===================================================================
>--- ggc-page.c (revision 276707)
>+++ ggc-page.c (working copy)
>@@ -529,7 +529,6 @@ static void clear_page_group_in_use (pag
> #endif
> static struct page_entry * alloc_page (unsigned);
> static void free_page (struct page_entry *);
>-static void release_pages (void);
> static void clear_marks (void);
> static void sweep_pages (void);
> static void ggc_recalculate_in_use_p (page_entry *);
>@@ -1016,6 +1015,8 @@ free_page (page_entry *entry)
> static void
> release_pages (void)
> {
>+  size_t n1 = 0;
>+  size_t n2 = 0;
> #ifdef USING_MADVISE
>   page_entry *p, *start_p;
>   char *start;
>@@ -1061,6 +1062,7 @@ release_pages (void)
>           else
>             G.free_pages = p;
>           G.bytes_mapped -= mapped_len;
>+        n1 += len;
>         continue;
>         }
>       prev = newprev;
>@@ -1092,6 +1094,7 @@ release_pages (void)
>/* Don't count those pages as mapped to not touch the garbage collector
>          unnecessarily. */
>       G.bytes_mapped -= len;
>+      n2 += len;
>       while (start_p != p)
>         {
>           start_p->discarded = true;
>@@ -1124,6 +1127,7 @@ release_pages (void)
>       }
> 
>       munmap (start, len);
>+      n1 += len;
>       G.bytes_mapped -= len;
>     }
> 
>@@ -1152,10 +1156,20 @@ release_pages (void)
>       *gp = g->next;
>       G.bytes_mapped -= g->alloc_size;
>       free (g->allocation);
>+      n1 += g->alloc_size;
>       }
>     else
>       gp = &g->next;
> #endif
>+  if (!quiet_flag && (n1 || n2))
>+    {
>+      fprintf (stderr, " {GC");
>+      if (n1)
>+      fprintf (stderr, " released %luk", (unsigned long)(n1 / 1024));
>+      if (n2)
>+      fprintf (stderr, " madv_dontneed %luk", (unsigned long)(n2 / 1024));
>+      fprintf (stderr, "}");
>+    }
> }
> 
> /* This table provides a fast way to determine ceil(log_2(size)) for
>@@ -2178,19 +2192,22 @@ ggc_collect (void)
>     return;
> 
>   timevar_push (TV_GC);
>-  if (!quiet_flag)
>-    fprintf (stderr, " {GC %luk -> ", (unsigned long) G.allocated /
>1024);
>   if (GGC_DEBUG_LEVEL >= 2)
>     fprintf (G.debug_file, "BEGIN COLLECTING\n");
> 
>   /* Zero the total allocated bytes.  This will be recalculated in the
>      sweep phase.  */
>+  size_t allocated = G.allocated;
>   G.allocated = 0;
> 
>   /* Release the pages we freed the last time we collected, but didn't
>      reuse in the interim.  */
>   release_pages ();
> 
>+  /* Output this later so we do not interfere with release_pages.  */
>+  if (!quiet_flag)
>+    fprintf (stderr, " {GC %luk -> ", (unsigned long) allocated /
>1024);
>+
>   /* Indicate that we've seen collections at this context depth.  */
>G.context_depth_collections = ((unsigned long)1 << (G.context_depth +
>1)) - 1;
> 
>@@ -2221,9 +2238,25 @@ ggc_collect (void)
>     fprintf (G.debug_file, "END COLLECTING\n");
> }
> 
>-/* Assume that all GGC memory is reachable and grow the limits for
>next collection.
>-   With checking, trigger GGC so -Q compilation outputs how much of
>memory really is
>-   reachable.  */
>+/* Return free pages to the system.  */
>+
>+void
>+ggc_trim ()
>+{
>+  timevar_push (TV_GC);
>+  G.allocated = 0;
>+  sweep_pages ();
>+  release_pages ();
>+  if (!quiet_flag)
>+    fprintf (stderr, " {GC trimmed to %luk, %luk mapped}",
>+           (unsigned long) G.allocated / 1024,
>+           (unsigned long) G.bytes_mapped / 1024);
>+  timevar_pop (TV_GC);
>+}
>+
>+/* Assume that all GGC memory is reachable and grow the limits for
>next
>+   collection.  With checking, trigger GGC so -Q compilation outputs
>how much
>+   of memory really is reachable.  */
> 
> void
> ggc_grow (void)
>Index: ggc-none.c
>===================================================================
>--- ggc-none.c (revision 276707)
>+++ ggc-none.c (working copy)
>@@ -72,3 +72,8 @@ void
> ggc_grow (void)
> {
> }
>+
>+void
>+ggc_trim (void)
>+{
>+}
>Index: lto/lto.c
>===================================================================
>--- lto/lto.c  (revision 276707)
>+++ lto/lto.c  (working copy)
>@@ -304,6 +306,7 @@ lto_wpa_write_files (void)
> 
>   timevar_push (TV_WHOPR_WPA_IO);
> 
>+  ggc_trim ();
>   /* Generate a prefix for the LTRANS unit files.  */
>   blen = strlen (ltrans_output_list);
>   temp_filename = (char *) xmalloc (blen + sizeof ("2147483648.o"));

Reply via email to