On Wed, 15 Feb 2017, Philip Guenther wrote:
...
> I have a followup diff that then alters uvm_unix.c to treat unshared amaps 
> (anonymous maps) as multiple ranges such that pages that were never 
> faulted don't get written to the coredump: it's useless to write them out 
> (they're just zeros) and it'll fail if faulting in those pages would hit 
> the process's memory limit.  That diff takes advantage of the API change 
> in the present diff to hold extra references on some amaps between the two 
> walks of the VA.

Here's that followup diff.  If you examine a core file generated by a 
kernel with both these diffs, you'll find segments with filesz < memsz in 
the output of "readelf -lW whatever.core".  Those are ranges in amaps 
where this diff suppressed faulting in of never-touched pages.


Philip Guenther

--- uvm/uvm_unix.c      Wed Feb 15 22:31:50 2017
+++ uvm/uvm_unix.c      Wed Feb 15 23:37:26 2017
@@ -134,7 +134,79 @@
 
 #ifndef SMALL_KERNEL
 
+#define WALK_CHUNK     32
 /*
+ * Not all the pages in an amap may be present.  When dumping core,
+ * we don't want to force all the pages to be present: it's a waste
+ * of time and memory when we already know what they contain (zeros)
+ * and the ELF format at least can adequately represent them as a
+ * segment with memory size larger than its file size.
+ *
+ * So, we walk the amap with calls to amap_lookups() and scan the
+ * resulting pointers to find ranges of zero or more present pages
+ * followed by at least one absent page or the end of the amap.
+ * When then pass that range to the walk callback with 'start'
+ * pointing to the start of the present range, 'realend' pointing
+ * to the first absent page (or the end of the entry), and 'end'
+ * pointing to the page page the last absent page (or the end of
+ * the entry).
+ *
+ * Note that if the first page of the amap is empty then the callback
+ * must be invoked with 'start' == 'realend' so it can present that
+ * first range of absent pages.
+ */
+int
+uvm_coredump_walk_amap(struct vm_map_entry *entry, int *nsegmentp,
+    uvm_coredump_walk_cb *walk, void *cookie)
+{
+       struct vm_anon *anons[WALK_CHUNK];
+       vaddr_t pos, start, realend, end, entry_end;
+       vm_prot_t prot;
+       int nsegment, absent, npages, i, error;
+
+       prot = entry->protection;
+       nsegment = *nsegmentp;
+       start = entry->start;
+       entry_end = MIN(entry->end, VM_MAXUSER_ADDRESS);
+
+       absent = 0;
+       for (pos = start; pos < entry_end; pos += npages << PAGE_SHIFT) {
+               npages = (entry_end - pos) >> PAGE_SHIFT;
+               if (npages > WALK_CHUNK)
+                       npages = WALK_CHUNK;
+               amap_lookups(&entry->aref, pos - entry->start, anons, npages);
+               for (i = 0; i < npages; i++) {
+                       if ((anons[i] == NULL) == absent)
+                               continue;
+                       if (!absent) {
+                               /* going from present to absent: set realend */
+                               realend = pos + (i << PAGE_SHIFT);
+                               absent = 1;
+                               continue;
+                       }
+
+                       /* going from absent to present: invoke callback */
+                       end = pos + (i << PAGE_SHIFT);
+                       if (start != end) {
+                               error = (*walk)(start, realend, end, prot,
+                                   nsegment, cookie);
+                               if (error)
+                                       return error;
+                               nsegment++;
+                       }
+                       start = realend = end;
+                       absent = 0;
+               }
+       }
+
+       if (!absent)
+               realend = entry_end;
+       error = (*walk)(start, realend, entry_end, prot, nsegment, cookie);
+       *nsegmentp = nsegment + 1;
+       return error;
+}
+
+/*
  * Common logic for whether a map entry should be included in a coredump
  */
 static inline int
@@ -166,6 +238,15 @@
        return 1;
 }
 
+
+/* do nothing callback for uvm_coredump_walk_amap() */
+static int
+noop(vaddr_t start, vaddr_t realend, vaddr_t end, vm_prot_t prot,
+    int nsegment, void *cookie)
+{
+       return 0;
+}
+
 /*
  * Walk the VA space for a process to identify what to write to
  * a coredump.  First the number of contiguous ranges is counted,
@@ -183,10 +264,16 @@
        struct vm_map *map = &vm->vm_map;
        struct vm_map_entry *entry;
        vaddr_t end;
+       int refed_amaps = 0;
        int nsegment, error;
 
        /*
-        * Walk the map once to count the segments.
+        * Walk the map once to count the segments.  If an amap is
+        * referenced more than once than take *another* reference
+        * and treat the amap as exactly one segment instead of
+        * checking page presence inside it.  On the second pass
+        * we'll recognize which amaps we did that for by the ref
+        * count being >1...and decrement it then.
         */
        nsegment = 0;
        RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
@@ -198,25 +285,52 @@
                if (! uvm_should_coredump(p, entry))
                        continue;
 
+               if (entry->aref.ar_amap != NULL) {
+                       if (entry->aref.ar_amap->am_ref == 1) {
+                               uvm_coredump_walk_amap(entry, &nsegment,
+                                   &noop, cookie);
+                               continue;
+                       }
+
+                       /*
+                        * Multiple refs currently, so take another and
+                        * treat it as a single segment
+                        */
+                       entry->aref.ar_amap->am_ref++;
+                       refed_amaps++;
+               }
+
                nsegment++;
        }
 
        /*
-        * Okay, we have a count in nsegment; invoke the setup callback.
+        * Okay, we have a count in nsegment.  Prepare to
+        * walk it again, then invoke the setup callback. 
         */
+       entry = RBT_MIN(uvm_map_addr, &map->addr);
        error = (*setup)(nsegment, cookie);
        if (error)
                goto cleanup;
 
        /*
         * Setup went okay, so do the second walk, invoking the walk
-        * callback on the counted segments.
+        * callback on the counted segments and cleaning up references
+        * as we go.
         */
        nsegment = 0;
-       RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
+       for (; entry != NULL; entry = RBT_NEXT(uvm_map_addr, entry)) {
                if (! uvm_should_coredump(p, entry))
                        continue;
 
+               if (entry->aref.ar_amap != NULL &&
+                   entry->aref.ar_amap->am_ref == 1) {
+                       error = uvm_coredump_walk_amap(entry, &nsegment,
+                           walk, cookie);
+                       if (error)
+                               break;
+                       continue;
+               }
+
                end = entry->end;
                if (end > VM_MAXUSER_ADDRESS)
                        end = VM_MAXUSER_ADDRESS;
@@ -226,9 +340,33 @@
                if (error)
                        break;
                nsegment++;
+
+               if (entry->aref.ar_amap != NULL &&
+                   entry->aref.ar_amap->am_ref > 1) {
+                       /* multiple refs, so we need to drop one */
+                       entry->aref.ar_amap->am_ref--;
+                       refed_amaps--;
+               }
        }
 
+       if (error) {
 cleanup:
+               /* clean up the extra references from where we left off */
+               if (refed_amaps > 0) {
+                       for (; entry != NULL;
+                           entry = RBT_NEXT(uvm_map_addr, entry)) {
+                               if (entry->aref.ar_amap == NULL ||
+                                   entry->aref.ar_amap->am_ref == 1)
+                                       continue;
+                               if (! uvm_should_coredump(p, entry))
+                                       continue;
+                               entry->aref.ar_amap->am_ref--;
+                               if (refed_amaps-- == 0)
+                                       break;
+                       }
+               }
+       }
+
        return error;
 }
 

Reply via email to