Tao Liu <[email protected]> writes:
> pfn and num is the data which extensions give to makedumpfile for mm page
> filtering. Since makedumpfile will iterate the pfn in an ascending order in
> __exclude_unnecessary_pages(), pfn and num are stored within ft_page_info 
> linked
> lists and organized in an ascending order by pfn, so if one pfn
> is hit by one list, the next pfn is most likely to be hit either by
> this list again, or the one follows, so a cur variable is used for saving
> the current list position to speedup the pfn checking process.

I'm wondering about the trade-off for using a linked list versus an
array. Using the linked list, we are forced to maintain the sorted
order as we construct the list, which is an O(N^2) insertion sort.

If instead we used an array, we could sort it with qsort() once, at the
end. Then we could merge any overlapping ranges. Lookup could be
implemented cheaply with bsearch(), and we could continue to use the
optimization where we maintain a "cur" pointer.  I believe the overall
runtime complexity of the array approach would be O(N*log(N)) without
requiring hand-implementing anything too complex, compared to O(N^2).

Depending on the number of pages (and how fragmented they are), this
may or may not be an issue.

In my testing for userspace tasks, the number of pages retained can be
on the order of ~100k. However -- my use case can't really use a list of
PFNs, which I'll explain below. So my use case doesn't really matter too
much here -- maybe your use case has relatively few page ranges, so the
cost of O(N^2) is not bad.

So I guess I don't have a strong preference - but it's worth
considering.

> In addition, 2 ft_page_info linked list chains are used, one for mm page
> discarding and the other for page keeping.
>
> Signed-off-by: Tao Liu <[email protected]>
> ---
>  erase_info.c   | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  erase_info.h   | 12 +++++++
>  makedumpfile.c | 28 ++++++++++++---
>  3 files changed, 134 insertions(+), 4 deletions(-)
>
> diff --git a/erase_info.c b/erase_info.c
> index b67d1d0..8838bea 100644
> --- a/erase_info.c
> +++ b/erase_info.c
> @@ -2466,3 +2466,101 @@ get_size_eraseinfo(void)
>       return size_eraseinfo;
>  }
>  
> +/* Pages to be discarded */
> +static struct ft_page_info *ft_head_discard = NULL;
> +/* Pages to be keeped */
> +static struct ft_page_info *ft_head_keep = NULL;
> +
> +/*
> + * Insert the ft_page_info blocks into ft_head by ascending pfn.
> + */
> +bool
> +update_filter_pages_info(unsigned long pfn, unsigned long num, bool 
> to_discard)
> +{
> +     struct ft_page_info *p, **ft_head;
> +     struct ft_page_info *new_p = malloc(sizeof(struct ft_page_info));
> +
> +     ft_head = to_discard ? &ft_head_discard : &ft_head_keep;
> +
> +     if (!new_p) {
> +             ERRMSG("Can't allocate memory for ft_page_info at %lx\n", pfn);
> +             return false;
> +     }
> +     new_p->pfn = pfn;
> +     new_p->num = num;
> +     new_p->next = NULL;
> +
> +     if (!(*ft_head) || (*ft_head)->pfn > new_p->pfn) {
> +             new_p->next = (*ft_head);
> +             (*ft_head) = new_p;
> +             return true;
> +     }
> +
> +     p = (*ft_head);
> +     while (p->next != NULL && p->next->pfn < new_p->pfn) {
> +             p = p->next;
> +     }
> +
> +     new_p->next = p->next;
> +     p->next = new_p;

It might be wise to defensively handle the case of overlapping
PFN ranges by merging them.

> +     return true;
> +}
> +
> +/*
> + * Check if the pfn hit ft_page_info block.
> + *
> + * pfn and ft_head are in ascending order, so save the current ft_page_info
> + * block into **p because it is likely to hit again next time.
> + */
> +bool
> +filter_page(unsigned long pfn, struct ft_page_info **p, bool handle_discard)
> +{
> +     struct ft_page_info *ft_head;
> +
> +     ft_head = handle_discard ? ft_head_discard : ft_head_keep;
> +
> +     if (ft_head == NULL)
> +             return false;
> +
> +     if (*p == NULL)
> +             *p = ft_head;
> +
> +     /* The gap before 1st block */
> +     if (pfn >= 0 && pfn < ft_head->pfn)
> +             return false;
> +
> +     /* Handle 1~(n-1) blocks and following gaps */
> +     while ((*p)->next) {
> +             if (pfn >= (*p)->pfn && pfn < (*p)->pfn + (*p)->num)
> +                     return true; // hit the block
> +             if (pfn >= (*p)->pfn + (*p)->num && pfn < (*p)->next->pfn)
> +                     return false; // the gap after the block
> +             *p = (*p)->next;
> +     }
> +
> +     /* The last block and gap */
> +     if (pfn >= (*p)->pfn + (*p)->num)
> +             return false;
> +     else
> +             return true;
> +}
> +
> +static void
> +do_cleanup(struct ft_page_info **ft_head)
> +{
> +     struct ft_page_info *p, *p_tmp;
> +
> +     for (p = *ft_head; p;) {
> +             p_tmp = p;
> +             p = p->next;
> +             free(p_tmp);
> +     }
> +     *ft_head = NULL;
> +}
> +
> +void
> +cleanup_filter_pages_info(void)
> +{
> +     do_cleanup(&ft_head_discard);
> +     do_cleanup(&ft_head_keep);
> +}
> diff --git a/erase_info.h b/erase_info.h
> index b363a40..6c60706 100644
> --- a/erase_info.h
> +++ b/erase_info.h
> @@ -20,6 +20,7 @@
>  #define _ERASE_INFO_H
>  
>  #define MAX_SIZE_STR_LEN (26)
> +#include <stdbool.h>
>  
>  /*
>   * Erase information, original symbol expressions.
> @@ -65,5 +66,16 @@ void filter_data_buffer_parallel(unsigned char *buf, 
> unsigned long long paddr,
>  unsigned long get_size_eraseinfo(void);
>  int update_filter_info_raw(unsigned long long, int, int);
>  
> +bool update_filter_pages_info(unsigned long, unsigned long, bool);
> +
> +struct ft_page_info {
> +     unsigned long pfn;
> +     unsigned long num;
> +     struct ft_page_info *next;
> +} __attribute__((packed));
> +
> +bool filter_page(unsigned long, struct ft_page_info **p, bool 
> handle_discard);
> +void cleanup_filter_pages_info(void);
> +
>  #endif /* _ERASE_INFO_H */
>  
> diff --git a/makedumpfile.c b/makedumpfile.c
> index ca8ed8a..ebac8da 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -102,6 +102,7 @@ mdf_pfn_t pfn_free;
>  mdf_pfn_t pfn_hwpoison;
>  mdf_pfn_t pfn_offline;
>  mdf_pfn_t pfn_elf_excluded;
> +mdf_pfn_t pfn_extension;
>  
>  mdf_pfn_t num_dumped;
>  
> @@ -6459,6 +6460,8 @@ __exclude_unnecessary_pages(unsigned long mem_map,
>       unsigned int order_offset, dtor_offset;
>       unsigned long flags, mapping, private = 0;
>       unsigned long compound_dtor, compound_head = 0;
> +     struct ft_page_info *cur_discard = NULL;
> +     struct ft_page_info *cur_keep = NULL;
>  
>       /*
>        * If a multi-page exclusion is pending, do it first
> @@ -6495,6 +6498,13 @@ __exclude_unnecessary_pages(unsigned long mem_map,
>               if (info->flag_cyclic && !is_cyclic_region(pfn, cycle))
>                       continue;
>  
> +             /*
> +              * Keep pages that specified by user via
> +              * makedumpfile extensions
> +              */
> +             if (filter_page(pfn, &cur_keep, false))
> +                     continue;
> +

It makes sense to allow plugins to enumerate a list of PFNs to override
and include. I like that - it's simple enough. But it's not flexible
enough for my use case with userspace stacks :(

The userspace stack region is an anon_vma. My plugin can enumerate the
anon_vmas that it wants to save, but it's prohibitively expensive and
complex to enumerate the list of pages associated with each anon_vma. We
would need to do a page table walk for each process.

There's a simpler way: from the struct page mapping and index fields,
it's possible to determine which anon_vma the page is associated with,
and what index it has within the VMA. And from this, we can make the
determination of whether to include a page or not. This is what I had
implemented in this patch:

https://github.com/brenns10/makedumpfile/commit/1c0a828ef80962480f771915c2d494272721b659#diff-2593512d7ec329b34b1ca5686a7b6b073d0ca636df8ff20fea04684da2c8e063R6692-R12150

So, I wonder if it makes sense to allow a plugin to register a callback
to be called here, so the plugin can make the more complex decision?
This would keep the logic outside of the core makedumpfile code, but
allow the necessary flexibility.

Something like:

if (plugin_keep_page_callback && plugin_keep_page_callback(pfn, pcache))
    continue;

And then the extension system could allow an extension to register that
callback. It would need to keep the extension loaded for the duration of
the execution of makedumpfile (rather than calling dlclose()
immediately).

What do you think about this? I'm happy to implement this part of it
separate from your patch series -- you could simply drop the stuff
related to page inclusion, and I can add the necessary pieces when I
submit my extension patches.

Thanks,
Stephen

>               /*
>                * Exclude the memory hole.
>                */
> @@ -6687,6 +6697,14 @@ check_order:
>               else if (isOffline(flags, _mapcount)) {
>                       pfn_counter = &pfn_offline;
>               }
> +             /*
> +              * Exclude pages that specified by user via
> +              * makedumpfile extensions
> +              */
> +             else if (filter_page(pfn, &cur_discard, true)) {
> +                     nr_pages = 1;
> +                     pfn_counter = &pfn_extension;
> +             }
>               /*
>                * Unexcludable page
>                */
> @@ -6748,6 +6766,7 @@ exclude_unnecessary_pages(struct cycle *cycle)
>               print_progress(PROGRESS_UNN_PAGES, info->num_mem_map, 
> info->num_mem_map, NULL);
>               print_execution_time(PROGRESS_UNN_PAGES, &ts_start);
>       }
> +     cleanup_filter_pages_info();
>  
>       return TRUE;
>  }
> @@ -8234,7 +8253,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
> struct cache_data *cd_page)
>        */
>       if (info->flag_cyclic) {
>               pfn_zero = pfn_cache = pfn_cache_private = 0;
> -             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> +             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 
> pfn_extension = 0;
>               pfn_memhole = info->max_mapnr;
>       }
>  
> @@ -9579,7 +9598,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> *cd_header, struct cache_d
>                * Reset counter for debug message.
>                */
>               pfn_zero = pfn_cache = pfn_cache_private = 0;
> -             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> +             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 
> pfn_extension = 0;
>               pfn_memhole = info->max_mapnr;
>  
>               /*
> @@ -10528,7 +10547,7 @@ print_report(void)
>       pfn_original = info->max_mapnr - pfn_memhole;
>  
>       pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> -         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> +         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + pfn_extension;
>  
>       REPORT_MSG("\n");
>       REPORT_MSG("Original pages  : 0x%016llx\n", pfn_original);
> @@ -10544,6 +10563,7 @@ print_report(void)
>       REPORT_MSG("    Free pages              : 0x%016llx\n", pfn_free);
>       REPORT_MSG("    Hwpoison pages          : 0x%016llx\n", pfn_hwpoison);
>       REPORT_MSG("    Offline pages           : 0x%016llx\n", pfn_offline);
> +     REPORT_MSG("    Extension filter pages  : 0x%016llx\n", pfn_extension);
>       REPORT_MSG("  Remaining pages  : 0x%016llx\n",
>           pfn_original - pfn_excluded);
>  
> @@ -10584,7 +10604,7 @@ print_mem_usage(void)
>       pfn_original = info->max_mapnr - pfn_memhole;
>  
>       pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> -         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> +         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + pfn_extension;
>       shrinking = (pfn_original - pfn_excluded) * 100;
>       shrinking = shrinking / pfn_original;
>       total_size = info->page_size * pfn_original;
> -- 
> 2.47.0

Reply via email to