Hi Pingfan,

a few nits in addition to what is mentioned in the cover letter.

On Tue, 19 Aug 2025 09:24:21 +0800
Pingfan Liu <[email protected]> wrote:

> As UEFI becomes popular, a few architectures support to boot a PE format
> kernel image directly. But the internal of PE format varies, which means
> each parser for each format.
> 
> This patch (with the rest in this series) introduces a common skeleton
> to all parsers, and leave the format parsing in
> bpf-prog, so the kernel code can keep relative stable.
> 
> A new kexec_file_ops is implementation, named pe_image_ops.
> 
> There are some place holder function in this patch. (They will take
> effect after the introduction of kexec bpf light skeleton and bpf
> helpers). Overall the parsing progress is a pipeline, the current
> bpf-prog parser is attached to bpf_handle_pefile(), and detatched at the
> end of the current stage 'disarm_bpf_prog()' the current parsed result
> by the current bpf-prog will be buffered in kernel 'prepare_nested_pe()'
> , and deliver to the next stage.  For each stage, the bpf bytecode is
> extracted from the '.bpf' section in the PE file.
> 
> Signed-off-by: Pingfan Liu <[email protected]>
> Cc: Baoquan He <[email protected]>
> Cc: Dave Young <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Philipp Rudo <[email protected]>
> To: [email protected]
> ---
>  include/linux/kexec.h   |   1 +
>  kernel/Kconfig.kexec    |   9 ++
>  kernel/Makefile         |   1 +
>  kernel/kexec_pe_image.c | 348 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 359 insertions(+)
>  create mode 100644 kernel/kexec_pe_image.c

[...]

> new file mode 100644
> index 0000000000000..b0cf9942e68d2
> --- /dev/null
> +++ b/kernel/kexec_pe_image.c
> @@ -0,0 +1,348 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Kexec PE image loader
> +
> + * Copyright (C) 2025 Red Hat, Inc
> + */
> +
> +#define pr_fmt(fmt)  "kexec_file(Image): " fmt
> +
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/list.h>
> +#include <linux/kernel.h>
> +#include <linux/vmalloc.h>
> +#include <linux/kexec.h>
> +#include <linux/pe.h>
> +#include <linux/string.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
> +#include <asm/byteorder.h>
> +#include <asm/image.h>
> +#include <asm/memory.h>
> +
> +
> +#define KEXEC_RES_KERNEL_NAME "kexec:kernel"
> +#define KEXEC_RES_INITRD_NAME "kexec:initrd"
> +#define KEXEC_RES_CMDLINE_NAME "kexec:cmdline"
> +
> +struct kexec_res {
> +     char *name;
> +     /* The free of buffer is deferred to kimage_file_post_load_cleanup */
> +     struct mem_range_result *r;
> +};
> +
> +static struct kexec_res parsed_resource[3] = {
> +     { KEXEC_RES_KERNEL_NAME, },
> +     { KEXEC_RES_INITRD_NAME, },
> +     { KEXEC_RES_CMDLINE_NAME, },
> +};
> +
> +static bool pe_has_bpf_section(const char *file_buf, unsigned long pe_sz);
> +
> +static bool is_valid_pe(const char *kernel_buf, unsigned long kernel_len)
> +{
> +     struct mz_hdr *mz;
> +     struct pe_hdr *pe;
> +
> +     if (!kernel_buf)
> +             return false;
> +     mz = (struct mz_hdr *)kernel_buf;
> +     if (mz->magic != IMAGE_DOS_SIGNATURE)
> +             return false;
> +     pe = (struct pe_hdr *)(kernel_buf + mz->peaddr);
> +     if (pe->magic != IMAGE_NT_SIGNATURE)
> +             return false;
> +     if (pe->opt_hdr_size == 0) {
> +             pr_err("optional header is missing\n");
> +             return false;
> +     }
> +
> +     return pe_has_bpf_section(kernel_buf, kernel_len);
> +}

Also check for 
pe32plus_opt_hdr->subsys == IMAGE_SUBSYSTEM_EFI_APPLICATION?

> +
> +static bool is_valid_format(const char *kernel_buf, unsigned long kernel_len)
> +{
> +     return is_valid_pe(kernel_buf, kernel_len);
> +}
> +
> +/*
> + * The UEFI Terse Executable (TE) image has MZ header.
> + */
> +static int pe_image_probe(const char *kernel_buf, unsigned long kernel_len)
> +{
> +     return is_valid_pe(kernel_buf, kernel_len) ? 0 : -1;
> +}
> +
> +static int pe_get_section(const char *file_buf, const char *sect_name,
> +             char **sect_start, unsigned long *sect_sz)
> +{
> +     struct pe_hdr *pe_hdr;
> +     struct pe32plus_opt_hdr *opt_hdr;
> +     struct section_header *sect_hdr;
> +     int section_nr, i;
> +     struct mz_hdr *mz = (struct mz_hdr *)file_buf;
> +
> +     *sect_start = NULL;
> +     *sect_sz = 0;
> +     pe_hdr = (struct pe_hdr *)(file_buf + mz->peaddr);
> +     section_nr = pe_hdr->sections;
> +     opt_hdr = (struct pe32plus_opt_hdr *)(file_buf + mz->peaddr + 
> sizeof(struct pe_hdr));
> +     sect_hdr = (struct section_header *)((char *)opt_hdr + 
> pe_hdr->opt_hdr_size);
> +
> +     for (i = 0; i < section_nr; i++) {
> +             if (strcmp(sect_hdr->name, sect_name) == 0) {
> +                     *sect_start = (char *)file_buf + sect_hdr->data_addr;
> +                     *sect_sz = sect_hdr->raw_data_size;
> +                     return 0;
> +             }
> +             sect_hdr++;
> +     }
> +
> +     return -1;
> +}
> +
> +static bool pe_has_bpf_section(const char *file_buf, unsigned long pe_sz)
> +{
> +     char *sect_start = NULL;
> +     unsigned long sect_sz = 0;
> +     int ret;
> +
> +     ret = pe_get_section(file_buf, ".bpf", &sect_start, &sect_sz);
> +     if (ret < 0)
> +             return false;
> +     return true;
> +}
> +
> +/* Load a ELF */
> +static int arm_bpf_prog(char *bpf_elf, unsigned long sz)
> +{
> +     return 0;
> +}
> +
> +static void disarm_bpf_prog(void)
> +{
> +}
> +
> +struct kexec_context {
> +     bool kdump;
> +     char *image;
> +     int image_sz;
> +     char *initrd;
> +     int initrd_sz;
> +     char *cmdline;
> +     int cmdline_sz;
> +};
> +
> +void bpf_handle_pefile(struct kexec_context *context);
> +void bpf_post_handle_pefile(struct kexec_context *context);
> +
> +
> +/*
> + * optimize("O0") prevents inline, compiler constant propagation
> + */
> +__attribute__((used, optimize("O0"))) void bpf_handle_pefile(struct 
> kexec_context *context)
> +{
> +     /*
> +      * To prevent linker from Identical Code Folding (ICF) with 
> bpf_handle_pefile,
> +      * making them have different code.
> +      */
> +     volatile int dummy = 0;
> +
> +     dummy += 1;
> +}
> +
> +__attribute__((used, optimize("O0"))) void bpf_post_handle_pefile(struct 
> kexec_context *context)
> +{
> +     volatile int dummy = 0;
> +
> +     dummy += 2;
> +}
> +
> +/*
> + * PE file may be nested and should be unfold one by one.
> + * Query 'kernel', 'initrd', 'cmdline' in cur_phase, as they are inputs for 
> the
> + * next phase.
> + */
> +static int prepare_nested_pe(char **kernel, unsigned long *kernel_len, char 
> **initrd,
> +             unsigned long *initrd_len, char **cmdline)
> +{
> +     struct kexec_res *res;
> +     int ret = -1;
> +
> +     *kernel = NULL;
> +     *kernel_len = 0;
> +
> +     res = &parsed_resource[0];
> +     if (!!res->r) {
> +             *kernel = res->r->buf;
> +             *kernel_len = res->r->data_sz;
> +             ret = 0;
> +     }
> +
> +     res = &parsed_resource[1];
> +     if (!!res->r) {
> +             *initrd = res->r->buf;
> +             *initrd_len = res->r->data_sz;
> +     }
> +
> +     res = &parsed_resource[2];
> +     if (!!res->r) {
> +             *cmdline = res->r->buf;
> +     }
> +
> +     return ret;
> +}
> +
> +static void *pe_image_load(struct kimage *image,
> +                             char *kernel, unsigned long kernel_len,
> +                             char *initrd, unsigned long initrd_len,
> +                             char *cmdline, unsigned long cmdline_len)
> +{
> +     char *linux_start, *initrd_start, *cmdline_start, *bpf_start;
> +     unsigned long linux_sz, initrd_sz, cmdline_sz, bpf_sz;

I don't see a point in defining all the
{linux,initrd,cmdline}_{start,sz} variables. Either you could reuse
the corresponding {kernel,initrd,cmdline} variables from the function
definition. Or better use a kexec_context that contains the same
information.

> +     struct kexec_res *res;
> +     struct mem_range_result *r;
> +     void *ldata;
> +     int ret;
> +
> +     linux_start = kernel;
> +     linux_sz = kernel_len;
> +     initrd_start = initrd;
> +     initrd_sz = initrd_len;
> +     cmdline_start = cmdline;
> +     cmdline_sz = cmdline_len;
> +
> +     while (is_valid_format(linux_start, linux_sz) &&
> +            pe_has_bpf_section(linux_start, linux_sz)) {
> +             struct kexec_context context;
> +
> +             pe_get_section((const char *)linux_start, ".bpf", &bpf_start, 
> &bpf_sz);
> +             if (!!bpf_sz) {
> +                     /* load and attach bpf-prog */
> +                     ret = arm_bpf_prog(bpf_start, bpf_sz);
> +                     if (ret) {
> +                             pr_err("Fail to load .bpf section\n");
> +                             ldata = ERR_PTR(ret);
> +                             goto err;
> +                     }
> +             }
> +             if (image->type != KEXEC_TYPE_CRASH)
> +                     context.kdump = false;
> +             else
> +                     context.kdump = true;

The bpf_prog cannot change whether kexec is used to load a kdump or
normal kernel. So this check can be moved outside the loop.

> +             context.image = linux_start;
> +             context.image_sz = linux_sz;
> +             context.initrd = initrd_start;
> +             context.initrd_sz = initrd_sz;
> +             context.cmdline = cmdline_start;
> +             context.cmdline_sz = strlen(cmdline_start);
> +             /* bpf-prog fentry, which handle above buffers. */
> +             bpf_handle_pefile(&context);
> +
> +             prepare_nested_pe(&linux_start, &linux_sz, &initrd_start,
> +                                     &initrd_sz, &cmdline_start);
> +             /* bpf-prog fentry */
> +             bpf_post_handle_pefile(&context);
> +             /*
> +              * detach the current bpf-prog from their attachment points.
> +              */
> +             disarm_bpf_prog();
> +     }
> +
> +     /*
> +      * image's kernel_buf, initrd_buf, cmdline_buf are set. Now they should
> +      * be updated to the new content.
> +      */
> +
> +     res = &parsed_resource[0];
> +     /* Kernel part should always be parsed */
> +     if (!res->r) {
> +             pr_err("Can not parse kernel\n");
> +             ldata = ERR_PTR(-EINVAL);
> +             goto err;
> +     }
> +     kernel = res->r->buf;
> +     kernel_len = res->r->data_sz;
> +     vfree(image->kernel_buf);
> +     image->kernel_buf = kernel;
> +     image->kernel_buf_len = kernel_len;

Can't you assign the output to image->kernel_buf{_len} directly? Same
for initrd and cmdline.

> +
> +     res = &parsed_resource[1];
> +     if (!!res->r) {
> +             initrd = res->r->buf;
> +             initrd_len = res->r->data_sz;
> +             vfree(image->initrd_buf);
> +             image->initrd_buf = initrd;
> +             image->initrd_buf_len = initrd_len;
> +     }
> +     res = &parsed_resource[2];
> +     if (!!res->r) {
> +             cmdline = res->r->buf;
> +             cmdline_len = res->r->data_sz;
> +             kfree(image->cmdline_buf);
> +             image->cmdline_buf = cmdline;
> +             image->cmdline_buf_len = cmdline_len;
> +     }
> +
> +     if (kernel == NULL || initrd == NULL || cmdline == NULL) {
> +             char *c, buf[64];
> +
> +             c = buf;
> +             if (kernel == NULL) {
> +                     strcpy(c, "kernel ");
> +                     c += strlen("kernel ");
> +             }
> +             if (initrd == NULL) {
> +                     strcpy(c, "initrd ");
> +                     c += strlen("initrd ");
> +             }
> +             if (cmdline == NULL) {
> +                     strcpy(c, "cmdline ");
> +                     c += strlen("cmdline ");
> +             }
> +             c = '\0';
> +             pr_err("Can not extract data for %s", buf);
> +             ldata = ERR_PTR(-EINVAL);
> +             goto err;
> +     }

This check needs to go. Not having a initrd or cmdline is not an error
plus not having a kernel already throws an error above. In case you
want to keep the error message for debugging purpose you can add it to
the 'else' paths above.

> +
> +     ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
> +                                         image->kernel_buf_len);
> +     if (ret) {
> +             pr_err("Fail to find suitable image loader\n");
> +             ldata = ERR_PTR(ret);
> +             goto err;
> +     }
> +     ldata = kexec_image_load_default(image);
> +     if (IS_ERR(ldata)) {
> +             pr_err("architecture code fails to load image\n");
> +             goto err;
> +     }
> +     image->image_loader_data = ldata;
> +
> +err:
> +     for (int i = 0; i < 3; i++) {

Can you please get rid of the magic '3', e.g. by using ARRAY_SIZE.

Thanks
Philipp

> +             r = parsed_resource[i].r;
> +             if (!r)
> +                     continue;
> +             parsed_resource[i].r = NULL;
> +             /*
> +              * The release of buffer defers to
> +              * kimage_file_post_load_cleanup()
> +              */
> +             r->buf = NULL;
> +             r->buf_sz = 0;
> +             mem_range_result_put(r);
> +     }
> +
> +     return ldata;
> +}
> +
> +const struct kexec_file_ops kexec_pe_image_ops = {
> +     .probe = pe_image_probe,
> +     .load = pe_image_load,
> +#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG
> +     .verify_sig = kexec_kernel_verify_pe_sig,
> +#endif
> +};

Reply via email to