On 9/15/23 05:13, Zhijian Li (Fujitsu) wrote:


I'm okay with "RDMA isn't touched".
BTW, could you share your reproducing program/hacking to poison the page, so 
that
i am able to take a look the RDMA part later when i'm free.

Not sure it's suitable to acknowledge a not touched part. Anyway
Acked-by: Li Zhijian <lizhij...@fujitsu.com> # RDMA


Thanks.
As you asked for a procedure to inject memory errors into a running VM,
I've attached to this email the source code (mce_process_react.c) of a
program that will help to target the error injection in the VM.

(Be careful that error injection is currently nor working on AMD
platforms -- this is a work in progress is a separate qemu thread)


The general idea:
We are going to target a process memory page running inside a VM to see
what happens when we inject an error on the underlying physical page at
the platform (hypervisor) level.
To have a better view of what's going on, we'll use a process made for
this: It's goal is to allocate a memory page, and create a SIGBUS
handler to inform when it receives this signal. It will also wait before
touching this page to see what happens next.

    Compiling this tool:
    $ gcc -o mce_process_react_x86 mce_process_react.c


Let's try that:
This procedure shows the best case scenario, where an error injected at
the platform level is reported up to the guest process using it.
Note that qemu should be started with root privilege.

    1. Choose a process running in the VM (and identify a memory page
you want to target, and get its physical address – crash(8) vtop can
help with that) or run the attached mce_process_react example (compiled
for your platform mce_process_react_[x86|arm]) with an option to be
early informed of _AO error (-e) and wait ENTER to continue with reading
the allocated page (-w 0):

[root@VM ]# ./mce_process_react_x86 -e -w 0
Setting Early kill... Ok

Data pages at 0x7fa0f9b25000  physically 0x200f2fa000

Press ENTER to continue with page reading


    2. Go into the VM monitor to get the translation from "Guest
Physical Address to Host Physical Address" or "Host Virtual Address":

 (qemu) gpa2hpa 0x200f2fa000'
Host physical address for 0x200f2fa000 (ram-node1) is 0x46f12fa000


    3. Before we inject the error, we want to keep track of the VM
console output (in a separate window).
If you are using libvirt: # virsh console myvm


    4. We now prepare for the error injection at the platform level to
the address we found.  To do so, we'll need to use the hwpoison-inject
module (x86)
Be careful, as hwpoison takes Page Frame Numbers and this PFN is not the
physical address – you need to remove the last 12 bits (the last 3 zeros
of the above address) !

[root@hv ]# modprobe hwpoison-inject
[root@hv ]# echo 0x46f12fa > /sys/kernel/debug/hwpoison/corrupt-pfn

       If you see "Operation not permitted" error when writing as root
on corrupt-pfn, you may be facing a "kernel_lockdown(7)" which is
enabled on SecureBoot systems (can be verified with
"mokutil --sb-state"). In this case, turn SecureBoot off  (at the UEFI
level for example)

    5. Look at the qemu output (either on the terminal where qemu was
started or  if you are using libvirt:  tail /var/log/libvirt/qemu/myvm

2022-08-31T13:52:25.645398Z qemu-system-x86_64: warning: Guest MCE Memory Error at QEMU addr 0x7eeeace00000 and GUEST addr 0x200f200 of type BUS_MCEERR_AO injected

    6. On the guest console:
We'll see the VM reaction to the injected error:

[  155.805149] Disabling lock debugging due to kernel taint
[  155.806174] mce: [Hardware Error]: Machine check events logged
[ 155.807120] Memory failure: 0x200f200: Killing mce_process_rea:3548 due to hardware memory corruption [ 155.808877] Memory failure: 0x200f200: recovery action for dirty LRU page: Recovered

    7. The Guest process that we started at the first step gives:

Signal 7 received
BUS_MCEERR_AO on vaddr: 0x7fa0f9b25000

At this stage, the VM has a poisoned page, and a migration of this VM
needs to be fixed in order to avoid accessing the poisoned page.

    8. The process continues to run (as it handled the SIGBUS).
Now if you press ENTER on this process terminal, it will try to read the
page which will generate a new MCE (a synchronous one) at VM level which
will be sent to this process:

Signal 7 received
BUS_MCEERR_AR on vaddr: 0x7fa0f9b25000
Exit from the signal handler on BUS_MCEERR_AR

    9. The VM console shows:
[ 2520.895263] MCE: Killing mce_process_rea:3548 due to hardware memory corruption fault at 7f45e5265000

    10. The VM continues to run...
With a poisoned page in its address space

HTH,
William.
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <signal.h>
#include <string.h>

#define PAGEMAP_ENTRY 8
#define GET_BIT(X,Y) (X & ((uint64_t)1<<Y)) >> Y
#define GET_PFN(X) X & 0x7FFFFFFFFFFFFF
#define ALLOC_PAGES 1

const int __endian_bit = 1;
#define is_bigendian() ( (*(char*)&__endian_bit) == 0 )

/*
 * Set the early kill mode reaction state to MCE error.
 */
static void early_reaction() {
   printf("Setting Early kill... ");
   if (prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0) == 0)
      printf("Ok\n");
   else
      printf("Failure !\n");
}

/*
 * Generate an error on the given page.
 */
static void memory_error_advise(void* virtual_page) {
   int ret;

   ret = madvise(virtual_page, 4096, MADV_HWPOISON);
   if (ret)
      printf("Poisoning failed - madvise: %s", strerror(errno));
}

/*
 * Return the physical address associated to a given local virtual address,
 * or -1 in case of an error.
 */
static uint64_t physical_address(uint64_t virt_addr) {
   char path_buf [0x100];
   FILE * f;
   uint64_t read_val, file_offset, pfn = 0;
   long pgsz;
   unsigned char c_buf[PAGEMAP_ENTRY];
   pid_t my_pid = getpid();
   int status, i;

   sprintf(path_buf, "/proc/%u/pagemap", my_pid);
   
   f = fopen(path_buf, "rb");
   if(!f){
      printf("Error! Cannot open %s\n", path_buf);
      return (uint64_t)-1;
   }
   
   //Shifting by virt-addr-offset number of bytes
   //and multiplying by the size of an address
   //(the size of an entry in pagemap file)
   pgsz = sysconf(_SC_PAGESIZE);
   file_offset = virt_addr / (uint64_t)pgsz * PAGEMAP_ENTRY;
   status = fseek(f, (long)file_offset, SEEK_SET);
   if(status){
      perror("Failed to do fseek!");
      fclose(f);
      return (uint64_t)-1;
   }

   for(i=0; i < PAGEMAP_ENTRY; i++){
      int c = getc(f);
      if(c==EOF){
         fclose(f);
         return (uint64_t)-1;
      }
      if(is_bigendian())
           c_buf[i] = (unsigned char)c;
      else
           c_buf[PAGEMAP_ENTRY - i - 1] = (unsigned char)c;
   }
   fclose(f);

   read_val = 0;
   for(i=0; i < PAGEMAP_ENTRY; i++){
      read_val = (read_val << 8) + c_buf[i];
   }

   if(GET_BIT(read_val, 63)) { // Bit  63    page present
      pfn = GET_PFN(read_val);
   } else {
      printf("Page not present !\n");
   }
   if(GET_BIT(read_val, 62)) // Bit  62    page swapped
      printf("Page swapped\n");

   if (pfn == 0)
      return (uint64_t)-1;

   return pfn * (uint64_t)pgsz;
}

/*
 * SIGBUS handler to display the given information.
 */
static void sigbus_action(int signum, siginfo_t *siginfo, void *ctx) {
   printf("Signal %d received: ", signum);
   printf("%s on vaddr: %llx\n",
      (siginfo->si_code == 4? "BUS_MCEERR_AR":"BUS_MCEERR_AO"),
      siginfo->si_addr);

  if (siginfo->si_code == 4) { /* BUS_MCEERR_AR */
	fprintf(stderr, "Exit from the signal handler on BUS_MCEERR_AR\n");
	_exit(1);
  }
}

int main(int argc, char ** argv) {
   int opt, early_react = 0, madvise_error=0, wait_time=5, i;
   struct sigaction my_sigaction;
   uint64_t virt_addr = 0, phys_addr;
   void *local_pnt;

   // Need to have the CAP_SYS_ADMIN capability to get PFNs values in pagemap.
   if (getuid() != 0) {
      fprintf(stderr, "Usage: %s needs to run as root\n", argv[0]);
      exit(EXIT_FAILURE);
   }

   while ((opt = getopt(argc, argv, "emw:")) != -1) {
      switch (opt) {
      case 'e':
         early_react = 1;
         break;
      case 'm':
         madvise_error=1;
         break;
      case 'w':
         wait_time=atoi(optarg);
         break;
      default: /* '?' */
         fprintf(stderr, "Usage: %s [-e] [-m] [-w seconds]\n", argv[0]);
         exit(EXIT_FAILURE);
      }
   }

   // attach our SIGBUS handler.
   my_sigaction.sa_sigaction = sigbus_action;
   my_sigaction.sa_flags = SA_SIGINFO | SA_NODEFER | SA_SIGINFO;
   if (sigaction(SIGBUS, &my_sigaction, NULL) == -1) {
      perror("Signal handler attach failed");
      exit(EXIT_FAILURE);
   }

   if (early_react)
      early_reaction();

   // Allocate nx4K private pages.
   local_pnt = mmap(NULL, ALLOC_PAGES*4096, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
   if (local_pnt == MAP_FAILED) {
      fprintf(stderr, "Memory Allocation failed !\n");
      exit(EXIT_FAILURE);
   }
   virt_addr = (uint64_t)local_pnt;

   // Dirty / map the pages.
   for (i=0; i < ALLOC_PAGES; i++) {
      sprintf(((char *)local_pnt + i*4096), "My page number %d\n", i);
   }

   phys_addr = physical_address(virt_addr);
   if (phys_addr == -1) {
      fprintf(stderr, "Virtual address translation 0x%llx failed\n", 
         (unsigned long long)virt_addr);
      exit(EXIT_FAILURE);
   }
   printf("\nData pages at 0x%llx  physically 0x%llx\n",
      (unsigned long long)virt_addr, (unsigned long long)phys_addr);
   fflush(stdout);

   // Explicit error
   if (madvise_error)
      memory_error_advise((void*) virt_addr);

   // Now Wait !
   if (wait_time > 0) {
      sleep((unsigned int)wait_time);
   } else {
      printf("\nPress ENTER to continue with page reading\n");
      i = fgetc(stdin);
   }
   
   // read the strings at the beginning of each page.
   for (i=0; i < ALLOC_PAGES; i++) {
      printf("%s", ((char *)local_pnt + i*4096));
   }

   phys_addr = physical_address(virt_addr);
   if (phys_addr == -1) {
      fprintf(stderr, "Virtual address translation 0x%llx failed\n", 
         (unsigned long long)virt_addr);
   } else {
      printf("\nData pages at 0x%llx  physically 0x%llx\n",
         (unsigned long long)virt_addr, (unsigned long long)phys_addr);
   }

   return 0;
}

Reply via email to