Hi Mauro,

On 12/1/25 10:17 PM, Mauro Carvalho Chehab wrote:
On Thu, 27 Nov 2025 10:44:30 +1000
Gavin Shan <[email protected]> wrote:

This series is curved from that for memory error handling improvement
[1] based on the received comments, to improve the error object handling
in various aspects.

[1] https://lists.nongnu.org/archive/html/qemu-arm/2025-11/msg00534.html

Gavin Shan (5):
   acpi/ghes: Automate data block cleanup in acpi_ghes_memory_errors()
   acpi/ghes: Abort in acpi_ghes_memory_errors() if necessary
   target/arm/kvm: Exit on error from acpi_ghes_memory_errors()
   acpi/ghes: Bail early on error from get_ghes_source_offsets()
   acpi/ghes: Use error_fatal in acpi_ghes_memory_errors()

Patch series look ok on my eyes.

Reviewed-by: Mauro Carvalho Chehab <[email protected]>


Thanks.

-

Btw, what setup are you using to test memory errors? It would be
nice to have it documented somewhere, maybe at
docs/specs/acpi_hest_ghes.rst.


I don't think docs/specs/acpi_hest_ghes.rst is the right place for that
as it's for specifications. I'm sharing how this is tested here to make
the thread complete.

- Both host and guest has 4KB page size

- Start the guest by the following command lines

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64                  \
  -accel kvm -machine virt,gic-version=host,nvdimm=on,ras=on               \
  -cpu host -smp maxcpus=8,cpus=8,sockets=2,clusters=2,cores=2,threads=1   \
  -m 4096M,slots=16,maxmem=128G                                            \
  -object memory-backend-ram,id=mem0,size=4096M                            \
  -numa node,nodeid=0,cpus=0-7,memdev=mem0                                 \
  -L /home/gavin/sandbox/qemu.main/build/pc-bios                           \
  -monitor none -serial mon:stdio -nographic                               \
  -gdb tcp::6666 -qmp tcp:localhost:5555,server,wait=off                   \
  -bios /home/gavin/sandbox/qemu.main/build/pc-bios/edk2-aarch64-code.fd   \
  -boot c                                                                  \
  -device pcie-root-port,bus=pcie.0,chassis=1,id=pcie.1                    \
  -device pcie-root-port,bus=pcie.0,chassis=2,id=pcie.2                    \
  -device pcie-root-port,bus=pcie.0,chassis=3,id=pcie.3                    \
     :                                                                     \
  -device pcie-root-port,bus=pcie.0,chassis=16,id=pcie.16                  \
  -drive file=/home/gavin/sandbox/images/disk.qcow2,if=none,id=drive0      \
  -device virtio-blk-pci,id=virtblk0,bus=pcie.1,drive=drive0,num-queues=4  \
  -netdev 
tap,id=tap1,vhost=true,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \
  -device virtio-net-pci,bus=pcie.8,netdev=tap1,mac=52:54:00:f1:26:b0

- Trigger 'victim -d' in the guest

  guest$ ./victim -d
  physical address of (0xffff8d9b7000) = 0x1251d6000
  Hit any key to trigger error:

- Inject error to the GPA. "test.c" is attached

  host$ ./test 0x1251d6000

- Press enter on the guest so that 'victim' continues its execution

  [  435.467481] EDAC MC0: 1 UE unknown on unknown memory ( page:0x1251d6 
offset:0x0 grain:1 - APEI location: )
  [  435.467542] {1}[Hardware Error]: Hardware error from APEI Generic Hardware 
Error Source: 0
  [  435.467543] {1}[Hardware Error]: event severity: recoverable
  [  435.467544] {1}[Hardware Error]:  Error 0, type: recoverable
  [  435.467545] {1}[Hardware Error]:   section_type: memory error
  [  435.467546] {1}[Hardware Error]:   physical_address: 0x00000001251d6000
  [  435.467547] {1}[Hardware Error]:   error_type: 0, unknown
  [  435.468380] Memory failure: 0x1251d6: recovery action for dirty LRU page: 
Recovered
  Bus error (core dumped)

Thanks,
Gavin





Thanks,
Mauro

// SPDX-License-Identifier: GPL-2.0+
/*
 * This test program runs on the host, to receive GPA outputed by 'victimd'
 * from the guest. The GPA is translated to HPA, and recoverable error
 * is inject to HPA automatically.
 *
 * NOTE: We have the assumption that the guest has only one NUMA node and
 * the memory capacity is 4GB. The test program won't work if the assumption
 * is broken.
 *
 * Author: Gavin Shan <[email protected]>
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <time.h>
#include <fcntl.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>

#define TEST_GUEST_MEM_SIZE	0x100000000	/* 4GB */
#define TEST_GUEST_MEM_START	0x040000000	/* 1GB */
#define TEST_INJECT_ERROR_TYPE	0x10

struct test_struct {
	int pid;
	unsigned long	guest_mem_size;
	unsigned long	gpa;
	unsigned long	hva;
	unsigned long	hpa;
};

static void usage(void)
{
	fprintf(stdout, "\n");
	fprintf(stdout, "./test <gpa>\n");
	fprintf(stdout, "gpa  The GPA (Guest Physical Address) where the error is injected\n");
	fprintf(stdout, "\n");
}

static void init_test_struct(struct test_struct *test)
{
	test->pid		= -1;
	test->guest_mem_size	= TEST_GUEST_MEM_SIZE;
	test->gpa		= -1UL;
	test->hpa		= -1UL;
}

static int fetch_gpa(struct test_struct *test, int argc, char **argv)
{
	if (argc != 2) {
		usage();
		return -EINVAL;
	}

	test->gpa = strtoul(argv[1], NULL, 16);
	if (test->gpa < TEST_GUEST_MEM_START ||
	    test->gpa > (TEST_GUEST_MEM_START + test->guest_mem_size)) {
		fprintf(stderr, "%s: GPA 0x%lx out of range [1GB, 1GB+0x%lx]\n",
			__func__, test->gpa, test->guest_mem_size);
		return -EINVAL;
	}

	return 0;
}

static int find_qemu_pid(struct test_struct *test)
{
	DIR *dir;
	FILE *fp;
	struct dirent *entry;
	char path[256], data[256];
	size_t sz;
	int ret = -ENODEV;

	dir = opendir("/proc");
	if (!dir) {
		fprintf(stderr, "%s: unable to open </proc>\n", __func__);
		return -EIO;
	}

	while ((entry = readdir(dir)) != NULL) {
		if (entry->d_type != DT_DIR || entry->d_name[0] == '.')
			continue;

		memset(path, 0, sizeof(path));
		snprintf(path, sizeof(path), "/proc/%s/comm", entry->d_name);
		fp = fopen(path, "r");
		if (!fp)
			continue;

		memset(data, 0, sizeof(data));
		sz = fread(data, 1, sizeof(data), fp);
		fclose(fp);
		if (sz <= 0)
			continue;

		if (strstr(data, "qemu")) {
			ret = 0;
			test->pid = atoi(entry->d_name);
			break;
		}
	}

	if (ret != 0)
		fprintf(stderr, "%s: Unable to find QEMU PID\n", __func__);

	closedir(dir);
	return ret;
}

static int fetch_hva(struct test_struct *test)
{
	FILE *fp;
	char filename[64], *data = NULL, *next, *next1;
	unsigned long start, end;
	size_t sz, len;
	int ret = -EIO;

	memset(filename, 0, sizeof(filename));
	snprintf(filename, sizeof(filename), "/proc/%d/smaps", test->pid);
	fp = fopen(filename, "r");
	if (!fp) {
		fprintf(stderr, "%s: Unable to open <%s>\n", __func__, filename);
		return ret;
	}

	while ((sz = getline(&data, &len, fp)) != -1) {
		if (!strstr(data, "rw-p"))
			continue;

		next = strchr(data, '-');
		if (!next)
			continue;

		*next++ = '\0';
		next1 = strchr(next, ' ');
		if (!next1)
			continue;

		*next1 = '\0';
		start = strtoul(data, NULL, 16);
		end = strtoul(next, NULL, 16);
		if (end - start == test->guest_mem_size) {
			ret = 0;
			test->hva = start + (test->gpa - TEST_GUEST_MEM_START);
			break;
		}
	}

	if (data)
		free(data);

	fclose(fp);
	return ret;
}

static int fetch_hpa(struct test_struct *test)
{
	int fd;
	unsigned long pinfo, pgsize = getpagesize();
	off_t offset = (test->hva / pgsize) * sizeof(pinfo);
	char filename[128];
	ssize_t sz;

	memset(filename, 0, sizeof(filename));
	snprintf(filename, sizeof(filename), "/proc/%d/pagemap", test->pid);
	fd = open(filename, O_RDONLY);
	if (fd < 0) {
		fprintf(stderr, "%s: Unable to open <%s>\n", __func__, filename);
		return -EIO;
	}

	sz = pread(fd, &pinfo, sizeof(pinfo), offset);
	close(fd);
	if (sz != sizeof(pinfo)) {
		fprintf(stderr, "%s: Unable to read from <%s>\n", __func__, filename);
		return -EIO;
	}

	if (!(pinfo & (1UL << 63))) {
		fprintf(stderr, "%s: Page not present\n", __func__);
		return -EINVAL;
	}

	test->hpa = ((pinfo & 0x007fffffffffffffUL) * pgsize) + (test->hva & (pgsize - 1));
	return 0;
}

static int write_file(const char *filename, unsigned long val)
{
	int fd;
	char data[128];
	size_t sz;
	int ret = 0;

	memset(data, 0, sizeof(data));
	sz = snprintf(data, sizeof(data), "0x%lx", val);

	fd = open(filename, O_WRONLY);
	if (fd < 0) {
		fprintf(stderr, "%s: Unable to open <%s>\n", __func__, filename);
		return -EIO;
	}

	if (write(fd, data, sz) != sz) {
		ret = -EIO;
		fprintf(stderr, "%s: Unable to write <%s>\n", __func__, filename);
	}

	close(fd);
	return ret;
}

static int inject_error(struct test_struct *test)
{
	fprintf(stdout, "pid:	%d\n", test->pid);
	fprintf(stdout, "gpa:	0x%lx\n", test->gpa);
	fprintf(stdout, "hva:	0x%lx\n", test->hva);
	fprintf(stdout, "hpa:	0x%lx\n", test->hpa);

	system("modprobe einj > /dev/null");
	if (write_file("/sys/kernel/debug/apei/einj/param1",		test->hpa) 		||
	    write_file("/sys/kernel/debug/apei/einj/param2",		0xfffffffffffff000)	||
	    write_file("/sys/kernel/debug/apei/einj/flags",		0x0)			||
	    write_file("/sys/kernel/debug/apei/einj/error_type",	TEST_INJECT_ERROR_TYPE)	||
	    write_file("/sys/kernel/debug/apei/einj/notrigger",		1)			||
	    write_file("/sys/kernel/debug/apei/einj/error_inject",	1))
		return -EIO;

	return 0;
}

int main(int argc, char **argv)
{
	struct test_struct test;
	int ret;

	init_test_struct(&test);

	if (fetch_gpa(&test, argc, argv) ||
	    find_qemu_pid(&test)	 ||
	    fetch_hva(&test)		 ||
	    fetch_hpa(&test)		 ||
	    inject_error(&test))
		return -EIO;

	return 0;
}

Reply via email to