Module Name: src
Committed By: mrg
Date: Fri Oct 27 05:45:00 UTC 2023
Modified Files:
src/sys/arch/x86/x86: errata.c
Log Message:
x86: handle AMD errata 1474: A CPU core may hang after about 1044 days
from the new comment:
* This requires disabling CC6 power level, which can be a performance
* issue since it stops full turbo in some implementations (eg, half the
* cores must be in CC6 to achieve the highest boost level.) Set a timer
* to fire in 1000 days -- except NetBSD timers end up having a signed
* 32-bit hz-based value, which rolls over in under 25 days with HZ=1000,
* and doing xcall(9) or kthread(9) from a callout is not allowed anyway,
* so just have a kthread wait 1 day for 1000 times.
documented in:
https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/revision-guides/56323-PUB_1_01.pdf
To generate a diff of this commit:
cvs rdiff -u -r1.34 -r1.35 src/sys/arch/x86/x86/errata.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/x86/x86/errata.c
diff -u src/sys/arch/x86/x86/errata.c:1.34 src/sys/arch/x86/x86/errata.c:1.35
--- src/sys/arch/x86/x86/errata.c:1.34 Fri Oct 27 03:06:04 2023
+++ src/sys/arch/x86/x86/errata.c Fri Oct 27 05:45:00 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: errata.c,v 1.34 2023/10/27 03:06:04 mrg Exp $ */
+/* $NetBSD: errata.c,v 1.35 2023/10/27 05:45:00 mrg Exp $ */
/*-
* Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -47,10 +47,13 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: errata.c,v 1.34 2023/10/27 03:06:04 mrg Exp $");
+__KERNEL_RCSID(0, "$NetBSD: errata.c,v 1.35 2023/10/27 05:45:00 mrg Exp $");
-#include <sys/types.h>
+#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/xcall.h>
+#include <sys/kthread.h>
+#include <sys/clock.h>
#include <machine/cpu.h>
#include <machine/cpufunc.h>
@@ -255,6 +258,7 @@ static const uint8_t x86_errata_zen2[] =
static bool x86_errata_setmsr(struct cpu_info *, errata_t *);
static bool x86_errata_testmsr(struct cpu_info *, errata_t *);
+static bool x86_errata_amd_1474(struct cpu_info *, errata_t *);
static errata_t errata[] = {
/*
@@ -453,6 +457,13 @@ static errata_t errata[] = {
x86_errata_setmsr, LS_CFG_ERRATA_1095, NULL
},
/*
+ * 1474: A CPU core may hang after about 1044 days
+ */
+ {
+ 1474, FALSE, MSR_CC6_CFG, x86_errata_zen2,
+ x86_errata_amd_1474, CC6_CFG_DISABLE_BITS, NULL
+ },
+ /*
* Zenbleed:
* https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7008.html
* https://github.com/google/security-research/security/advisories/GHSA-v6wh-rxpg-cmm8
@@ -465,6 +476,96 @@ static errata_t errata[] = {
},
};
+/*
+ * 1474: A CPU core may hang after about 1044 days
+ *
+ * This requires disabling CC6 power level, which can be a performance
+ * issue since it stops full turbo in some implementations (eg, half the
+ * cores must be in CC6 to achieve the highest boost level.) Set a timer
+ * to fire in 1000 days -- except NetBSD timers end up having a signed
+ * 32-bit hz-based value, which rolls over in under 25 days with HZ=1000,
+ * and doing xcall(9) or kthread(9) from a callout is not allowed anyway,
+ * so just have a kthread wait 1 day for 1000 times.
+ */
+
+#define AMD_ERRATA_1474_WARN_DAYS 950
+#define AMD_ERRATA_1474_BAD_DAYS 1000
+
+static void
+amd_errata_1474_disable_cc6(void *a1, void *a2)
+{
+ errata_t *e = a1;
+ uint64_t val;
+
+ val = rdmsr_locked(e->e_data1);
+ if ((val & e->e_data2) == 0)
+ return;
+ wrmsr_locked(e->e_data1, val & ~e->e_data2);
+ aprint_debug_dev(curcpu()->ci_dev, "erratum %u patched\n",
+ e->e_num);
+}
+
+static void
+amd_errata_1474_thread(void *arg)
+{
+ int loops = 0;
+ int ticks;
+
+ ticks = hz * SECS_PER_DAY;
+#ifdef X86_ERRATA_TEST_AMD_1474
+ /*
+ * Make this trigger warning after 50 seconds, and workaround
+ * at 100 seconds, for easy testing.
+ */
+ ticks = hz;
+ loops = 900;
+#endif
+
+ while (loops++ < AMD_ERRATA_1474_BAD_DAYS) {
+ if (loops == AMD_ERRATA_1474_WARN_DAYS) {
+ printf("warning: AMD Errata 1474 workaround scheduled "
+ "for %u days.\n", AMD_ERRATA_1474_BAD_DAYS -
+ AMD_ERRATA_1474_WARN_DAYS);
+ printf("warning: reboot required to avoid.\n");
+ }
+ kpause("amd1474", false, ticks, NULL);
+ }
+
+ /* Been 1000 days, disable CC6 and warn about it. */
+ uint64_t xc = xc_broadcast(0, amd_errata_1474_disable_cc6, arg, NULL);
+ xc_wait(xc);
+
+ printf("warning: AMD CC6 disabled due to errata 1474.\n");
+ printf("warning: reboot required to restore full turbo speeds.\n");
+
+ kthread_exit(0);
+}
+
+static bool
+x86_errata_amd_1474(struct cpu_info *ci, errata_t *e)
+{
+ int error;
+
+ /* Don't do anything on non-primary CPUs. */
+ if (!CPU_IS_PRIMARY(ci))
+ return FALSE;
+
+ error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
+ amd_errata_1474_thread, e, NULL, "amd1474");
+ if (error) {
+ printf("WARNING: Unable to disable AMD errata 1474!\n");
+ printf("WARNING: reboot system after %u days to avoid CPU "
+ "hangs.\n", AMD_ERRATA_1474_BAD_DAYS);
+ } else {
+ aprint_debug_dev(ci->ci_dev, "workaround for erratum %u "
+ "scheduled for %u days\n", e->e_num,
+ AMD_ERRATA_1474_BAD_DAYS);
+ }
+
+ /* Do own warning here, it's not like most others. */
+ return FALSE;
+}
+
static void
x86_errata_log(device_t dev, errata_t *e, const char *msg)
{