The branch main has been updated by jhibbits:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=1ae25866767d686067fe6678b62681b7a8f0d361

commit 1ae25866767d686067fe6678b62681b7a8f0d361
Author:     Justin Hibbits <[email protected]>
AuthorDate: 2025-10-26 02:45:00 +0000
Commit:     Justin Hibbits <[email protected]>
CommitDate: 2025-10-27 14:33:50 +0000

    kexec: Introduce basic arm64 support
    
    This works on older arm64 platforms, but may not work with arm64 devices
    using GICv3, due to a quirk in the GICv3, where some registers are
    write-once.
    
    Most of the kexec reboot work on arm64 can be done entirely in C code,
    by disabling the MMU, as the kernel is carved out of the vm_phys_segs
    array, so cannot be overwritten.
    
    Reviewed by:    andrew
    Sponsored by:   Juniper Networks, Inc.
    Differential Revision:  https://reviews.freebsd.org/D51621
---
 sys/arm64/arm64/kexec_support.c | 188 ++++++++++++++++++++++++++++++++++++++++
 sys/arm64/arm64/locore.S        |  44 ++++++++++
 sys/arm64/arm64/mp_machdep.c    |  78 +++++++++++++++++
 sys/arm64/include/cpufunc.h     |   7 ++
 sys/arm64/include/kexec.h       |  33 +++++++
 sys/arm64/include/pcpu.h        |   3 +-
 sys/arm64/include/smp.h         |   1 +
 sys/conf/files.arm64            |   1 +
 sys/dev/psci/psci.c             |  13 +++
 sys/dev/psci/psci.h             |   1 +
 10 files changed, 368 insertions(+), 1 deletion(-)

diff --git a/sys/arm64/arm64/kexec_support.c b/sys/arm64/arm64/kexec_support.c
new file mode 100644
index 000000000000..8b9719c05b67
--- /dev/null
+++ b/sys/arm64/arm64/kexec_support.c
@@ -0,0 +1,188 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+
+#include <machine/armreg.h>
+#include <machine/pmap.h>
+#include <machine/pte.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ *   disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+extern pt_entry_t pagetable_l0_ttbr0_bootstrap[];
+extern unsigned long initstack_end[];
+void switch_stack(void *, void (*)(void *, void *, struct kexec_image *), void 
*);
+
+#define        SCTLR_EL1_NO_MMU        (SCTLR_RES1 | SCTLR_LSMAOE | 
SCTLR_nTLSMD | \
+               SCTLR_EIS | SCTLR_TSCXT | SCTLR_EOS)
+#define        vm_page_offset(m)       ((vm_offset_t)(m) - vm_page_base)
+static inline vm_page_t
+phys_vm_page(vm_page_t m, vm_offset_t vm_page_v, vm_paddr_t vm_page_p)
+{
+       return ((vm_page_t)((vm_offset_t)m - vm_page_v + vm_page_p));
+}
+
+/* First 2 args are filler for switch_stack() */
+static void __aligned(16) __dead2
+kexec_reboot_bottom( void *arg1 __unused, void *arg2 __unused,
+    struct kexec_image *image)
+{
+       void (*e)(void) = (void *)image->entry;
+       vm_offset_t     vm_page_base = (vm_offset_t)vm_page_array;
+       vm_paddr_t      vm_page_phys = 
pmap_kextract((vm_offset_t)vm_page_array);
+       struct kexec_segment_stage *phys_segs =
+           (void *)pmap_kextract((vm_offset_t)&image->segments);
+       vm_paddr_t from_pa, to_pa;
+       vm_size_t size;
+       vm_page_t       first, m, mp;
+       struct pctrie_iter pct_i;
+
+       /*
+        * Create a linked list of all pages in the object before we disable the
+        * MMU.  Once the MMU is disabled we can't use the vm_radix iterators,
+        * as they rely on virtual address pointers.
+        */
+       first = NULL;
+       vm_radix_iter_init(&pct_i, &image->map_obj->rtree);
+       VM_RADIX_FORALL(m, &pct_i) {
+               if (first == NULL)
+                       first = m;
+               else
+                       SLIST_INSERT_AFTER(mp, m, plinks.s.ss);
+               mp = m;
+       }
+
+       /*
+        * We're running out of the identity map now, disable the MMU before we
+        * continue.  It's possible page tables can be overwritten, which would
+        * be very bad if we were running with the MMU enabled.
+        */
+       WRITE_SPECIALREG(sctlr_el1, SCTLR_EL1_NO_MMU);
+       isb();
+       for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+               if (phys_segs[i].size == 0)
+                       break;
+               to_pa = phys_segs[i].target;
+               /* Copy the segment here... */
+               for (vm_page_t p = phys_segs[i].first_page;
+                   p != NULL && to_pa - phys_segs[i].target < 
phys_segs[i].size;
+                   p = SLIST_NEXT(p, plinks.s.ss)) {
+                       p = phys_vm_page(p, vm_page_base, vm_page_phys);
+                       from_pa = p->phys_addr;
+                       if (p->phys_addr == to_pa) {
+                               to_pa += PAGE_SIZE;
+                               continue;
+                       }
+                       for (size = PAGE_SIZE / sizeof(register_t);
+                           size > 0; --size) {
+                               *(register_t *)to_pa = *(register_t *)from_pa;
+                               to_pa += sizeof(register_t);
+                               from_pa += sizeof(register_t);
+                       }
+               }
+       }
+       invalidate_icache();
+       e();
+       while (1)
+               ;
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+       uintptr_t ptr;
+       register_t reg;
+
+       for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+               if (image->segments[i].size > 0)
+                       cpu_dcache_inv_range((void 
*)PHYS_TO_DMAP(image->segments[i].target),
+                           image->segments[i].size);
+       }
+       ptr = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+       serror_disable();
+
+       reg = pmap_kextract((vm_offset_t)pagetable_l0_ttbr0_bootstrap);
+       set_ttbr0(reg);
+       cpu_tlb_flushID();
+
+       typeof(kexec_reboot_bottom) *p = (void *)ptr;
+       switch_stack((void *)pmap_kextract((vm_offset_t)initstack_end),
+           p, image);
+       while (1)
+               ;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+       vm_paddr_t tmp;
+       pt_entry_t *pte;
+
+       /* Create L2 page blocks for the trampoline. L0/L1 are from the 
startup. */
+
+       /*
+        * There are exactly 2 pages before the pagetable_l0_ttbr0_bootstrap, so
+        * move to there.
+        */
+       pte = pagetable_l0_ttbr0_bootstrap;
+       pte -= (Ln_ENTRIES * 2);        /* move to start of L2 pages */
+
+       /*
+        * Populate the identity map with symbols we know we'll need before we
+        * turn off the MMU.
+        */
+       tmp = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+       pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+       tmp = pmap_kextract((vm_offset_t)initstack_end);
+       pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+       /* We'll need vm_page_array for doing offset calculations. */
+       tmp = pmap_kextract((vm_offset_t)&vm_page_array);
+       pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+
+       return (0);
+}
diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S
index d35e334905a7..3ec12140f139 100644
--- a/sys/arm64/arm64/locore.S
+++ b/sys/arm64/arm64/locore.S
@@ -325,6 +325,19 @@ mp_virtdone:
 
        b       init_secondary
 LEND(mpentry_common)
+
+ENTRY(mp_cpu_spinloop)
+0:
+       wfe
+       ldr     x0, mp_cpu_spin_table_release_addr
+       cbz     x0, 0b
+       blr     x0
+       .globl mp_cpu_spin_table_release_addr
+mp_cpu_spin_table_release_addr:
+       .quad   0
+       .globl mp_cpu_spinloop_end
+mp_cpu_spinloop_end:
+END(mp_cpu_spinloop)
 #endif
 
 /*
@@ -475,6 +488,29 @@ LENTRY(enter_kernel_el)
        eret
 LEND(enter_kernel_el)
 
+/* Turn off the MMU.  Install ttbr0 from the bootstrap page table, and go 
there.
+ * Does not return.
+ * - x0 - target address to jump to after stopping the MMU.
+ * - x1 - kernel load address
+ */
+ENTRY(stop_mmu)
+       mov     x16, x0 /* Save target. */
+       ldr     x2, =(1f - KERNBASE)
+       add     x17, x1, x2
+       ldr     x3, =(pagetable_l0_ttbr0_bootstrap - KERNBASE)
+       add     x1, x1, x3
+       msr     ttbr0_el1, x1
+       isb
+       br      x17
+1:
+       BTI_J
+       mrs     x0, sctlr_el1
+       bic     x0, x0, SCTLR_M
+       bic     x0, x0, SCTLR_C
+       msr     sctlr_el1, x0
+       isb
+       br      x16
+END(stop_mmu)
 /*
  * Get the physical address the kernel was loaded at.
  */
@@ -1094,12 +1130,19 @@ tcr:
            TCR_SH0_IS | TCR_ORGN0_WBWA | TCR_IRGN0_WBWA)
 LEND(start_mmu)
 
+ENTRY(switch_stack)
+       mov     sp, x0
+       mov     x16, x1
+       br      x16
+END(switch_stack)
+
 ENTRY(abort)
        b abort
 END(abort)
 
 .bss
        .align  PAGE_SHIFT
+       .globl  initstack_end
 initstack:
        .space  BOOT_STACK_SIZE
 initstack_end:
@@ -1116,6 +1159,7 @@ initstack_end:
         *           L0 for user
         */
        .globl pagetable_l0_ttbr1
+       .globl pagetable_l0_ttbr0_bootstrap
 pagetable:
 pagetable_l3_ttbr1:
        .space  (PAGE_SIZE * L3_PAGE_COUNT)
diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index e4d011df3a06..0bdd2ecfd8a7 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -60,6 +60,7 @@
 #include <machine/debug_monitor.h>
 #include <machine/intr.h>
 #include <machine/smp.h>
+#include <machine/vmparam.h>
 #ifdef VFP
 #include <machine/vfp.h>
 #endif
@@ -103,6 +104,7 @@ static void ipi_hardclock(void *);
 static void ipi_preempt(void *);
 static void ipi_rendezvous(void *);
 static void ipi_stop(void *);
+static void ipi_off(void *);
 
 #ifdef FDT
 static u_int fdt_cpuid;
@@ -193,6 +195,7 @@ release_aps(void *dummy __unused)
        intr_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL);
        intr_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL);
        intr_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL);
+       intr_ipi_setup(IPI_OFF, "off", ipi_off, NULL);
 
        atomic_store_int(&aps_started, 0);
        atomic_store_rel_int(&aps_ready, 1);
@@ -390,6 +393,34 @@ ipi_stop(void *dummy __unused)
        CTR0(KTR_SMP, "IPI_STOP (restart)");
 }
 
+void stop_mmu(vm_paddr_t, vm_paddr_t) __dead2;
+extern uint32_t mp_cpu_spinloop[];
+extern uint32_t mp_cpu_spinloop_end[];
+extern uint64_t mp_cpu_spin_table_release_addr;
+static void
+ipi_off(void *dummy __unused)
+{
+       CTR0(KTR_SMP, "IPI_OFF");
+       if (psci_present)
+               psci_cpu_off();
+       else {
+               uint64_t release_addr;
+               vm_size_t size;
+
+               size = (vm_offset_t)&mp_cpu_spin_table_release_addr -
+                   (vm_offset_t)mp_cpu_spinloop;
+               release_addr = PCPU_GET(release_addr) - size;
+               isb();
+               invalidate_icache();
+               /* Go catatonic, don't take any interrupts. */
+               intr_disable();
+               stop_mmu(release_addr, pmap_kextract(KERNBASE));
+
+
+       }
+       CTR0(KTR_SMP, "IPI_OFF failed");
+}
+
 struct cpu_group *
 cpu_topo(void)
 {
@@ -511,6 +542,7 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain, 
vm_paddr_t release_addr)
        pcpu_init(pcpup, cpuid, sizeof(struct pcpu));
        pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK;
        bootpcpu = pcpup;
+       pcpup->pc_release_addr = release_addr;
 
        dpcpu[cpuid - 1] = (void *)(pcpup + 1);
        dpcpu_init(dpcpu[cpuid - 1], cpuid);
@@ -752,6 +784,52 @@ cpu_mp_start(void)
        }
 }
 
+void
+cpu_mp_stop(void)
+{
+
+       /* Short-circuit for single-CPU */
+       if (CPU_COUNT(&all_cpus) == 1)
+               return;
+
+       KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("Not on the first CPU!\n"));
+
+       /*
+        * If we use spin-table, assume U-boot method for now (single address
+        * shared by all CPUs).
+        */
+       if (!psci_present) {
+               int cpu;
+               vm_paddr_t release_addr;
+               void *release_vaddr;
+               vm_size_t size;
+
+               /* Find the shared release address. */
+               CPU_FOREACH(cpu) {
+                       release_addr = pcpu_find(cpu)->pc_release_addr;
+                       if (release_addr != 0)
+                               break;
+               }
+               /* No release address? No way of notifying other CPUs. */
+               if (release_addr == 0)
+                       return;
+
+               size = (vm_offset_t)&mp_cpu_spinloop_end -
+                   (vm_offset_t)&mp_cpu_spinloop;
+
+               release_addr -= (vm_offset_t)&mp_cpu_spin_table_release_addr -
+                   (vm_offset_t)mp_cpu_spinloop;
+
+               release_vaddr = pmap_mapdev(release_addr, size);
+               bcopy(mp_cpu_spinloop, release_vaddr, size);
+               cpu_dcache_wbinv_range(release_vaddr, size);
+               pmap_unmapdev(release_vaddr, size);
+               invalidate_icache();
+       }
+       ipi_all_but_self(IPI_OFF);
+       DELAY(1000000);
+}
+
 /* Introduce rest of cores to the world */
 void
 cpu_mp_announce(void)
diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h
index e6e1f682794e..e9eee643216b 100644
--- a/sys/arm64/include/cpufunc.h
+++ b/sys/arm64/include/cpufunc.h
@@ -96,6 +96,13 @@ serror_enable(void)
        __asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")");
 }
 
+static __inline void
+serror_disable(void)
+{
+
+       __asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")");
+}
+
 static __inline register_t
 get_midr(void)
 {
diff --git a/sys/arm64/include/kexec.h b/sys/arm64/include/kexec.h
new file mode 100644
index 000000000000..0a8c7a053331
--- /dev/null
+++ b/sys/arm64/include/kexec.h
@@ -0,0 +1,33 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef        _ARM64_KEXEC_H_
+#define        _ARM64_KEXEC_H_
+
+#define KEXEC_MD_PAGES(x) 0
+
+#endif /* _ARM64_KEXEC_H_ */
diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h
index 09bd8fa8a966..73399d2c3f8c 100644
--- a/sys/arm64/include/pcpu.h
+++ b/sys/arm64/include/pcpu.h
@@ -50,7 +50,8 @@ struct debug_monitor_state;
        struct pmap *pc_curvmpmap;                                      \
        uint64_t pc_mpidr;                                              \
        u_int   pc_bcast_tlbi_workaround;                               \
-       char __pad[197]
+       uint64_t pc_release_addr;                                       \
+       char __pad[189]
 
 #ifdef _KERNEL
 
diff --git a/sys/arm64/include/smp.h b/sys/arm64/include/smp.h
index 500cd1ef4f02..4a5bfda3ac1c 100644
--- a/sys/arm64/include/smp.h
+++ b/sys/arm64/include/smp.h
@@ -40,6 +40,7 @@ enum {
        IPI_STOP,
        IPI_STOP_HARD,
        IPI_HARDCLOCK,
+       IPI_OFF,
        INTR_IPI_COUNT,
 };
 
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 2f412fa3cb1b..882aca705336 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -55,6 +55,7 @@ arm64/arm64/gic_v3_acpi.c                     optional acpi
 arm64/arm64/gic_v3_fdt.c                       optional fdt
 arm64/arm64/hyp_stub.S                         standard
 arm64/arm64/identcpu.c                         standard
+arm64/arm64/kexec_support.c                    standard
 arm64/arm64/locore.S                           standard no-obj
 arm64/arm64/machdep.c                          standard
 arm64/arm64/machdep_boot.c                     standard
diff --git a/sys/dev/psci/psci.c b/sys/dev/psci/psci.c
index 497b23d2d4c3..2b250401ae83 100644
--- a/sys/dev/psci/psci.c
+++ b/sys/dev/psci/psci.c
@@ -474,6 +474,19 @@ psci_cpu_on(unsigned long cpu, unsigned long entry, 
unsigned long context_id)
        return (psci_call(fnid, cpu, entry, context_id));
 }
 
+int
+psci_cpu_off(void)
+{
+       uint32_t fnid;
+
+       fnid = PSCI_FNID_CPU_OFF;
+       if (psci_softc != NULL)
+               fnid = psci_softc->psci_fnids[PSCI_FN_CPU_OFF];
+
+       /* Returns PSCI_RETVAL_DENIED on error. */
+       return (psci_call(fnid, 0, 0, 0));
+}
+
 static void
 psci_shutdown(void *xsc, int howto)
 {
diff --git a/sys/dev/psci/psci.h b/sys/dev/psci/psci.h
index 451d40c0178d..6704eaf26c71 100644
--- a/sys/dev/psci/psci.h
+++ b/sys/dev/psci/psci.h
@@ -39,6 +39,7 @@ typedef int (*psci_callfn_t)(register_t, register_t, 
register_t, register_t,
 extern bool psci_present;
 
 int    psci_cpu_on(unsigned long, unsigned long, unsigned long);
+int    psci_cpu_off(void);     /* Operates on caller. */
 void   psci_reset(void);
 int32_t        psci_features(uint32_t);
 int    psci_get_version(void);

Reply via email to