[PATCH v11 1/3] x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP

2024-07-02 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM in the decompressor causes guest
termination as the boot stage2 #VC handler for SEV-ES/SNP systems does
not support MMIO handling.

This issue is observed during a SEV-ES/SNP guest kexec as kexec -c adds
screen_info to the boot parameters passed to the second kernel, which
causes console output to be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is preferable to
get the console output only on serial, hence, skip accessing the video
RAM during decompressor stage to prevent guest termination.

Serial console output during decompressor stage works as boot stage2 #VC
handler already supports handling port I/O.

  [ bp: Massage. ]

Suggested-by: Borislav Petkov (AMD) 
Suggested-by: Thomas Lendacky 
Signed-off-by: Ashish Kalra 
Signed-off-by: Borislav Petkov (AMD) 
Reviewed-by: Kuppuswamy Sathyanarayanan 

Reviewed-by: Tom Lendacky 
---
 arch/x86/boot/compressed/misc.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 944454306ef4..826b4d5cb1f0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -385,6 +385,19 @@ static void parse_mem_encrypt(struct setup_header *hdr)
hdr->xloadflags |= XLF_MEM_ENCRYPTION;
 }
 
+static void early_sev_detect(void)
+{
+   /*
+* Accessing video memory causes guest termination because
+* the boot stage2 #VC handler of SEV-ES/SNP guests does not
+* support MMIO handling and kexec -c adds screen_info to the
+* boot parameters passed to the kexec kernel, which causes
+* console output to be dumped to both video and serial.
+*/
+   if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+   lines = cols = 0;
+}
+
 /*
  * The compressed kernel image (ZO), has been moved so that its position
  * is against the end of the buffer used to hold the uncompressed kernel
@@ -440,6 +453,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
 */
early_tdx_detect();
 
+   early_sev_detect();
+
console_init();
 
/*
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v11 3/3] x86/snp: Convert shared memory back to private on kexec

2024-07-02 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second, kexec-ed, kernel has no idea what memory is converted this way.
It only sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Reviewed-by: Tom Lendacky 
Signed-off-by: Ashish Kalra 
---
 arch/x86/coco/sev/core.c  | 148 ++
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/mm/mem_encrypt_amd.c |   2 +
 3 files changed, 154 insertions(+)

diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 082d61d85dfc..0c90a8a74a88 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1010,6 +1010,154 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t 
end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static void set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   unsigned long pfn;
+   pgprot_t new_prot;
+
+   prep_set_clr_pte_enc(kpte, level, 1, va, , NULL, NULL, _prot);
+   set_pte_enc_mask(kpte, pfn, new_prot);
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+   int cpu;
+
+   /*
+* Ensure that all the per-cpu GHCBs are made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   for_each_possible_cpu(cpu) {
+   data = per_cpu(runtime_data, cpu);
+   ghcb = >ghcb_page;
+   /* Check for GHCB for being part of a PMD range */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE)))
+   return true;
+   }
+
+   set_pte_enc(pte, level, (void *)addr);
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_bss_decrypted_memory(void)
+{
+   unsigned long vaddr, vaddr_end;
+   unsigned int level;
+   unsigned int npages;
+   pte_t *pte;
+
+   vaddr = (unsigned long)__start_bss_decrypted;
+   vaddr_end = (unsigned long)__start_bss_decrypted_unused;
+   npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+   for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
+   pte = lookup_address(vaddr, );
+   if (!pte || !pte_decrypted(*pte) || pte_none(*pte))
+   continue;
+
+   set_pte_enc(pte, level, (void *)vaddr);
+   }
+   vaddr = (unsigned long)__start_bss_decrypted;
+   snp_set_memory_private(vaddr, npages);
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private.
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSET + get_max_mapped();
+
+   while (addr < end) {
+   unsigned long size;
+   unsigned int level;
+   pte_t *pte;
+
+   pte = lookup_address(addr, );
+   size = page_level_size(level);
+
+   if (pte && pte_decrypted(*pte) && !pte_none(*pte)) {
+   int pages = size / PAGE_SIZE;
+
+   if (!make_pte_private(pte, addr, pages, level)) {
+   pr_err("Failed to unshare range %#lx-%#lx\n",
+  addr, addr + size);
+   }
+   }
+   addr += size;
+   }
+
+   unshare_all_bss_decrypted_memory();
+
+   __flush_tlb_all();
+
+}
+
+/* Stop new private<->shared conversions */
+void snp_kexec_begin(void)
+{
+   if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+   return;
+
+   if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+   return;
+   /*
+* Crash kernel reaches here with interrupts disabled: can't wait for
+* conversions to finish.
+*
+* If race happened, just report and proceed.
+*/
+   if (!set_memory_e

[PATCH v11 2/3] x86/mm: refactor __set_clr_pte_enc()

2024-07-02 Thread Ashish Kalra
From: Ashish Kalra 

Refactor __set_clr_pte_enc() and add two new helper functions to
set/clear PTE C-bit from early SEV/SNP initialization code and
later during normal system operations and shutdown/kexec.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|  9 +++
 arch/x86/mm/mem_encrypt_amd.c | 47 +--
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ac5886ce252e..4f3fd913aadb 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -348,6 +348,10 @@ u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
 void snp_update_svsm_ca(void);
+int prep_set_clr_pte_enc(pte_t *kpte, int level, int enc, void *va,
+unsigned long *ret_pfn, unsigned long *ret_pa,
+unsigned long *ret_size, pgprot_t *ret_new_prot);
+void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot);
 
 #else  /* !CONFIG_AMD_MEM_ENCRYPT */
 
@@ -384,6 +388,11 @@ static inline u64 snp_get_unsupported_features(u64 status) 
{ return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
 static inline void snp_update_svsm_ca(void) { }
+static inline int
+prep_set_clr_pte_enc(pte_t *kpte, int level, int enc, void *va,
+unsigned long *ret_pfn, unsigned long *ret_pa,
+unsigned long *ret_size, pgprot_t *ret_new_prot) { }
+static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t 
new_prot) { }
 
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 86a476a426c2..42a35040aaf9 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -311,15 +311,16 @@ static int amd_enc_status_change_finish(unsigned long 
vaddr, int npages, bool en
return 0;
 }
 
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+int prep_set_clr_pte_enc(pte_t *kpte, int level, int enc, void *va,
+unsigned long *ret_pfn, unsigned long *ret_pa,
+unsigned long *ret_size, pgprot_t *ret_new_prot)
 {
pgprot_t old_prot, new_prot;
unsigned long pfn, pa, size;
-   pte_t new_pte;
 
pfn = pg_level_to_pfn(level, kpte, _prot);
if (!pfn)
-   return;
+   return 1;
 
new_prot = old_prot;
if (enc)
@@ -329,7 +330,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int 
level, bool enc)
 
/* If prot is same then do nothing. */
if (pgprot_val(old_prot) == pgprot_val(new_prot))
-   return;
+   return 1;
 
pa = pfn << PAGE_SHIFT;
size = page_level_size(level);
@@ -339,7 +340,39 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int 
level, bool enc)
 * physical page attribute from C=1 to C=0 or vice versa. Flush the
 * caches to ensure that data gets accessed with the correct C-bit.
 */
-   clflush_cache_range(__va(pa), size);
+   if (va)
+   clflush_cache_range(va, size);
+   else
+   clflush_cache_range(__va(pa), size);
+
+   if (ret_new_prot)
+   *ret_new_prot = new_prot;
+   if (ret_size)
+   *ret_size = size;
+   if (ret_pfn)
+   *ret_pfn = pfn;
+   if (ret_pa)
+   *ret_pa = pa;
+
+   return 0;
+}
+
+void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot)
+{
+   pte_t new_pte;
+
+   /* Change the page encryption mask. */
+   new_pte = pfn_pte(pfn, new_prot);
+   set_pte_atomic(kpte, new_pte);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+   unsigned long pfn, pa, size;
+   pgprot_t new_prot;
+
+   if (prep_set_clr_pte_enc(kpte, level, enc, NULL, , , , 
_prot))
+   return;
 
/* Encrypt/decrypt the contents in-place */
if (enc) {
@@ -354,9 +387,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int 
level, bool enc)
early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1);
}
 
-   /* Change the page encryption mask. */
-   new_pte = pfn_pte(pfn, new_prot);
-   set_pte_atomic(kpte, new_pte);
+   set_pte_enc_mask(kpte, pfn, new_prot);
 
/*
 * If page is set encrypted in the page table, then update the RMP 
table to
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v11 0/3] x86/snp: Add kexec support

2024-07-02 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

This patchset requires the following fix for preventing EFI memory map
corruption while doing SNP guest kexec:
  
https://lore.kernel.org/all/16131a10-b473-41cc-a96e-d71a4d930...@amd.com/T/#m77f2f33f5521d1369b0e8d461802b99005b4ffd6

The series is based off and tested against tree:
  https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git



v11:
- Refactored __set_clr_pte_enc() and added two new helper functions to
  set/clear PTE C-bit from early SEV/SNP initialization code and
  later during normal system operations and shutdown/kexec.
- Removed kexec_last_addr_to_make_private and now skip per-cpu
  GHCB addresses when making all pages private and then after 
  converting all pages to private in snp_kexec_finish(), go over
  the per-cpu GHCB addresses and convert them to private explicitly.
- Fixed comments and commit logs as per upstream review.

v10:
- Removed pr_debug() calls as per upstream review feedback.
- Add review tags.

v9:
- Rebased onto current tip/master;
- Rebased on top of [PATCH] x86/sev: Move SEV compilation units 
  and uses the coco directory hierarchy for SEV guest kexec patches.
- Includes the above mentioned patch as part of this patch-set to
  fix any kernel test robot/build issues.
- Includes the massaged version of patch 2/3 as per upstream
  review/feedback.

v8:
- removed fix EFI memory map corruption with kexec patch as this
  is a use-after-free bug that is not specific to SNP/TDX or kexec
  and a generic fix for the same has been posted. 
- Add new early_sev_detect() and move detection of SEV-ES/SNP guest
  and skip accessing video RAM during decompressor stage into
  this function as per feedback from upstream review.

v7:
- Rebased onto current tip/master;
- Moved back to checking the md attribute instead of checking the
  efi_setup for detecting if running under kexec kernel as 
  suggested in upstream review feedback.

v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (3):
  x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP
  x86/mm: refactor __set_clr_pte_enc()
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |  15 
 arch/x86/coco/sev/core.c| 148 
 arch/x86/include/asm/sev.h  |  13 +++
 arch/x86/mm/mem_encrypt_amd.c   |  49 +--
 4 files changed, 217 insertions(+), 8 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v10 2/2] Subject: [PATCH v9 3/3] x86/snp: Convert shared memory back to private on kexec

2024-06-24 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Reviewed-by: Tom Lendacky 
Signed-off-by: Ashish Kalra 
---
 arch/x86/coco/sev/core.c  | 166 ++
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/mm/mem_encrypt_amd.c |   2 +
 3 files changed, 172 insertions(+)

diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 082d61d85dfc..9b405237f2c5 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -1010,6 +1015,167 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t 
end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSET + get_max_mapped();
+
+   while (addr < end) {
+   unsigned long size;
+   unsigned int level;
+   pte_t *pte;
+
+   pte = lookup_address(addr, );
+   size = page_level_size(level);
+
+   /*
+* pte_none() check is required to skip physical memory holes 
in direct mapped.
+*/
+   if (pte && pte_decrypted(*pte) && !pte_none(*pte)) {
+   int pages = size / PAGE_SIZE;
+
+   if (!make_pte_private(pte, addr, pages, level)) {
+   pr_err("Failed to unshare range %#lx-%#lx\n",
+  addr, addr + size);
+   }
+
+   }
+
+   addr += size;
+   }
+   __flush_tlb_all();
+
+}
+
+static void unshare_all_bss_decrypted_memory(void)
+{
+   unsigned long vaddr, vaddr_end;
+   unsigned int level;
+   unsigned int npages;
+   pte_t *pte;
+
+   vaddr = (unsigned long)__start_bss_decrypted;
+   vaddr_end = (un

[PATCH v10 0/2] x86/snp: Add kexec support

2024-06-24 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

This patchset requires the following fix for preventing EFI memory map
corruption while doing SNP guest kexec:
  
https://lore.kernel.org/all/16131a10-b473-41cc-a96e-d71a4d930...@amd.com/T/#m77f2f33f5521d1369b0e8d461802b99005b4ffd6

The series is based off and tested against tree:
  https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git



v10:
- Removed pr_debug() calls as per upstream review feedback.
- Add review tags.

v9:
- Rebased onto current tip/master;
- Rebased on top of [PATCH] x86/sev: Move SEV compilation units 
  and uses the coco directory hierarchy for SEV guest kexec patches.
- Includes the above mentioned patch as part of this patch-set to
  fix any kernel test robot/build issues.
- Includes the massaged version of patch 2/3 as per upstream
  review/feedback.

v8:
- removed fix EFI memory map corruption with kexec patch as this
  is a use-after-free bug that is not specific to SNP/TDX or kexec
  and a generic fix for the same has been posted. 
- Add new early_sev_detect() and move detection of SEV-ES/SNP guest
  and skip accessing video RAM during decompressor stage into
  this function as per feedback from upstream review.

v7:
- Rebased onto current tip/master;
- Moved back to checking the md attribute instead of checking the
  efi_setup for detecting if running under kexec kernel as 
  suggested in upstream review feedback.

v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (2):
  x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP
  Subject: [PATCH v9 3/3] x86/snp: Convert shared memory back to private
on  kexec

 arch/x86/boot/compressed/misc.c |  15 +++
 arch/x86/coco/sev/core.c| 166 
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/mm/mem_encrypt_amd.c   |   2 +
 4 files changed, 187 insertions(+)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v10 1/2] x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP

2024-06-24 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM in the decompressor causes guest
termination as the boot stage2 #VC handler for SEV-ES/SNP systems does
not support MMIO handling.

This issue is observed during a SEV-ES/SNP guest kexec as kexec -c adds
screen_info to the boot parameters passed to the second kernel, which
causes console output to be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is preferable to
get the console output only on serial, hence, skip accessing the video
RAM during decompressor stage to prevent guest termination.

Serial console output during decompressor stage works as boot stage2 #VC
handler already supports handling port I/O.

  [ bp: Massage. ]

Suggested-by: Borislav Petkov (AMD) 
Suggested-by: Thomas Lendacky 
Signed-off-by: Ashish Kalra 
Signed-off-by: Borislav Petkov (AMD) 
Reviewed-by: Kuppuswamy Sathyanarayanan 

Reviewed-by: Tom Lendacky 
---
 arch/x86/boot/compressed/misc.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 944454306ef4..826b4d5cb1f0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -385,6 +385,19 @@ static void parse_mem_encrypt(struct setup_header *hdr)
hdr->xloadflags |= XLF_MEM_ENCRYPTION;
 }
 
+static void early_sev_detect(void)
+{
+   /*
+* Accessing video memory causes guest termination because
+* the boot stage2 #VC handler of SEV-ES/SNP guests does not
+* support MMIO handling and kexec -c adds screen_info to the
+* boot parameters passed to the kexec kernel, which causes
+* console output to be dumped to both video and serial.
+*/
+   if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+   lines = cols = 0;
+}
+
 /*
  * The compressed kernel image (ZO), has been moved so that its position
  * is against the end of the buffer used to hold the uncompressed kernel
@@ -440,6 +453,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
 */
early_tdx_detect();
 
+   early_sev_detect();
+
console_init();
 
/*
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v9 2/3] x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP

2024-06-20 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM in the decompressor causes guest
termination as the boot stage2 #VC handler for SEV-ES/SNP systems does
not support MMIO handling.

This issue is observed during a SEV-ES/SNP guest kexec as kexec -c adds
screen_info to the boot parameters passed to the second kernel, which
causes console output to be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is preferable to
get the console output only on serial, hence, skip accessing the video
RAM during decompressor stage to prevent guest termination.

Serial console output during decompressor stage works as boot stage2 #VC
handler already supports handling port I/O.

  [ bp: Massage. ]

Suggested-by: Borislav Petkov (AMD) 
Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Signed-off-by: Borislav Petkov (AMD) 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 944454306ef4..826b4d5cb1f0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -385,6 +385,19 @@ static void parse_mem_encrypt(struct setup_header *hdr)
hdr->xloadflags |= XLF_MEM_ENCRYPTION;
 }
 
+static void early_sev_detect(void)
+{
+   /*
+* Accessing video memory causes guest termination because
+* the boot stage2 #VC handler of SEV-ES/SNP guests does not
+* support MMIO handling and kexec -c adds screen_info to the
+* boot parameters passed to the kexec kernel, which causes
+* console output to be dumped to both video and serial.
+*/
+   if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+   lines = cols = 0;
+}
+
 /*
  * The compressed kernel image (ZO), has been moved so that its position
  * is against the end of the buffer used to hold the uncompressed kernel
@@ -440,6 +453,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
 */
early_tdx_detect();
 
+   early_sev_detect();
+
console_init();
 
/*
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v9 3/3] x86/snp: Convert shared memory back to private on kexec

2024-06-20 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/coco/sev/core.c  | 168 ++
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/mm/mem_encrypt_amd.c |   2 +
 3 files changed, 174 insertions(+)

diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 082d61d85dfc..0ce96123b684 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -1010,6 +1015,169 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t 
end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSET + get_max_mapped();
+
+   while (addr < end) {
+   unsigned long size;
+   unsigned int level;
+   pte_t *pte;
+
+   pte = lookup_address(addr, );
+   size = page_level_size(level);
+
+   /*
+* pte_none() check is required to skip physical memory holes 
in direct mapped.
+*/
+   if (pte && pte_decrypted(*pte) && !pte_none(*pte)) {
+   int pages = size / PAGE_SIZE;
+
+   if (!make_pte_private(pte, addr, pages, level)) {
+   pr_err("Failed to unshare range %#lx-%#lx\n",
+  addr, addr + size);
+   }
+
+   }
+
+   addr += size;
+   }
+   __flush_tlb_all();
+
+}
+
+static void unshare_all_bss_decrypted_memory(void)
+{
+   unsigned long vaddr, vaddr_end;
+   unsigned int level;
+   unsigned int npages;
+   pte_t *pte;
+
+   vaddr = (

[PATCH v9 1/3] x86/sev: Move SEV compilation units

2024-06-20 Thread Ashish Kalra
From: "Borislav Petkov (AMD)" 

From: "Borislav Petkov (AMD)" 

A long time ago we said that we're going to move the coco stuff where it
belongs

  https://lore.kernel.org/all/yg5nh1rknprwi...@zn.tnic

and not keep it in arch/x86/kernel. TDX did that and SEV can't find time
to do so. So lemme do it. If people have trouble converting their
ongoing featuritis patches, ask me for a sed script.

No functional changes.

Cc: Ashish Kalra 
Cc: Joerg Roedel 
Cc: Michael Roth 
Cc: Nikunj A Dadhania 
Cc: Tom Lendacky 
Signed-off-by: Borislav Petkov (AMD) 
---
 arch/x86/boot/compressed/sev.c  | 2 +-
 arch/x86/coco/Makefile  | 1 +
 arch/x86/coco/sev/Makefile  | 3 +++
 arch/x86/{kernel/sev.c => coco/sev/core.c}  | 2 +-
 arch/x86/{kernel/sev-shared.c => coco/sev/shared.c} | 0
 arch/x86/kernel/Makefile| 2 --
 6 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/coco/sev/Makefile
 rename arch/x86/{kernel/sev.c => coco/sev/core.c} (99%)
 rename arch/x86/{kernel/sev-shared.c => coco/sev/shared.c} (100%)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 697057250faa..cd44e120fe53 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -127,7 +127,7 @@ static bool fault_in_kernel_space(unsigned long address)
 #include "../../lib/insn.c"
 
 /* Include code for early handlers */
-#include "../../kernel/sev-shared.c"
+#include "../../coco/sev/shared.c"
 
 static struct svsm_ca *svsm_get_caa(void)
 {
diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
index c816acf78b6a..eabdc7486538 100644
--- a/arch/x86/coco/Makefile
+++ b/arch/x86/coco/Makefile
@@ -6,3 +6,4 @@ CFLAGS_core.o   += -fno-stack-protector
 obj-y += core.o
 
 obj-$(CONFIG_INTEL_TDX_GUEST)  += tdx/
+obj-$(CONFIG_AMD_MEM_ENCRYPT)   += sev/
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
new file mode 100644
index ..b89ba3fba343
--- /dev/null
+++ b/arch/x86/coco/sev/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o
diff --git a/arch/x86/kernel/sev.c b/arch/x86/coco/sev/core.c
similarity index 99%
rename from arch/x86/kernel/sev.c
rename to arch/x86/coco/sev/core.c
index 726d9df505e7..082d61d85dfc 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/coco/sev/core.c
@@ -613,7 +613,7 @@ static __always_inline void vc_forward_exception(struct 
es_em_ctxt *ctxt)
 }
 
 /* Include code shared with pre-decompression boot stage */
-#include "sev-shared.c"
+#include "shared.c"
 
 static inline struct svsm_ca *svsm_get_caa(void)
 {
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/coco/sev/shared.c
similarity index 100%
rename from arch/x86/kernel/sev-shared.c
rename to arch/x86/coco/sev/shared.c
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 20a0dd51700a..b22ceb9fdf57 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -142,8 +142,6 @@ obj-$(CONFIG_UNWINDER_ORC)  += unwind_orc.o
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)   += unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
-obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev.o
-
 obj-$(CONFIG_CFI_CLANG)+= cfi.o
 
 obj-$(CONFIG_CALL_THUNKS)  += callthunks.o
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v9 0/3] x86/snp: Add kexec support

2024-06-20 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

This patchset requires the following fix for preventing EFI memory map
corruption while doing SNP guest kexec:
  
https://lore.kernel.org/all/16131a10-b473-41cc-a96e-d71a4d930...@amd.com/T/#m77f2f33f5521d1369b0e8d461802b99005b4ffd6

The series is based off and tested against tree:
  https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git



v9:
- Rebased onto current tip/master;
- Rebased on top of [PATCH] x86/sev: Move SEV compilation units 
  and uses the coco directory hierarchy for SEV guest kexec patches.
- Includes the above mentioned patch as part of this patch-set to
  fix any kernel test robot/build issues.
- Includes the massaged version of patch 2/3 as per upstream
  review/feedback.

v8:
- removed fix EFI memory map corruption with kexec patch as this
  is a use-after-free bug that is not specific to SNP/TDX or kexec
  and a generic fix for the same has been posted. 
- Add new early_sev_detect() and move detection of SEV-ES/SNP guest
  and skip accessing video RAM during decompressor stage into
  this function as per feedback from upstream review.

v7:
- Rebased onto current tip/master;
- Moved back to checking the md attribute instead of checking the
  efi_setup for detecting if running under kexec kernel as 
  suggested in upstream review feedback.

v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (2):
  x86/boot: Skip video memory access in the decompressor for SEV-ES/SNP
  x86/snp: Convert shared memory back to private on kexec

Borislav Petkov (AMD) (1):
  x86/sev: Move SEV compilation units

 arch/x86/boot/compressed/misc.c   |  15 ++
 arch/x86/boot/compressed/sev.c|   2 +-
 arch/x86/coco/Makefile|   1 +
 arch/x86/coco/sev/Makefile|   3 +
 arch/x86/{kernel/sev.c => coco/sev/core.c}| 170 +-
 .../sev-shared.c => coco/sev/shared.c}|   0
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/Makefile  |   2 -
 arch/x86/mm/mem_encrypt_amd.c |   2 +
 9 files changed, 195 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/coco/sev/Makefile
 rename arch/x86/{kernel/sev.c => coco/sev/core.c} (93%)
 rename arch/x86/{kernel/sev-shared.c => coco/sev/shared.c} (100%)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v8 2/2] x86/snp: Convert shared memory back to private on kexec

2024-06-17 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/sev.c | 168 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 3 files changed, 175 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ca20cc4e5826..68c08458bb87 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
+void snp_kexec_finish(void);
+void snp_kexec_begin(void);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
+static inline void snp_kexec_finish(void) { }
+static inline void snp_kexec_begin(void) { }
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 3342ed58e168..ff2f385642e2 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -913,6 +918,169 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSET + get_max_mapped();
+
+   while (addr < end) {
+   unsigned long size;
+   unsigned int level;
+

[PATCH v8 1/2] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-06-17 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Add early_sev_detect() to detect SEV-ES/SNP guest and skip
accessing video RAM during decompressor stage.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Borislav Petkov 
Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..bad924f20a3a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -385,6 +385,27 @@ static void parse_mem_encrypt(struct setup_header *hdr)
hdr->xloadflags |= XLF_MEM_ENCRYPTION;
 }
 
+static void early_sev_detect(void)
+{
+   /*
+* Accessing guest video memory/RAM during kernel decompressor
+* causes guest termination as boot stage2 #VC handler for
+* SEV-ES/SNP systems does not support MMIO handling.
+*
+* This issue is observed with SEV-ES/SNP guest kexec as
+* kexec -c adds screen_info to the boot parameters
+* passed to the kexec kernel, which causes console output to
+* be dumped to both video and serial.
+*
+* As the decompressor output gets cleared really fast, it is
+* preferable to get the console output only on serial, hence,
+* skip accessing video RAM during decompressor stage to
+* prevent guest termination.
+*/
+   if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+   lines = cols = 0;
+}
+
 /*
  * The compressed kernel image (ZO), has been moved so that its position
  * is against the end of the buffer used to hold the uncompressed kernel
@@ -440,6 +461,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
 */
early_tdx_detect();
 
+   early_sev_detect();
+
console_init();
 
/*
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v8 0/2] x86/snp: Add kexec support

2024-06-17 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

This patchset requires the following fix for preventing EFI memory map
corruption while doing SNP guest kexec:
https://lore.kernel.org/all/16131a10-b473-41cc-a96e-d71a4d930...@amd.com/T/#m77f2f33f5521d1369b0e8d461802b99005b4ffd6

The series is based off of and tested against Kirill Shutemov's tree:
  https://github.com/intel/tdx.git guest-kexec



v8:
- removed fix EFI memory map corruption with kexec patch as this
  is a use-after-free bug that is not specific to SNP/TDX or kexec
  and a generic fix for the same has been posted. 
- Add new early_sev_detect() and move detection of SEV-ES/SNP guest
  and skip accessing video RAM during decompressor stage into
  this function as per feedback from upstream review.

v7:
- Rebased onto current tip/master;
- Moved back to checking the md attribute instead of checking the
  efi_setup for detecting if running under kexec kernel as 
  suggested in upstream review feedback.

v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (2):
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |  23 +
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/kernel/sev.c   | 168 
 arch/x86/mm/mem_encrypt_amd.c   |   3 +
 4 files changed, 198 insertions(+)


base-commit: f87c20c019e22be5f2efd11bf9141a532ae876da
prerequisite-patch-id: a911f230c2524bd791c47f62f17f0a93cbf726b6
prerequisite-patch-id: bfe2fa046349978ac1825275eb205acecfbc22f3
prerequisite-patch-id: 36fe38a0547bcc26048bd1c5568d736344173d0a
prerequisite-patch-id: 1f97d0a2edb7509dd58276f628d1a4bda62c154c
prerequisite-patch-id: c890aed9c68e5f6dec8e640194950f0abeddb68c
prerequisite-patch-id: 17a7d996d9af56c6b24a2374e9e498feafe18216
prerequisite-patch-id: 6a8bda2b3cf9bfab8177acdcfc8dd0408ed129fa
prerequisite-patch-id: 99382c42348b9a076ba930eca0dfc9d000ec951d
prerequisite-patch-id: 469a0a3c78b0eca82527cd85e2205fb8fb89d645
prerequisite-patch-id: fda4eb74abfdee49760e508ee6f3b661d52ceb26
prerequisite-patch-id: 6da1f25b8b1646f326911eb10c05f3821343313e
prerequisite-patch-id: 95356474298029468750a9c1bc2224fb09a86eed
prerequisite-patch-id: d4966ae63e86d24b0bf578da4dae871cd9002b12
prerequisite-patch-id: fccde6f1fa385b5af0195f81fcb95acd71822428
prerequisite-patch-id: 16048ee15e392b0b9217b8923939b0059311abd2
prerequisite-patch-id: 5c9ae9aa294f72f63ae2c3551507dfbd92525803
prerequisite-patch-id: 6bd2e291bfdb1f61b6d194899d3bb3c678d534dd
prerequisite-patch-id: c85fd0bb6d183a40da73720eaa607481b1d51daf
prerequisite-patch-id: 60760e0c98ab7ccd2ca22ae3e9f20ff5a94c6e91
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 0/3] x86/snp: Add kexec support

2024-05-30 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

The series is based off of and tested against Kirill Shutemov's tree:
  https://github.com/intel/tdx.git guest-kexec



v7:
- Rebased onto current tip/master;
- Moved back to checking the md attribute instead of checking the
  efi_setup for detecting if running under kexec kernel as 
  suggested in upstream review feedback.

v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (3):
  efi/x86: Fix EFI memory map corruption with kexec
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |   6 +-
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/kernel/sev.c   | 162 
 arch/x86/mm/mem_encrypt_amd.c   |   3 +
 arch/x86/platform/efi/quirks.c  |  30 +-
 5 files changed, 200 insertions(+), 5 deletions(-)


base-commit: f8441cd55885e43eb0d4e8eedc6c5ab15d2dabf1
prerequisite-patch-id: a911f230c2524bd791c47f62f17f0a93cbf726b6
prerequisite-patch-id: bfe2fa046349978ac1825275eb205acecfbc22f3
prerequisite-patch-id: 5e60d292457c7cd98fd3e45c23127e9463b56a69
prerequisite-patch-id: 1f97d0a2edb7509dd58276f628d1a4bda62c154c
prerequisite-patch-id: 6e07f4d4ac95ad1d2c7750ebd3e87483fb9fd48f
prerequisite-patch-id: 24ec385d6a89cf2c8553c6d29515cc513643a68a
prerequisite-patch-id: 6a8bda2b3cf9bfab8177acdcfc8dd0408ed129fa
prerequisite-patch-id: 99382c42348b9a076ba930eca0dfc9d000ec951d
prerequisite-patch-id: 469a0a3c78b0eca82527cd85e2205fb8fb89d645
prerequisite-patch-id: 2be870cdf58bdc6a10ca3c18bf874e5c6cfb7e42
prerequisite-patch-id: 7fc62697fb6bdade0bab66ba2b45a19759008f9e
prerequisite-patch-id: 95356474298029468750a9c1bc2224fb09a86eed
prerequisite-patch-id: d4966ae63e86d24b0bf578da4dae871cd9002b12
prerequisite-patch-id: fccde6f1fa385b5af0195f81fcb95acd71822428
prerequisite-patch-id: 16048ee15e392b0b9217b8923939b0059311abd2
prerequisite-patch-id: 5c9ae9aa294f72f63ae2c3551507dfbd92525803
prerequisite-patch-id: 758bdb686290c018cbd5b7d005354019f9d15248
prerequisite-patch-id: c85fd0bb6d183a40da73720eaa607481b1d51daf
prerequisite-patch-id: 60760e0c98ab7ccd2ca22ae3e9f20ff5a94c6e91
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 3/3] x86/snp: Convert shared memory back to private on kexec

2024-05-30 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/sev.c | 162 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 3 files changed, 169 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ca20cc4e5826..f9b0a4eb1980 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
+void snp_kexec_finish(void);
+void snp_kexec_begin(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
+static inline void snp_kexec_finish(void) { }
+static inline void snp_kexec_begin(bool crash) { }
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 3342ed58e168..941f3996a9b6 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -913,6 +918,163 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSET + get_max_mapped();
+
+   while (addr < end) {
+   unsigned long size;
+   unsign

[PATCH v7 2/3] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-05-30 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..3b9f96b3dbcc 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -427,8 +427,10 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
vidport = 0x3d4;
}
 
-   lines = boot_params_ptr->screen_info.orig_video_lines;
-   cols = boot_params_ptr->screen_info.orig_video_cols;
+   if (!(sev_status & MSR_AMD64_SEV_ES_ENABLED)) {
+   lines = boot_params_ptr->screen_info.orig_video_lines;
+   cols = boot_params_ptr->screen_info.orig_video_cols;
+   }
 
init_default_io_ops();
 
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 1/3] efi/x86: Fix EFI memory map corruption with kexec

2024-05-30 Thread Ashish Kalra
From: Ashish Kalra 

With SNP guest kexec observe the following efi memmap corruption :

[0.00] efi: EFI v2.7 by EDK II
[0.00] efi: SMBIOS=0x7e33f000 SMBIOS 3.0=0x7e33d000 ACPI=0x7e57e000 
ACPI 2.0=0x7e57e014 MEMATTR=0x7cc3c018 Unaccepted=0x7c09e018
[0.00] efi: [Firmware Bug]: Invalid EFI memory map entries:
[0.00] efi: mem03: [type=269370880|attr=0x0e42100e42180e41] 
range=[0x0486200e41038c18-0x200e898a0eee713ac17] (invalid)
[0.00] efi: mem04: [type=12336|attr=0x0e410686300e4105] 
range=[0x100e42000176-0x8c290f26248d200e175] (invalid)
[0.00] efi: mem06: [type=1124304408|attr=0x30b40028] 
range=[0x0e51300e45280e77-0xb44ed2142f460c1e76] (invalid)
[0.00] efi: mem08: [type=68|attr=0x300e540583280e41] 
range=[0x011a3cd8-0x486200e54b38c0bcd7] (invalid)
[0.00] efi: mem10: [type=1107529240|attr=0x0e42280e41300e41] 
range=[0x300e41058c280e42-0x38010ae54c5c328ee41] (invalid)
[0.00] efi: mem11: [type=189335566|attr=0x048d200e42038e18] 
range=[0x318c0048-0xe42029228ce4200047] (invalid)
[0.00] efi: mem12: [type=239142534|attr=0x00240b4b] 
range=[0x0e41380e0a7d700e-0x80f26238f22bfe500d] (invalid)
[0.00] efi: mem14: [type=239207055|attr=0x0e41300e43380e0a] 
range=[0x8c280e42048d200e-0xc70b028f2f27cc0a00d] (invalid)
[0.00] efi: mem15: [type=239210510|attr=0x00080e660b47080e] 
range=[0x324c001c-0xa78028634ce490001b] (invalid)
[0.00] efi: mem16: [type=4294848528|attr=0x32940014] 
range=[0x0e410286100e4100-0x80f252036a218f20ff] (invalid)
[0.00] efi: mem19: [type=2250772033|attr=0x42180e42200e4328] 
range=[0x41280e0ab9020683-0xe0e538c28b39e62682] (invalid)
[0.00] efi: mem20: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x00084438-0x44340090333c437] (invalid)
[0.00] efi: mem22: [Reserved|attr=0x00c14420] 
range=[0x44243398-0x1033a04240003f397] (invalid)
[0.00] efi: mem23: [type=1141080856|attr=0x080e41100e43180e] 
range=[0x280e66300e4b280e-0x440dc5ee7141f4c080d] (invalid)
[0.00] efi: mem25: [Reserved|attr=0x000a44a0] 
range=[0x44a43428-0x1034304a400013427] (invalid)
[0.00] efi: mem28: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x000a4488-0x448400b034bc487] (invalid)
[0.00] efi: mem30: [Reserved|attr=0x000a4470] 
range=[0x44743518-0x10352047400013517] (invalid)
[0.00] efi: mem33: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x000a4458-0x445400b035ac457] (invalid)
[0.00] efi: mem35: [type=269372416|attr=0x0e42100e42180e41] 
range=[0x0486200e44038c18-0x200e8b8a0eee823ac17] (invalid)
[0.00] efi: mem37: [type=2351435330|attr=0x0e42100e42180e42] 
range=[0x470783380e410686-0x2002b2a041c2141e685] (invalid)
[0.00] efi: mem38: [type=1093668417|attr=0x100e42000270] 
range=[0x42100e42180e4220-0xfff366a4e421b78c21f] (invalid)
[0.00] efi: mem39: [type=76357646|attr=0x180e42200e42280e] 
range=[0x0e410686300e4105-0x4130f251a0710ae5104] (invalid)
[0.00] efi: mem40: [type=940444268|attr=0x0e42200e42280e41] 
range=[0x180e42200e42280e-0x300fc71c300b4f2480d] (invalid)
[0.00] efi: mem41: [MMIO|attr=0x8c280e42048d200e] 
range=[0x47943728-0x42138e0c87820292727] (invalid)
[0.00] efi: mem42: [type=1191674680|attr=0x004c000b] 
range=[0x300e41380e0a0246-0x470b0f26238f22b8245] (invalid)
[0.00] efi: mem43: [type=2010|attr=0x0301f00e4d078338] 
range=[0x45038e180e42028f-0xe4556bf118f282528e] (invalid)
[0.00] efi: mem44: [type=1109921345|attr=0x300e446c] 
range=[0x44080e42100e4218-0xfff39254e42138ac217] (invalid)
...

This EFI memap corruption is happening with efi_arch_mem_reserve() invocation 
in case of kexec boot.

( efi_arch_mem_reserve() is invoked with the following call-stack: )

[0.310010]  efi_arch_mem_reserve+0xb1/0x220
[0.311382]  efi_mem_reserve+0x36/0x60
[0.311973]  efi_bgrt_init+0x17d/0x1a0
[0.313265]  acpi_parse_bgrt+0x12/0x20
[0.313858]  acpi_table_parse+0x77/0xd0
[0.314463]  acpi_boot_init+0x362/0x630
[0.315069]  setup_arch+0xa88/0xf80
[0.315629]  start_kernel+0x68/0xa90
[0.316194]  x86_64_start_reservations+0x1c/0x30
[0.316921]  x86_64_start_kernel+0xbf/0x110
[0.317582]  common_startup_64+0x13e/0x141

efi_arch_mem_reserve() calls efi_memmap_alloc() to allocate memory for
EFI memory map and due to early allocation it uses memblock allocation.

Later during boot, efi_enter_virtual_mode() calls kexec_enter_virtual_mode()
in case of a kexec-ed kernel boot.

This function kexec_enter_virtual_mode() installs the new EFI memory map by
calling efi_memmap_init_late() which remaps the efi_memmap physically allocated
in efi_arch_mem_reserve(), but this remapping is still using memblock 
allocation.

Subsequently, when memblock is freed

[PATCH v6 3/3] x86/snp: Convert shared memory back to private on kexec

2024-04-26 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/sev.c | 161 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 3 files changed, 168 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 7f57382afee4..78d40d08d201 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
+static inline void snp_kexec_unshare_mem(void) { }
+static inline void snp_kexec_stop_conversion(bool crash) { }
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 995f94467101..891257fde810 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -913,6 +918,162 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSE

[PATCH v6 2/3] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-04-26 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..3b9f96b3dbcc 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -427,8 +427,10 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
vidport = 0x3d4;
}
 
-   lines = boot_params_ptr->screen_info.orig_video_lines;
-   cols = boot_params_ptr->screen_info.orig_video_cols;
+   if (!(sev_status & MSR_AMD64_SEV_ES_ENABLED)) {
+   lines = boot_params_ptr->screen_info.orig_video_lines;
+   cols = boot_params_ptr->screen_info.orig_video_cols;
+   }
 
init_default_io_ops();
 
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v6 1/3] efi/x86: Fix EFI memory map corruption with kexec

2024-04-26 Thread Ashish Kalra
From: Ashish Kalra 

With SNP guest kexec observe the following efi memmap corruption :

[0.00] efi: EFI v2.7 by EDK II
[0.00] efi: SMBIOS=0x7e33f000 SMBIOS 3.0=0x7e33d000 ACPI=0x7e57e000 
ACPI 2.0=0x7e57e014 MEMATTR=0x7cc3c018 Unaccepted=0x7c09e018
[0.00] efi: [Firmware Bug]: Invalid EFI memory map entries:
[0.00] efi: mem03: [type=269370880|attr=0x0e42100e42180e41] 
range=[0x0486200e41038c18-0x200e898a0eee713ac17] (invalid)
[0.00] efi: mem04: [type=12336|attr=0x0e410686300e4105] 
range=[0x100e42000176-0x8c290f26248d200e175] (invalid)
[0.00] efi: mem06: [type=1124304408|attr=0x30b40028] 
range=[0x0e51300e45280e77-0xb44ed2142f460c1e76] (invalid)
[0.00] efi: mem08: [type=68|attr=0x300e540583280e41] 
range=[0x011a3cd8-0x486200e54b38c0bcd7] (invalid)
[0.00] efi: mem10: [type=1107529240|attr=0x0e42280e41300e41] 
range=[0x300e41058c280e42-0x38010ae54c5c328ee41] (invalid)
[0.00] efi: mem11: [type=189335566|attr=0x048d200e42038e18] 
range=[0x318c0048-0xe42029228ce4200047] (invalid)
[0.00] efi: mem12: [type=239142534|attr=0x00240b4b] 
range=[0x0e41380e0a7d700e-0x80f26238f22bfe500d] (invalid)
[0.00] efi: mem14: [type=239207055|attr=0x0e41300e43380e0a] 
range=[0x8c280e42048d200e-0xc70b028f2f27cc0a00d] (invalid)
[0.00] efi: mem15: [type=239210510|attr=0x00080e660b47080e] 
range=[0x324c001c-0xa78028634ce490001b] (invalid)
[0.00] efi: mem16: [type=4294848528|attr=0x32940014] 
range=[0x0e410286100e4100-0x80f252036a218f20ff] (invalid)
[0.00] efi: mem19: [type=2250772033|attr=0x42180e42200e4328] 
range=[0x41280e0ab9020683-0xe0e538c28b39e62682] (invalid)
[0.00] efi: mem20: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x00084438-0x44340090333c437] (invalid)
[0.00] efi: mem22: [Reserved|attr=0x00c14420] 
range=[0x44243398-0x1033a04240003f397] (invalid)
[0.00] efi: mem23: [type=1141080856|attr=0x080e41100e43180e] 
range=[0x280e66300e4b280e-0x440dc5ee7141f4c080d] (invalid)
[0.00] efi: mem25: [Reserved|attr=0x000a44a0] 
range=[0x44a43428-0x1034304a400013427] (invalid)
[0.00] efi: mem28: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x000a4488-0x448400b034bc487] (invalid)
[0.00] efi: mem30: [Reserved|attr=0x000a4470] 
range=[0x44743518-0x10352047400013517] (invalid)
[0.00] efi: mem33: [type=16|   |  |  |  |  |  |  |  |  |   |WB|  |WC|  
] range=[0x000a4458-0x445400b035ac457] (invalid)
[0.00] efi: mem35: [type=269372416|attr=0x0e42100e42180e41] 
range=[0x0486200e44038c18-0x200e8b8a0eee823ac17] (invalid)
[0.00] efi: mem37: [type=2351435330|attr=0x0e42100e42180e42] 
range=[0x470783380e410686-0x2002b2a041c2141e685] (invalid)
[0.00] efi: mem38: [type=1093668417|attr=0x100e42000270] 
range=[0x42100e42180e4220-0xfff366a4e421b78c21f] (invalid)
[0.00] efi: mem39: [type=76357646|attr=0x180e42200e42280e] 
range=[0x0e410686300e4105-0x4130f251a0710ae5104] (invalid)
[0.00] efi: mem40: [type=940444268|attr=0x0e42200e42280e41] 
range=[0x180e42200e42280e-0x300fc71c300b4f2480d] (invalid)
[0.00] efi: mem41: [MMIO|attr=0x8c280e42048d200e] 
range=[0x47943728-0x42138e0c87820292727] (invalid)
[0.00] efi: mem42: [type=1191674680|attr=0x004c000b] 
range=[0x300e41380e0a0246-0x470b0f26238f22b8245] (invalid)
[0.00] efi: mem43: [type=2010|attr=0x0301f00e4d078338] 
range=[0x45038e180e42028f-0xe4556bf118f282528e] (invalid)
[0.00] efi: mem44: [type=1109921345|attr=0x300e446c] 
range=[0x44080e42100e4218-0xfff39254e42138ac217] (invalid)
...

This EFI memap corruption is happening with efi_arch_mem_reserve() invocation 
in case of kexec boot.

( efi_arch_mem_reserve() is invoked with the following call-stack: )

[0.310010]  efi_arch_mem_reserve+0xb1/0x220
[0.311382]  efi_mem_reserve+0x36/0x60
[0.311973]  efi_bgrt_init+0x17d/0x1a0
[0.313265]  acpi_parse_bgrt+0x12/0x20
[0.313858]  acpi_table_parse+0x77/0xd0
[0.314463]  acpi_boot_init+0x362/0x630
[0.315069]  setup_arch+0xa88/0xf80
[0.315629]  start_kernel+0x68/0xa90
[0.316194]  x86_64_start_reservations+0x1c/0x30
[0.316921]  x86_64_start_kernel+0xbf/0x110
[0.317582]  common_startup_64+0x13e/0x141

efi_arch_mem_reserve() calls efi_memmap_alloc() to allocate memory for
EFI memory map and due to early allocation it uses memblock allocation.

Later during boot, efi_enter_virtual_mode() calls kexec_enter_virtual_mode()
in case of a kexec-ed kernel boot.

This function kexec_enter_virtual_mode() installs the new EFI memory map by
calling efi_memmap_init_late() which remaps the efi_memmap physically allocated
in efi_arch_mem_reserve(), but this remapping is still using memblock 
allocation.

Subsequently, when memblock is freed

[PATCH v6 0/3] x86/snp: Add kexec support

2024-04-26 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

The series is based off of and tested against Kirill Shutemov's tree:
  https://github.com/intel/tdx.git guest-kexec



v6:
- Updated and restructured the commit message for patch 1/3 to
  explain the issue in detail.
- Updated inline comments in patch 1/3 to explain the issue in 
  detail.
- Moved back to checking efi_setup for detecting if running
  under kexec kernel.

v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 


Ashish Kalra (3):
  efi/x86: Fix EFI memory map corruption with kexec
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |   6 +-
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/kernel/sev.c   | 161 
 arch/x86/mm/mem_encrypt_amd.c   |   3 +
 arch/x86/platform/efi/quirks.c  |  20 
 5 files changed, 192 insertions(+), 2 deletions(-)


base-commit: 7fcd76de8a7bc12e930ef383a157ce99d711715d
prerequisite-patch-id: a911f230c2524bd791c47f62f17f0a93cbf726b6
prerequisite-patch-id: bfe2fa046349978ac1825275eb205acecfbc22f3
prerequisite-patch-id: 5e60d292457c7cd98fd3e45c23127e9463b56a69
prerequisite-patch-id: 1f97d0a2edb7509dd58276f628d1a4bda62c154c
prerequisite-patch-id: cbc2507b5c2810c3015aaf836d774d32f969c19a
prerequisite-patch-id: cbdfea1e50ecb3b4cee3a25a27df4d35bd95d532
prerequisite-patch-id: 99382c42348b9a076ba930eca0dfc9d000ec951d
prerequisite-patch-id: 469a0a3c78b0eca82527cd85e2205fb8fb89d645
prerequisite-patch-id: 2974ef211db5253d9782018e352d2a6ff0b0ef54
prerequisite-patch-id: 2cfffd80947941892421dae99b7fa0f9f9715884
prerequisite-patch-id: 466c2cb9f0a107bbd1dbd8526f4eff2bdb55f1ce
prerequisite-patch-id: d4966ae63e86d24b0bf578da4dae871cd9002b12
prerequisite-patch-id: fccde6f1fa385b5af0195f81fcb95acd71822428
prerequisite-patch-id: 16048ee15e392b0b9217b8923939b0059311abd2
prerequisite-patch-id: 5c9ae9aa294f72f63ae2c3551507dfbd92525803
prerequisite-patch-id: 758bdb686290c018cbd5b7d005354019f9d15248
prerequisite-patch-id: c85fd0bb6d183a40da73720eaa607481b1d51daf
prerequisite-patch-id: 60760e0c98ab7ccd2ca22ae3e9f20ff5a94c6e91
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 2/2] x86/sev: Add callback to apply RMP table fixups for kexec

2024-04-25 Thread Ashish Kalra
From: Ashish Kalra 

Handle cases where the RMP table placement in the BIOS is
not 2M aligned and then the kexec kernel could try to allocate
from within that chunk and that causes a fatal RMP fault.

The kexec failure is illustrated below from the kernel logs:

[0.00] SEV-SNP: RMP table physical range [0x007ffe80 - 
0x00807f0f]
[0.00] BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0008efff] usable
[0.00] BIOS-e820: [mem 0x0008f000-0x0008] ACPI NVS
[0.00] BIOS-e820: [mem 0x0009-0x0009] usable
[0.00] BIOS-e820: [mem 0x0010-0x5a14afff] usable
[0.00] BIOS-e820: [mem 0x5a14b000-0x5a34afff] reserved
[0.00] BIOS-e820: [mem 0x5a34b000-0x67acefff] usable
[0.00] BIOS-e820: [mem 0x67acf000-0x6dfcefff] reserved
[0.00] BIOS-e820: [mem 0x6dfcf000-0x6edfefff] ACPI NVS
[0.00] BIOS-e820: [mem 0x6edff000-0x6effefff] ACPI data
[0.00] BIOS-e820: [mem 0x6efff000-0x6eff] usable
[0.00] BIOS-e820: [mem 0x6f00-0x6f00afff] ACPI NVS
[0.00] BIOS-e820: [mem 0x6f00b000-0x6fff] usable
[0.00] BIOS-e820: [mem 0x7000-0x8fff] reserved
[0.00] BIOS-e820: [mem 0xaa00-0xaaff] reserved
[0.00] BIOS-e820: [mem 0xc500-0xc5ff] reserved
[0.00] BIOS-e820: [mem 0xe000-0xe0ff] reserved
[0.00] BIOS-e820: [mem 0xfd00-0x] reserved
[0.00] BIOS-e820: [mem 0x0001-0x00407fcf] usable
[0.00] BIOS-e820: [mem 0x00407fd0-0x00407fff] reserved
[0.00] BIOS-e820: [mem 0x00408000-0x007ffe7f] usable
[0.00] BIOS-e820: [mem 0x007ffe80-0x00807f0f] reserved
[0.00] BIOS-e820: [mem 0x00807f10-0x00807f1fefff] usable

As seen here in the e820 memory map, the end range of RMP table is not aligned 
to
2MB and not reserved and usable as RAM.

Subsequently, kexec -s (KEXEC_FILE_LOAD syscall) loads it's purgatory code and
boot_param, command line and other setup data into this RAM region as seen in 
the
kexec logs below, which leads to fatal RMP fault during kexec boot.

[  173.113085] Loaded purgatory at 0x807f1fa000
[  173.113099] Loaded boot_param, command line and misc at 0x807f1f8000 
bufsz=0x1350 memsz=0x2000
[  173.113107] Loaded 64bit kernel at 0x7ffae0 bufsz=0xd06200 
memsz=0x3894000
[  173.113291] Loaded initrd at 0x7ff6c89000 bufsz=0x4176014 memsz=0x4176014
[  173.113296] E820 memmap:
[  173.113298] -0008efff (1)
[  173.113300] 0008f000-0008 (4)
[  173.113302] 0009-0009 (1)
[  173.113303] 0010-5a14afff (1)
[  173.113305] 5a14b000-5a34afff (2)
[  173.113306] 5a34b000-67acefff (1)
[  173.113308] 67acf000-6dfcefff (2)
[  173.113309] 6dfcf000-6edfefff (4)
[  173.113311] 6edff000-6effefff (3)
[  173.113312] 6efff000-6eff (1)
[  173.113314] 6f00-6f00afff (4)
[  173.113315] 6f00b000-6fff (1)
[  173.113317] 7000-8fff (2)
[  173.113318] aa00-aaff (2)
[  173.113319] c500-c5ff (2)
[  173.113321] e000-e0ff (2)
[  173.113322] fd00- (2)
[  173.113324] 0001-00407fcf (1)
[  173.113325] 00407fd0-00407fff (2)
[  173.113327] 00408000-007ffe7f (1)
[  173.113328] 007ffe80-00807f0f (2)
[  173.113330] 00807f10-00807f1fefff (1)
[  173.113331] 00807f1ff000-00807fff (2)
[  173.690528] nr_segments = 4
[  173.690533] segment[0]: buf=0xe626d1a2 bufsz=0x4000 mem=0x807f1fa000 
memsz=0x5000
[  173.690546] segment[1]: buf=0x29c67bd6 bufsz=0x1350 mem=0x807f1f8000 
memsz=0x2000
[  173.690552] segment[2]: buf=0x45c60183 bufsz=0xd06200 
mem=0x7ffae0 memsz=0x3894000
[  173.697994] segment[3]: buf=0x6e54f08d bufsz=0x4176014 
mem=0x7ff6c89000 memsz=0x4177000
[  173.708672] kexec_file_load: type:0, start:0x807f1fa150 head:0x1184d0002 
flags:0x0

Check if RMP table start & end physical range in e820 tables
are not aligned to 2MB and in that case map this range to reserved in all
the three e820 tables.

Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature")
Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h |  2 ++
 arch/x86/mm/mem_encrypt.c  | 13 
 arch/x86/virt/svm/sev.c| 42 ++
 3 files changed,

[PATCH v3 1/2] x86/e820: Expose API to update e820 kexec and firmware tables externally.

2024-04-25 Thread Ashish Kalra
From: Ashish Kalra 

Export a new API helper function e820__range_update_table() to update both
e820_table_kexec and e820_table_firmware. Move all current users of
e820__range_update_kexec() to use this new helper function.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/e820.c  | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index e8f58ddd06d9..eeb44e71aa66 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -18,6 +18,8 @@ extern void e820__range_add   (u64 start, u64 size, enum 
e820_type type);
 extern u64  e820__range_update(u64 start, u64 size, enum e820_type old_type, 
enum e820_type new_type);
 extern u64  e820__range_remove(u64 start, u64 size, enum e820_type old_type, 
bool check_type);
 
+extern u64  e820__range_update_table(struct e820_table *t, u64 start, u64 
size, enum e820_type old_type, enum e820_type new_type);
+
 extern void e820__print_table(char *who);
 extern int  e820__update_table(struct e820_table *table);
 extern void e820__update_table_print(void);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6f1b379e3b38..872e133d2718 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -532,9 +532,9 @@ u64 __init e820__range_update(u64 start, u64 size, enum 
e820_type old_type, enum
return __e820__range_update(e820_table, start, size, old_type, 
new_type);
 }
 
-static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type 
old_type, enum e820_type  new_type)
+u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, 
enum e820_type old_type, enum e820_type new_type)
 {
-   return __e820__range_update(e820_table_kexec, start, size, old_type, 
new_type);
+   return __e820__range_update(t, start, size, old_type, new_type);
 }
 
 /* Remove a range of memory from the E820 table: */
@@ -806,7 +806,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 
align)
 
addr = memblock_phys_alloc(size, align);
if (addr) {
-   e820__range_update_kexec(addr, size, E820_TYPE_RAM, 
E820_TYPE_RESERVED);
+   e820__range_update_table(e820_table_kexec, addr, size, 
E820_TYPE_RAM, E820_TYPE_RESERVED);
pr_info("update e820_table_kexec for 
e820__memblock_alloc_reserved()\n");
e820__update_table_kexec();
}
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 0/2] Apply RMP table fixups for kexec.

2024-04-25 Thread Ashish Kalra
From: Ashish Kalra 

Handle cases where the RMP table placement in the BIOS is
not 2M aligned and then the kexec kernel could try to allocate
from within that chunk and that causes a fatal RMP fault.

Check if RMP table start & end physical range in e820 tables
are not aligned to 2MB and in that case map this range to reserved
in all the three e820 tables.

The callback to apply these RMP table fixups needs to be called
after the e820 tables are setup/populated and before the e820 map
has been converted to the standard Linux memory resources and e820 map
is no longer used and modifying it has no effect.

v3:
- Added new e820__range_update_table() helper and updated all
  internal and external modifiers of e820_table_kexec and 
  e820_table_firmware to call this new helper function.
- Updated and restructured the commit message for patch 2/2 to
  explain the issue in detail.
- Added, merged and cleaned up inline comments in patch 2/2.
- Added new __snp_e820_tables_fixup() function to be avoid
  duplication of code for fixing both RMP table start and end
  physical ranges.

v2:
- Remove overriding e820__memory_setup_default() to invoke
  snp_rmptable_e820_fixup() to apply the RMP table fixups.
- This callback snp_rmptable_e820_fixup() is now invoked
  after e820__memory_setup() and before e820__reserve_resources().
- Expose e820 API interfaces to update e820_table_kexec and
  e820_table_firmware externally.
- snp_rmptable_e820_fixup() now calls these new external API
  interfaces to update e820_table_kexec and e820_table_firmware.

Ashish Kalra (2):
  x86/e820: Expose API to update e820 kexec and firmware tables
externally.
  x86/sev: Add callback to apply RMP table fixups for kexec

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/include/asm/sev.h  |  2 ++
 arch/x86/kernel/e820.c  |  6 ++---
 arch/x86/mm/mem_encrypt.c   | 13 ++
 arch/x86/virt/svm/sev.c | 42 +
 5 files changed, 62 insertions(+), 3 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v5 3/3] x86/snp: Convert shared memory back to private on kexec

2024-04-15 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/sev.c | 161 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 3 files changed, 168 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 7f57382afee4..78d40d08d201 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
+static inline void snp_kexec_unshare_mem(void) { }
+static inline void snp_kexec_stop_conversion(bool crash) { }
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 38ad066179d8..17f616963beb 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -913,6 +918,162 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSE

[PATCH v5 2/3] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-04-15 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..3b9f96b3dbcc 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -427,8 +427,10 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
vidport = 0x3d4;
}
 
-   lines = boot_params_ptr->screen_info.orig_video_lines;
-   cols = boot_params_ptr->screen_info.orig_video_cols;
+   if (!(sev_status & MSR_AMD64_SEV_ES_ENABLED)) {
+   lines = boot_params_ptr->screen_info.orig_video_lines;
+   cols = boot_params_ptr->screen_info.orig_video_cols;
+   }
 
init_default_io_ops();
 
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v5 1/3] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-04-15 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memblock & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Suggested-by: Dave Young 
[Dave Young: checking the md attribute instead of checking the efi_setup]
Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..982f5e50a4b3 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -258,12 +258,28 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
int num_entries;
void *new;
 
-   if (efi_mem_desc_lookup(addr, ) ||
-   md.type != EFI_BOOT_SERVICES_DATA) {
+   /*
+* For kexec use case, we need to use the EFI memmap passed from the 
first
+* kernel via setup data, so we need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memboot & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+
+   if (efi_mem_desc_lookup(addr, )) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
return;
}
 
+   if (md.type != EFI_BOOT_SERVICES_DATA) {
+   pr_err("Skip reserving non EFI Boot Service Data memory for 
%pa\n", );
+   return;
+   }
+
+   /* Kexec copied the efi memmap from the first kernel, thus skip the 
case */
+   if (md.attribute & EFI_MEMORY_RUNTIME)
+   return;
+
if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
pr_err("Region spans EFI memory descriptors, %pa\n", );
return;
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v5 0/3] x86/snp: Add kexec support

2024-04-15 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

The series is based off of and tested against Kirill Shutemov's tree:
  https://github.com/intel/tdx.git guest-kexec



v5:
- Removed sev_es_enabled() function and using sev_status directly to
  check for SEV-ES/SEV-SNP guest.
- used --base option to generate patches to specify Kirill's TDX guest
  kexec patches as prerequisite patches to fix kernel test robot
  build errors.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 


Ashish Kalra (3):
  efi/x86: skip efi_arch_mem_reserve() in case of kexec.
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |   6 +-
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/kernel/sev.c   | 161 
 arch/x86/mm/mem_encrypt_amd.c   |   3 +
 arch/x86/platform/efi/quirks.c  |  20 +++-
 5 files changed, 190 insertions(+), 4 deletions(-)


base-commit: a18b42d8997abfd77aa1637c0de6850b0c30b1fe
prerequisite-patch-id: bd8e77f0f12223d21cb2f35b77bfcbdd9ad80b0f
prerequisite-patch-id: bfe2fa046349978ac1825275eb205acecfbc22f3
prerequisite-patch-id: 5e60d292457c7cd98fd3e45c23127e9463b56a69
prerequisite-patch-id: 1f97d0a2edb7509dd58276f628d1a4bda62c154c
prerequisite-patch-id: 8db559385c44e8b6670d74196e8d83d2dfad2f40
prerequisite-patch-id: cbdfea1e50ecb3b4cee3a25a27df4d35bd95d532
prerequisite-patch-id: 1cea0996e0dc3bb9f0059c927c405ca31003791e
prerequisite-patch-id: 469a0a3c78b0eca82527cd85e2205fb8fb89d645
prerequisite-patch-id: 2974ef211db5253d9782018e352d2a6ff0b0ef54
prerequisite-patch-id: 2cfffd80947941892421dae99b7fa0f9f9715884
prerequisite-patch-id: 466c2cb9f0a107bbd1dbd8526f4eff2bdb55f1ce
prerequisite-patch-id: d4966ae63e86d24b0bf578da4dae871cd9002b12
prerequisite-patch-id: fccde6f1fa385b5af0195f81fcb95acd71822428
prerequisite-patch-id: 16048ee15e392b0b9217b8923939b0059311abd2
prerequisite-patch-id: 5c9ae9aa294f72f63ae2c3551507dfbd92525803
prerequisite-patch-id: 758bdb686290c018cbd5b7d005354019f9d15248
prerequisite-patch-id: 4125b799fc9577b1a46427e45618fa0174f7a4b3
prerequisite-patch-id: 60760e0c98ab7ccd2ca22ae3e9f20ff5a94c6e91
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v4 4/4] x86/snp: Convert shared memory back to private on kexec

2024-04-09 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/sev.c | 161 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 3 files changed, 168 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 7f57382afee4..78d40d08d201 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
+static inline void snp_kexec_unshare_mem(void) { }
+static inline void snp_kexec_stop_conversion(bool crash) { }
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 38ad066179d8..17f616963beb 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -42,6 +42,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -92,6 +94,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -913,6 +918,162 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+   set_pte_atomic(kpte, new_pte);
+
+   return true;
+}
+
+static bool make_pte_private(pte_t *pte, unsigned long addr, int pages, int 
level)
+{
+   struct sev_es_runtime_data *data;
+   struct ghcb *ghcb;
+
+   data = this_cpu_read(runtime_data);
+   ghcb = >ghcb_page;
+
+   /* Check for GHCB for being part of a PMD range. */
+   if ((unsigned long)ghcb >= addr &&
+   (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
+   /*
+* Ensure that the current cpu's GHCB is made private
+* at the end of unshared loop so that we continue to use the
+* optimized GHCB protocol and not force the switch to
+* MSR protocol till the very end.
+*/
+   pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
+   kexec_last_addr_to_make_private = addr;
+   return true;
+   }
+
+   if (!set_pte_enc(pte, level, (void *)addr))
+   return false;
+
+   snp_set_memory_private(addr, pages);
+
+   return true;
+}
+
+static void unshare_all_memory(void)
+{
+   unsigned long addr, end;
+
+   /*
+* Walk direct mapping and convert all shared memory back to private,
+*/
+
+   addr = PAGE_OFFSET;
+   end  = PAGE_OFFSE

[PATCH v4 1/4] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-04-09 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memblock & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Suggested-by: Dave Young 
[Dave Young: checking the md attribute instead of checking the efi_setup]
Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..982f5e50a4b3 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -258,12 +258,28 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
int num_entries;
void *new;
 
-   if (efi_mem_desc_lookup(addr, ) ||
-   md.type != EFI_BOOT_SERVICES_DATA) {
+   /*
+* For kexec use case, we need to use the EFI memmap passed from the 
first
+* kernel via setup data, so we need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memboot & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+
+   if (efi_mem_desc_lookup(addr, )) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
return;
}
 
+   if (md.type != EFI_BOOT_SERVICES_DATA) {
+   pr_err("Skip reserving non EFI Boot Service Data memory for 
%pa\n", );
+   return;
+   }
+
+   /* Kexec copied the efi memmap from the first kernel, thus skip the 
case */
+   if (md.attribute & EFI_MEMORY_RUNTIME)
+   return;
+
if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
pr_err("Region spans EFI memory descriptors, %pa\n", );
return;
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v4 3/4] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-04-09 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/misc.c | 6 --
 arch/x86/boot/compressed/misc.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..47b4db200e1f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -427,8 +427,10 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
vidport = 0x3d4;
}
 
-   lines = boot_params_ptr->screen_info.orig_video_lines;
-   cols = boot_params_ptr->screen_info.orig_video_cols;
+   if (!sev_es_enabled()) {
+   lines = boot_params_ptr->screen_info.orig_video_lines;
+   cols = boot_params_ptr->screen_info.orig_video_cols;
+   }
 
init_default_io_ops();
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b353a7be380c..3c12ca987554 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -37,6 +37,7 @@
 #include 
 
 #include "tdx.h"
+#include "sev.h"
 
 #define BOOT_CTYPE_H
 #include 
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v4 2/4] x86/sev: add sev_es_enabled() function.

2024-04-09 Thread Ashish Kalra
From: Ashish Kalra 

Add sev_es_enabled() function to detect if SEV-ES
support is enabled.

Signed-off-by: Ashish Kalra 
Reviewed-by: Kuppuswamy Sathyanarayanan 

---
 arch/x86/boot/compressed/sev.c | 5 +
 arch/x86/boot/compressed/sev.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9..4ae4cc51e6b8 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -134,6 +134,11 @@ bool sev_snp_enabled(void)
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
 }
 
+bool sev_es_enabled(void)
+{
+   return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+}
+
 static void __page_state_change(unsigned long paddr, enum psc_op op)
 {
u64 val;
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index fc725a981b09..5008c80e66e6 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -11,11 +11,13 @@
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 bool sev_snp_enabled(void);
+bool sev_es_enabled(void);
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 
 #else
 
 static inline bool sev_snp_enabled(void) { return false; }
+static inline bool sev_es_enabled(void) { return false; }
 static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
 
 #endif
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v4 0/4] x86/snp: Add kexec support

2024-04-09 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

v4:
- Rebased to current tip/master.
- Reviewed-bys from Sathya.
- Remove snp_kexec_unprep_rom_memory() as it is not needed any more as 
  SEV-SNP code is not validating the ROM range in probe_roms() anymore.
- Fix kernel test robot build error/warnings.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (4):
  efi/x86: skip efi_arch_mem_reserve() in case of kexec.
  x86/sev: add sev_es_enabled() function.
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c |   6 +-
 arch/x86/boot/compressed/misc.h |   1 +
 arch/x86/boot/compressed/sev.c  |   5 +
 arch/x86/boot/compressed/sev.h  |   2 +
 arch/x86/include/asm/sev.h  |   4 +
 arch/x86/kernel/sev.c   | 161 
 arch/x86/mm/mem_encrypt_amd.c   |   3 +
 arch/x86/platform/efi/quirks.c  |  20 +++-
 8 files changed, 198 insertions(+), 4 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 4/4] x86/snp: Convert shared memory back to private on kexec

2024-04-04 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 169 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 5 files changed, 193 insertions(+)

diff --git a/arch/x86/include/asm/probe_roms.h 
b/arch/x86/include/asm/probe_roms.h
index 1c7f3815bbd6..d50b67dbff33 100644
--- a/arch/x86/include/asm/probe_roms.h
+++ b/arch/x86/include/asm/probe_roms.h
@@ -6,4 +6,5 @@ struct pci_dev;
 extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
 extern void pci_unmap_biosrom(void __iomem *rom);
 extern size_t pci_biosrom_size(struct pci_dev *pdev);
+extern void snp_kexec_unprep_rom_memory(void);
 #endif
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 9477b4053bce..51197a544693 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -230,6 +230,8 @@ u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void kdump_sev_callback(void);
 void sev_show_status(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -260,6 +262,8 @@ static inline u64 snp_get_unsupported_features(u64 status) 
{ return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void kdump_sev_callback(void) { }
 static inline void sev_show_status(void) { }
+void snp_kexec_unshare_mem(void) {}
+static void snp_kexec_stop_conversion(bool crash) {}
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9dc..457f1e5c8d00 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -177,6 +177,22 @@ size_t pci_biosrom_size(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL(pci_biosrom_size);
 
+void snp_kexec_unprep_rom_memory(void)
+{
+   unsigned long vaddr, npages, sz;
+
+   /*
+* Switch back ROM regions to shared so that their validation
+* does not fail during kexec kernel boot.
+*/
+   vaddr = (unsigned long)__va(video_rom_resource.start);
+   sz = (system_rom_resource.end + 1) - video_rom_resource.start;
+   npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+   snp_set_memory_shared(vaddr, npages);
+}
+EXPORT_SYMBOL(snp_kexec_unprep_rom_memory);
+
 #define ROMSIGNATURE 0xaa55
 
 static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index b59b09c2f284..1395c9f0fae4 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -91,6 +92,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -927,6 +931,171 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_level_size(level));
+
+   new_pte = __pte(cc_mkenc(pte_val(*kpte)));
+ 

[PATCH v3 3/4] x86/boot/compressed: Skip Video Memory access in Decompressor for SEV-ES/SNP.

2024-04-04 Thread Ashish Kalra
From: Ashish Kalra 

Accessing guest video memory/RAM during kernel decompressor
causes guest termination as boot stage2 #VC handler for
SEV-ES/SNP systems does not support MMIO handling.

This issue is observed with SEV-ES/SNP guest kexec as
kexec -c adds screen_info to the boot parameters
passed to the kexec kernel, which causes console output to
be dumped to both video and serial.

As the decompressor output gets cleared really fast, it is
preferable to get the console output only on serial, hence,
skip accessing video RAM during decompressor stage to
prevent guest termination.

Serial console output during decompressor stage works as
boot stage2 #VC handler already supports handling port I/O.

Suggested-by: Thomas Lendacy 
Signed-off-by: Ashish Kalra 
---
 arch/x86/boot/compressed/misc.c | 6 --
 arch/x86/boot/compressed/misc.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..47b4db200e1f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -427,8 +427,10 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
unsigned char *output)
vidport = 0x3d4;
}
 
-   lines = boot_params_ptr->screen_info.orig_video_lines;
-   cols = boot_params_ptr->screen_info.orig_video_cols;
+   if (!sev_es_enabled()) {
+   lines = boot_params_ptr->screen_info.orig_video_lines;
+   cols = boot_params_ptr->screen_info.orig_video_cols;
+   }
 
init_default_io_ops();
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b353a7be380c..3c12ca987554 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -37,6 +37,7 @@
 #include 
 
 #include "tdx.h"
+#include "sev.h"
 
 #define BOOT_CTYPE_H
 #include 
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 2/4] x86/sev: add sev_es_enabled() function.

2024-04-04 Thread Ashish Kalra
From: Ashish Kalra 

Add sev_es_enabled() function to detect if SEV-ES
support is enabled.

Signed-off-by: Ashish Kalra 
---
 arch/x86/boot/compressed/sev.c | 5 +
 arch/x86/boot/compressed/sev.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9..4ae4cc51e6b8 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -134,6 +134,11 @@ bool sev_snp_enabled(void)
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
 }
 
+bool sev_es_enabled(void)
+{
+   return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+}
+
 static void __page_state_change(unsigned long paddr, enum psc_op op)
 {
u64 val;
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index fc725a981b09..5008c80e66e6 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -11,11 +11,13 @@
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 bool sev_snp_enabled(void);
+bool sev_es_enabled(void);
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 
 #else
 
 static inline bool sev_snp_enabled(void) { return false; }
+static inline bool sev_es_enabled(void) { return false; }
 static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
 
 #endif
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 1/4] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-04-04 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memblock & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Suggested-by: Dave Young 
[Dave Young: checking the md attribute instead of checking the efi_setup]
Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..2b65b3863912 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -255,15 +255,32 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
struct efi_memory_map_data data = { 0 };
struct efi_mem_range mr;
efi_memory_desc_t md;
-   int num_entries;
+   int num_entries, ret;
void *new;
 
-   if (efi_mem_desc_lookup(addr, ) ||
-   md.type != EFI_BOOT_SERVICES_DATA) {
+   /*
+* For kexec use case, we need to use the EFI memmap passed from the 
first
+* kernel via setup data, so we need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memboot & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+
+   ret = efi_mem_desc_lookup(addr, );
+   if (ret) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
return;
}
 
+   if (md.type != EFI_BOOT_SERVICES_DATA) {
+   pr_err("Skip reserving non EFI Boot Service Data memory for 
%pa\n", );
+   return;
+   }
+
+   /* Kexec copied the efi memmap from the first kernel, thus skip the 
case */
+   if (md.attribute & EFI_MEMORY_RUNTIME)
+   return;
+
if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
pr_err("Region spans EFI memory descriptors, %pa\n", );
return;
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 0/4] x86/snp: Add kexec support

2024-04-04 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

v3:
- Rebased;
- moved Keep page tables that maps E820_TYPE_ACPI patch to Kirill's tdx
  guest kexec patch series.
- checking the md attribute instead of checking the efi_setup for
  detecting if running under kexec kernel.
- added new sev_es_enabled() function.
- skip video memory access in decompressor for SEV-ES/SNP systems to 
  prevent guest termination as boot stage2 #VC handler does not handle
  MMIO.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (4):
  efi/x86: skip efi_arch_mem_reserve() in case of kexec.
  x86/sev: add sev_es_enabled() function.
  x86/boot/compressed: Skip Video Memory access in Decompressor for
SEV-ES/SNP.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/boot/compressed/misc.c   |   6 +-
 arch/x86/boot/compressed/misc.h   |   1 +
 arch/x86/boot/compressed/sev.c|   5 +
 arch/x86/boot/compressed/sev.h|   2 +
 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 169 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 arch/x86/platform/efi/quirks.c|  23 +++-
 10 files changed, 225 insertions(+), 5 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 3/3] x86/snp: Convert shared memory back to private on kexec

2024-03-18 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 169 ++
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 5 files changed, 193 insertions(+)

diff --git a/arch/x86/include/asm/probe_roms.h 
b/arch/x86/include/asm/probe_roms.h
index 1c7f3815bbd6..d50b67dbff33 100644
--- a/arch/x86/include/asm/probe_roms.h
+++ b/arch/x86/include/asm/probe_roms.h
@@ -6,4 +6,5 @@ struct pci_dev;
 extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
 extern void pci_unmap_biosrom(void __iomem *rom);
 extern size_t pci_biosrom_size(struct pci_dev *pdev);
+extern void snp_kexec_unprep_rom_memory(void);
 #endif
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index d7b27cb34c2b..867518b9bcad 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -229,6 +229,8 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void kdump_sev_callback(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -258,6 +260,8 @@ static inline void snp_accept_memory(phys_addr_t start, 
phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void kdump_sev_callback(void) { }
+void snp_kexec_unshare_mem(void) {}
+static void snp_kexec_stop_conversion(bool crash) {}
 #endif
 
 #ifdef CONFIG_KVM_AMD_SEV
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9dc..457f1e5c8d00 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -177,6 +177,22 @@ size_t pci_biosrom_size(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL(pci_biosrom_size);
 
+void snp_kexec_unprep_rom_memory(void)
+{
+   unsigned long vaddr, npages, sz;
+
+   /*
+* Switch back ROM regions to shared so that their validation
+* does not fail during kexec kernel boot.
+*/
+   vaddr = (unsigned long)__va(video_rom_resource.start);
+   sz = (system_rom_resource.end + 1) - video_rom_resource.start;
+   npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+   snp_set_memory_shared(vaddr, npages);
+}
+EXPORT_SYMBOL(snp_kexec_unprep_rom_memory);
+
 #define ROMSIGNATURE 0xaa55
 
 static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 1ef7ae806a01..7443a9620a31 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -71,6 +72,9 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long kexec_last_addr_to_make_private;
+
 /* #VC handler runtime per-CPU data */
 struct sev_es_runtime_data {
struct ghcb ghcb_page;
@@ -906,6 +910,171 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static bool set_pte_enc(pte_t *kpte, int level, void *va)
+{
+   pte_t new_pte;
+
+   if (pte_none(*kpte))
+   return false;
+
+   /*
+* Change the physical page attribute from C=0 to C=1. Flush the
+* caches to ensure that data gets accessed with the correct C-bit.
+*/
+   if (pte_present(*kpte))
+   clflush_cache_range(va, page_

[PATCH v2 2/3] x86/mm: Do not zap page table entries mapping unaccepted memory table during kdump.

2024-03-18 Thread Ashish Kalra
From: Ashish Kalra 

During crashkernel boot only pre-allocated crash memory is presented as
E820_TYPE_RAM. This can cause page table entries mapping unaccepted memory
table to be zapped during phys_pte_init(), phys_pmd_init(), phys_pud_init()
and phys_p4d_init() as SNP/TDX guest use E820_TYPE_ACPI to store the
unaccepted memory table and pass it between the kernels on
kexec/kdump.

E820_TYPE_ACPI covers not only ACPI data, but also EFI tables and might
be required by kernel to function properly.

The problem was discovered during debugging kdump for SNP guest. The
unaccepted memory table stored with E820_TYPE_ACPI and passed between
the kernels on kdump was getting zapped as the PMD entry mapping this
is above the E820_TYPE_RAM range for the reserved crashkernel memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/mm/init_64.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2b..cc294a9e9fd7 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -469,7 +469,9 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, 
unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
 E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
-E820_TYPE_RESERVED_KERN))
+E820_TYPE_RESERVED_KERN) &&
+   !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
}
@@ -524,7 +526,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, 
unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
 E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
-E820_TYPE_RESERVED_KERN))
+E820_TYPE_RESERVED_KERN) &&
+   !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
}
@@ -611,7 +615,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
 E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
-E820_TYPE_RESERVED_KERN))
+E820_TYPE_RESERVED_KERN) &&
+   !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
}
@@ -698,7 +704,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
 E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
-E820_TYPE_RESERVED_KERN))
+E820_TYPE_RESERVED_KERN) &&
+   !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
}
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 1/3] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-03-18 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memboot & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..d4562d074371 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -258,6 +258,16 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
int num_entries;
void *new;
 
+   /*
+* For kexec use case, we need to use the EFI memmap passed from the 
first
+* kernel via setup data, so we need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memboot & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+   if (efi_setup)
+   return;
+
if (efi_mem_desc_lookup(addr, ) ||
md.type != EFI_BOOT_SERVICES_DATA) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 0/3] x86/snp: Add kexec support

2024-03-18 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

v2:
- address zeroing of unaccepted memory table mappings at all page table levels
  adding phys_pte_init(), phys_pud_init() and phys_p4d_init().
- include skip efi_arch_mem_reserve() in case of kexec as part of this 
  patch set.
- rename last_address_shd_kexec to a more appropriate 
  kexec_last_address_to_make_private.
- remove duplicate code shared with TDX and use common interfaces
  defined for SNP and TDX for kexec/kdump.
- remove set_pte_enc() dependency on pg_level_to_pfn() and make the 
  function simpler.
- rename unshare_pte() to make_pte_private().
- clarify and make the comment for using kexec_last_address_to_make_private  
  more understandable.
- general cleanup. 

Ashish Kalra (3):
  efi/x86: skip efi_arch_mem_reserve() in case of kexec.
  x86/mm: Do not zap page table entries mapping unaccepted memory table
during kdump.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   4 +
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 169 ++
 arch/x86/mm/init_64.c |  16 ++-
 arch/x86/mm/mem_encrypt_amd.c |   3 +
 arch/x86/platform/efi/quirks.c|  10 ++
 7 files changed, 215 insertions(+), 4 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2] x86/snp: Convert shared memory back to private on kexec

2024-02-19 Thread Ashish Kalra
From: Ashish Kalra 

SNP guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping will cause unrecoverable RMP
page-faults.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally. Additionally for SNP guests convert all bss decrypted section
pages back to private and switch back ROM regions to shared so that
their revalidation does not fail during kexec kernel boot.

The conversion occurs in two steps: stopping new conversions and
unsharing all memory. In the case of normal kexec, the stopping of
conversions takes place while scheduling is still functioning. This
allows for waiting until any ongoing conversions are finished. The
second step is carried out when all CPUs except one are inactive and
interrupts are disabled. This prevents any conflicts with code that may
access shared memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   8 ++
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 211 ++
 arch/x86/mm/mem_encrypt_amd.c |  18 ++-
 5 files changed, 253 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/probe_roms.h 
b/arch/x86/include/asm/probe_roms.h
index 1c7f3815bbd6..d50b67dbff33 100644
--- a/arch/x86/include/asm/probe_roms.h
+++ b/arch/x86/include/asm/probe_roms.h
@@ -6,4 +6,5 @@ struct pci_dev;
 extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
 extern void pci_unmap_biosrom(void __iomem *rom);
 extern size_t pci_biosrom_size(struct pci_dev *pdev);
+extern void snp_kexec_unprep_rom_memory(void);
 #endif
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 5b4a1ce3d368..dd236d7e9407 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -81,6 +81,10 @@ extern void vc_no_ghcb(void);
 extern void vc_boot_ghcb(void);
 extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
 
+extern atomic_t conversions_in_progress;
+extern bool conversion_allowed;
+extern unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t 
*ret_prot);
+
 /* PVALIDATE return codes */
 #define PVALIDATE_FAIL_SIZEMISMATCH6
 
@@ -213,6 +217,8 @@ int snp_issue_guest_request(u64 exit_code, struct 
snp_req_data *input, struct sn
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
+void snp_kexec_unshare_mem(void);
+void snp_kexec_stop_conversion(bool crash);
 #else
 static inline void sev_es_ist_enter(struct pt_regs *regs) { }
 static inline void sev_es_ist_exit(void) { }
@@ -241,6 +247,8 @@ static inline int snp_issue_guest_request(u64 exit_code, 
struct snp_req_data *in
 static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
+void snp_kexec_unshare_mem(void) {}
+static void snp_kexec_stop_conversion(bool crash) {}
 #endif
 
 #endif
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9dc..457f1e5c8d00 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -177,6 +177,22 @@ size_t pci_biosrom_size(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL(pci_biosrom_size);
 
+void snp_kexec_unprep_rom_memory(void)
+{
+   unsigned long vaddr, npages, sz;
+
+   /*
+* Switch back ROM regions to shared so that their validation
+* does not fail during kexec kernel boot.
+*/
+   vaddr = (unsigned long)__va(video_rom_resource.start);
+   sz = (system_rom_resource.end + 1) - video_rom_resource.start;
+   npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+   snp_set_memory_shared(vaddr, npages);
+}
+EXPORT_SYMBOL(snp_kexec_unprep_rom_memory);
+
 #define ROMSIGNATURE 0xaa55
 
 static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c67285824e82..765ab83129eb 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 
 #include 
@@ -40,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DR7_RESET_VALUE0x400
 
@@ -71,6 +75,13 @@ static struct ghcb *boot_ghcb __section(".data");
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
+/* Last address to be switched to private during kexec */
+static unsigned long last_address_shd_kexec;
+
+static bool crash_requested;
+atomic_t conversions_in_progress;
+bool conversion_allowed = true;
+
 /* #VC

[PATCH 0/2] x86/snp: Add kexec support

2024-02-19 Thread Ashish Kalra
From: Ashish Kalra 

The patchset adds bits and pieces to get kexec (and crashkernel) work on
SNP guest.

This patchset requires [1] for chained guest kexec to work correctly.

[1]: https://lore.kernel.org/lkml/20240219225451.787816-1-ashish.ka...@amd.com/

Ashish Kalra (2):
  x86/mm: Do not zap PMD entry mapping unaccepted memory table during
kdump.
  x86/snp: Convert shared memory back to private on kexec

 arch/x86/include/asm/probe_roms.h |   1 +
 arch/x86/include/asm/sev.h|   8 ++
 arch/x86/kernel/probe_roms.c  |  16 +++
 arch/x86/kernel/sev.c | 211 ++
 arch/x86/mm/init_64.c |   4 +-
 arch/x86/mm/mem_encrypt_amd.c |  18 ++-
 6 files changed, 256 insertions(+), 2 deletions(-)

-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2] x86/mm: Do not zap PMD entry mapping unaccepted memory table during kdump.

2024-02-19 Thread Ashish Kalra
From: Ashish Kalra 

During crashkernel boot only pre-allocated crash memory is presented as
E820_TYPE_RAM. This can cause PMD entry mapping unaccepted memory table
to be zapped during phys_pmd_init() as SNP/TDX guest use E820_TYPE_ACPI
to store the unaccepted memory table and pass it between the kernels on
kexec/kdump.

E820_TYPE_ACPI covers not only ACPI data, but also EFI tables and might
be required by kernel to function properly.

The problem was discovered during debugging kdump for SNP guest. The
unaccepted memory table stored with E820_TYPE_ACPI and passed between
the kernels on kdump was getting zapped as the PMD entry mapping this
is above the E820_TYPE_RAM range for the reserved crashkernel memory.

Signed-off-by: Ashish Kalra 
---
 arch/x86/mm/init_64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2b..207c6e0c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -524,7 +524,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, 
unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
 E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
-E820_TYPE_RESERVED_KERN))
+E820_TYPE_RESERVED_KERN) &&
+   !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
}
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-02-19 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memblock & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..d4562d074371 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -258,6 +258,16 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
int num_entries;
void *new;
 
+   /*
+* For kexec use case, need to use the EFI memmap passed from the first
+* kernel via boot-params/setup data and need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memblock & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+   if (efi_setup)
+   return;
+
if (efi_mem_desc_lookup(addr, ) ||
md.type != EFI_BOOT_SERVICES_DATA) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] efi/x86: skip efi_arch_mem_reserve() in case of kexec.

2024-02-19 Thread Ashish Kalra
From: Ashish Kalra 

For kexec use case, need to use and stick to the EFI memmap passed
from the first kernel via boot-params/setup data, hence,
skip efi_arch_mem_reserve() during kexec.

Additionally during SNP guest kexec testing discovered that EFI memmap
is corrupted during chained kexec. kexec_enter_virtual_mode() during
late init will remap the efi_memmap physical pages allocated in
efi_arch_mem_reserve() via memboot & then subsequently cause random
EFI memmap corruption once memblock is freed/teared-down.

Signed-off-by: Ashish Kalra 
---
 arch/x86/platform/efi/quirks.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f0cc00032751..d4562d074371 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -258,6 +258,16 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
int num_entries;
void *new;
 
+   /*
+* For kexec use case, need to use the EFI memmap passed from the first
+* kernel via boot-params/setup data and need to skip this.
+* Additionally kexec_enter_virtual_mode() during late init will remap
+* the efi_memmap physical pages allocated here via memboot & then
+* subsequently cause random EFI memmap corruption once memblock is 
freed.
+*/
+   if (efi_setup)
+   return;
+
if (efi_mem_desc_lookup(addr, ) ||
md.type != EFI_BOOT_SERVICES_DATA) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", 
);
-- 
2.34.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v6 5/5] x86/kvm: Add kexec support for SEV Live Migration.

2021-08-24 Thread Ashish Kalra
From: Ashish Kalra 

Reset the host's shared pages list related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
shared pages list here as we need to retain the
UEFI/OVMF firmware specific settings.

The host's shared pages list is maintained for the
guest to keep track of all unencrypted guest memory regions,
therefore we need to explicitly mark all shared pages as
encrypted again before rebooting into the new guest kernel.

Signed-off-by: Ashish Kalra 
Reviewed-by: Steve Rutherford 
---
 arch/x86/kernel/kvm.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7d36b98b567d..025d25efd7e6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -869,10 +869,35 @@ static void __init kvm_init_platform(void)
if (sev_active() &&
kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
unsigned long nr_pages;
+   int i;
 
pv_ops.mmu.notify_page_enc_status_changed =
kvm_sev_hc_page_enc_status;
 
+   /*
+* Reset the host's shared pages list related to kernel
+* specific page encryption status settings before we load a
+* new kernel by kexec. Reset the page encryption status
+* during early boot intead of just before kexec to avoid SMP
+* races during kvm_pv_guest_cpu_reboot().
+* NOTE: We cannot reset the complete shared pages list
+* here as we need to retain the UEFI/OVMF firmware
+* specific settings.
+*/
+
+   for (i = 0; i < e820_table->nr_entries; i++) {
+   struct e820_entry *entry = _table->entries[i];
+
+   if (entry->type != E820_TYPE_RAM)
+   continue;
+
+   nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+   kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
+  nr_pages,
+  KVM_MAP_GPA_RANGE_ENCRYPTED | 
KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+   }
+
/*
 * Ensure that _bss_decrypted section is marked as decrypted in 
the
 * shared pages list.
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v13 12/12] x86/kvm: Add guest support for detecting and enabling SEV Live Migration feature.

2021-04-21 Thread Ashish Kalra
To reiterate, in addition to KVM_FEATURE_HC_PAGE_ENC_STATUS, we also need 
to add the new KVM_FEATURE_SEV_LIVE_MIGRATION feature for guest to check
for host-side support for SEV live migration. 

Or will the guest now check KVM_FEATURE_HC_PAGE_ENC_STATUS in CPUID and
then accordingly set bit0 in MSR_KVM_MIGRATION_CONTROL to enable SEV
live migration ?

Thanks,
Ashish

On Wed, Apr 21, 2021 at 06:48:32PM +, Ashish Kalra wrote:
> Hello Paolo,
> 
> The earlier patch#10 of SEV live migration patches which is now part of
> the guest interface patches used to define
> KVM_FEATURE_SEV_LIVE_MIGRATION. 
> 
> So now, will the guest patches need to define this feature ?
> 
> Thanks,
> Ashish
> 
> On Wed, Apr 21, 2021 at 05:38:45PM +0200, Paolo Bonzini wrote:
> > On 21/04/21 16:44, Borislav Petkov wrote:
> > > On Thu, Apr 15, 2021 at 04:01:16PM +0000, Ashish Kalra wrote:
> > > > From: Ashish Kalra 
> > > > 
> > > > The guest support for detecting and enabling SEV Live migration
> > > > feature uses the following logic :
> > > > 
> > > >   - kvm_init_plaform() invokes check_kvm_sev_migration() which
> > > > checks if its booted under the EFI
> > > > 
> > > > - If not EFI,
> > > > 
> > > >   i) check for the KVM_FEATURE_CPUID
> > > 
> > > Where do you do that?
> > > 
> > > $ git grep KVM_FEATURE_CPUID
> > > $
> > > 
> > > Do you mean
> > > 
> > >   kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)
> > > 
> > > per chance?
> > 
> > Yep.  Or KVM_CPUID_FEATURES perhaps.
> > 
> > > 
> > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > > index 78bb0fae3982..94ef16d263a7 100644
> > > > --- a/arch/x86/kernel/kvm.c
> > > > +++ b/arch/x86/kernel/kvm.c
> > > > @@ -26,6 +26,7 @@
> > > >   #include 
> > > >   #include 
> > > >   #include 
> > > > +#include 
> > > >   #include 
> > > >   #include 
> > > >   #include 
> > > > @@ -429,6 +430,59 @@ static inline void __set_percpu_decrypted(void 
> > > > *ptr, unsigned long size)
> > > > early_set_memory_decrypted((unsigned long) ptr, size);
> > > >   }
> > > > +static int __init setup_kvm_sev_migration(void)
> > > 
> > > kvm_init_sev_migration() or so.
> > > 
> > > ...
> > > 
> > > > @@ -48,6 +50,8 @@ EXPORT_SYMBOL_GPL(sev_enable_key);
> > > >   bool sev_enabled __section(".data");
> > > > +bool sev_live_migration_enabled __section(".data");
> > > 
> > > Pls add a function called something like:
> > > 
> > > bool sev_feature_enabled(enum sev_feature)
> > > 
> > > and gets SEV_FEATURE_LIVE_MIGRATION and then use it instead of adding
> > > yet another boolean which contains whether some aspect of SEV has been
> > > enabled or not.
> > > 
> > > Then add a
> > > 
> > > static enum sev_feature sev_features;
> > > 
> > > in mem_encrypt.c and that function above will query that sev_features
> > > enum for set flags.
> > 
> > Even better: let's stop callings things SEV/SEV_ES.  Long term we want
> > anyway to use things like mem_encrypt_enabled (SEV),
> > guest_instruction_trap_enabled (SEV/ES), etc.
> > 
> > For this one we don't need a bool at all, we can simply check whether the
> > pvop points to paravirt_nop.  Also keep everything but the BSS handling in
> > arch/x86/kernel/kvm.c.  Only the BSS handling should be in
> > arch/x86/mm/mem_encrypt.c.  This way all KVM paravirt hypercalls and MSRs
> > are in kvm.c.
> > 
> > That is:
> > 
> > void kvm_init_platform(void)
> > {
> > if (sev_active() &&
> > kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)) {
> > pv_ops.mmu.notify_page_enc_status_changed =
> > kvm_sev_hc_page_enc_status;
> > /* this takes care of bss_decrypted */
> > early_set_page_enc_status();
> > if (!efi_enabled(EFI_BOOT))
> > wrmsrl(MSR_KVM_SEV_LIVE_MIGRATION,
> >KVM_SEV_LIVE_MIGRATION_ENABLED);
> > }
> > /* existing kvm_init_platform code goes here */
> > }
> > 
> > // the pvop is changed to take the pfn, so that the vaddr loop
> > // is not KVM specific
> 

Re: [PATCH v13 12/12] x86/kvm: Add guest support for detecting and enabling SEV Live Migration feature.

2021-04-21 Thread Ashish Kalra
Hello Paolo,

The earlier patch#10 of SEV live migration patches which is now part of
the guest interface patches used to define
KVM_FEATURE_SEV_LIVE_MIGRATION. 

So now, will the guest patches need to define this feature ?

Thanks,
Ashish

On Wed, Apr 21, 2021 at 05:38:45PM +0200, Paolo Bonzini wrote:
> On 21/04/21 16:44, Borislav Petkov wrote:
> > On Thu, Apr 15, 2021 at 04:01:16PM +0000, Ashish Kalra wrote:
> > > From: Ashish Kalra 
> > > 
> > > The guest support for detecting and enabling SEV Live migration
> > > feature uses the following logic :
> > > 
> > >   - kvm_init_plaform() invokes check_kvm_sev_migration() which
> > > checks if its booted under the EFI
> > > 
> > > - If not EFI,
> > > 
> > >   i) check for the KVM_FEATURE_CPUID
> > 
> > Where do you do that?
> > 
> > $ git grep KVM_FEATURE_CPUID
> > $
> > 
> > Do you mean
> > 
> > kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)
> > 
> > per chance?
> 
> Yep.  Or KVM_CPUID_FEATURES perhaps.
> 
> > 
> > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > index 78bb0fae3982..94ef16d263a7 100644
> > > --- a/arch/x86/kernel/kvm.c
> > > +++ b/arch/x86/kernel/kvm.c
> > > @@ -26,6 +26,7 @@
> > >   #include 
> > >   #include 
> > >   #include 
> > > +#include 
> > >   #include 
> > >   #include 
> > >   #include 
> > > @@ -429,6 +430,59 @@ static inline void __set_percpu_decrypted(void *ptr, 
> > > unsigned long size)
> > >   early_set_memory_decrypted((unsigned long) ptr, size);
> > >   }
> > > +static int __init setup_kvm_sev_migration(void)
> > 
> > kvm_init_sev_migration() or so.
> > 
> > ...
> > 
> > > @@ -48,6 +50,8 @@ EXPORT_SYMBOL_GPL(sev_enable_key);
> > >   bool sev_enabled __section(".data");
> > > +bool sev_live_migration_enabled __section(".data");
> > 
> > Pls add a function called something like:
> > 
> > bool sev_feature_enabled(enum sev_feature)
> > 
> > and gets SEV_FEATURE_LIVE_MIGRATION and then use it instead of adding
> > yet another boolean which contains whether some aspect of SEV has been
> > enabled or not.
> > 
> > Then add a
> > 
> > static enum sev_feature sev_features;
> > 
> > in mem_encrypt.c and that function above will query that sev_features
> > enum for set flags.
> 
> Even better: let's stop callings things SEV/SEV_ES.  Long term we want
> anyway to use things like mem_encrypt_enabled (SEV),
> guest_instruction_trap_enabled (SEV/ES), etc.
> 
> For this one we don't need a bool at all, we can simply check whether the
> pvop points to paravirt_nop.  Also keep everything but the BSS handling in
> arch/x86/kernel/kvm.c.  Only the BSS handling should be in
> arch/x86/mm/mem_encrypt.c.  This way all KVM paravirt hypercalls and MSRs
> are in kvm.c.
> 
> That is:
> 
> void kvm_init_platform(void)
> {
>   if (sev_active() &&
>   kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)) {
>   pv_ops.mmu.notify_page_enc_status_changed =
>   kvm_sev_hc_page_enc_status;
>   /* this takes care of bss_decrypted */
>   early_set_page_enc_status();
>   if (!efi_enabled(EFI_BOOT))
>   wrmsrl(MSR_KVM_SEV_LIVE_MIGRATION,
>  KVM_SEV_LIVE_MIGRATION_ENABLED);
>   }
>   /* existing kvm_init_platform code goes here */
> }
> 
> // the pvop is changed to take the pfn, so that the vaddr loop
> // is not KVM specific
> static inline void notify_page_enc_status_changed(unsigned long pfn,
>   int npages, bool enc)
> {
>   PVOP_VCALL3(mmu.page_encryption_changed, pfn, npages, enc);
> }
> 
> static void notify_addr_enc_status_changed(unsigned long addr,
>  int numpages, bool enc)
> {
> #ifdef CONFIG_PARAVIRT
>   if (pv_ops.mmu.notify_page_enc_status_changed == paravirt_nop)
>   return;
> 
>   /* the body of set_memory_enc_dec_hypercall goes here */
>   for (; vaddr < vaddr_end; vaddr = vaddr_next) {
>   ...
>   notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT,
>  enc);
>   vaddr_next = (vaddr & pmask) + psize;
>   }
> #endif
> }
> 
> static int __set_memory_enc_dec(unsigned long addr,
>  

Re: [PATCH v13 12/12] x86/kvm: Add guest support for detecting and enabling SEV Live Migration feature.

2021-04-21 Thread Ashish Kalra
On Wed, Apr 21, 2021 at 04:44:02PM +0200, Borislav Petkov wrote:
> On Thu, Apr 15, 2021 at 04:01:16PM +0000, Ashish Kalra wrote:
> > From: Ashish Kalra 
> > 
> > The guest support for detecting and enabling SEV Live migration
> > feature uses the following logic :
> > 
> >  - kvm_init_plaform() invokes check_kvm_sev_migration() which
> >checks if its booted under the EFI
> > 
> >- If not EFI,
> > 
> >  i) check for the KVM_FEATURE_CPUID
> 
> Where do you do that?
> 
> $ git grep KVM_FEATURE_CPUID
> $
> 
> Do you mean
> 
>   kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)
> 
> per chance?
> 

Yes, the above mentions to get KVM_FEATURE_CPUID and then check if live
migration feature is supported, i.e.,
kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION). The above comments
are written more generically.

> > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > index 78bb0fae3982..94ef16d263a7 100644
> > --- a/arch/x86/kernel/kvm.c
> > +++ b/arch/x86/kernel/kvm.c
> > @@ -26,6 +26,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -429,6 +430,59 @@ static inline void __set_percpu_decrypted(void *ptr, 
> > unsigned long size)
> > early_set_memory_decrypted((unsigned long) ptr, size);
> >  }
> >  
> > +static int __init setup_kvm_sev_migration(void)
> 
> kvm_init_sev_migration() or so.
> 
> ...
> 
> > @@ -48,6 +50,8 @@ EXPORT_SYMBOL_GPL(sev_enable_key);
> >  
> >  bool sev_enabled __section(".data");
> >  
> > +bool sev_live_migration_enabled __section(".data");
> 
> Pls add a function called something like:
> 
> bool sev_feature_enabled(enum sev_feature)
> 
> and gets SEV_FEATURE_LIVE_MIGRATION and then use it instead of adding
> yet another boolean which contains whether some aspect of SEV has been
> enabled or not.
> 
> Then add a
> 
> static enum sev_feature sev_features;
> 
> in mem_encrypt.c and that function above will query that sev_features
> enum for set flags.
> 
> Then, if you feel bored, you could convert sme_active, sev_active,
> sev_es_active, mem_encrypt_active and whetever else code needs to query
> any aspect of SEV being enabled or not, to that function.
> 

Ok.

> > +void __init check_kvm_sev_migration(void)
> > +{
> > +   if (sev_active() &&
> > +   kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION)) {
> 
> Save an indentation level:
> 
>   if (!sev_active() ||
>   !kvm_para_has_feature(KVM_FEATURE_SEV_LIVE_MIGRATION))
>   return;
> 
> > +   unsigned long nr_pages;
> > +   int i;
> > +
> > +   pr_info("KVM enable live migration\n");
> 
> That should be at the end of the function and say:
> 
>   pr_info("KVM live migration enabled.\n");
> 
> > +   WRITE_ONCE(sev_live_migration_enabled, true);
> 
> Why WRITE_ONCE?
> 

Just to ensure that the sev_live_migration_enabled is set to TRUE before
it is used immediately next in the function.

Thanks,
Ashish

> And that needs to go to the end of the function too.
> 
> Thx.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquettedata=04%7C01%7CAshish.Kalra%40amd.com%7Cfe47697d718c4326b62108d904d3e9ad%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637546130496140162%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=d%2F%2Bx8t8R7zJclA7ENc%2Fxwt5%2FU13m%2FWObem2Hq8yH190%3Dreserved=0

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v13 12/12] x86/kvm: Add guest support for detecting and enabling SEV Live Migration feature.

2021-04-15 Thread Ashish Kalra
From: Ashish Kalra 

The guest support for detecting and enabling SEV Live migration
feature uses the following logic :

 - kvm_init_plaform() invokes check_kvm_sev_migration() which
   checks if its booted under the EFI

   - If not EFI,

 i) check for the KVM_FEATURE_CPUID

 ii) if CPUID reports that migration is supported, issue a wrmsrl()
 to enable the SEV live migration support

   - If EFI,

 i) check for the KVM_FEATURE_CPUID

 ii) If CPUID reports that migration is supported, read the UEFI variable 
which
 indicates OVMF support for live migration

 iii) the variable indicates live migration is supported, issue a wrmsrl() 
to
  enable the SEV live migration support

The EFI live migration check is done using a late_initcall() callback.

Also, ensure that _bss_decrypted section is marked as decrypted in the
shared pages list.

Also adds kexec support for SEV Live Migration.

Reset the host's shared pages list related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
shared pages list here as we need to retain the
UEFI/OVMF firmware specific settings.

The host's shared pages list is maintained for the
guest to keep track of all unencrypted guest memory regions,
therefore we need to explicitly mark all shared pages as
encrypted again before rebooting into the new guest kernel.

Signed-off-by: Ashish Kalra 
---
 arch/x86/include/asm/mem_encrypt.h |  8 
 arch/x86/kernel/kvm.c  | 55 +
 arch/x86/mm/mem_encrypt.c  | 64 ++
 3 files changed, 127 insertions(+)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 31c4df123aa0..19b77f3a62dc 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,6 +21,7 @@
 extern u64 sme_me_mask;
 extern u64 sev_status;
 extern bool sev_enabled;
+extern bool sev_live_migration_enabled;
 
 void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
 unsigned long decrypted_kernel_vaddr,
@@ -44,8 +45,11 @@ void __init sme_enable(struct boot_params *bp);
 
 int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
 int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
+   bool enc);
 
 void __init mem_encrypt_free_decrypted_mem(void);
+void __init check_kvm_sev_migration(void);
 
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void);
@@ -60,6 +64,7 @@ bool sev_es_active(void);
 #else  /* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask0ULL
+#define sev_live_migration_enabled false
 
 static inline void __init sme_early_encrypt(resource_size_t paddr,
unsigned long size) { }
@@ -84,8 +89,11 @@ static inline int __init
 early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 
0; }
 static inline int __init
 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 
0; }
+static inline void __init
+early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {}
 
 static inline void mem_encrypt_free_decrypted_mem(void) { }
+static inline void check_kvm_sev_migration(void) { }
 
 #define __bss_decrypted
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..94ef16d263a7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -429,6 +430,59 @@ static inline void __set_percpu_decrypted(void *ptr, 
unsigned long size)
early_set_memory_decrypted((unsigned long) ptr, size);
 }
 
+static int __init setup_kvm_sev_migration(void)
+{
+   efi_char16_t efi_sev_live_migration_enabled[] = 
L"SevLiveMigrationEnabled";
+   efi_guid_t efi_variable_guid = MEM_ENCRYPT_GUID;
+   efi_status_t status;
+   unsigned long size;
+   bool enabled;
+
+   /*
+* check_kvm_sev_migration() invoked via kvm_init_platform() before
+* this callback would have setup the indicator that live migration
+* feature is supported/enabled.
+*/
+   if (!sev_live_migration_enabled)
+   return 0;
+
+   if (!efi_enabled(EFI_BOOT))
+   return 0;
+
+   if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+   pr_info("%s : EFI runtime services are not enabled\n", 
__func__);
+   return 0;
+   }
+
+   size = sizeof(enabled);
+
+   /* Get variable contents into buffer */
+   status = efi.get_variable(efi_sev_live_migration_enabled,
+ _variable_guid, NULL, , );
+
+   if (status == EFI_NOT_FOUND) {
+   pr_info("%s : EF

Re: [PATCH v12 13/13] x86/kvm: Add kexec support for SEV Live Migration.

2021-04-13 Thread Ashish Kalra
On Mon, Apr 12, 2021 at 07:25:03PM -0700, Steve Rutherford wrote:
> On Mon, Apr 12, 2021 at 6:48 PM Ashish Kalra  wrote:
> >
> > On Mon, Apr 12, 2021 at 06:23:32PM -0700, Steve Rutherford wrote:
> > > On Mon, Apr 12, 2021 at 5:22 PM Steve Rutherford  
> > > wrote:
> > > >
> > > > On Mon, Apr 12, 2021 at 12:48 PM Ashish Kalra  
> > > > wrote:
> > > > >
> > > > > From: Ashish Kalra 
> > > > >
> > > > > Reset the host's shared pages list related to kernel
> > > > > specific page encryption status settings before we load a
> > > > > new kernel by kexec. We cannot reset the complete
> > > > > shared pages list here as we need to retain the
> > > > > UEFI/OVMF firmware specific settings.
> > > > >
> > > > > The host's shared pages list is maintained for the
> > > > > guest to keep track of all unencrypted guest memory regions,
> > > > > therefore we need to explicitly mark all shared pages as
> > > > > encrypted again before rebooting into the new guest kernel.
> > > > >
> > > > > Signed-off-by: Ashish Kalra 
> > > > > ---
> > > > >  arch/x86/kernel/kvm.c | 24 
> > > > >  1 file changed, 24 insertions(+)
> > > > >
> > > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > > > index bcc82e0c9779..4ad3ed547ff1 100644
> > > > > --- a/arch/x86/kernel/kvm.c
> > > > > +++ b/arch/x86/kernel/kvm.c
> > > > > @@ -39,6 +39,7 @@
> > > > >  #include 
> > > > >  #include 
> > > > >  #include 
> > > > > +#include 
> > > > >
> > > > >  DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
> > > > >
> > > > > @@ -384,6 +385,29 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
> > > > >  */
> > > > > if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> > > > > wrmsrl(MSR_KVM_PV_EOI_EN, 0);
> > > > > +   /*
> > > > > +* Reset the host's shared pages list related to kernel
> > > > > +* specific page encryption status settings before we load a
> > > > > +* new kernel by kexec. NOTE: We cannot reset the complete
> > > > > +* shared pages list here as we need to retain the
> > > > > +* UEFI/OVMF firmware specific settings.
> > > > > +*/
> > > > > +   if (sev_live_migration_enabled & (smp_processor_id() == 0)) {
> > > > What happens if the reboot of CPU0 races with another CPU servicing a
> > > > device request (while the reboot is pending for that CPU)?
> > > > Seems like you could run into a scenario where you have hypercalls 
> > > > racing.
> > > >
> > > > Calling this on every core isn't free, but it is an easy way to avoid 
> > > > this race.
> > > > You could also count cores, and have only last core do the job, but
> > > > that seems more complicated.
> > > On second thought, I think this may be insufficient as a fix, since my
> > > read of kernel/reboot.c seems to imply that devices aren't shutdown
> > > until after these notifiers occur. As such, a single thread might be
> > > able to race with itself. I could be wrong here though.
> > >
> > > The heavy hammer would be to disable migration through the MSR (which
> > > the subsequent boot will re-enable).
> > >
> > > I'm curious if there is a less "blocking" way of handling kexecs (that
> > > strategy would block LM while the guest booted).
> > >
> > > One option that comes to mind would be for the guest to "mute" the
> > > encryption status hypercall after the call to reset the encryption
> > > status. The problem would be that the encryption status for pages
> > > would be very temporarily inaccurate in the window between that call
> > > and the start of the next boot. That isn't ideal, but, on the other
> > > hand, the VM was about to reboot anyway, so a corrupted shared page
> > > for device communication probably isn't super important. Still, I'm
> > > not really a fan of that. This would avoid corrupting the next boot,
> > > which is clearly an improvement.
> > >
> > > Each time the kernel boots it could also choose something 

Re: [PATCH v12 13/13] x86/kvm: Add kexec support for SEV Live Migration.

2021-04-12 Thread Ashish Kalra
On Mon, Apr 12, 2021 at 06:23:32PM -0700, Steve Rutherford wrote:
> On Mon, Apr 12, 2021 at 5:22 PM Steve Rutherford  
> wrote:
> >
> > On Mon, Apr 12, 2021 at 12:48 PM Ashish Kalra  wrote:
> > >
> > > From: Ashish Kalra 
> > >
> > > Reset the host's shared pages list related to kernel
> > > specific page encryption status settings before we load a
> > > new kernel by kexec. We cannot reset the complete
> > > shared pages list here as we need to retain the
> > > UEFI/OVMF firmware specific settings.
> > >
> > > The host's shared pages list is maintained for the
> > > guest to keep track of all unencrypted guest memory regions,
> > > therefore we need to explicitly mark all shared pages as
> > > encrypted again before rebooting into the new guest kernel.
> > >
> > > Signed-off-by: Ashish Kalra 
> > > ---
> > >  arch/x86/kernel/kvm.c | 24 
> > >  1 file changed, 24 insertions(+)
> > >
> > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > index bcc82e0c9779..4ad3ed547ff1 100644
> > > --- a/arch/x86/kernel/kvm.c
> > > +++ b/arch/x86/kernel/kvm.c
> > > @@ -39,6 +39,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
> > >
> > > @@ -384,6 +385,29 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
> > >  */
> > > if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> > > wrmsrl(MSR_KVM_PV_EOI_EN, 0);
> > > +   /*
> > > +* Reset the host's shared pages list related to kernel
> > > +* specific page encryption status settings before we load a
> > > +* new kernel by kexec. NOTE: We cannot reset the complete
> > > +* shared pages list here as we need to retain the
> > > +* UEFI/OVMF firmware specific settings.
> > > +*/
> > > +   if (sev_live_migration_enabled & (smp_processor_id() == 0)) {
> > What happens if the reboot of CPU0 races with another CPU servicing a
> > device request (while the reboot is pending for that CPU)?
> > Seems like you could run into a scenario where you have hypercalls racing.
> >
> > Calling this on every core isn't free, but it is an easy way to avoid this 
> > race.
> > You could also count cores, and have only last core do the job, but
> > that seems more complicated.
> On second thought, I think this may be insufficient as a fix, since my
> read of kernel/reboot.c seems to imply that devices aren't shutdown
> until after these notifiers occur. As such, a single thread might be
> able to race with itself. I could be wrong here though.
> 
> The heavy hammer would be to disable migration through the MSR (which
> the subsequent boot will re-enable).
> 
> I'm curious if there is a less "blocking" way of handling kexecs (that
> strategy would block LM while the guest booted).
> 
> One option that comes to mind would be for the guest to "mute" the
> encryption status hypercall after the call to reset the encryption
> status. The problem would be that the encryption status for pages
> would be very temporarily inaccurate in the window between that call
> and the start of the next boot. That isn't ideal, but, on the other
> hand, the VM was about to reboot anyway, so a corrupted shared page
> for device communication probably isn't super important. Still, I'm
> not really a fan of that. This would avoid corrupting the next boot,
> which is clearly an improvement.
> 
> Each time the kernel boots it could also choose something like a
> generation ID, and pass that down each time it calls the hypercall.
> This would then let userspace identify which requests were coming from
> the subsequent boot.
> 
> Everything here (except, perhaps, disabling migration through the MSR)
> seems kind of complicated. I somewhat hope my interpretation of
> kernel/reboot.c is wrong and this race just is not possible in the
> first place.
> 

Disabling migration through the MSR after resetting the page encryption
status is a reasonable approach. There is a similar window existing for
normal VM boot during which LM is disabled, from the point where OVMF
checks and adds support for SEV LM and the kernel boot checks for the
same and enables LM using the MSR. 

Thanks,
Ashish

> > > +   int i;
> > > +   unsigned long nr_pages;
> > > +
> > > +   for (i = 0; i < e820_table->n

[PATCH v12 13/13] x86/kvm: Add kexec support for SEV Live Migration.

2021-04-12 Thread Ashish Kalra
From: Ashish Kalra 

Reset the host's shared pages list related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
shared pages list here as we need to retain the
UEFI/OVMF firmware specific settings.

The host's shared pages list is maintained for the
guest to keep track of all unencrypted guest memory regions,
therefore we need to explicitly mark all shared pages as
encrypted again before rebooting into the new guest kernel.

Signed-off-by: Ashish Kalra 
---
 arch/x86/kernel/kvm.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index bcc82e0c9779..4ad3ed547ff1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
 
@@ -384,6 +385,29 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
 */
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+   /*
+* Reset the host's shared pages list related to kernel
+* specific page encryption status settings before we load a
+* new kernel by kexec. NOTE: We cannot reset the complete
+* shared pages list here as we need to retain the
+* UEFI/OVMF firmware specific settings.
+*/
+   if (sev_live_migration_enabled & (smp_processor_id() == 0)) {
+   int i;
+   unsigned long nr_pages;
+
+   for (i = 0; i < e820_table->nr_entries; i++) {
+   struct e820_entry *entry = _table->entries[i];
+
+   if (entry->type != E820_TYPE_RAM)
+   continue;
+
+   nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+   kvm_sev_hypercall3(KVM_HC_PAGE_ENC_STATUS,
+  entry->addr, nr_pages, 1);
+   }
+   }
kvm_pv_disable_apf();
kvm_disable_steal_time();
 }
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v11 13/13] x86/kvm: Add kexec support for SEV Live Migration.

2021-04-05 Thread Ashish Kalra
From: Ashish Kalra 

Reset the host's shared pages list related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
shared pages list here as we need to retain the
UEFI/OVMF firmware specific settings.

The host's shared pages list is maintained for the
guest to keep track of all unencrypted guest memory regions,
therefore we need to explicitly mark all shared pages as
encrypted again before rebooting into the new guest kernel.

Signed-off-by: Ashish Kalra 
---
 arch/x86/kernel/kvm.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index bcc82e0c9779..4ad3ed547ff1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
 
@@ -384,6 +385,29 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
 */
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+   /*
+* Reset the host's shared pages list related to kernel
+* specific page encryption status settings before we load a
+* new kernel by kexec. NOTE: We cannot reset the complete
+* shared pages list here as we need to retain the
+* UEFI/OVMF firmware specific settings.
+*/
+   if (sev_live_migration_enabled & (smp_processor_id() == 0)) {
+   int i;
+   unsigned long nr_pages;
+
+   for (i = 0; i < e820_table->nr_entries; i++) {
+   struct e820_entry *entry = _table->entries[i];
+
+   if (entry->type != E820_TYPE_RAM)
+   continue;
+
+   nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+   kvm_sev_hypercall3(KVM_HC_PAGE_ENC_STATUS,
+  entry->addr, nr_pages, 1);
+   }
+   }
kvm_pv_disable_apf();
kvm_disable_steal_time();
 }
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v8 17/18] KVM: x86: Add kexec support for SEV Live Migration.

2020-05-05 Thread Ashish Kalra
From: Ashish Kalra 

Reset the host's page encryption bitmap related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
page encryption bitmap here as we need to retain the
UEFI/OVMF firmware specific settings.

The host's page encryption bitmap is maintained for the
guest to keep the encrypted/decrypted state of the guest pages,
therefore we need to explicitly mark all shared pages as
encrypted again before rebooting into the new guest kernel.

Signed-off-by: Ashish Kalra 
---
 arch/x86/kernel/kvm.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4b29815de873..a8bc30d5b15b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int kvmapf = 1;
 
@@ -358,6 +359,33 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
 */
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+   /*
+* Reset the host's page encryption bitmap related to kernel
+* specific page encryption status settings before we load a
+* new kernel by kexec. NOTE: We cannot reset the complete
+* page encryption bitmap here as we need to retain the
+* UEFI/OVMF firmware specific settings.
+*/
+   if (sev_live_migration_enabled() & (smp_processor_id() == 0)) {
+   int i;
+   unsigned long nr_pages;
+
+   for (i = 0; i < e820_table->nr_entries; i++) {
+   struct e820_entry *entry = _table->entries[i];
+   unsigned long start_pfn;
+   unsigned long end_pfn;
+
+   if (entry->type != E820_TYPE_RAM)
+   continue;
+
+   start_pfn = entry->addr >> PAGE_SHIFT;
+   end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
+   nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+   kvm_sev_hypercall3(KVM_HC_PAGE_ENC_STATUS,
+  entry->addr, nr_pages, 1);
+   }
+   }
kvm_pv_disable_apf();
kvm_disable_steal_time();
 }
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 18/18] KVM: x86: Add kexec support for SEV Live Migration.

2020-04-30 Thread Ashish Kalra
From: Ashish Kalra 

Reset the host's page encryption bitmap related to kernel
specific page encryption status settings before we load a
new kernel by kexec. We cannot reset the complete
page encryption bitmap here as we need to retain the
UEFI/OVMF firmware specific settings.

Signed-off-by: Ashish Kalra 
---
 arch/x86/kernel/kvm.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b8cc87a3461..68f2de4c1e74 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int kvmapf = 1;
 
@@ -358,6 +359,33 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
 */
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+   /*
+* Reset the host's page encryption bitmap related to kernel
+* specific page encryption status settings before we load a
+* new kernel by kexec. NOTE: We cannot reset the complete
+* page encryption bitmap here as we need to retain the
+* UEFI/OVMF firmware specific settings.
+*/
+   if (sev_live_migration_enabled() & (smp_processor_id() == 0)) {
+   int i;
+   unsigned long nr_pages;
+
+   for (i = 0; i < e820_table->nr_entries; i++) {
+   struct e820_entry *entry = _table->entries[i];
+   unsigned long start_pfn;
+   unsigned long end_pfn;
+
+   if (entry->type != E820_TYPE_RAM)
+   continue;
+
+   start_pfn = entry->addr >> PAGE_SHIFT;
+   end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
+   nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+   kvm_sev_hypercall3(KVM_HC_PAGE_ENC_STATUS,
+  entry->addr, nr_pages, 1);
+   }
+   }
kvm_pv_disable_apf();
kvm_disable_steal_time();
 }
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec