QEMU 4.0 and later supports new way of booting Linux by specifying 64-bit vmlinux ELF. Please see this for details - https://patchwork.kernel.org/patch/10741013/.
This patch enhances OSv boot logic and loader.elf to support this new mode. Some tests show OSv boots in ~ 10ms in PVH mode. Here is an example script to execute OSv: qemu-system-x86_64 \ -m 64M -smp 2 \ -nodefaults \ -append '--bootchart /hello' \ -machine q35,accel=kvm,kernel_irqchip \ -kernel ./build/release/loader-stripped.elf \ -vga none -display none \ -serial mon:stdio \ -device virtio-blk-pci,id=blk0,drive=hd0,scsi=off \ -drive file=./build/last/usr.img,if=none,id=hd0,cache=none,aio=threads Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com> --- arch/x64/arch-setup.cc | 16 ++ arch/x64/arch-setup.hh | 2 + arch/x64/boot.S | 9 +- arch/x64/entry-xen.S | 8 + arch/x64/vmlinux.cc | 13 +- arch/x64/xen.cc | 34 ++++ .../xen/interface/arch-x86/hvm/start_info.h | 159 ++++++++++++++++++ bsd/sys/xen/interface/elfnote.h | 10 ++ 8 files changed, 238 insertions(+), 13 deletions(-) create mode 100644 bsd/sys/xen/interface/arch-x86/hvm/start_info.h diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index 47528408..e5fb7a6e 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -316,3 +316,19 @@ bool arch_setup_console(std::string opt_console) } return true; } + +void reset_bootchart(osv_multiboot_info_type* mb_info) +{ + auto now = processor::ticks(); + u32 now_high = (u32)(now >> 32); + u32 now_low = (u32)now; + + mb_info->tsc_init_hi = now_high; + mb_info->tsc_init = now_low; + + mb_info->tsc_disk_done_hi = now_high; + mb_info->tsc_disk_done = now_low; + + mb_info->tsc_uncompress_done_hi = now_high; + mb_info->tsc_uncompress_done = now_low; +} \ No newline at end of file diff --git a/arch/x64/arch-setup.hh b/arch/x64/arch-setup.hh index 32ba378a..8a83ac0f 100644 --- a/arch/x64/arch-setup.hh +++ b/arch/x64/arch-setup.hh @@ -60,4 +60,6 @@ void arch_setup_free_memory(); void arch_init_drivers(); bool arch_setup_console(std::string opt_console); +void reset_bootchart(osv_multiboot_info_type* mb_info); + #endif /* ARCH_SETUP_HH_ */ diff --git a/arch/x64/boot.S b/arch/x64/boot.S index cf04a4bb..c9435d70 100644 --- a/arch/x64/boot.S +++ b/arch/x64/boot.S @@ -148,10 +148,17 @@ start64: # it contains the address of the boot_params structure # that would be set if we came here from vmlinux_entry64 cmp $0x0, %rdi - jz start64_continue + jz detect_pvh_boot call extract_linux_boot_params mov $0x1000, %rbx +detect_pvh_boot: + mov hvm_xen_start_info, %rdi + cmp $0x0, %rdi + jz start64_continue + call hvm_xen_extract_boot_params + mov $0x1000, %rbx + start64_continue: lea .bss, %rdi lea .edata, %rcx diff --git a/arch/x64/entry-xen.S b/arch/x64/entry-xen.S index 10927089..bc37d4cd 100644 --- a/arch/x64/entry-xen.S +++ b/arch/x64/entry-xen.S @@ -30,6 +30,7 @@ elfnote_str(XEN_ELFNOTE_GUEST_VERSION, "?.?") elfnote_str(XEN_ELFNOTE_LOADER, "generic") elfnote_str(XEN_ELFNOTE_FEATURES, "!writable_page_tables") elfnote_str(XEN_ELFNOTE_BSD_SYMTAB, "yes") +elfnote_val(XEN_ELFNOTE_PHYS32_ENTRY, hvm_xen_start-OSV_KERNEL_VM_SHIFT) .data @@ -51,3 +52,10 @@ xen_start: call xen_init mov $0x0, %rdi jmp start64 + +.code32 +hvm_xen_start: + mov %ebx, hvm_xen_start_info-OSV_KERNEL_VM_SHIFT + mov $0x7c00, %esp # Allocate some temporary stack -> TODO: Probably unnecessary + mov $start32-OSV_KERNEL_VM_SHIFT, %eax + jmp *%eax diff --git a/arch/x64/vmlinux.cc b/arch/x64/vmlinux.cc index 6a551a98..c63457ff 100644 --- a/arch/x64/vmlinux.cc +++ b/arch/x64/vmlinux.cc @@ -78,16 +78,5 @@ extern "C" void extract_linux_boot_params(void *boot_params) mb_info->mb.mmap_length += sizeof(e820ent); } - auto now = processor::ticks(); - u32 now_high = (u32)(now >> 32); - u32 now_low = (u32)now; - - mb_info->tsc_init_hi = now_high; - mb_info->tsc_init = now_low; - - mb_info->tsc_disk_done_hi = now_high; - mb_info->tsc_disk_done = now_low; - - mb_info->tsc_uncompress_done_hi = now_high; - mb_info->tsc_uncompress_done = now_low; + reset_bootchart(mb_info); } diff --git a/arch/x64/xen.cc b/arch/x64/xen.cc index c02bf62c..78678af3 100644 --- a/arch/x64/xen.cc +++ b/arch/x64/xen.cc @@ -12,17 +12,20 @@ #include "processor.hh" #include "cpuid.hh" #include "exceptions.hh" +#include "arch-setup.hh" #include <osv/interrupt.hh> #include <osv/sched.hh> #include <bsd/porting/pcpu.h> #include <machine/xen/xen-os.h> #include <xen/evtchn.h> +#include <xen/interface/arch-x86/hvm/start_info.h> shared_info_t *HYPERVISOR_shared_info; uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; // make sure xen_start_info is not in .bss, or it will be overwritten // by init code, as xen_init() is called before .bss initialization struct start_info* xen_start_info __attribute__((section(".data"))); +struct hvm_start_info* hvm_xen_start_info __attribute__((section(".data"))); namespace xen { @@ -221,4 +224,35 @@ void xen_init(struct start_info* si) { xen_start_info = si; } + +#define OSV_MULTI_BOOT_INFO_ADDR 0x1000 +#define OSV_E820_TABLE_ADDR 0x2000 + +extern "C" +void hvm_xen_extract_boot_params() +{ + // Set location of multiboot info struct at arbitrary place in lower memory + // to copy to (happens to be the same as in boot16.S) + osv_multiboot_info_type* mb_info = reinterpret_cast<osv_multiboot_info_type*>(OSV_MULTI_BOOT_INFO_ADDR); + + // Copy command line pointer from boot params + mb_info->mb.cmdline = hvm_xen_start_info->cmdline_paddr; + + // Copy e820 information from boot params + mb_info->mb.mmap_length = 0; + mb_info->mb.mmap_addr = OSV_E820_TABLE_ADDR; + + struct hvm_memmap_table_entry *source_e820_table = reinterpret_cast<struct hvm_memmap_table_entry *>(hvm_xen_start_info->memmap_paddr); + struct e820ent *dest_e820_table = reinterpret_cast<struct e820ent *>(mb_info->mb.mmap_addr); + + for (uint32_t e820_index = 0; e820_index < hvm_xen_start_info->memmap_entries; e820_index++) { + dest_e820_table[e820_index].ent_size = 20; + dest_e820_table[e820_index].type = source_e820_table[e820_index].type; + dest_e820_table[e820_index].addr = source_e820_table[e820_index].addr; + dest_e820_table[e820_index].size = source_e820_table[e820_index].size; + mb_info->mb.mmap_length += sizeof(e820ent); + } + + reset_bootchart(mb_info); +} } diff --git a/bsd/sys/xen/interface/arch-x86/hvm/start_info.h b/bsd/sys/xen/interface/arch-x86/hvm/start_info.h new file mode 100644 index 00000000..50af9ea2 --- /dev/null +++ b/bsd/sys/xen/interface/arch-x86/hvm/start_info.h @@ -0,0 +1,159 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2016, Citrix Systems, Inc. + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ +#define __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ + +/* + * Start of day structure passed to PVH guests and to HVM guests in %ebx. + * + * NOTE: nothing will be loaded at physical address 0, so a 0 value in any + * of the address fields should be treated as not present. + * + * 0 +----------------+ + * | magic | Contains the magic value XEN_HVM_START_MAGIC_VALUE + * | | ("xEn3" with the 0x80 bit of the "E" set). + * 4 +----------------+ + * | version | Version of this structure. Current version is 1. New + * | | versions are guaranteed to be backwards-compatible. + * 8 +----------------+ + * | flags | SIF_xxx flags. + * 12 +----------------+ + * | nr_modules | Number of modules passed to the kernel. + * 16 +----------------+ + * | modlist_paddr | Physical address of an array of modules + * | | (layout of the structure below). + * 24 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 32 +----------------+ + * | rsdp_paddr | Physical address of the RSDP ACPI data structure. + * 40 +----------------+ + * | memmap_paddr | Physical address of the (optional) memory map. Only + * | | present in version 1 and newer of the structure. + * 48 +----------------+ + * | memmap_entries | Number of entries in the memory map table. Zero + * | | if there is no memory map being provided. Only + * | | present in version 1 and newer of the structure. + * 52 +----------------+ + * | reserved | Version 1 and newer only. + * 56 +----------------+ + * + * The layout of each entry in the module structure is the following: + * + * 0 +----------------+ + * | paddr | Physical address of the module. + * 8 +----------------+ + * | size | Size of the module in bytes. + * 16 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 24 +----------------+ + * | reserved | + * 32 +----------------+ + * + * The layout of each entry in the memory map table is as follows: + * + * 0 +----------------+ + * | addr | Base address + * 8 +----------------+ + * | size | Size of mapping in bytes + * 16 +----------------+ + * | type | Type of mapping as defined between the hypervisor + * | | and guest. See XEN_HVM_MEMMAP_TYPE_* values below. + * 20 +----------------| + * | reserved | + * 24 +----------------+ + * + * The address and sizes are always a 64bit little endian unsigned integer. + * + * NB: Xen on x86 will always try to place all the data below the 4GiB + * boundary. + * + * Version numbers of the hvm_start_info structure have evolved like this: + * + * Version 0: Initial implementation. + * + * Version 1: Added the memmap_paddr/memmap_entries fields (plus 4 bytes of + * padding) to the end of the hvm_start_info struct. These new + * fields can be used to pass a memory map to the guest. The + * memory map is optional and so guests that understand version 1 + * of the structure must check that memmap_entries is non-zero + * before trying to read the memory map. + */ +#define XEN_HVM_START_MAGIC_VALUE 0x336ec578 + +/* + * The values used in the type field of the memory map table entries are + * defined below and match the Address Range Types as defined in the "System + * Address Map Interfaces" section of the ACPI Specification. Please refer to + * section 15 in version 6.2 of the ACPI spec: http://uefi.org/specifications + */ +#define XEN_HVM_MEMMAP_TYPE_RAM 1 +#define XEN_HVM_MEMMAP_TYPE_RESERVED 2 +#define XEN_HVM_MEMMAP_TYPE_ACPI 3 +#define XEN_HVM_MEMMAP_TYPE_NVS 4 +#define XEN_HVM_MEMMAP_TYPE_UNUSABLE 5 +#define XEN_HVM_MEMMAP_TYPE_DISABLED 6 +#define XEN_HVM_MEMMAP_TYPE_PMEM 7 + +/* + * C representation of the x86/HVM start info layout. + * + * The canonical definition of this layout is above, this is just a way to + * represent the layout described there using C types. + */ +struct hvm_start_info { + uint32_t magic; /* Contains the magic value 0x336ec578 */ + /* ("xEn3" with the 0x80 bit of the "E" set).*/ + uint32_t version; /* Version of this structure. */ + uint32_t flags; /* SIF_xxx flags. */ + uint32_t nr_modules; /* Number of modules passed to the kernel. */ + uint64_t modlist_paddr; /* Physical address of an array of */ + /* hvm_modlist_entry. */ + uint64_t cmdline_paddr; /* Physical address of the command line. */ + uint64_t rsdp_paddr; /* Physical address of the RSDP ACPI data */ + /* structure. */ + /* All following fields only present in version 1 and newer */ + uint64_t memmap_paddr; /* Physical address of an array of */ + /* hvm_memmap_table_entry. */ + uint32_t memmap_entries; /* Number of entries in the memmap table. */ + /* Value will be zero if there is no memory */ + /* map being provided. */ + uint32_t reserved; /* Must be zero. */ +}; + +struct hvm_modlist_entry { + uint64_t paddr; /* Physical address of the module. */ + uint64_t size; /* Size of the module in bytes. */ + uint64_t cmdline_paddr; /* Physical address of the command line. */ + uint64_t reserved; +}; + +struct hvm_memmap_table_entry { + uint64_t addr; /* Base address of the memory region */ + uint64_t size; /* Size of the memory region in bytes */ + uint32_t type; /* Mapping type */ + uint32_t reserved; /* Must be zero for Version 1. */ +}; + +#endif /* __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ */ diff --git a/bsd/sys/xen/interface/elfnote.h b/bsd/sys/xen/interface/elfnote.h index 77be41bb..8487cce4 100644 --- a/bsd/sys/xen/interface/elfnote.h +++ b/bsd/sys/xen/interface/elfnote.h @@ -161,6 +161,16 @@ */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 +/* + * Physical entry point into the kernel. + * + * 32bit entry point into the kernel. When requested to launch the + * guest kernel in a HVM container, Xen will use this entry point to + * launch the guest in 32bit protected mode with paging disabled. + * Ignored otherwise. + */ +#define XEN_ELFNOTE_PHYS32_ENTRY 18 + /* * The number of the highest elfnote defined. */ -- 2.20.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20190707042237.6831-1-jwkozaczuk%40gmail.com. For more options, visit https://groups.google.com/d/optout.